summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/aio.c6
-rw-r--r--fs/autofs/dev-ioctl.c5
-rw-r--r--fs/bfs/inode.c2
-rw-r--r--fs/binfmt_elf.c6
-rw-r--r--fs/block_dev.c755
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/backref.c19
-rw-r--r--fs/btrfs/block-group.c268
-rw-r--r--fs/btrfs/block-group.h2
-rw-r--r--fs/btrfs/block-rsv.c8
-rw-r--r--fs/btrfs/btrfs_inode.h23
-rw-r--r--fs/btrfs/check-integrity.c11
-rw-r--r--fs/btrfs/compression.c28
-rw-r--r--fs/btrfs/ctree.c258
-rw-r--r--fs/btrfs/ctree.h214
-rw-r--r--fs/btrfs/delayed-inode.c23
-rw-r--r--fs/btrfs/delayed-inode.h3
-rw-r--r--fs/btrfs/dev-replace.c20
-rw-r--r--fs/btrfs/dir-item.c1
-rw-r--r--fs/btrfs/discard.c46
-rw-r--r--fs/btrfs/discard.h3
-rw-r--r--fs/btrfs/disk-io.c689
-rw-r--r--fs/btrfs/disk-io.h25
-rw-r--r--fs/btrfs/export.c1
-rw-r--r--fs/btrfs/extent-io-tree.h71
-rw-r--r--fs/btrfs/extent-tree.c111
-rw-r--r--fs/btrfs/extent_io.c656
-rw-r--r--fs/btrfs/extent_io.h50
-rw-r--r--fs/btrfs/file-item.c344
-rw-r--r--fs/btrfs/file.c737
-rw-r--r--fs/btrfs/free-space-cache.c558
-rw-r--r--fs/btrfs/free-space-cache.h22
-rw-r--r--fs/btrfs/free-space-tree.c26
-rw-r--r--fs/btrfs/inode-item.c6
-rw-r--r--fs/btrfs/inode-map.c582
-rw-r--r--fs/btrfs/inode-map.h16
-rw-r--r--fs/btrfs/inode.c815
-rw-r--r--fs/btrfs/ioctl.c64
-rw-r--r--fs/btrfs/locking.c459
-rw-r--r--fs/btrfs/locking.h24
-rw-r--r--fs/btrfs/ordered-data.c45
-rw-r--r--fs/btrfs/ordered-data.h5
-rw-r--r--fs/btrfs/print-tree.c15
-rw-r--r--fs/btrfs/qgroup.c52
-rw-r--r--fs/btrfs/raid56.c8
-rw-r--r--fs/btrfs/reada.c34
-rw-r--r--fs/btrfs/ref-verify.c27
-rw-r--r--fs/btrfs/reflink.c18
-rw-r--r--fs/btrfs/relocation.c116
-rw-r--r--fs/btrfs/scrub.c340
-rw-r--r--fs/btrfs/send.c6
-rw-r--r--fs/btrfs/struct-funcs.c18
-rw-r--r--fs/btrfs/super.c179
-rw-r--r--fs/btrfs/sysfs.c132
-rw-r--r--fs/btrfs/tests/btrfs-tests.c3
-rw-r--r--fs/btrfs/tests/extent-io-tests.c26
-rw-r--r--fs/btrfs/tests/free-space-tests.c1
-rw-r--r--fs/btrfs/tests/qgroup-tests.c4
-rw-r--r--fs/btrfs/transaction.c126
-rw-r--r--fs/btrfs/transaction.h3
-rw-r--r--fs/btrfs/tree-checker.c337
-rw-r--r--fs/btrfs/tree-defrag.c1
-rw-r--r--fs/btrfs/tree-log.c183
-rw-r--r--fs/btrfs/uuid-tree.c3
-rw-r--r--fs/btrfs/volumes.c156
-rw-r--r--fs/btrfs/volumes.h21
-rw-r--r--fs/btrfs/xattr.c8
-rw-r--r--fs/btrfs/zoned.c616
-rw-r--r--fs/btrfs/zoned.h160
-rw-r--r--fs/buffer.c4
-rw-r--r--fs/compat_binfmt_elf.c20
-rw-r--r--fs/coredump.c6
-rw-r--r--fs/crypto/fname.c14
-rw-r--r--fs/crypto/fscrypt_private.h57
-rw-r--r--fs/crypto/hkdf.c2
-rw-r--r--fs/crypto/hooks.c56
-rw-r--r--fs/crypto/keyring.c10
-rw-r--r--fs/crypto/keysetup.c44
-rw-r--r--fs/crypto/policy.c27
-rw-r--r--fs/dax.c9
-rw-r--r--fs/dlm/lockspace.c2
-rw-r--r--fs/dlm/lowcomms.c304
-rw-r--r--fs/dlm/lowcomms.h2
-rw-r--r--fs/dlm/member.c2
-rw-r--r--fs/dlm/rcom.c6
-rw-r--r--fs/erofs/Makefile5
-rw-r--r--fs/erofs/compress.h54
-rw-r--r--fs/erofs/data.c26
-rw-r--r--fs/erofs/decompressor.c2
-rw-r--r--fs/erofs/zdata.c172
-rw-r--r--fs/erofs/zdata.h1
-rw-r--r--fs/eventpoll.c723
-rw-r--r--fs/exec.c58
-rw-r--r--fs/exportfs/expfs.c32
-rw-r--r--fs/ext2/dir.c14
-rw-r--r--fs/ext2/ext2.h7
-rw-r--r--fs/ext2/inode.c1
-rw-r--r--fs/ext2/namei.c15
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext4/dir.c23
-rw-r--r--fs/ext4/ext4.h4
-rw-r--r--fs/ext4/inode-test.c320
-rw-r--r--fs/ext4/ioctl.c2
-rw-r--r--fs/ext4/namei.c14
-rw-r--r--fs/ext4/super.c23
-rw-r--r--fs/ext4/sysfs.c10
-rw-r--r--fs/f2fs/acl.c2
-rw-r--r--fs/f2fs/checkpoint.c38
-rw-r--r--fs/f2fs/compress.c26
-rw-r--r--fs/f2fs/compress.h0
-rw-r--r--fs/f2fs/data.c211
-rw-r--r--fs/f2fs/debug.c11
-rw-r--r--fs/f2fs/dir.c115
-rw-r--r--fs/f2fs/f2fs.h183
-rw-r--r--fs/f2fs/file.c436
-rw-r--r--fs/f2fs/gc.c4
-rw-r--r--fs/f2fs/hash.c11
-rw-r--r--fs/f2fs/inline.c11
-rw-r--r--fs/f2fs/inode.c3
-rw-r--r--fs/f2fs/namei.c1
-rw-r--r--fs/f2fs/node.c41
-rw-r--r--fs/f2fs/node.h4
-rw-r--r--fs/f2fs/recovery.c16
-rw-r--r--fs/f2fs/segment.c51
-rw-r--r--fs/f2fs/shrinker.c4
-rw-r--r--fs/f2fs/super.c72
-rw-r--r--fs/f2fs/sysfs.c14
-rw-r--r--fs/fcntl.c10
-rw-r--r--fs/file.c168
-rw-r--r--fs/file_table.c1
-rw-r--r--fs/fs-writeback.c4
-rw-r--r--fs/fuse/acl.c6
-rw-r--r--fs/fuse/dir.c60
-rw-r--r--fs/fuse/file.c41
-rw-r--r--fs/fuse/fuse_i.h41
-rw-r--r--fs/fuse/inode.c61
-rw-r--r--fs/fuse/readdir.c4
-rw-r--r--fs/fuse/virtio_fs.c47
-rw-r--r--fs/fuse/xattr.c9
-rw-r--r--fs/inode.c3
-rw-r--r--fs/internal.h9
-rw-r--r--fs/io-wq.c10
-rw-r--r--fs/io-wq.h1
-rw-r--r--fs/io_uring.c1385
-rw-r--r--fs/iomap/buffered-io.c2
-rw-r--r--fs/jfs/jfs_dmap.c10
-rw-r--r--fs/jfs/jfs_dmap.h2
-rw-r--r--fs/jfs/jfs_extent.c2
-rw-r--r--fs/jfs/jfs_extent.h2
-rw-r--r--fs/jfs/jfs_logmgr.h2
-rw-r--r--fs/jfs/jfs_txnmgr.c2
-rw-r--r--fs/jfs/jfs_xtree.c2
-rw-r--r--fs/kernfs/dir.c5
-rw-r--r--fs/libfs.c70
-rw-r--r--fs/locks.c18
-rw-r--r--fs/mount.h3
-rw-r--r--fs/namei.c40
-rw-r--r--fs/namespace.c4
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nfs/blocklayout/dev.c2
-rw-r--r--fs/nfs/callback_proc.c5
-rw-r--r--fs/nfs/dir.c2
-rw-r--r--fs/nfs/export.c3
-rw-r--r--fs/nfs/filelayout/filelayout.c2
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c2
-rw-r--r--fs/nfs/nfs42xdr.c2
-rw-r--r--fs/nfs/nfs4xdr.c6
-rw-r--r--fs/nfs_common/grace.c6
-rw-r--r--fs/nfsd/export.c6
-rw-r--r--fs/nfsd/filecache.c3
-rw-r--r--fs/nfsd/nfs2acl.c21
-rw-r--r--fs/nfsd/nfs3acl.c8
-rw-r--r--fs/nfsd/nfs3proc.c11
-rw-r--r--fs/nfsd/nfs3xdr.c40
-rw-r--r--fs/nfsd/nfs4proc.c35
-rw-r--r--fs/nfsd/nfs4state.c3
-rw-r--r--fs/nfsd/nfs4xdr.c2551
-rw-r--r--fs/nfsd/nfsd.h9
-rw-r--r--fs/nfsd/nfsfh.c34
-rw-r--r--fs/nfsd/nfsfh.h24
-rw-r--r--fs/nfsd/nfsproc.c25
-rw-r--r--fs/nfsd/nfssvc.c50
-rw-r--r--fs/nfsd/nfsxdr.c16
-rw-r--r--fs/nfsd/trace.c1
-rw-r--r--fs/nfsd/trace.h176
-rw-r--r--fs/nfsd/vfs.c29
-rw-r--r--fs/nfsd/xdr.h2
-rw-r--r--fs/nfsd/xdr3.h2
-rw-r--r--fs/nfsd/xdr4.h43
-rw-r--r--fs/nilfs2/segment.c5
-rw-r--r--fs/notify/dnotify/dnotify.c4
-rw-r--r--fs/notify/fanotify/fanotify.c7
-rw-r--r--fs/notify/fsnotify.c107
-rw-r--r--fs/notify/inotify/inotify.h9
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c51
-rw-r--r--fs/notify/inotify/inotify_user.c31
-rw-r--r--fs/ntfs/file.c5
-rw-r--r--fs/ntfs/inode.c2
-rw-r--r--fs/ntfs/logfile.c3
-rw-r--r--fs/ocfs2/cluster/tcp.c1
-rw-r--r--fs/ocfs2/namei.c4
-rw-r--r--fs/open.c6
-rw-r--r--fs/overlayfs/copy_up.c28
-rw-r--r--fs/overlayfs/export.c10
-rw-r--r--fs/overlayfs/file.c144
-rw-r--r--fs/overlayfs/inode.c14
-rw-r--r--fs/overlayfs/namei.c28
-rw-r--r--fs/overlayfs/overlayfs.h22
-rw-r--r--fs/overlayfs/ovl_entry.h2
-rw-r--r--fs/overlayfs/super.c95
-rw-r--r--fs/overlayfs/util.c18
-rw-r--r--fs/pipe.c5
-rw-r--r--fs/proc/array.c38
-rw-r--r--fs/proc/base.c18
-rw-r--r--fs/proc/fd.c48
-rw-r--r--fs/proc/generic.c24
-rw-r--r--fs/proc/internal.h10
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/meminfo.c2
-rw-r--r--fs/proc/proc_net.c20
-rw-r--r--fs/proc/stat.c3
-rw-r--r--fs/pstore/Kconfig8
-rw-r--r--fs/pstore/blk.c85
-rw-r--r--fs/pstore/inode.c2
-rw-r--r--fs/pstore/internal.h1
-rw-r--r--fs/pstore/platform.c2
-rw-r--r--fs/pstore/zone.c4
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/quota/quota.c40
-rw-r--r--fs/quota/quota_tree.c8
-rw-r--r--fs/quota/quota_v2.c19
-rw-r--r--fs/reiserfs/stree.c6
-rw-r--r--fs/remap_range.c10
-rw-r--r--fs/statfs.c2
-rw-r--r--fs/super.c93
-rw-r--r--fs/ubifs/auth.c1
-rw-r--r--fs/ubifs/dir.c29
-rw-r--r--fs/userfaultfd.c20
-rw-r--r--fs/verity/enable.c8
-rw-r--r--fs/verity/fsverity_private.h38
-rw-r--r--fs/verity/hash_algs.c2
-rw-r--r--fs/verity/init.c2
-rw-r--r--fs/verity/measure.c12
-rw-r--r--fs/verity/open.c24
-rw-r--r--fs/verity/signature.c14
-rw-r--r--fs/verity/verify.c2
-rw-r--r--fs/xattr.c17
-rw-r--r--fs/xfs/xfs_fsops.c7
250 files changed, 10148 insertions, 9155 deletions
diff --git a/fs/aio.c b/fs/aio.c
index 6a21d8919409..1f32da13d39e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -43,7 +43,6 @@
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
-#include <asm/kmap_types.h>
#include <linux/uaccess.h>
#include <linux/nospec.h>
@@ -324,13 +323,16 @@ static void aio_free_ring(struct kioctx *ctx)
}
}
-static int aio_ring_mremap(struct vm_area_struct *vma)
+static int aio_ring_mremap(struct vm_area_struct *vma, unsigned long flags)
{
struct file *file = vma->vm_file;
struct mm_struct *mm = vma->vm_mm;
struct kioctx_table *table;
int i, res = -EINVAL;
+ if (flags & MREMAP_DONTUNMAP)
+ return -EINVAL;
+
spin_lock(&mm->ioctx_lock);
rcu_read_lock();
table = rcu_dereference(mm->ioctx_table);
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index 322b7dfb4ea0..5bf781ea6d67 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -4,9 +4,10 @@
* Copyright 2008 Ian Kent <raven@themaw.net>
*/
+#include <linux/module.h>
#include <linux/miscdevice.h>
#include <linux/compat.h>
-#include <linux/syscalls.h>
+#include <linux/fdtable.h>
#include <linux/magic.h>
#include <linux/nospec.h>
@@ -289,7 +290,7 @@ static int autofs_dev_ioctl_closemount(struct file *fp,
struct autofs_sb_info *sbi,
struct autofs_dev_ioctl *param)
{
- return ksys_close(param->ioctlfd);
+ return close_fd(param->ioctlfd);
}
/*
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 3ac7611ef7ce..fd691e4815c5 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -350,7 +350,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / sizeof(struct bfs_inode) + BFS_ROOT_INO - 1;
if (info->si_lasti == BFS_MAX_LASTI)
- printf("WARNING: filesystem %s was created with 512 inodes, the real maximum is 511, mounting anyway\n", s->s_id);
+ printf("NOTE: filesystem %s was created with 512 inodes, the real maximum is 511, mounting anyway\n", s->s_id);
else if (info->si_lasti > BFS_MAX_LASTI) {
printf("Impossible last inode number %lu > %d on %s\n", info->si_lasti, BFS_MAX_LASTI, s->s_id);
goto out1;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index fa50e8936f5f..950bc177238a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1246,7 +1246,7 @@ out_free_interp:
set_binfmt(&elf_format);
#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
- retval = arch_setup_additional_pages(bprm, !!interpreter);
+ retval = ARCH_SETUP_ADDITIONAL_PAGES(bprm, elf_ex, !!interpreter);
if (retval < 0)
goto out;
#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
@@ -1307,7 +1307,7 @@ out_free_interp:
#endif
finalize_exec(bprm);
- start_thread(regs, elf_entry, bprm->p);
+ START_THREAD(elf_ex, regs, elf_entry, bprm->p);
retval = 0;
out:
return retval;
@@ -2198,6 +2198,7 @@ static int elf_core_dump(struct coredump_params *cprm)
{
size_t sz = get_note_info_size(&info);
+ /* For cell spufs */
sz += elf_coredump_extra_notes_size();
phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
@@ -2261,6 +2262,7 @@ static int elf_core_dump(struct coredump_params *cprm)
if (!write_note_info(&info, cprm))
goto end_coredump;
+ /* For cell spufs */
if (elf_coredump_extra_notes_write(cprm))
goto end_coredump;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9e84b1928b94..9e56ee1f2652 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -32,6 +32,7 @@
#include <linux/cleancache.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/falloc.h>
+#include <linux/part_stat.h>
#include <linux/uaccess.h>
#include <linux/suspend.h>
#include "internal.h"
@@ -110,24 +111,20 @@ EXPORT_SYMBOL(invalidate_bdev);
int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
loff_t lstart, loff_t lend)
{
- struct block_device *claimed_bdev = NULL;
- int err;
-
/*
* If we don't hold exclusive handle for the device, upgrade to it
* while we discard the buffer cache to avoid discarding buffers
* under live filesystem.
*/
if (!(mode & FMODE_EXCL)) {
- claimed_bdev = bdev->bd_contains;
- err = bd_prepare_to_claim(bdev, claimed_bdev,
- truncate_bdev_range);
+ int err = bd_prepare_to_claim(bdev, truncate_bdev_range);
if (err)
return err;
}
+
truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend);
- if (claimed_bdev)
- bd_abort_claiming(bdev, claimed_bdev, truncate_bdev_range);
+ if (!(mode & FMODE_EXCL))
+ bd_abort_claiming(bdev, truncate_bdev_range);
return 0;
}
EXPORT_SYMBOL(truncate_bdev_range);
@@ -548,55 +545,47 @@ EXPORT_SYMBOL(fsync_bdev);
* count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
* actually.
*/
-struct super_block *freeze_bdev(struct block_device *bdev)
+int freeze_bdev(struct block_device *bdev)
{
struct super_block *sb;
int error = 0;
mutex_lock(&bdev->bd_fsfreeze_mutex);
- if (++bdev->bd_fsfreeze_count > 1) {
- /*
- * We don't even need to grab a reference - the first call
- * to freeze_bdev grab an active reference and only the last
- * thaw_bdev drops it.
- */
- sb = get_super(bdev);
- if (sb)
- drop_super(sb);
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- return sb;
- }
+ if (++bdev->bd_fsfreeze_count > 1)
+ goto done;
sb = get_active_super(bdev);
if (!sb)
- goto out;
+ goto sync;
if (sb->s_op->freeze_super)
error = sb->s_op->freeze_super(sb);
else
error = freeze_super(sb);
+ deactivate_super(sb);
+
if (error) {
- deactivate_super(sb);
bdev->bd_fsfreeze_count--;
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- return ERR_PTR(error);
+ goto done;
}
- deactivate_super(sb);
- out:
+ bdev->bd_fsfreeze_sb = sb;
+
+sync:
sync_blockdev(bdev);
+done:
mutex_unlock(&bdev->bd_fsfreeze_mutex);
- return sb; /* thaw_bdev releases s->s_umount */
+ return error;
}
EXPORT_SYMBOL(freeze_bdev);
/**
* thaw_bdev -- unlock filesystem
* @bdev: blockdevice to unlock
- * @sb: associated superblock
*
* Unlocks the filesystem and marks it writeable again after freeze_bdev().
*/
-int thaw_bdev(struct block_device *bdev, struct super_block *sb)
+int thaw_bdev(struct block_device *bdev)
{
+ struct super_block *sb;
int error = -EINVAL;
mutex_lock(&bdev->bd_fsfreeze_mutex);
@@ -607,6 +596,7 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
if (--bdev->bd_fsfreeze_count > 0)
goto out;
+ sb = bdev->bd_fsfreeze_sb;
if (!sb)
goto out;
@@ -792,23 +782,19 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
static void bdev_free_inode(struct inode *inode)
{
+ struct block_device *bdev = I_BDEV(inode);
+
+ free_percpu(bdev->bd_stats);
+ kfree(bdev->bd_meta_info);
+
kmem_cache_free(bdev_cachep, BDEV_I(inode));
}
-static void init_once(void *foo)
+static void init_once(void *data)
{
- struct bdev_inode *ei = (struct bdev_inode *) foo;
- struct block_device *bdev = &ei->bdev;
+ struct bdev_inode *ei = data;
- memset(bdev, 0, sizeof(*bdev));
- mutex_init(&bdev->bd_mutex);
-#ifdef CONFIG_SYSFS
- INIT_LIST_HEAD(&bdev->bd_holder_disks);
-#endif
- bdev->bd_bdi = &noop_backing_dev_info;
inode_init_once(&ei->vfs_inode);
- /* Initialize mutex for freeze. */
- mutex_init(&bdev->bd_fsfreeze_mutex);
}
static void bdev_evict_inode(struct inode *inode)
@@ -870,72 +856,72 @@ void __init bdev_cache_init(void)
blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
}
-/*
- * Most likely _very_ bad one - but then it's hardly critical for small
- * /dev and can be fixed when somebody will need really large one.
- * Keep in mind that it will be fed through icache hash function too.
- */
-static inline unsigned long hash(dev_t dev)
+struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
{
- return MAJOR(dev)+MINOR(dev);
-}
+ struct block_device *bdev;
+ struct inode *inode;
-static int bdev_test(struct inode *inode, void *data)
-{
- return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data;
+ inode = new_inode(blockdev_superblock);
+ if (!inode)
+ return NULL;
+ inode->i_mode = S_IFBLK;
+ inode->i_rdev = 0;
+ inode->i_data.a_ops = &def_blk_aops;
+ mapping_set_gfp_mask(&inode->i_data, GFP_USER);
+
+ bdev = I_BDEV(inode);
+ memset(bdev, 0, sizeof(*bdev));
+ mutex_init(&bdev->bd_mutex);
+ mutex_init(&bdev->bd_fsfreeze_mutex);
+ spin_lock_init(&bdev->bd_size_lock);
+ bdev->bd_disk = disk;
+ bdev->bd_partno = partno;
+ bdev->bd_inode = inode;
+ bdev->bd_bdi = &noop_backing_dev_info;
+#ifdef CONFIG_SYSFS
+ INIT_LIST_HEAD(&bdev->bd_holder_disks);
+#endif
+ bdev->bd_stats = alloc_percpu(struct disk_stats);
+ if (!bdev->bd_stats) {
+ iput(inode);
+ return NULL;
+ }
+ return bdev;
}
-static int bdev_set(struct inode *inode, void *data)
+void bdev_add(struct block_device *bdev, dev_t dev)
{
- BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data;
- return 0;
+ bdev->bd_dev = dev;
+ bdev->bd_inode->i_rdev = dev;
+ bdev->bd_inode->i_ino = dev;
+ insert_inode_hash(bdev->bd_inode);
}
static struct block_device *bdget(dev_t dev)
{
- struct block_device *bdev;
struct inode *inode;
- inode = iget5_locked(blockdev_superblock, hash(dev),
- bdev_test, bdev_set, &dev);
-
+ inode = ilookup(blockdev_superblock, dev);
if (!inode)
return NULL;
-
- bdev = &BDEV_I(inode)->bdev;
-
- if (inode->i_state & I_NEW) {
- spin_lock_init(&bdev->bd_size_lock);
- bdev->bd_contains = NULL;
- bdev->bd_super = NULL;
- bdev->bd_inode = inode;
- bdev->bd_part_count = 0;
- inode->i_mode = S_IFBLK;
- inode->i_rdev = dev;
- inode->i_bdev = bdev;
- inode->i_data.a_ops = &def_blk_aops;
- mapping_set_gfp_mask(&inode->i_data, GFP_USER);
- unlock_new_inode(inode);
- }
- return bdev;
+ return &BDEV_I(inode)->bdev;
}
/**
* bdgrab -- Grab a reference to an already referenced block device
* @bdev: Block device to grab a reference to.
+ *
+ * Returns the block_device with an additional reference when successful,
+ * or NULL if the inode is already beeing freed.
*/
struct block_device *bdgrab(struct block_device *bdev)
{
- ihold(bdev->bd_inode);
+ if (!igrab(bdev->bd_inode))
+ return NULL;
return bdev;
}
EXPORT_SYMBOL(bdgrab);
-struct block_device *bdget_part(struct hd_struct *part)
-{
- return bdget(part_devt(part));
-}
-
long nr_blockdev_pages(void)
{
struct inode *inode;
@@ -953,67 +939,8 @@ void bdput(struct block_device *bdev)
{
iput(bdev->bd_inode);
}
-
EXPORT_SYMBOL(bdput);
-static struct block_device *bd_acquire(struct inode *inode)
-{
- struct block_device *bdev;
-
- spin_lock(&bdev_lock);
- bdev = inode->i_bdev;
- if (bdev && !inode_unhashed(bdev->bd_inode)) {
- bdgrab(bdev);
- spin_unlock(&bdev_lock);
- return bdev;
- }
- spin_unlock(&bdev_lock);
-
- /*
- * i_bdev references block device inode that was already shut down
- * (corresponding device got removed). Remove the reference and look
- * up block device inode again just in case new device got
- * reestablished under the same device number.
- */
- if (bdev)
- bd_forget(inode);
-
- bdev = bdget(inode->i_rdev);
- if (bdev) {
- spin_lock(&bdev_lock);
- if (!inode->i_bdev) {
- /*
- * We take an additional reference to bd_inode,
- * and it's released in clear_inode() of inode.
- * So, we can access it via ->i_mapping always
- * without igrab().
- */
- bdgrab(bdev);
- inode->i_bdev = bdev;
- inode->i_mapping = bdev->bd_inode->i_mapping;
- }
- spin_unlock(&bdev_lock);
- }
- return bdev;
-}
-
-/* Call when you free inode */
-
-void bd_forget(struct inode *inode)
-{
- struct block_device *bdev = NULL;
-
- spin_lock(&bdev_lock);
- if (!sb_is_blkdev_sb(inode->i_sb))
- bdev = inode->i_bdev;
- inode->i_bdev = NULL;
- inode->i_mapping = &inode->i_data;
- spin_unlock(&bdev_lock);
-
- if (bdev)
- bdput(bdev);
-}
-
/**
* bd_may_claim - test whether a block device can be claimed
* @bdev: block device of interest
@@ -1049,7 +976,6 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
/**
* bd_prepare_to_claim - claim a block device
* @bdev: block device of interest
- * @whole: the whole device containing @bdev, may equal @bdev
* @holder: holder trying to claim @bdev
*
* Claim @bdev. This function fails if @bdev is already claimed by another
@@ -1059,9 +985,12 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
* RETURNS:
* 0 if @bdev can be claimed, -EBUSY otherwise.
*/
-int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole,
- void *holder)
+int bd_prepare_to_claim(struct block_device *bdev, void *holder)
{
+ struct block_device *whole = bdev_whole(bdev);
+
+ if (WARN_ON_ONCE(!holder))
+ return -EINVAL;
retry:
spin_lock(&bdev_lock);
/* if someone else claimed, fail */
@@ -1089,27 +1018,6 @@ retry:
}
EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */
-static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno)
-{
- struct gendisk *disk = get_gendisk(bdev->bd_dev, partno);
-
- if (!disk)
- return NULL;
- /*
- * Now that we hold gendisk reference we make sure bdev we looked up is
- * not stale. If it is, it means device got removed and created before
- * we looked up gendisk and we fail open in such case. Associating
- * unhashed bdev with newly created gendisk could lead to two bdevs
- * (and thus two independent caches) being associated with one device
- * which is bad.
- */
- if (inode_unhashed(bdev->bd_inode)) {
- put_disk_and_module(disk);
- return NULL;
- }
- return disk;
-}
-
static void bd_clear_claiming(struct block_device *whole, void *holder)
{
lockdep_assert_held(&bdev_lock);
@@ -1122,15 +1030,15 @@ static void bd_clear_claiming(struct block_device *whole, void *holder)
/**
* bd_finish_claiming - finish claiming of a block device
* @bdev: block device of interest
- * @whole: whole block device
* @holder: holder that has claimed @bdev
*
* Finish exclusive open of a block device. Mark the device as exlusively
* open by the holder and wake up all waiters for exclusive open to finish.
*/
-static void bd_finish_claiming(struct block_device *bdev,
- struct block_device *whole, void *holder)
+static void bd_finish_claiming(struct block_device *bdev, void *holder)
{
+ struct block_device *whole = bdev_whole(bdev);
+
spin_lock(&bdev_lock);
BUG_ON(!bd_may_claim(bdev, whole, holder));
/*
@@ -1155,11 +1063,10 @@ static void bd_finish_claiming(struct block_device *bdev,
* also used when exclusive open is not actually desired and we just needed
* to block other exclusive openers for a while.
*/
-void bd_abort_claiming(struct block_device *bdev, struct block_device *whole,
- void *holder)
+void bd_abort_claiming(struct block_device *bdev, void *holder)
{
spin_lock(&bdev_lock);
- bd_clear_claiming(whole, holder);
+ bd_clear_claiming(bdev_whole(bdev), holder);
spin_unlock(&bdev_lock);
}
EXPORT_SYMBOL(bd_abort_claiming);
@@ -1230,7 +1137,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
WARN_ON_ONCE(!bdev->bd_holder);
/* FIXME: remove the following once add_disk() handles errors */
- if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
+ if (WARN_ON(!disk->slave_dir || !bdev->bd_holder_dir))
goto out_unlock;
holder = bd_find_holder_disk(bdev, disk);
@@ -1249,24 +1156,24 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
holder->disk = disk;
holder->refcnt = 1;
- ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+ ret = add_symlink(disk->slave_dir, bdev_kobj(bdev));
if (ret)
goto out_free;
- ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
+ ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
if (ret)
goto out_del;
/*
* bdev could be deleted beneath us which would implicitly destroy
* the holder directory. Hold on to it.
*/
- kobject_get(bdev->bd_part->holder_dir);
+ kobject_get(bdev->bd_holder_dir);
list_add(&holder->list, &bdev->bd_holder_disks);
goto out_unlock;
out_del:
- del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
+ del_symlink(disk->slave_dir, bdev_kobj(bdev));
out_free:
kfree(holder);
out_unlock:
@@ -1294,10 +1201,9 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
holder = bd_find_holder_disk(bdev, disk);
if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
- del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
- del_symlink(bdev->bd_part->holder_dir,
- &disk_to_dev(disk)->kobj);
- kobject_put(bdev->bd_part->holder_dir);
+ del_symlink(disk->slave_dir, bdev_kobj(bdev));
+ del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
+ kobject_put(bdev->bd_holder_dir);
list_del_init(&holder->list);
kfree(holder);
}
@@ -1307,77 +1213,6 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
#endif
-/**
- * check_disk_size_change - checks for disk size change and adjusts bdev size.
- * @disk: struct gendisk to check
- * @bdev: struct bdev to adjust.
- * @verbose: if %true log a message about a size change if there is any
- *
- * This routine checks to see if the bdev size does not match the disk size
- * and adjusts it if it differs. When shrinking the bdev size, its all caches
- * are freed.
- */
-static void check_disk_size_change(struct gendisk *disk,
- struct block_device *bdev, bool verbose)
-{
- loff_t disk_size, bdev_size;
-
- spin_lock(&bdev->bd_size_lock);
- disk_size = (loff_t)get_capacity(disk) << 9;
- bdev_size = i_size_read(bdev->bd_inode);
- if (disk_size != bdev_size) {
- if (verbose) {
- printk(KERN_INFO
- "%s: detected capacity change from %lld to %lld\n",
- disk->disk_name, bdev_size, disk_size);
- }
- i_size_write(bdev->bd_inode, disk_size);
- }
- spin_unlock(&bdev->bd_size_lock);
-
- if (bdev_size > disk_size) {
- if (__invalidate_device(bdev, false))
- pr_warn("VFS: busy inodes on resized disk %s\n",
- disk->disk_name);
- }
-}
-
-/**
- * revalidate_disk_size - checks for disk size change and adjusts bdev size.
- * @disk: struct gendisk to check
- * @verbose: if %true log a message about a size change if there is any
- *
- * This routine checks to see if the bdev size does not match the disk size
- * and adjusts it if it differs. When shrinking the bdev size, its all caches
- * are freed.
- */
-void revalidate_disk_size(struct gendisk *disk, bool verbose)
-{
- struct block_device *bdev;
-
- /*
- * Hidden disks don't have associated bdev so there's no point in
- * revalidating them.
- */
- if (disk->flags & GENHD_FL_HIDDEN)
- return;
-
- bdev = bdget_disk(disk, 0);
- if (bdev) {
- check_disk_size_change(disk, bdev, verbose);
- bdput(bdev);
- }
-}
-EXPORT_SYMBOL(revalidate_disk_size);
-
-void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors)
-{
- spin_lock(&bdev->bd_size_lock);
- i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
- spin_unlock(&bdev->bd_size_lock);
-}
-EXPORT_SYMBOL(bd_set_nr_sectors);
-
static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
int bdev_disk_changed(struct block_device *bdev, bool invalidate)
@@ -1411,8 +1246,6 @@ rescan:
disk->fops->revalidate_disk(disk);
}
- check_disk_size_change(disk, bdev, !invalidate);
-
if (get_capacity(disk)) {
ret = blk_add_partitions(disk, bdev);
if (ret == -EAGAIN)
@@ -1439,71 +1272,19 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed);
* mutex_lock(part->bd_mutex)
* mutex_lock_nested(whole->bd_mutex, 1)
*/
-
-static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
- int for_part)
+static int __blkdev_get(struct block_device *bdev, fmode_t mode)
{
- struct block_device *whole = NULL, *claiming = NULL;
- struct gendisk *disk;
- int ret;
- int partno;
- bool first_open = false, unblock_events = true, need_restart;
-
- restart:
- need_restart = false;
- ret = -ENXIO;
- disk = bdev_get_gendisk(bdev, &partno);
- if (!disk)
- goto out;
-
- if (partno) {
- whole = bdget_disk(disk, 0);
- if (!whole) {
- ret = -ENOMEM;
- goto out_put_disk;
- }
- }
-
- if (!for_part && (mode & FMODE_EXCL)) {
- WARN_ON_ONCE(!holder);
- if (whole)
- claiming = whole;
- else
- claiming = bdev;
- ret = bd_prepare_to_claim(bdev, claiming, holder);
- if (ret)
- goto out_put_whole;
- }
+ struct gendisk *disk = bdev->bd_disk;
+ int ret = 0;
- disk_block_events(disk);
- mutex_lock_nested(&bdev->bd_mutex, for_part);
if (!bdev->bd_openers) {
- first_open = true;
- bdev->bd_disk = disk;
- bdev->bd_contains = bdev;
- bdev->bd_partno = partno;
-
- if (!partno) {
- ret = -ENXIO;
- bdev->bd_part = disk_get_part(disk, partno);
- if (!bdev->bd_part)
- goto out_clear;
-
+ if (!bdev_is_partition(bdev)) {
ret = 0;
- if (disk->fops->open) {
+ if (disk->fops->open)
ret = disk->fops->open(bdev, mode);
- /*
- * If we lost a race with 'disk' being deleted,
- * try again. See md.c
- */
- if (ret == -ERESTARTSYS)
- need_restart = true;
- }
- if (!ret) {
- bd_set_nr_sectors(bdev, get_capacity(disk));
+ if (!ret)
set_init_blocksize(bdev);
- }
/*
* If the device is invalidated, rescan partition
@@ -1516,28 +1297,33 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
bdev_disk_changed(bdev, ret == -ENOMEDIUM);
if (ret)
- goto out_clear;
+ return ret;
} else {
- BUG_ON(for_part);
- ret = __blkdev_get(whole, mode, NULL, 1);
- if (ret)
- goto out_clear;
- bdev->bd_contains = bdgrab(whole);
- bdev->bd_part = disk_get_part(disk, partno);
+ struct block_device *whole = bdgrab(disk->part0);
+
+ mutex_lock_nested(&whole->bd_mutex, 1);
+ ret = __blkdev_get(whole, mode);
+ if (ret) {
+ mutex_unlock(&whole->bd_mutex);
+ bdput(whole);
+ return ret;
+ }
+ whole->bd_part_count++;
+ mutex_unlock(&whole->bd_mutex);
+
if (!(disk->flags & GENHD_FL_UP) ||
- !bdev->bd_part || !bdev->bd_part->nr_sects) {
- ret = -ENXIO;
- goto out_clear;
+ !bdev_nr_sectors(bdev)) {
+ __blkdev_put(whole, mode, 1);
+ bdput(whole);
+ return -ENXIO;
}
- bd_set_nr_sectors(bdev, bdev->bd_part->nr_sects);
set_init_blocksize(bdev);
}
if (bdev->bd_bdi == &noop_backing_dev_info)
bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
} else {
- if (bdev->bd_contains == bdev) {
- ret = 0;
+ if (!bdev_is_partition(bdev)) {
if (bdev->bd_disk->fops->open)
ret = bdev->bd_disk->fops->open(bdev, mode);
/* the same as first opener case, read comment there */
@@ -1545,101 +1331,145 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder,
(!ret || ret == -ENOMEDIUM))
bdev_disk_changed(bdev, ret == -ENOMEDIUM);
if (ret)
- goto out_unlock_bdev;
+ return ret;
}
}
bdev->bd_openers++;
- if (for_part)
- bdev->bd_part_count++;
- if (claiming)
- bd_finish_claiming(bdev, claiming, holder);
+ return 0;
+}
- /*
- * Block event polling for write claims if requested. Any write holder
- * makes the write_holder state stick until all are released. This is
- * good enough and tracking individual writeable reference is too
- * fragile given the way @mode is used in blkdev_get/put().
- */
- if (claiming && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
- (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
- bdev->bd_write_holder = true;
- unblock_events = false;
- }
- mutex_unlock(&bdev->bd_mutex);
+struct block_device *blkdev_get_no_open(dev_t dev)
+{
+ struct block_device *bdev;
+ struct gendisk *disk;
- if (unblock_events)
- disk_unblock_events(disk);
+ down_read(&bdev_lookup_sem);
+ bdev = bdget(dev);
+ if (!bdev) {
+ up_read(&bdev_lookup_sem);
+ blk_request_module(dev);
+ down_read(&bdev_lookup_sem);
+
+ bdev = bdget(dev);
+ if (!bdev)
+ goto unlock;
+ }
- /* only one opener holds refs to the module and disk */
- if (!first_open)
- put_disk_and_module(disk);
- if (whole)
- bdput(whole);
- return 0;
+ disk = bdev->bd_disk;
+ if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj))
+ goto bdput;
+ if ((disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP)
+ goto put_disk;
+ if (!try_module_get(bdev->bd_disk->fops->owner))
+ goto put_disk;
+ up_read(&bdev_lookup_sem);
+ return bdev;
+put_disk:
+ put_disk(disk);
+bdput:
+ bdput(bdev);
+unlock:
+ up_read(&bdev_lookup_sem);
+ return NULL;
+}
- out_clear:
- disk_put_part(bdev->bd_part);
- bdev->bd_disk = NULL;
- bdev->bd_part = NULL;
- if (bdev != bdev->bd_contains)
- __blkdev_put(bdev->bd_contains, mode, 1);
- bdev->bd_contains = NULL;
- out_unlock_bdev:
- if (claiming)
- bd_abort_claiming(bdev, claiming, holder);
- mutex_unlock(&bdev->bd_mutex);
- disk_unblock_events(disk);
- out_put_whole:
- if (whole)
- bdput(whole);
- out_put_disk:
- put_disk_and_module(disk);
- if (need_restart)
- goto restart;
- out:
- return ret;
+void blkdev_put_no_open(struct block_device *bdev)
+{
+ module_put(bdev->bd_disk->fops->owner);
+ put_disk(bdev->bd_disk);
+ bdput(bdev);
}
/**
- * blkdev_get - open a block device
- * @bdev: block_device to open
+ * blkdev_get_by_dev - open a block device by device number
+ * @dev: device number of block device to open
* @mode: FMODE_* mask
* @holder: exclusive holder identifier
*
- * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is
- * open with exclusive access. Specifying %FMODE_EXCL with %NULL
- * @holder is invalid. Exclusive opens may nest for the same @holder.
+ * Open the block device described by device number @dev. If @mode includes
+ * %FMODE_EXCL, the block device is opened with exclusive access. Specifying
+ * %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may nest for
+ * the same @holder.
*
- * On success, the reference count of @bdev is unchanged. On failure,
- * @bdev is put.
+ * Use this interface ONLY if you really do not have anything better - i.e. when
+ * you are behind a truly sucky interface and all you are given is a device
+ * number. Everything else should use blkdev_get_by_path().
*
* CONTEXT:
* Might sleep.
*
* RETURNS:
- * 0 on success, -errno on failure.
+ * Reference to the block_device on success, ERR_PTR(-errno) on failure.
*/
-static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
+struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
{
- int ret, perm = 0;
+ bool unblock_events = true;
+ struct block_device *bdev;
+ struct gendisk *disk;
+ int ret;
- if (mode & FMODE_READ)
- perm |= MAY_READ;
- if (mode & FMODE_WRITE)
- perm |= MAY_WRITE;
- ret = devcgroup_inode_permission(bdev->bd_inode, perm);
+ ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
+ MAJOR(dev), MINOR(dev),
+ ((mode & FMODE_READ) ? DEVCG_ACC_READ : 0) |
+ ((mode & FMODE_WRITE) ? DEVCG_ACC_WRITE : 0));
if (ret)
- goto bdput;
+ return ERR_PTR(ret);
+
+ /*
+ * If we lost a race with 'disk' being deleted, try again. See md.c.
+ */
+retry:
+ bdev = blkdev_get_no_open(dev);
+ if (!bdev)
+ return ERR_PTR(-ENXIO);
+ disk = bdev->bd_disk;
+
+ if (mode & FMODE_EXCL) {
+ ret = bd_prepare_to_claim(bdev, holder);
+ if (ret)
+ goto put_blkdev;
+ }
+
+ disk_block_events(disk);
- ret =__blkdev_get(bdev, mode, holder, 0);
+ mutex_lock(&bdev->bd_mutex);
+ ret =__blkdev_get(bdev, mode);
if (ret)
- goto bdput;
- return 0;
+ goto abort_claiming;
+ if (mode & FMODE_EXCL) {
+ bd_finish_claiming(bdev, holder);
-bdput:
- bdput(bdev);
- return ret;
+ /*
+ * Block event polling for write claims if requested. Any write
+ * holder makes the write_holder state stick until all are
+ * released. This is good enough and tracking individual
+ * writeable reference is too fragile given the way @mode is
+ * used in blkdev_get/put().
+ */
+ if ((mode & FMODE_WRITE) && !bdev->bd_write_holder &&
+ (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
+ bdev->bd_write_holder = true;
+ unblock_events = false;
+ }
+ }
+ mutex_unlock(&bdev->bd_mutex);
+
+ if (unblock_events)
+ disk_unblock_events(disk);
+ return bdev;
+
+abort_claiming:
+ if (mode & FMODE_EXCL)
+ bd_abort_claiming(bdev, holder);
+ mutex_unlock(&bdev->bd_mutex);
+ disk_unblock_events(disk);
+put_blkdev:
+ blkdev_put_no_open(bdev);
+ if (ret == -ERESTARTSYS)
+ goto retry;
+ return ERR_PTR(ret);
}
+EXPORT_SYMBOL(blkdev_get_by_dev);
/**
* blkdev_get_by_path - open a block device by name
@@ -1647,32 +1477,30 @@ bdput:
* @mode: FMODE_* mask
* @holder: exclusive holder identifier
*
- * Open the blockdevice described by the device file at @path. @mode
- * and @holder are identical to blkdev_get().
- *
- * On success, the returned block_device has reference count of one.
+ * Open the block device described by the device file at @path. If @mode
+ * includes %FMODE_EXCL, the block device is opened with exclusive access.
+ * Specifying %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may
+ * nest for the same @holder.
*
* CONTEXT:
* Might sleep.
*
* RETURNS:
- * Pointer to block_device on success, ERR_PTR(-errno) on failure.
+ * Reference to the block_device on success, ERR_PTR(-errno) on failure.
*/
struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
void *holder)
{
struct block_device *bdev;
- int err;
-
- bdev = lookup_bdev(path);
- if (IS_ERR(bdev))
- return bdev;
+ dev_t dev;
+ int error;
- err = blkdev_get(bdev, mode, holder);
- if (err)
- return ERR_PTR(err);
+ error = lookup_bdev(path, &dev);
+ if (error)
+ return ERR_PTR(error);
- if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
+ bdev = blkdev_get_by_dev(dev, mode, holder);
+ if (!IS_ERR(bdev) && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
blkdev_put(bdev, mode);
return ERR_PTR(-EACCES);
}
@@ -1681,45 +1509,6 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
}
EXPORT_SYMBOL(blkdev_get_by_path);
-/**
- * blkdev_get_by_dev - open a block device by device number
- * @dev: device number of block device to open
- * @mode: FMODE_* mask
- * @holder: exclusive holder identifier
- *
- * Open the blockdevice described by device number @dev. @mode and
- * @holder are identical to blkdev_get().
- *
- * Use it ONLY if you really do not have anything better - i.e. when
- * you are behind a truly sucky interface and all you are given is a
- * device number. _Never_ to be used for internal purposes. If you
- * ever need it - reconsider your API.
- *
- * On success, the returned block_device has reference count of one.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * Pointer to block_device on success, ERR_PTR(-errno) on failure.
- */
-struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
-{
- struct block_device *bdev;
- int err;
-
- bdev = bdget(dev);
- if (!bdev)
- return ERR_PTR(-ENOMEM);
-
- err = blkdev_get(bdev, mode, holder);
- if (err)
- return ERR_PTR(err);
-
- return bdev;
-}
-EXPORT_SYMBOL(blkdev_get_by_dev);
-
static int blkdev_open(struct inode * inode, struct file * filp)
{
struct block_device *bdev;
@@ -1741,14 +1530,12 @@ static int blkdev_open(struct inode * inode, struct file * filp)
if ((filp->f_flags & O_ACCMODE) == 3)
filp->f_mode |= FMODE_WRITE_IOCTL;
- bdev = bd_acquire(inode);
- if (bdev == NULL)
- return -ENOMEM;
-
+ bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp);
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
filp->f_mapping = bdev->bd_inode->i_mapping;
filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
-
- return blkdev_get(bdev, filp->f_mode, filp);
+ return 0;
}
static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
@@ -1774,34 +1561,28 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
WARN_ON_ONCE(bdev->bd_holders);
sync_blockdev(bdev);
kill_bdev(bdev);
-
bdev_write_inode(bdev);
+ if (bdev_is_partition(bdev))
+ victim = bdev_whole(bdev);
}
- if (bdev->bd_contains == bdev) {
- if (disk->fops->release)
- disk->fops->release(disk, mode);
- }
- if (!bdev->bd_openers) {
- disk_put_part(bdev->bd_part);
- bdev->bd_part = NULL;
- bdev->bd_disk = NULL;
- if (bdev != bdev->bd_contains)
- victim = bdev->bd_contains;
- bdev->bd_contains = NULL;
-
- put_disk_and_module(disk);
- }
+
+ if (!bdev_is_partition(bdev) && disk->fops->release)
+ disk->fops->release(disk, mode);
mutex_unlock(&bdev->bd_mutex);
- bdput(bdev);
- if (victim)
+ if (victim) {
__blkdev_put(victim, mode, 1);
+ bdput(victim);
+ }
}
void blkdev_put(struct block_device *bdev, fmode_t mode)
{
+ struct gendisk *disk = bdev->bd_disk;
+
mutex_lock(&bdev->bd_mutex);
if (mode & FMODE_EXCL) {
+ struct block_device *whole = bdev_whole(bdev);
bool bdev_free;
/*
@@ -1812,13 +1593,12 @@ void blkdev_put(struct block_device *bdev, fmode_t mode)
spin_lock(&bdev_lock);
WARN_ON_ONCE(--bdev->bd_holders < 0);
- WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
+ WARN_ON_ONCE(--whole->bd_holders < 0);
- /* bd_contains might point to self, check in a separate step */
if ((bdev_free = !bdev->bd_holders))
bdev->bd_holder = NULL;
- if (!bdev->bd_contains->bd_holders)
- bdev->bd_contains->bd_holder = NULL;
+ if (!whole->bd_holders)
+ whole->bd_holder = NULL;
spin_unlock(&bdev_lock);
@@ -1827,7 +1607,7 @@ void blkdev_put(struct block_device *bdev, fmode_t mode)
* unblock evpoll if it was a write holder.
*/
if (bdev_free && bdev->bd_write_holder) {
- disk_unblock_events(bdev->bd_disk);
+ disk_unblock_events(disk);
bdev->bd_write_holder = false;
}
}
@@ -1837,11 +1617,11 @@ void blkdev_put(struct block_device *bdev, fmode_t mode)
* event. This is to ensure detection of media removal commanded
* from userland - e.g. eject(1).
*/
- disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE);
-
+ disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE);
mutex_unlock(&bdev->bd_mutex);
__blkdev_put(bdev, mode, 0);
+ blkdev_put_no_open(bdev);
}
EXPORT_SYMBOL(blkdev_put);
@@ -2054,37 +1834,32 @@ const struct file_operations def_blk_fops = {
* namespace if possible and return it. Return ERR_PTR(error)
* otherwise.
*/
-struct block_device *lookup_bdev(const char *pathname)
+int lookup_bdev(const char *pathname, dev_t *dev)
{
- struct block_device *bdev;
struct inode *inode;
struct path path;
int error;
if (!pathname || !*pathname)
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
error = kern_path(pathname, LOOKUP_FOLLOW, &path);
if (error)
- return ERR_PTR(error);
+ return error;
inode = d_backing_inode(path.dentry);
error = -ENOTBLK;
if (!S_ISBLK(inode->i_mode))
- goto fail;
+ goto out_path_put;
error = -EACCES;
if (!may_open_dev(&path))
- goto fail;
- error = -ENOMEM;
- bdev = bd_acquire(inode);
- if (!bdev)
- goto fail;
-out:
+ goto out_path_put;
+
+ *dev = inode->i_rdev;
+ error = 0;
+out_path_put:
path_put(&path);
- return bdev;
-fail:
- bdev = ERR_PTR(error);
- goto out;
+ return error;
}
EXPORT_SYMBOL(lookup_bdev);
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index e738f6206ea5..9f1b1a88e317 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -3,7 +3,7 @@
obj-$(CONFIG_BTRFS_FS) := btrfs.o
btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
- file-item.o inode-item.o inode-map.o disk-io.o \
+ file-item.o inode-item.o disk-io.o \
transaction.o inode.o file.o tree-defrag.o \
extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
@@ -16,6 +16,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
+btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o
btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
tests/extent-buffer-tests.o tests/btrfs-tests.o \
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 771a036867dc..02d7d7b2563b 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -783,8 +783,8 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
BUG_ON(ref->key_for_search.type);
BUG_ON(!ref->wanted_disk_byte);
- eb = read_tree_block(fs_info, ref->wanted_disk_byte, 0,
- ref->level - 1, NULL);
+ eb = read_tree_block(fs_info, ref->wanted_disk_byte,
+ ref->root_id, 0, ref->level - 1, NULL);
if (IS_ERR(eb)) {
free_pref(ref);
return PTR_ERR(eb);
@@ -1331,7 +1331,7 @@ again:
struct extent_buffer *eb;
eb = read_tree_block(fs_info, ref->parent, 0,
- ref->level, NULL);
+ 0, ref->level, NULL);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
@@ -1341,14 +1341,12 @@ again:
goto out;
}
- if (!path->skip_locking) {
+ if (!path->skip_locking)
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_read(eb);
- }
ret = find_extent_in_eb(eb, bytenr,
*extent_item_pos, &eie, ignore_offset);
if (!path->skip_locking)
- btrfs_tree_read_unlock_blocking(eb);
+ btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
if (ret < 0)
goto out;
@@ -1671,13 +1669,11 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
s64 bytes_left = ((s64)size) - 1;
struct extent_buffer *eb = eb_in;
struct btrfs_key found_key;
- int leave_spinning = path->leave_spinning;
struct btrfs_inode_ref *iref;
if (bytes_left >= 0)
dest[bytes_left] = '\0';
- path->leave_spinning = 1;
while (1) {
bytes_left -= name_len;
if (bytes_left >= 0)
@@ -1685,7 +1681,7 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
name_off, name_len);
if (eb != eb_in) {
if (!path->skip_locking)
- btrfs_tree_read_unlock_blocking(eb);
+ btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
}
ret = btrfs_find_item(fs_root, path, parent, 0,
@@ -1705,8 +1701,6 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
eb = path->nodes[0];
/* make sure we can use eb after releasing the path */
if (eb != eb_in) {
- if (!path->skip_locking)
- btrfs_set_lock_blocking_read(eb);
path->nodes[0] = NULL;
path->locks[0] = 0;
}
@@ -1723,7 +1717,6 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
}
btrfs_release_path(path);
- path->leave_spinning = leave_spinning;
if (ret)
return ERR_PTR(ret);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 3ba6f3839d39..52f2198d44c9 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -424,6 +424,23 @@ int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
return ret;
}
+static bool space_cache_v1_done(struct btrfs_block_group *cache)
+{
+ bool ret;
+
+ spin_lock(&cache->lock);
+ ret = cache->cached != BTRFS_CACHE_FAST;
+ spin_unlock(&cache->lock);
+
+ return ret;
+}
+
+void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
+ struct btrfs_caching_control *caching_ctl)
+{
+ wait_event(caching_ctl->wait, space_cache_v1_done(cache));
+}
+
#ifdef CONFIG_BTRFS_DEBUG
static void fragment_free_space(struct btrfs_block_group *block_group)
{
@@ -639,11 +656,28 @@ static noinline void caching_thread(struct btrfs_work *work)
mutex_lock(&caching_ctl->mutex);
down_read(&fs_info->commit_root_sem);
+ if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
+ ret = load_free_space_cache(block_group);
+ if (ret == 1) {
+ ret = 0;
+ goto done;
+ }
+
+ /*
+ * We failed to load the space cache, set ourselves to
+ * CACHE_STARTED and carry on.
+ */
+ spin_lock(&block_group->lock);
+ block_group->cached = BTRFS_CACHE_STARTED;
+ spin_unlock(&block_group->lock);
+ wake_up(&caching_ctl->wait);
+ }
+
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
ret = load_free_space_tree(caching_ctl);
else
ret = load_extent_tree_free(caching_ctl);
-
+done:
spin_lock(&block_group->lock);
block_group->caching_ctl = NULL;
block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
@@ -679,7 +713,7 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only
{
DEFINE_WAIT(wait);
struct btrfs_fs_info *fs_info = cache->fs_info;
- struct btrfs_caching_control *caching_ctl;
+ struct btrfs_caching_control *caching_ctl = NULL;
int ret = 0;
caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
@@ -691,119 +725,41 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only
init_waitqueue_head(&caching_ctl->wait);
caching_ctl->block_group = cache;
caching_ctl->progress = cache->start;
- refcount_set(&caching_ctl->count, 1);
+ refcount_set(&caching_ctl->count, 2);
btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
spin_lock(&cache->lock);
- /*
- * This should be a rare occasion, but this could happen I think in the
- * case where one thread starts to load the space cache info, and then
- * some other thread starts a transaction commit which tries to do an
- * allocation while the other thread is still loading the space cache
- * info. The previous loop should have kept us from choosing this block
- * group, but if we've moved to the state where we will wait on caching
- * block groups we need to first check if we're doing a fast load here,
- * so we can wait for it to finish, otherwise we could end up allocating
- * from a block group who's cache gets evicted for one reason or
- * another.
- */
- while (cache->cached == BTRFS_CACHE_FAST) {
- struct btrfs_caching_control *ctl;
-
- ctl = cache->caching_ctl;
- refcount_inc(&ctl->count);
- prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
- spin_unlock(&cache->lock);
-
- schedule();
-
- finish_wait(&ctl->wait, &wait);
- btrfs_put_caching_control(ctl);
- spin_lock(&cache->lock);
- }
-
if (cache->cached != BTRFS_CACHE_NO) {
- spin_unlock(&cache->lock);
kfree(caching_ctl);
- return 0;
+
+ caching_ctl = cache->caching_ctl;
+ if (caching_ctl)
+ refcount_inc(&caching_ctl->count);
+ spin_unlock(&cache->lock);
+ goto out;
}
WARN_ON(cache->caching_ctl);
cache->caching_ctl = caching_ctl;
- cache->cached = BTRFS_CACHE_FAST;
+ if (btrfs_test_opt(fs_info, SPACE_CACHE))
+ cache->cached = BTRFS_CACHE_FAST;
+ else
+ cache->cached = BTRFS_CACHE_STARTED;
+ cache->has_caching_ctl = 1;
spin_unlock(&cache->lock);
- if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
- mutex_lock(&caching_ctl->mutex);
- ret = load_free_space_cache(cache);
-
- spin_lock(&cache->lock);
- if (ret == 1) {
- cache->caching_ctl = NULL;
- cache->cached = BTRFS_CACHE_FINISHED;
- cache->last_byte_to_unpin = (u64)-1;
- caching_ctl->progress = (u64)-1;
- } else {
- if (load_cache_only) {
- cache->caching_ctl = NULL;
- cache->cached = BTRFS_CACHE_NO;
- } else {
- cache->cached = BTRFS_CACHE_STARTED;
- cache->has_caching_ctl = 1;
- }
- }
- spin_unlock(&cache->lock);
-#ifdef CONFIG_BTRFS_DEBUG
- if (ret == 1 &&
- btrfs_should_fragment_free_space(cache)) {
- u64 bytes_used;
-
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- bytes_used = cache->length - cache->used;
- cache->space_info->bytes_used += bytes_used >> 1;
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
- fragment_free_space(cache);
- }
-#endif
- mutex_unlock(&caching_ctl->mutex);
-
- wake_up(&caching_ctl->wait);
- if (ret == 1) {
- btrfs_put_caching_control(caching_ctl);
- btrfs_free_excluded_extents(cache);
- return 0;
- }
- } else {
- /*
- * We're either using the free space tree or no caching at all.
- * Set cached to the appropriate value and wakeup any waiters.
- */
- spin_lock(&cache->lock);
- if (load_cache_only) {
- cache->caching_ctl = NULL;
- cache->cached = BTRFS_CACHE_NO;
- } else {
- cache->cached = BTRFS_CACHE_STARTED;
- cache->has_caching_ctl = 1;
- }
- spin_unlock(&cache->lock);
- wake_up(&caching_ctl->wait);
- }
-
- if (load_cache_only) {
- btrfs_put_caching_control(caching_ctl);
- return 0;
- }
-
- down_write(&fs_info->commit_root_sem);
+ spin_lock(&fs_info->block_group_cache_lock);
refcount_inc(&caching_ctl->count);
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
- up_write(&fs_info->commit_root_sem);
+ spin_unlock(&fs_info->block_group_cache_lock);
btrfs_get_block_group(cache);
btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
+out:
+ if (load_cache_only && caching_ctl)
+ btrfs_wait_space_cache_v1_finished(cache, caching_ctl);
+ if (caching_ctl)
+ btrfs_put_caching_control(caching_ctl);
return ret;
}
@@ -892,8 +848,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct btrfs_block_group *block_group;
struct btrfs_free_cluster *cluster;
- struct btrfs_root *tree_root = fs_info->tree_root;
- struct btrfs_key key;
struct inode *inode;
struct kobject *kobj = NULL;
int ret;
@@ -971,42 +925,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&trans->transaction->dirty_bgs_lock);
mutex_unlock(&trans->transaction->cache_write_mutex);
- if (!IS_ERR(inode)) {
- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
- if (ret) {
- btrfs_add_delayed_iput(inode);
- goto out;
- }
- clear_nlink(inode);
- /* One for the block groups ref */
- spin_lock(&block_group->lock);
- if (block_group->iref) {
- block_group->iref = 0;
- block_group->inode = NULL;
- spin_unlock(&block_group->lock);
- iput(inode);
- } else {
- spin_unlock(&block_group->lock);
- }
- /* One for our lookup ref */
- btrfs_add_delayed_iput(inode);
- }
-
- key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.type = 0;
- key.offset = block_group->start;
-
- ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
- if (ret < 0)
+ ret = btrfs_remove_free_space_inode(trans, inode, block_group);
+ if (ret)
goto out;
- if (ret > 0)
- btrfs_release_path(path);
- if (ret == 0) {
- ret = btrfs_del_item(trans, tree_root, path);
- if (ret)
- goto out;
- btrfs_release_path(path);
- }
spin_lock(&fs_info->block_group_cache_lock);
rb_erase(&block_group->cache_node,
@@ -1043,7 +964,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
if (block_group->cached == BTRFS_CACHE_STARTED)
btrfs_wait_block_group_cache_done(block_group);
if (block_group->has_caching_ctl) {
- down_write(&fs_info->commit_root_sem);
+ spin_lock(&fs_info->block_group_cache_lock);
if (!caching_ctl) {
struct btrfs_caching_control *ctl;
@@ -1057,7 +978,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
}
if (caching_ctl)
list_del_init(&caching_ctl->list);
- up_write(&fs_info->commit_root_sem);
+ spin_unlock(&fs_info->block_group_cache_lock);
if (caching_ctl) {
/* Once for the caching bgs list and once for us. */
btrfs_put_caching_control(caching_ctl);
@@ -1723,6 +1644,7 @@ out:
static int exclude_super_stripes(struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
+ const bool zoned = btrfs_is_zoned(fs_info);
u64 bytenr;
u64 *logical;
int stripe_len;
@@ -1744,6 +1666,14 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
if (ret)
return ret;
+ /* Shouldn't have super stripes in sequential zones */
+ if (zoned && nr) {
+ btrfs_err(fs_info,
+ "zoned: block group %llu must not contain super block",
+ cache->start);
+ return -EUCLEAN;
+ }
+
while (nr--) {
u64 len = min_t(u64, stripe_len,
cache->start + cache->length - logical[nr]);
@@ -1805,7 +1735,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
INIT_LIST_HEAD(&cache->discard_list);
INIT_LIST_HEAD(&cache->dirty_list);
INIT_LIST_HEAD(&cache->io_list);
- btrfs_init_free_space_ctl(cache);
+ btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
atomic_set(&cache->frozen, 0);
mutex_init(&cache->free_space_lock);
btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
@@ -1985,6 +1915,51 @@ error:
return ret;
}
+static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
+ struct btrfs_space_info *space_info;
+ struct rb_node *node;
+ int ret = 0;
+
+ for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct btrfs_block_group *bg;
+
+ em = rb_entry(node, struct extent_map, rb_node);
+ map = em->map_lookup;
+ bg = btrfs_create_block_group_cache(fs_info, em->start);
+ if (!bg) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ /* Fill dummy cache as FULL */
+ bg->length = em->len;
+ bg->flags = map->type;
+ bg->last_byte_to_unpin = (u64)-1;
+ bg->cached = BTRFS_CACHE_FINISHED;
+ bg->used = em->len;
+ bg->flags = map->type;
+ ret = btrfs_add_block_group_cache(fs_info, bg);
+ if (ret) {
+ btrfs_remove_free_space_cache(bg);
+ btrfs_put_block_group(bg);
+ break;
+ }
+ btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
+ 0, &space_info);
+ bg->space_info = space_info;
+ link_block_group(bg);
+
+ set_avail_alloc_bits(fs_info, bg->flags);
+ }
+ if (!ret)
+ btrfs_init_global_block_rsv(fs_info);
+ return ret;
+}
+
int btrfs_read_block_groups(struct btrfs_fs_info *info)
{
struct btrfs_path *path;
@@ -1995,6 +1970,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
int need_clear = 0;
u64 cache_gen;
+ if (!info->extent_root)
+ return fill_dummy_bgs(info);
+
key.objectid = 0;
key.offset = 0;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
@@ -2152,7 +2130,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
cache->flags = type;
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
- cache->needs_free_space = 1;
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ cache->needs_free_space = 1;
ret = exclude_super_stripes(cache);
if (ret) {
/* We may have excluded something, so call this just in case */
@@ -2361,6 +2340,9 @@ static int cache_save_setup(struct btrfs_block_group *block_group,
int retries = 0;
int ret = 0;
+ if (!btrfs_test_opt(fs_info, SPACE_CACHE))
+ return 0;
+
/*
* If this block group is smaller than 100 megs don't bother caching the
* block group.
@@ -2401,7 +2383,7 @@ again:
* time.
*/
BTRFS_I(inode)->generation = 0;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret) {
/*
* So theoretically we could recover from this, simply set the
@@ -3307,14 +3289,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
struct btrfs_caching_control *caching_ctl;
struct rb_node *n;
- down_write(&info->commit_root_sem);
+ spin_lock(&info->block_group_cache_lock);
while (!list_empty(&info->caching_block_groups)) {
caching_ctl = list_entry(info->caching_block_groups.next,
struct btrfs_caching_control, list);
list_del(&caching_ctl->list);
btrfs_put_caching_control(caching_ctl);
}
- up_write(&info->commit_root_sem);
+ spin_unlock(&info->block_group_cache_lock);
spin_lock(&info->unused_bgs_lock);
while (!list_empty(&info->unused_bgs)) {
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index adfd7583a17b..8f74a96074f7 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -268,6 +268,8 @@ void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
+void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
+ struct btrfs_caching_control *caching_ctl);
static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
{
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index bc920afe23bf..04a6226e0388 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -426,6 +426,14 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
fs_info->delayed_block_rsv.space_info = space_info;
fs_info->delayed_refs_rsv.space_info = space_info;
+ /*
+ * Our various recovery options can leave us with NULL roots, so check
+ * here and just bail before we go dereferencing NULLs everywhere.
+ */
+ if (!fs_info->extent_root || !fs_info->csum_root ||
+ !fs_info->dev_root || !fs_info->chunk_root || !fs_info->tree_root)
+ return;
+
fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 92dd86bceae3..555cbcef6585 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -35,6 +35,13 @@ enum {
BTRFS_INODE_IN_DELALLOC_LIST,
BTRFS_INODE_HAS_PROPS,
BTRFS_INODE_SNAPSHOT_FLUSH,
+ /*
+ * Set and used when logging an inode and it serves to signal that an
+ * inode does not have xattrs, so subsequent fsyncs can avoid searching
+ * for xattrs to log. This bit must be cleared whenever a xattr is added
+ * to an inode.
+ */
+ BTRFS_INODE_NO_XATTRS,
};
/* in memory btrfs inode */
@@ -50,7 +57,8 @@ struct btrfs_inode {
/*
* Lock for counters and all fields used to determine if the inode is in
* the log or not (last_trans, last_sub_trans, last_log_commit,
- * logged_trans).
+ * logged_trans), to access/update new_delalloc_bytes and to update the
+ * VFS' inode number of bytes used.
*/
spinlock_t lock;
@@ -203,16 +211,6 @@ struct btrfs_inode {
/* Hook into fs_info->delayed_iputs */
struct list_head delayed_iput;
- /*
- * To avoid races between lockless (i_mutex not held) direct IO writes
- * and concurrent fsync requests. Direct IO writes must acquire read
- * access on this semaphore for creating an extent map and its
- * corresponding ordered extent. The fast fsync path must acquire write
- * access on this semaphore before it collects ordered extents and
- * extent maps.
- */
- struct rw_semaphore dio_sem;
-
struct inode vfs_inode;
};
@@ -341,8 +339,7 @@ static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
{
struct btrfs_root *root = inode->root;
- struct btrfs_super_block *sb = root->fs_info->super_copy;
- const u16 csum_size = btrfs_super_csum_size(sb);
+ const u32 csum_size = root->fs_info->csum_size;
/* Output minus objectid, which is more meaningful */
if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID)
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 81a8c87a5afb..6ff44e53814c 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -233,7 +233,6 @@ struct btrfsic_stack_frame {
struct btrfsic_state {
u32 print_mask;
int include_extent_data;
- int csum_size;
struct list_head all_blocks_list;
struct btrfsic_block_hashtable block_hashtable;
struct btrfsic_block_link_hashtable block_link_hashtable;
@@ -660,8 +659,6 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
return -1;
}
- state->csum_size = btrfs_super_csum_size(selected_super);
-
for (pass = 0; pass < 3; pass++) {
int num_copies;
int mirror_num;
@@ -954,7 +951,7 @@ static noinline_for_stack int btrfsic_process_metablock(
sf->prev = NULL;
continue_with_new_stack_frame:
- sf->block->generation = le64_to_cpu(sf->hdr->generation);
+ sf->block->generation = btrfs_stack_header_generation(sf->hdr);
if (0 == sf->hdr->level) {
struct btrfs_leaf *const leafhdr =
(struct btrfs_leaf *)sf->hdr;
@@ -1723,7 +1720,7 @@ static noinline_for_stack int btrfsic_test_for_metadata(
crypto_shash_update(shash, data, sublen);
}
crypto_shash_final(shash, csum);
- if (memcmp(csum, h->csum, state->csum_size))
+ if (memcmp(csum, h->csum, fs_info->csum_size))
return 1;
return 0; /* is metadata */
@@ -2695,8 +2692,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_disk=%p)\n",
bio_op(bio), bio->bi_opf, segs,
- (unsigned long long)bio->bi_iter.bi_sector,
- dev_bytenr, bio->bi_disk);
+ bio->bi_iter.bi_sector, dev_bytenr, bio->bi_disk);
mapped_datav = kmalloc_array(segs,
sizeof(*mapped_datav), GFP_NOFS);
@@ -2797,7 +2793,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
state->fs_info = fs_info;
state->print_mask = print_mask;
state->include_extent_data = including_extent_data;
- state->csum_size = 0;
state->metablock_size = fs_info->nodesize;
state->datablock_size = fs_info->sectorsize;
INIT_LIST_HEAD(&state->all_blocks_list);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index eeface30facd..5ae3fa0386b7 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -131,10 +131,8 @@ static int btrfs_decompress_bio(struct compressed_bio *cb);
static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
unsigned long disk_size)
{
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
-
return sizeof(struct compressed_bio) +
- (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size;
+ (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * fs_info->csum_size;
}
static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
@@ -142,7 +140,7 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ const u32 csum_size = fs_info->csum_size;
struct page *page;
unsigned long i;
char *kaddr;
@@ -150,7 +148,7 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
struct compressed_bio *cb = bio->bi_private;
u8 *cb_sum = cb->sums;
- if (inode->flags & BTRFS_INODE_NODATASUM)
+ if (!fs_info->csum_root || (inode->flags & BTRFS_INODE_NODATASUM))
return 0;
shash->tfm = fs_info->csum_shash;
@@ -220,7 +218,7 @@ static void end_compressed_bio_read(struct bio *bio)
inode = cb->inode;
ret = check_compressed_csum(BTRFS_I(inode), bio,
- (u64)bio->bi_iter.bi_sector << 9);
+ bio->bi_iter.bi_sector << 9);
if (ret)
goto csum_failed;
@@ -622,13 +620,12 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
unsigned long pg_index;
struct page *page;
struct bio *comp_bio;
- u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9;
+ u64 cur_disk_byte = bio->bi_iter.bi_sector << 9;
u64 em_len;
u64 em_start;
struct extent_map *em;
blk_status_t ret = BLK_STS_RESOURCE;
int faili = 0;
- const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
u8 *sums;
em_tree = &BTRFS_I(inode)->extent_tree;
@@ -722,15 +719,12 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
*/
refcount_inc(&cb->pending_bios);
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
- ret = btrfs_lookup_bio_sums(inode, comp_bio,
- (u64)-1, sums);
- BUG_ON(ret); /* -ENOMEM */
- }
+ ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
+ BUG_ON(ret); /* -ENOMEM */
nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
fs_info->sectorsize);
- sums += csum_size * nr_sectors;
+ sums += fs_info->csum_size * nr_sectors;
ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
if (ret) {
@@ -751,10 +745,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
- ret = btrfs_lookup_bio_sums(inode, comp_bio, (u64)-1, sums);
- BUG_ON(ret); /* -ENOMEM */
- }
+ ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
+ BUG_ON(ret); /* -ENOMEM */
ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
if (ret) {
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 113da62dc17f..07810891e204 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1278,14 +1278,11 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
if (!tm)
return eb;
- btrfs_set_path_blocking(path);
- btrfs_set_lock_blocking_read(eb);
-
if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
BUG_ON(tm->slot != 0);
eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
if (!eb_rewin) {
- btrfs_tree_read_unlock_blocking(eb);
+ btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
return NULL;
}
@@ -1297,13 +1294,13 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
} else {
eb_rewin = btrfs_clone_extent_buffer(eb);
if (!eb_rewin) {
- btrfs_tree_read_unlock_blocking(eb);
+ btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
return NULL;
}
}
- btrfs_tree_read_unlock_blocking(eb);
+ btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
@@ -1356,7 +1353,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
- old = read_tree_block(fs_info, logical, 0, level, NULL);
+ old = read_tree_block(fs_info, logical, root->root_key.objectid,
+ 0, level, NULL);
if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
if (!IS_ERR(old))
free_extent_buffer(old);
@@ -1373,9 +1371,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
free_extent_buffer(eb_root);
eb = alloc_dummy_extent_buffer(fs_info, logical);
} else {
- btrfs_set_lock_blocking_read(eb_root);
eb = btrfs_clone_extent_buffer(eb_root);
- btrfs_tree_read_unlock_blocking(eb_root);
+ btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
}
@@ -1483,10 +1480,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
search_start = buf->start & ~((u64)SZ_1G - 1);
- if (parent)
- btrfs_set_lock_blocking_write(parent);
- btrfs_set_lock_blocking_write(buf);
-
/*
* Before CoWing this block for later modification, check if it's
* the subtree root and do the delayed subtree trace if needed.
@@ -1578,7 +1571,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *cur;
u64 blocknr;
- u64 gen;
u64 search_start = *last_ret;
u64 last_block = 0;
u64 other;
@@ -1586,14 +1578,10 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
int end_slot;
int i;
int err = 0;
- int parent_level;
- int uptodate;
u32 blocksize;
int progress_passed = 0;
struct btrfs_disk_key disk_key;
- parent_level = btrfs_header_level(parent);
-
WARN_ON(trans->transaction != fs_info->running_transaction);
WARN_ON(trans->transid != fs_info->generation);
@@ -1604,10 +1592,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (parent_nritems <= 1)
return 0;
- btrfs_set_lock_blocking_write(parent);
-
for (i = start_slot; i <= end_slot; i++) {
- struct btrfs_key first_key;
int close = 1;
btrfs_node_key(parent, &disk_key, i);
@@ -1616,8 +1601,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
progress_passed = 1;
blocknr = btrfs_node_blockptr(parent, i);
- gen = btrfs_node_ptr_generation(parent, i);
- btrfs_node_key_to_cpu(parent, &first_key, i);
if (last_block == 0)
last_block = blocknr;
@@ -1634,36 +1617,13 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
continue;
}
- cur = find_extent_buffer(fs_info, blocknr);
- if (cur)
- uptodate = btrfs_buffer_uptodate(cur, gen, 0);
- else
- uptodate = 0;
- if (!cur || !uptodate) {
- if (!cur) {
- cur = read_tree_block(fs_info, blocknr, gen,
- parent_level - 1,
- &first_key);
- if (IS_ERR(cur)) {
- return PTR_ERR(cur);
- } else if (!extent_buffer_uptodate(cur)) {
- free_extent_buffer(cur);
- return -EIO;
- }
- } else if (!uptodate) {
- err = btrfs_read_buffer(cur, gen,
- parent_level - 1,&first_key);
- if (err) {
- free_extent_buffer(cur);
- return err;
- }
- }
- }
+ cur = btrfs_read_node_slot(parent, i);
+ if (IS_ERR(cur))
+ return PTR_ERR(cur);
if (search_start == 0)
search_start = last_block;
btrfs_tree_lock(cur);
- btrfs_set_lock_blocking_write(cur);
err = __btrfs_cow_block(trans, root, cur, parent, i,
&cur, search_start,
min(16 * blocksize,
@@ -1723,9 +1683,10 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
oip = offset_in_page(offset);
if (oip + key_size <= PAGE_SIZE) {
- const unsigned long idx = offset >> PAGE_SHIFT;
+ const unsigned long idx = get_eb_page_index(offset);
char *kaddr = page_address(eb->pages[idx]);
+ oip = get_eb_offset_in_page(eb, offset);
tmp = (struct btrfs_disk_key *)(kaddr + oip);
} else {
read_extent_buffer(eb, &unaligned, offset, key_size);
@@ -1801,6 +1762,7 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
btrfs_node_key_to_cpu(parent, &first_key, slot);
eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
+ btrfs_header_owner(parent),
btrfs_node_ptr_generation(parent, slot),
level - 1, &first_key);
if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
@@ -1835,8 +1797,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
mid = path->nodes[level];
- WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
- path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
+ WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
WARN_ON(btrfs_header_generation(mid) != trans->transid);
orig_ptr = btrfs_node_blockptr(mid, orig_slot);
@@ -1865,7 +1826,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
}
btrfs_tree_lock(child);
- btrfs_set_lock_blocking_write(child);
ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
BTRFS_NESTING_COW);
if (ret) {
@@ -1904,7 +1864,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (left) {
__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
- btrfs_set_lock_blocking_write(left);
wret = btrfs_cow_block(trans, root, left,
parent, pslot - 1, &left,
BTRFS_NESTING_LEFT_COW);
@@ -1920,7 +1879,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (right) {
__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
- btrfs_set_lock_blocking_write(right);
wret = btrfs_cow_block(trans, root, right,
parent, pslot + 1, &right,
BTRFS_NESTING_RIGHT_COW);
@@ -2084,7 +2042,6 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
u32 left_nr;
__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
- btrfs_set_lock_blocking_write(left);
left_nr = btrfs_header_nritems(left);
if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -2139,7 +2096,6 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
u32 right_nr;
__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
- btrfs_set_lock_blocking_write(right);
right_nr = btrfs_header_nritems(right);
if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
@@ -2243,7 +2199,7 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
search = btrfs_node_blockptr(node, nr);
if ((search <= target && target - search <= 65536) ||
(search > target && search - target <= 65536)) {
- readahead_tree_block(fs_info, search);
+ btrfs_readahead_node_child(node, nr);
nread += blocksize;
}
nscan++;
@@ -2252,16 +2208,11 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
}
}
-static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path, int level)
+static noinline void reada_for_balance(struct btrfs_path *path, int level)
{
+ struct extent_buffer *parent;
int slot;
int nritems;
- struct extent_buffer *parent;
- struct extent_buffer *eb;
- u64 gen;
- u64 block1 = 0;
- u64 block2 = 0;
parent = path->nodes[level + 1];
if (!parent)
@@ -2270,32 +2221,10 @@ static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
nritems = btrfs_header_nritems(parent);
slot = path->slots[level + 1];
- if (slot > 0) {
- block1 = btrfs_node_blockptr(parent, slot - 1);
- gen = btrfs_node_ptr_generation(parent, slot - 1);
- eb = find_extent_buffer(fs_info, block1);
- /*
- * if we get -eagain from btrfs_buffer_uptodate, we
- * don't want to return eagain here. That will loop
- * forever
- */
- if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
- block1 = 0;
- free_extent_buffer(eb);
- }
- if (slot + 1 < nritems) {
- block2 = btrfs_node_blockptr(parent, slot + 1);
- gen = btrfs_node_ptr_generation(parent, slot + 1);
- eb = find_extent_buffer(fs_info, block2);
- if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
- block2 = 0;
- free_extent_buffer(eb);
- }
-
- if (block1)
- readahead_tree_block(fs_info, block1);
- if (block2)
- readahead_tree_block(fs_info, block2);
+ if (slot > 0)
+ btrfs_readahead_node_child(parent, slot - 1);
+ if (slot + 1 < nritems)
+ btrfs_readahead_node_child(parent, slot + 1);
}
@@ -2399,14 +2328,6 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
return 0;
}
- /* the pages were up to date, but we failed
- * the generation number check. Do a full
- * read for the generation number that is correct.
- * We must do this without dropping locks so
- * we can trust our generation number
- */
- btrfs_set_path_blocking(p);
-
/* now we're allowed to do a blocking uptodate check */
ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
if (!ret) {
@@ -2426,14 +2347,13 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
* out which blocks to read.
*/
btrfs_unlock_up_safe(p, level + 1);
- btrfs_set_path_blocking(p);
if (p->reada != READA_NONE)
reada_for_search(fs_info, p, level, slot, key->objectid);
ret = -EAGAIN;
- tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
- &first_key);
+ tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
+ gen, parent_level - 1, &first_key);
if (!IS_ERR(tmp)) {
/*
* If the read above didn't mark this buffer up to date,
@@ -2468,58 +2388,42 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
int *write_lock_level)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- int ret;
+ int ret = 0;
if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
- int sret;
if (*write_lock_level < level + 1) {
*write_lock_level = level + 1;
btrfs_release_path(p);
- goto again;
+ return -EAGAIN;
}
- btrfs_set_path_blocking(p);
- reada_for_balance(fs_info, p, level);
- sret = split_node(trans, root, p, level);
+ reada_for_balance(p, level);
+ ret = split_node(trans, root, p, level);
- BUG_ON(sret > 0);
- if (sret) {
- ret = sret;
- goto done;
- }
b = p->nodes[level];
} else if (ins_len < 0 && btrfs_header_nritems(b) <
BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
- int sret;
if (*write_lock_level < level + 1) {
*write_lock_level = level + 1;
btrfs_release_path(p);
- goto again;
+ return -EAGAIN;
}
- btrfs_set_path_blocking(p);
- reada_for_balance(fs_info, p, level);
- sret = balance_level(trans, root, p, level);
+ reada_for_balance(p, level);
+ ret = balance_level(trans, root, p, level);
+ if (ret)
+ return ret;
- if (sret) {
- ret = sret;
- goto done;
- }
b = p->nodes[level];
if (!b) {
btrfs_release_path(p);
- goto again;
+ return -EAGAIN;
}
BUG_ON(btrfs_header_nritems(b) == 1);
}
- return 0;
-
-again:
- ret = -EAGAIN;
-done:
return ret;
}
@@ -2616,7 +2520,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
* We don't know the level of the root node until we actually
* have it read locked
*/
- b = __btrfs_read_lock_root_node(root, p->recurse);
+ b = btrfs_read_lock_root_node(root);
level = btrfs_header_level(b);
if (level > write_lock_level)
goto out;
@@ -2752,7 +2656,6 @@ again:
goto again;
}
- btrfs_set_path_blocking(p);
if (last_level)
err = btrfs_cow_block(trans, root, b, NULL, 0,
&b,
@@ -2822,7 +2725,6 @@ cow_done:
goto again;
}
- btrfs_set_path_blocking(p);
err = split_leaf(trans, root, key,
p, ins_len, ret == 0);
@@ -2884,17 +2786,10 @@ cow_done:
if (!p->skip_locking) {
level = btrfs_header_level(b);
if (level <= write_lock_level) {
- if (!btrfs_try_tree_write_lock(b)) {
- btrfs_set_path_blocking(p);
- btrfs_tree_lock(b);
- }
+ btrfs_tree_lock(b);
p->locks[level] = BTRFS_WRITE_LOCK;
} else {
- if (!btrfs_tree_read_lock_atomic(b)) {
- btrfs_set_path_blocking(p);
- __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL,
- p->recurse);
- }
+ btrfs_tree_read_lock(b);
p->locks[level] = BTRFS_READ_LOCK;
}
p->nodes[level] = b;
@@ -2902,12 +2797,6 @@ cow_done:
}
ret = 1;
done:
- /*
- * we don't really know what they plan on doing with the path
- * from here on, so for now just mark it as blocking
- */
- if (!p->leave_spinning)
- btrfs_set_path_blocking(p);
if (ret < 0 && !p->skip_release_on_error)
btrfs_release_path(p);
return ret;
@@ -2999,10 +2888,7 @@ again:
}
level = btrfs_header_level(b);
- if (!btrfs_tree_read_lock_atomic(b)) {
- btrfs_set_path_blocking(p);
- btrfs_tree_read_lock(b);
- }
+ btrfs_tree_read_lock(b);
b = tree_mod_log_rewind(fs_info, p, b, time_seq);
if (!b) {
ret = -ENOMEM;
@@ -3013,8 +2899,6 @@ again:
}
ret = 1;
done:
- if (!p->leave_spinning)
- btrfs_set_path_blocking(p);
if (ret < 0)
btrfs_release_path(p);
@@ -3441,7 +3325,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
add_root_to_dirty_list(root);
atomic_inc(&c->refs);
path->nodes[level] = c;
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
path->slots[level] = 0;
return 0;
}
@@ -3562,7 +3446,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
(c_nritems - mid) * sizeof(struct btrfs_key_ptr));
btrfs_set_header_nritems(split, c_nritems - mid);
btrfs_set_header_nritems(c, mid);
- ret = 0;
btrfs_mark_buffer_dirty(c);
btrfs_mark_buffer_dirty(split);
@@ -3580,7 +3463,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(split);
free_extent_buffer(split);
}
- return ret;
+ return 0;
}
/*
@@ -3814,7 +3697,6 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
return 1;
__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
- btrfs_set_lock_blocking_write(right);
free_space = btrfs_leaf_free_space(right);
if (free_space < data_size)
@@ -4053,7 +3935,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
return 1;
__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
- btrfs_set_lock_blocking_write(left);
free_space = btrfs_leaf_free_space(left);
if (free_space < data_size) {
@@ -4448,7 +4329,6 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
goto err;
}
- btrfs_set_path_blocking(path);
ret = split_leaf(trans, root, &key, path, ins_len, 1);
if (ret)
goto err;
@@ -4478,8 +4358,6 @@ static noinline int split_item(struct btrfs_path *path,
leaf = path->nodes[0];
BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
- btrfs_set_path_blocking(path);
-
item = btrfs_item_nr(path->slots[0]);
orig_offset = btrfs_item_offset(leaf, item);
item_size = btrfs_item_size(leaf, item);
@@ -5055,7 +4933,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (leaf == root->node) {
btrfs_set_header_level(leaf, 0);
} else {
- btrfs_set_path_blocking(path);
btrfs_clean_tree_block(leaf);
btrfs_del_leaf(trans, root, path, leaf);
}
@@ -5077,7 +4954,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
slot = path->slots[1];
atomic_inc(&leaf->refs);
- btrfs_set_path_blocking(path);
wret = push_leaf_left(trans, root, path, 1, 1,
1, (u32)-1);
if (wret < 0 && wret != -ENOSPC)
@@ -5248,7 +5124,6 @@ find_next_key:
*/
if (slot >= nritems) {
path->slots[level] = slot;
- btrfs_set_path_blocking(path);
sret = btrfs_find_next_key(root, path, min_key, level,
min_trans);
if (sret == 0) {
@@ -5265,7 +5140,6 @@ find_next_key:
ret = 0;
goto out;
}
- btrfs_set_path_blocking(path);
cur = btrfs_read_node_slot(cur, slot);
if (IS_ERR(cur)) {
ret = PTR_ERR(cur);
@@ -5282,7 +5156,6 @@ out:
path->keep_locks = keep_locks;
if (ret == 0) {
btrfs_unlock_up_safe(path, path->lowest_level + 1);
- btrfs_set_path_blocking(path);
memcpy(min_key, &found_key, sizeof(found_key));
}
return ret;
@@ -5384,8 +5257,7 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key key;
u32 nritems;
int ret;
- int old_spinning = path->leave_spinning;
- int next_rw_lock = 0;
+ int i;
nritems = btrfs_header_nritems(path->nodes[0]);
if (nritems == 0)
@@ -5395,11 +5267,9 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
again:
level = 1;
next = NULL;
- next_rw_lock = 0;
btrfs_release_path(path);
path->keep_locks = 1;
- path->leave_spinning = 1;
if (time_seq)
ret = btrfs_search_old_slot(root, &key, path, time_seq);
@@ -5459,13 +5329,22 @@ again:
continue;
}
- if (next) {
- btrfs_tree_unlock_rw(next, next_rw_lock);
- free_extent_buffer(next);
+
+ /*
+ * Our current level is where we're going to start from, and to
+ * make sure lockdep doesn't complain we need to drop our locks
+ * and nodes from 0 to our current level.
+ */
+ for (i = 0; i < level; i++) {
+ if (path->locks[level]) {
+ btrfs_tree_read_unlock(path->nodes[i]);
+ path->locks[i] = 0;
+ }
+ free_extent_buffer(path->nodes[i]);
+ path->nodes[i] = NULL;
}
next = c;
- next_rw_lock = path->locks[level];
ret = read_block_for_search(root, path, &next, level,
slot, &key);
if (ret == -EAGAIN)
@@ -5491,28 +5370,18 @@ again:
cond_resched();
goto again;
}
- if (!ret) {
- btrfs_set_path_blocking(path);
- __btrfs_tree_read_lock(next,
- BTRFS_NESTING_RIGHT,
- path->recurse);
- }
- next_rw_lock = BTRFS_READ_LOCK;
+ if (!ret)
+ btrfs_tree_read_lock(next);
}
break;
}
path->slots[level] = slot;
while (1) {
level--;
- c = path->nodes[level];
- if (path->locks[level])
- btrfs_tree_unlock_rw(c, path->locks[level]);
-
- free_extent_buffer(c);
path->nodes[level] = next;
path->slots[level] = 0;
if (!path->skip_locking)
- path->locks[level] = next_rw_lock;
+ path->locks[level] = BTRFS_READ_LOCK;
if (!level)
break;
@@ -5526,23 +5395,12 @@ again:
goto done;
}
- if (!path->skip_locking) {
- ret = btrfs_try_tree_read_lock(next);
- if (!ret) {
- btrfs_set_path_blocking(path);
- __btrfs_tree_read_lock(next,
- BTRFS_NESTING_RIGHT,
- path->recurse);
- }
- next_rw_lock = BTRFS_READ_LOCK;
- }
+ if (!path->skip_locking)
+ btrfs_tree_read_lock(next);
}
ret = 0;
done:
unlock_up(path, 0, 1, 0, NULL);
- path->leave_spinning = old_spinning;
- if (!old_spinning)
- btrfs_set_path_blocking(path);
return ret;
}
@@ -5564,7 +5422,6 @@ int btrfs_previous_item(struct btrfs_root *root,
while (1) {
if (path->slots[0] == 0) {
- btrfs_set_path_blocking(path);
ret = btrfs_prev_leaf(root, path);
if (ret != 0)
return ret;
@@ -5606,7 +5463,6 @@ int btrfs_previous_extent_item(struct btrfs_root *root,
while (1) {
if (path->slots[0] == 0) {
- btrfs_set_path_blocking(path);
ret = btrfs_prev_leaf(root, path);
if (ret != 0)
return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0b29bdb25105..1d3c1e479f3d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -17,7 +17,6 @@
#include <linux/wait.h>
#include <linux/slab.h>
#include <trace/events/btrfs.h>
-#include <asm/kmap_types.h>
#include <asm/unaligned.h>
#include <linux/pagemap.h>
#include <linux/btrfs.h>
@@ -28,6 +27,7 @@
#include <linux/dynamic_debug.h>
#include <linux/refcount.h>
#include <linux/crc32c.h>
+#include <linux/iomap.h>
#include "extent-io-tree.h"
#include "extent_io.h"
#include "extent_map.h"
@@ -67,12 +67,6 @@ struct btrfs_ref;
#define BTRFS_OLDEST_GENERATION 0ULL
/*
- * the max metadata block size. This limit is somewhat artificial,
- * but the memmove costs go through the roof for larger blocks.
- */
-#define BTRFS_MAX_METADATA_BLOCKSIZE 65536
-
-/*
* we can actually store much bigger names, but lets not confuse the rest
* of linux
*/
@@ -370,11 +364,9 @@ struct btrfs_path {
unsigned int search_for_split:1;
unsigned int keep_locks:1;
unsigned int skip_locking:1;
- unsigned int leave_spinning:1;
unsigned int search_commit_root:1;
unsigned int need_commit_sem:1;
unsigned int skip_release_on_error:1;
- unsigned int recurse:1;
};
#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
sizeof(struct btrfs_item))
@@ -469,10 +461,11 @@ struct btrfs_discard_ctl {
struct btrfs_block_group *block_group;
struct list_head discard_list[BTRFS_NR_DISCARD_LISTS];
u64 prev_discard;
+ u64 prev_discard_time;
atomic_t discardable_extents;
atomic64_t discardable_bytes;
u64 max_discard_size;
- unsigned long delay;
+ u64 delay_ms;
u32 iops_limit;
u32 kbps_limit;
u64 discard_extent_bytes;
@@ -559,6 +552,9 @@ enum {
/* Indicate that the discard workqueue can service discards. */
BTRFS_FS_DISCARD_RUNNING,
+
+ /* Indicate that we need to cleanup space cache v1 */
+ BTRFS_FS_CLEANUP_SPACE_CACHE_V1,
};
/*
@@ -912,6 +908,7 @@ struct btrfs_fs_info {
/* Extent buffer radix tree */
spinlock_t buffer_lock;
+ /* Entries are eb->start / sectorsize */
struct radix_tree_root buffer_radix;
/* next backup root to be overwritten */
@@ -934,6 +931,10 @@ struct btrfs_fs_info {
/* Cached block sizes */
u32 nodesize;
u32 sectorsize;
+ /* ilog2 of sectorsize, use to avoid 64bit division */
+ u32 sectorsize_bits;
+ u32 csum_size;
+ u32 csums_per_leaf;
u32 stripesize;
/* Block groups and devices containing active swapfiles. */
@@ -951,6 +952,18 @@ struct btrfs_fs_info {
/* Type of exclusive operation running */
unsigned long exclusive_operation;
+ /*
+ * Zone size > 0 when in ZONED mode, otherwise it's used for a check
+ * if the mode is enabled
+ */
+ union {
+ u64 zone_size;
+ u64 zoned;
+ };
+
+ /* Max size to emit ZONE_APPEND write command */
+ u64 max_zone_append_size;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
@@ -1021,7 +1034,7 @@ enum {
BTRFS_ROOT_DEAD_RELOC_TREE,
/* Mark dead root stored on device whose cleanup needs to be resumed */
BTRFS_ROOT_DEAD_TREE,
- /* The root has a log tree. Used only for subvolume roots. */
+ /* The root has a log tree. Used for subvolume roots and the tree root. */
BTRFS_ROOT_HAS_LOG_TREE,
/* Qgroup flushing is in progress */
BTRFS_ROOT_QGROUP_FLUSHING,
@@ -1060,15 +1073,6 @@ struct btrfs_root {
spinlock_t accounting_lock;
struct btrfs_block_rsv *block_rsv;
- /* free ino cache stuff */
- struct btrfs_free_space_ctl *free_ino_ctl;
- enum btrfs_caching_type ino_cache_state;
- spinlock_t ino_cache_lock;
- wait_queue_head_t ino_cache_wait;
- struct btrfs_free_space_ctl *free_ino_pinned;
- u64 ino_cache_progress;
- struct inode *ino_cache_inode;
-
struct mutex log_mutex;
wait_queue_head_t log_writer_wait;
wait_queue_head_t log_commit_wait[2];
@@ -1227,6 +1231,63 @@ struct btrfs_replace_extent_info {
int insertions;
};
+/* Arguments for btrfs_drop_extents() */
+struct btrfs_drop_extents_args {
+ /* Input parameters */
+
+ /*
+ * If NULL, btrfs_drop_extents() will allocate and free its own path.
+ * If 'replace_extent' is true, this must not be NULL. Also the path
+ * is always released except if 'replace_extent' is true and
+ * btrfs_drop_extents() sets 'extent_inserted' to true, in which case
+ * the path is kept locked.
+ */
+ struct btrfs_path *path;
+ /* Start offset of the range to drop extents from */
+ u64 start;
+ /* End (exclusive, last byte + 1) of the range to drop extents from */
+ u64 end;
+ /* If true drop all the extent maps in the range */
+ bool drop_cache;
+ /*
+ * If true it means we want to insert a new extent after dropping all
+ * the extents in the range. If this is true, the 'extent_item_size'
+ * parameter must be set as well and the 'extent_inserted' field will
+ * be set to true by btrfs_drop_extents() if it could insert the new
+ * extent.
+ * Note: when this is set to true the path must not be NULL.
+ */
+ bool replace_extent;
+ /*
+ * Used if 'replace_extent' is true. Size of the file extent item to
+ * insert after dropping all existing extents in the range
+ */
+ u32 extent_item_size;
+
+ /* Output parameters */
+
+ /*
+ * Set to the minimum between the input parameter 'end' and the end
+ * (exclusive, last byte + 1) of the last dropped extent. This is always
+ * set even if btrfs_drop_extents() returns an error.
+ */
+ u64 drop_end;
+ /*
+ * The number of allocated bytes found in the range. This can be smaller
+ * than the range's length when there are holes in the range.
+ */
+ u64 bytes_found;
+ /*
+ * Only set if 'replace_extent' is true. Set to true if we were able
+ * to insert a replacement extent after dropping all extents in the
+ * range, otherwise set to false by btrfs_drop_extents().
+ * Also, if btrfs_drop_extents() has set this to true it means it
+ * returned with the path locked, otherwise if it has set this to
+ * false it has returned with the path released.
+ */
+ bool extent_inserted;
+};
+
struct btrfs_file_private {
void *filldir_buf;
};
@@ -1285,7 +1346,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
-#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
+/* bit 17 is free */
#define BTRFS_MOUNT_USEBACKUPROOT (1 << 18)
#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19)
#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20)
@@ -1298,6 +1359,8 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
#define BTRFS_MOUNT_NOLOGREPLAY (1 << 27)
#define BTRFS_MOUNT_REF_VERIFY (1 << 28)
#define BTRFS_MOUNT_DISCARD_ASYNC (1 << 29)
+#define BTRFS_MOUNT_IGNOREBADROOTS (1 << 30)
+#define BTRFS_MOUNT_IGNOREDATACSUMS (1 << 31)
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
#define BTRFS_DEFAULT_MAX_INLINE (2048)
@@ -1330,9 +1393,7 @@ do { \
* transaction commit)
*/
-#define BTRFS_PENDING_SET_INODE_MAP_CACHE (0)
-#define BTRFS_PENDING_CLEAR_INODE_MAP_CACHE (1)
-#define BTRFS_PENDING_COMMIT (2)
+#define BTRFS_PENDING_COMMIT (0)
#define btrfs_test_pending(info, opt) \
test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
@@ -1405,7 +1466,7 @@ struct btrfs_map_token {
};
#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
- ((bytes) >> (fs_info)->sb->s_blocksize_bits)
+ ((bytes) >> (fs_info)->sectorsize_bits)
static inline void btrfs_init_map_token(struct btrfs_map_token *token,
struct extent_buffer *eb)
@@ -1490,13 +1551,14 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
static inline u##bits btrfs_##name(const struct extent_buffer *eb) \
{ \
- const type *p = page_address(eb->pages[0]); \
+ const type *p = page_address(eb->pages[0]) + \
+ offset_in_page(eb->start); \
return get_unaligned_le##bits(&p->member); \
} \
static inline void btrfs_set_##name(const struct extent_buffer *eb, \
u##bits val) \
{ \
- type *p = page_address(eb->pages[0]); \
+ type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \
put_unaligned_le##bits(val, &p->member); \
}
@@ -2086,6 +2148,7 @@ BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item,
generation, 64);
BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8);
BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
@@ -2518,7 +2581,17 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
enum btrfs_inline_ref_type is_data);
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
-u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes);
+/*
+ * Take the number of bytes to be checksummmed and figure out how many leaves
+ * it would require to store the csums for that many bytes.
+ */
+static inline u64 btrfs_csum_bytes_to_leaves(
+ const struct btrfs_fs_info *fs_info, u64 csum_bytes)
+{
+ const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits;
+
+ return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf);
+}
/*
* Use this if we would be adding new items, as we could split nodes as we cow
@@ -2593,7 +2666,6 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len, int delalloc);
int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
u64 len);
-void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref);
@@ -2940,8 +3012,7 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
struct btrfs_dio_private;
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
-blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
- u64 offset, u8 *dst);
+blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst);
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 pos,
@@ -2968,13 +3039,13 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
u64 len);
int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
u64 len);
-void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size);
+void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size);
u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
-int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset,
+int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
struct page *page, u64 start, u64 end, int mirror);
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
u64 start, u64 len);
@@ -2994,11 +3065,11 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
const char *name, int name_len, int add_backref, u64 index);
int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry);
-int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
- int front);
+int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
+ int front);
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- struct inode *inode, u64 new_size,
+ struct btrfs_inode *inode, u64 new_size,
u32 min_type);
int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
@@ -3038,14 +3109,13 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct page *page, size_t pg_offset,
u64 start, u64 end);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct inode *inode);
+ struct btrfs_root *root, struct btrfs_inode *inode);
int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode);
+ struct btrfs_root *root, struct btrfs_inode *inode);
int btrfs_orphan_add(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode);
int btrfs_orphan_cleanup(struct btrfs_root *root);
-int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
+int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size);
void btrfs_add_delayed_iput(struct inode *inode);
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info);
@@ -3063,7 +3133,18 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
u64 end, int uptodate);
extern const struct dentry_operations btrfs_dentry_operations;
-ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
+extern const struct iomap_ops btrfs_dio_iomap_ops;
+extern const struct iomap_dio_ops btrfs_dio_ops;
+
+/* Inode locking type flags, by default the exclusive lock is taken */
+#define BTRFS_ILOCK_SHARED (1U << 0)
+#define BTRFS_ILOCK_TRY (1U << 1)
+
+int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags);
+void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags);
+void btrfs_update_inode_bytes(struct btrfs_inode *inode,
+ const u64 add_bytes,
+ const u64 del_bytes);
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -3093,16 +3174,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
int skip_pinned);
extern const struct file_operations btrfs_file_operations;
-int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
- struct btrfs_path *path, u64 start, u64 end,
- u64 *drop_end, int drop_cache,
- int replace_extent,
- u32 extent_item_size,
- int *key_inserted);
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode, u64 start,
- u64 end, int drop_cache);
+ struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_drop_extents_args *args);
int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
const u64 start, const u64 end,
struct btrfs_replace_extent_info *extent_info,
@@ -3112,7 +3186,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
int btrfs_release_file(struct inode *inode, struct file *file);
int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
- struct extent_state **cached);
+ struct extent_state **cached, bool noreserve);
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes);
@@ -3292,6 +3366,39 @@ static inline void assertfail(const char *expr, const char* file, int line) { }
#endif
/*
+ * Get the correct offset inside the page of extent buffer.
+ *
+ * @eb: target extent buffer
+ * @start: offset inside the extent buffer
+ *
+ * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases.
+ */
+static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb,
+ unsigned long offset)
+{
+ /*
+ * For sectorsize == PAGE_SIZE case, eb->start will always be aligned
+ * to PAGE_SIZE, thus adding it won't cause any difference.
+ *
+ * For sectorsize < PAGE_SIZE, we must only read the data that belongs
+ * to the eb, thus we have to take the eb->start into consideration.
+ */
+ return offset_in_page(offset + eb->start);
+}
+
+static inline unsigned long get_eb_page_index(unsigned long offset)
+{
+ /*
+ * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough.
+ *
+ * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE,
+ * and have ensured that all tree blocks are contained in one page,
+ * thus we always get index == 0.
+ */
+ return offset >> PAGE_SHIFT;
+}
+
+/*
* Use that for functions that are conditionally exported for sanity tests but
* otherwise static
*/
@@ -3600,4 +3707,9 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
}
#endif
+static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
+{
+ return fs_info->zoned != 0;
+}
+
#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 5aba81e16113..70c0340d839c 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -740,13 +740,6 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
goto out;
}
- /*
- * we need allocate some memory space, but it might cause the task
- * to sleep, so we set all locked nodes in the path to blocking locks
- * first.
- */
- btrfs_set_path_blocking(path);
-
keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS);
if (!keys) {
ret = -ENOMEM;
@@ -1154,7 +1147,6 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
block_rsv = trans->block_rsv;
trans->block_rsv = &fs_info->delayed_block_rsv;
@@ -1219,7 +1211,6 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
btrfs_release_delayed_node(delayed_node);
return -ENOMEM;
}
- path->leave_spinning = 1;
block_rsv = trans->block_rsv;
trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
@@ -1264,7 +1255,6 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode)
ret = -ENOMEM;
goto trans_out;
}
- path->leave_spinning = 1;
block_rsv = trans->block_rsv;
trans->block_rsv = &fs_info->delayed_block_rsv;
@@ -1333,7 +1323,6 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
if (!delayed_node)
break;
- path->leave_spinning = 1;
root = delayed_node->root;
trans = btrfs_join_transaction(root);
@@ -1826,27 +1815,29 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
}
int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode)
+ struct btrfs_root *root,
+ struct btrfs_inode *inode)
{
struct btrfs_delayed_node *delayed_node;
int ret = 0;
- delayed_node = btrfs_get_or_create_delayed_node(BTRFS_I(inode));
+ delayed_node = btrfs_get_or_create_delayed_node(inode);
if (IS_ERR(delayed_node))
return PTR_ERR(delayed_node);
mutex_lock(&delayed_node->mutex);
if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
- fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
+ fill_stack_inode_item(trans, &delayed_node->inode_item,
+ &inode->vfs_inode);
goto release_node;
}
- ret = btrfs_delayed_inode_reserve_metadata(trans, root, BTRFS_I(inode),
+ ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
delayed_node);
if (ret)
goto release_node;
- fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
+ fill_stack_inode_item(trans, &delayed_node->inode_item, &inode->vfs_inode);
set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
delayed_node->count++;
atomic_inc(&root->fs_info->delayed_root->items);
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index ca96ef007d8f..b2412160c5bc 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -110,7 +110,8 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode);
int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode);
+ struct btrfs_root *root,
+ struct btrfs_inode *inode);
int btrfs_fill_inode(struct inode *inode, u32 *rdev);
int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 10638537b9ef..a98e33f232d5 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -21,6 +21,7 @@
#include "rcu-string.h"
#include "dev-replace.h"
#include "sysfs.h"
+#include "zoned.h"
/*
* Device replace overview
@@ -96,7 +97,7 @@ no_valid_dev_replace_entry_found:
* a replace target, fail the mount.
*/
if (btrfs_find_device(fs_info->fs_devices,
- BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) {
+ BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
btrfs_err(fs_info,
"found replace target device without a valid replace item");
ret = -EUCLEAN;
@@ -159,7 +160,7 @@ no_valid_dev_replace_entry_found:
* replace target, fail the mount.
*/
if (btrfs_find_device(fs_info->fs_devices,
- BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) {
+ BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
btrfs_err(fs_info,
"replace devid present without an active replace item");
ret = -EUCLEAN;
@@ -171,10 +172,10 @@ no_valid_dev_replace_entry_found:
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
- src_devid, NULL, NULL, true);
+ src_devid, NULL, NULL);
dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
BTRFS_DEV_REPLACE_DEVID,
- NULL, NULL, true);
+ NULL, NULL);
/*
* allow 'btrfs dev replace_cancel' if src/tgt device is
* missing
@@ -259,6 +260,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return PTR_ERR(bdev);
}
+ if (!btrfs_check_device_zone_type(fs_info, bdev)) {
+ btrfs_err(fs_info,
+ "dev-replace: zoned type of target device mismatch with filesystem");
+ ret = -EINVAL;
+ goto error;
+ }
+
sync_blockdev(bdev);
list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
@@ -313,6 +321,10 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
device->fs_devices = fs_info->fs_devices;
+ ret = btrfs_get_dev_zone_info(device);
+ if (ret)
+ goto error;
+
mutex_lock(&fs_info->fs_devices->device_list_mutex);
list_add(&device->dev_list, &fs_info->fs_devices->devices);
fs_info->fs_devices->num_devices++;
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 863367c2c620..98b63ebed539 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -127,7 +127,6 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
btrfs_cpu_key_to_disk(&disk_key, location);
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index 741c7e19c32f..1db966bf85b2 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -355,7 +355,7 @@ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
block_group = find_next_block_group(discard_ctl, now);
if (block_group) {
- unsigned long delay = discard_ctl->delay;
+ u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC;
u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit);
/*
@@ -366,9 +366,9 @@ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
if (kbps_limit && discard_ctl->prev_discard) {
u64 bps_limit = ((u64)kbps_limit) * SZ_1K;
u64 bps_delay = div64_u64(discard_ctl->prev_discard *
- MSEC_PER_SEC, bps_limit);
+ NSEC_PER_SEC, bps_limit);
- delay = max(delay, msecs_to_jiffies(bps_delay));
+ delay = max(delay, bps_delay);
}
/*
@@ -378,11 +378,20 @@ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
if (now < block_group->discard_eligible_time) {
u64 bg_timeout = block_group->discard_eligible_time - now;
- delay = max(delay, nsecs_to_jiffies(bg_timeout));
+ delay = max(delay, bg_timeout);
+ }
+
+ if (override && discard_ctl->prev_discard) {
+ u64 elapsed = now - discard_ctl->prev_discard_time;
+
+ if (delay > elapsed)
+ delay -= elapsed;
+ else
+ delay = 0;
}
mod_delayed_work(discard_ctl->discard_workers,
- &discard_ctl->work, delay);
+ &discard_ctl->work, nsecs_to_jiffies(delay));
}
out:
spin_unlock(&discard_ctl->lock);
@@ -465,7 +474,12 @@ static void btrfs_discard_workfn(struct work_struct *work)
discard_ctl->discard_extent_bytes += trimmed;
}
+ /*
+ * Updated without locks as this is inside the workfn and nothing else
+ * is reading the values
+ */
discard_ctl->prev_discard = trimmed;
+ discard_ctl->prev_discard_time = ktime_get_ns();
/* Determine next steps for a block_group */
if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
@@ -519,7 +533,6 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
s64 discardable_bytes;
u32 iops_limit;
unsigned long delay;
- unsigned long lower_limit = BTRFS_DISCARD_MIN_DELAY_MSEC;
discardable_extents = atomic_read(&discard_ctl->discardable_extents);
if (!discardable_extents)
@@ -550,12 +563,13 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
iops_limit = READ_ONCE(discard_ctl->iops_limit);
if (iops_limit)
- lower_limit = max_t(unsigned long, lower_limit,
- MSEC_PER_SEC / iops_limit);
+ delay = MSEC_PER_SEC / iops_limit;
+ else
+ delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents;
- delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents;
- delay = clamp(delay, lower_limit, BTRFS_DISCARD_MAX_DELAY_MSEC);
- discard_ctl->delay = msecs_to_jiffies(delay);
+ delay = clamp(delay, BTRFS_DISCARD_MIN_DELAY_MSEC,
+ BTRFS_DISCARD_MAX_DELAY_MSEC);
+ discard_ctl->delay_ms = delay;
spin_unlock(&discard_ctl->lock);
}
@@ -563,15 +577,14 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
/**
* btrfs_discard_update_discardable - propagate discard counters
* @block_group: block_group of interest
- * @ctl: free_space_ctl of @block_group
*
* This propagates deltas of counters up to the discard_ctl. It maintains a
* current counter and a previous counter passing the delta up to the global
* stat. Then the current counter value becomes the previous counter value.
*/
-void btrfs_discard_update_discardable(struct btrfs_block_group *block_group,
- struct btrfs_free_space_ctl *ctl)
+void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
{
+ struct btrfs_free_space_ctl *ctl;
struct btrfs_discard_ctl *discard_ctl;
s32 extents_delta;
s64 bytes_delta;
@@ -581,8 +594,10 @@ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group,
!btrfs_is_block_group_data_only(block_group))
return;
+ ctl = block_group->free_space_ctl;
discard_ctl = &block_group->fs_info->discard_ctl;
+ lockdep_assert_held(&ctl->tree_lock);
extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] -
ctl->discardable_extents[BTRFS_STAT_PREV];
if (extents_delta) {
@@ -684,10 +699,11 @@ void btrfs_discard_init(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&discard_ctl->discard_list[i]);
discard_ctl->prev_discard = 0;
+ discard_ctl->prev_discard_time = 0;
atomic_set(&discard_ctl->discardable_extents, 0);
atomic64_set(&discard_ctl->discardable_bytes, 0);
discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE;
- discard_ctl->delay = BTRFS_DISCARD_MAX_DELAY_MSEC;
+ discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC;
discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS;
discard_ctl->kbps_limit = 0;
discard_ctl->discard_extent_bytes = 0;
diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h
index 353228d62f5a..57b9202f427f 100644
--- a/fs/btrfs/discard.h
+++ b/fs/btrfs/discard.h
@@ -28,8 +28,7 @@ bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl);
/* Update operations */
void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl);
-void btrfs_discard_update_discardable(struct btrfs_block_group *block_group,
- struct btrfs_free_space_ctl *ctl);
+void btrfs_discard_update_discardable(struct btrfs_block_group *block_group);
/* Setup/cleanup operations */
void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index af97ddcc6b3e..765deefda92b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -29,7 +29,6 @@
#include "tree-log.h"
#include "free-space-cache.h"
#include "free-space-tree.h"
-#include "inode-map.h"
#include "check-integrity.h"
#include "rcu-string.h"
#include "dev-replace.h"
@@ -42,6 +41,7 @@
#include "block-group.h"
#include "discard.h"
#include "space-info.h"
+#include "zoned.h"
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
BTRFS_HEADER_FLAG_RELOC |\
@@ -109,15 +109,13 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
* just before they are sent down the IO stack.
*/
struct async_submit_bio {
- void *private_data;
+ struct inode *inode;
struct bio *bio;
extent_submit_bio_start_t *submit_bio_start;
int mirror_num;
- /*
- * bio_offset is optional, can be used if the pages in the bio
- * can't tell us where in the file the bio should go
- */
- u64 bio_offset;
+
+ /* Optional parameter for submit_bio_start used by direct io */
+ u64 dio_file_offset;
struct btrfs_work work;
blk_status_t status;
};
@@ -150,40 +148,41 @@ struct async_submit_bio {
# error
# endif
+#define DEFINE_LEVEL(stem, level) \
+ .names[level] = "btrfs-" stem "-0" #level,
+
+#define DEFINE_NAME(stem) \
+ DEFINE_LEVEL(stem, 0) \
+ DEFINE_LEVEL(stem, 1) \
+ DEFINE_LEVEL(stem, 2) \
+ DEFINE_LEVEL(stem, 3) \
+ DEFINE_LEVEL(stem, 4) \
+ DEFINE_LEVEL(stem, 5) \
+ DEFINE_LEVEL(stem, 6) \
+ DEFINE_LEVEL(stem, 7)
+
static struct btrfs_lockdep_keyset {
u64 id; /* root objectid */
- const char *name_stem; /* lock name stem */
- char names[BTRFS_MAX_LEVEL + 1][20];
- struct lock_class_key keys[BTRFS_MAX_LEVEL + 1];
+ /* Longest entry: btrfs-free-space-00 */
+ char names[BTRFS_MAX_LEVEL][20];
+ struct lock_class_key keys[BTRFS_MAX_LEVEL];
} btrfs_lockdep_keysets[] = {
- { .id = BTRFS_ROOT_TREE_OBJECTID, .name_stem = "root" },
- { .id = BTRFS_EXTENT_TREE_OBJECTID, .name_stem = "extent" },
- { .id = BTRFS_CHUNK_TREE_OBJECTID, .name_stem = "chunk" },
- { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" },
- { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" },
- { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" },
- { .id = BTRFS_QUOTA_TREE_OBJECTID, .name_stem = "quota" },
- { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" },
- { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" },
- { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" },
- { .id = BTRFS_UUID_TREE_OBJECTID, .name_stem = "uuid" },
- { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, .name_stem = "free-space" },
- { .id = 0, .name_stem = "tree" },
+ { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") },
+ { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") },
+ { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") },
+ { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") },
+ { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") },
+ { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") },
+ { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") },
+ { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") },
+ { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") },
+ { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") },
+ { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") },
+ { .id = 0, DEFINE_NAME("tree") },
};
-void __init btrfs_init_lockdep(void)
-{
- int i, j;
-
- /* initialize lockdep class names */
- for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) {
- struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i];
-
- for (j = 0; j < ARRAY_SIZE(ks->names); j++)
- snprintf(ks->names[j], sizeof(ks->names[j]),
- "btrfs-%s-%02d", ks->name_stem, j);
- }
-}
+#undef DEFINE_LEVEL
+#undef DEFINE_NAME
void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
int level)
@@ -210,15 +209,16 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
{
struct btrfs_fs_info *fs_info = buf->fs_info;
const int num_pages = fs_info->nodesize >> PAGE_SHIFT;
+ const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
char *kaddr;
int i;
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
- kaddr = page_address(buf->pages[0]);
+ kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
- PAGE_SIZE - BTRFS_CSUM_SIZE);
+ first_page_part - BTRFS_CSUM_SIZE);
for (i = 1; i < num_pages; i++) {
kaddr = page_address(buf->pages[i]);
@@ -248,10 +248,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
if (atomic)
return -EAGAIN;
- if (need_lock) {
+ if (need_lock)
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_read(eb);
- }
lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
&cached_state);
@@ -280,7 +278,7 @@ out:
unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
&cached_state);
if (need_lock)
- btrfs_tree_read_unlock_blocking(eb);
+ btrfs_tree_read_unlock(eb);
return ret;
}
@@ -319,7 +317,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
- if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb)))
+ if (memcmp(disk_sb->csum, result, fs_info->csum_size))
return 1;
return 0;
@@ -443,16 +441,16 @@ static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
}
/*
- * checksum a dirty tree block before IO. This has extra checks to make sure
- * we only fill in the checksum field in the first page of a multi-page block
+ * Checksum a dirty tree block before IO. This has extra checks to make sure
+ * we only fill in the checksum field in the first page of a multi-page block.
+ * For subpage extent buffers we need bvec to also read the offset in the page.
*/
-
-static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
+static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec)
{
+ struct page *page = bvec->bv_page;
u64 start = page_offset(page);
u64 found_start;
u8 result[BTRFS_CSUM_SIZE];
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
struct extent_buffer *eb;
int ret;
@@ -489,7 +487,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
return ret;
}
- write_extent_buffer(eb, result, 0, csum_size);
+ write_extent_buffer(eb, result, 0, fs_info->csum_size);
return 0;
}
@@ -523,65 +521,37 @@ static int check_tree_block_fsid(struct extent_buffer *eb)
return 1;
}
-int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
- struct page *page, u64 start, u64 end,
- int mirror)
+/* Do basic extent buffer checks at read time */
+static int validate_extent_buffer(struct extent_buffer *eb)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
u64 found_start;
- int found_level;
- struct extent_buffer *eb;
- struct btrfs_fs_info *fs_info;
- u16 csum_size;
- int ret = 0;
+ const u32 csum_size = fs_info->csum_size;
+ u8 found_level;
u8 result[BTRFS_CSUM_SIZE];
- int reads_done;
-
- if (!page->private)
- goto out;
-
- eb = (struct extent_buffer *)page->private;
- fs_info = eb->fs_info;
- csum_size = btrfs_super_csum_size(fs_info->super_copy);
-
- /* the pending IO might have been the only thing that kept this buffer
- * in memory. Make sure we have a ref for all this other checks
- */
- atomic_inc(&eb->refs);
-
- reads_done = atomic_dec_and_test(&eb->io_pages);
- if (!reads_done)
- goto err;
-
- eb->read_mirror = mirror;
- if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
- ret = -EIO;
- goto err;
- }
+ int ret = 0;
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu",
eb->start, found_start);
ret = -EIO;
- goto err;
+ goto out;
}
if (check_tree_block_fsid(eb)) {
btrfs_err_rl(fs_info, "bad fsid on block %llu",
eb->start);
ret = -EIO;
- goto err;
+ goto out;
}
found_level = btrfs_header_level(eb);
if (found_level >= BTRFS_MAX_LEVEL) {
btrfs_err(fs_info, "bad tree block level %d on %llu",
(int)btrfs_header_level(eb), eb->start);
ret = -EIO;
- goto err;
+ goto out;
}
- btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
- eb, found_level);
-
csum_tree_block(eb, result);
if (memcmp_extent_buffer(eb, result, 0, csum_size)) {
@@ -595,7 +565,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
CSUM_FMT_VALUE(csum_size, result),
btrfs_header_level(eb));
ret = -EUCLEAN;
- goto err;
+ goto out;
}
/*
@@ -617,6 +587,37 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
btrfs_err(fs_info,
"block=%llu read time tree block corruption detected",
eb->start);
+out:
+ return ret;
+}
+
+int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
+ struct page *page, u64 start, u64 end,
+ int mirror)
+{
+ struct extent_buffer *eb;
+ int ret = 0;
+ int reads_done;
+
+ ASSERT(page->private);
+ eb = (struct extent_buffer *)page->private;
+
+ /*
+ * The pending IO might have been the only thing that kept this buffer
+ * in memory. Make sure we have a ref for all this other checks
+ */
+ atomic_inc(&eb->refs);
+
+ reads_done = atomic_dec_and_test(&eb->io_pages);
+ if (!reads_done)
+ goto err;
+
+ eb->read_mirror = mirror;
+ if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
+ ret = -EIO;
+ goto err;
+ }
+ ret = validate_extent_buffer(eb);
err:
if (reads_done &&
test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
@@ -632,7 +633,7 @@ err:
clear_extent_buffer_uptodate(eb);
}
free_extent_buffer(eb);
-out:
+
return ret;
}
@@ -694,8 +695,8 @@ static void run_one_async_start(struct btrfs_work *work)
blk_status_t ret;
async = container_of(work, struct async_submit_bio, work);
- ret = async->submit_bio_start(async->private_data, async->bio,
- async->bio_offset);
+ ret = async->submit_bio_start(async->inode, async->bio,
+ async->dio_file_offset);
if (ret)
async->status = ret;
}
@@ -715,7 +716,7 @@ static void run_one_async_done(struct btrfs_work *work)
blk_status_t ret;
async = container_of(work, struct async_submit_bio, work);
- inode = async->private_data;
+ inode = async->inode;
/* If an error occurred we just want to clean up the bio and move on */
if (async->status) {
@@ -745,18 +746,19 @@ static void run_one_async_free(struct btrfs_work *work)
kfree(async);
}
-blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
+blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags,
- u64 bio_offset, void *private_data,
+ u64 dio_file_offset,
extent_submit_bio_start_t *submit_bio_start)
{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct async_submit_bio *async;
async = kmalloc(sizeof(*async), GFP_NOFS);
if (!async)
return BLK_STS_RESOURCE;
- async->private_data = private_data;
+ async->inode = inode;
async->bio = bio;
async->mirror_num = mirror_num;
async->submit_bio_start = submit_bio_start;
@@ -764,7 +766,7 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
run_one_async_free);
- async->bio_offset = bio_offset;
+ async->dio_file_offset = dio_file_offset;
async->status = 0;
@@ -785,7 +787,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
root = BTRFS_I(bvec->bv_page->mapping->host)->root;
- ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
+ ret = csum_dirty_buffer(root->fs_info, bvec);
if (ret)
break;
}
@@ -793,8 +795,8 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
return errno_to_blk_status(ret);
}
-static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio,
- u64 bio_offset)
+static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
+ u64 dio_file_offset)
{
/*
* when we're called for a write, we're already in the async
@@ -840,8 +842,8 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
* kthread helpers are used to submit writes so that
* checksumming can happen in parallel across all CPUs
*/
- ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0,
- 0, inode, btree_submit_bio_start);
+ ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
+ 0, btree_submit_bio_start);
}
if (ret)
@@ -947,47 +949,33 @@ static const struct address_space_operations btree_aops = {
.set_page_dirty = btree_set_page_dirty,
};
-void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr)
-{
- struct extent_buffer *buf = NULL;
- int ret;
-
- buf = btrfs_find_create_tree_block(fs_info, bytenr);
- if (IS_ERR(buf))
- return;
-
- ret = read_extent_buffer_pages(buf, WAIT_NONE, 0);
- if (ret < 0)
- free_extent_buffer_stale(buf);
- else
- free_extent_buffer(buf);
-}
-
struct extent_buffer *btrfs_find_create_tree_block(
struct btrfs_fs_info *fs_info,
- u64 bytenr)
+ u64 bytenr, u64 owner_root,
+ int level)
{
if (btrfs_is_testing(fs_info))
return alloc_test_extent_buffer(fs_info, bytenr);
- return alloc_extent_buffer(fs_info, bytenr);
+ return alloc_extent_buffer(fs_info, bytenr, owner_root, level);
}
/*
* Read tree block at logical address @bytenr and do variant basic but critical
* verification.
*
+ * @owner_root: the objectid of the root owner for this block.
* @parent_transid: expected transid of this tree block, skip check if 0
* @level: expected level, mandatory check
* @first_key: expected key in slot 0, skip check if NULL
*/
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 parent_transid, int level,
- struct btrfs_key *first_key)
+ u64 owner_root, u64 parent_transid,
+ int level, struct btrfs_key *first_key)
{
struct extent_buffer *buf = NULL;
int ret;
- buf = btrfs_find_create_tree_block(fs_info, bytenr);
+ buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
if (IS_ERR(buf))
return buf;
@@ -1012,8 +1000,6 @@ void btrfs_clean_tree_block(struct extent_buffer *buf)
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
-buf->len,
fs_info->dirty_metadata_batch);
- /* ugh, clear_extent_buffer_dirty needs to lock the page */
- btrfs_set_lock_blocking_write(buf);
clear_extent_buffer_dirty(buf);
}
}
@@ -1155,7 +1141,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
leaf = NULL;
- goto fail;
+ goto fail_unlock;
}
root->node = leaf;
@@ -1164,8 +1150,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
root->commit_root = btrfs_root_node(root);
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- root->root_item.flags = 0;
- root->root_item.byte_limit = 0;
+ btrfs_set_root_flags(&root->root_item, 0);
+ btrfs_set_root_limit(&root->root_item, 0);
btrfs_set_root_bytenr(&root->root_item, leaf->start);
btrfs_set_root_generation(&root->root_item, trans->transid);
btrfs_set_root_level(&root->root_item, 0);
@@ -1177,7 +1163,9 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
generate_random_guid(root->root_item.uuid);
else
export_guid(root->root_item.uuid, &guid_null);
- root->root_item.drop_level = 0;
+ btrfs_set_root_drop_level(&root->root_item, 0);
+
+ btrfs_tree_unlock(leaf);
key.objectid = objectid;
key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1186,13 +1174,12 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
if (ret)
goto fail;
- btrfs_tree_unlock(leaf);
-
return root;
-fail:
+fail_unlock:
if (leaf)
btrfs_tree_unlock(leaf);
+fail:
btrfs_put_root(root);
return ERR_PTR(ret);
@@ -1307,7 +1294,7 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
level = btrfs_root_level(&root->root_item);
root->node = read_tree_block(fs_info,
btrfs_root_bytenr(&root->root_item),
- generation, level, NULL);
+ key->objectid, generation, level, NULL);
if (IS_ERR(root->node)) {
ret = PTR_ERR(root->node);
root->node = NULL;
@@ -1348,14 +1335,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
int ret;
unsigned int nofs_flag;
- root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
- root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
- GFP_NOFS);
- if (!root->free_ino_pinned || !root->free_ino_ctl) {
- ret = -ENOMEM;
- goto fail;
- }
-
/*
* We might be called under a transaction (e.g. indirect backref
* resolution) which could deadlock if it triggers memory reclaim
@@ -1372,10 +1351,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
btrfs_check_and_init_root_item(&root->root_item);
}
- btrfs_init_free_ino_ctl(root);
- spin_lock_init(&root->ino_cache_lock);
- init_waitqueue_head(&root->ino_cache_wait);
-
/*
* Don't assign anonymous block device to roots that are not exposed to
* userspace, the id pool is limited to 1M
@@ -1774,13 +1749,13 @@ static int transaction_kthread(void *arg)
struct btrfs_trans_handle *trans;
struct btrfs_transaction *cur;
u64 transid;
- time64_t now;
+ time64_t delta;
unsigned long delay;
bool cannot_commit;
do {
cannot_commit = false;
- delay = HZ * fs_info->commit_interval;
+ delay = msecs_to_jiffies(fs_info->commit_interval * 1000);
mutex_lock(&fs_info->transaction_kthread_mutex);
spin_lock(&fs_info->trans_lock);
@@ -1790,12 +1765,13 @@ static int transaction_kthread(void *arg)
goto sleep;
}
- now = ktime_get_seconds();
+ delta = ktime_get_seconds() - cur->start_time;
if (cur->state < TRANS_STATE_COMMIT_START &&
- (now < cur->start_time ||
- now - cur->start_time < fs_info->commit_interval)) {
+ delta < fs_info->commit_interval) {
spin_unlock(&fs_info->trans_lock);
- delay = HZ * 5;
+ delay -= msecs_to_jiffies((delta - 1) * 1000);
+ delay = min(delay,
+ msecs_to_jiffies(fs_info->commit_interval * 1000));
goto sleep;
}
transid = cur->transid;
@@ -2044,8 +2020,6 @@ void btrfs_put_root(struct btrfs_root *root)
free_anon_bdev(root->anon_dev);
btrfs_drew_lock_destroy(&root->snapshot_lock);
free_root_extent_buffers(root);
- kfree(root->free_ino_ctl);
- kfree(root->free_ino_pinned);
#ifdef CONFIG_BTRFS_DEBUG
spin_lock(&root->fs_info->fs_roots_radix_lock);
list_del_init(&root->leak_list);
@@ -2260,8 +2234,9 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
return -ENOMEM;
log_tree_root->node = read_tree_block(fs_info, bytenr,
- fs_info->generation + 1,
- level, NULL);
+ BTRFS_TREE_LOG_OBJECTID,
+ fs_info->generation + 1, level,
+ NULL);
if (IS_ERR(log_tree_root->node)) {
btrfs_warn(fs_info, "failed to read log tree");
ret = PTR_ERR(log_tree_root->node);
@@ -2306,30 +2281,42 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
root = btrfs_read_tree_root(tree_root, &location);
if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- goto out;
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->extent_root = root;
}
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->extent_root = root;
location.objectid = BTRFS_DEV_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- goto out;
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->dev_root = root;
+ btrfs_init_devices_late(fs_info);
}
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->dev_root = root;
- btrfs_init_devices_late(fs_info);
- location.objectid = BTRFS_CSUM_TREE_OBJECTID;
- root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- goto out;
+ /* If IGNOREDATACSUMS is set don't bother reading the csum root. */
+ if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
+ location.objectid = BTRFS_CSUM_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->csum_root = root;
+ }
}
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->csum_root = root;
/*
* This tree can share blocks with some other fs tree during relocation
@@ -2338,11 +2325,14 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
root = btrfs_get_fs_root(tree_root->fs_info,
BTRFS_DATA_RELOC_TREE_OBJECTID, true);
if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- goto out;
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->data_reloc_root = root;
}
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->data_reloc_root = root;
location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
@@ -2355,9 +2345,11 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
location.objectid = BTRFS_UUID_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- if (ret != -ENOENT)
- goto out;
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ if (ret != -ENOENT)
+ goto out;
+ }
} else {
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->uuid_root = root;
@@ -2367,11 +2359,14 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
if (IS_ERR(root)) {
- ret = PTR_ERR(root);
- goto out;
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->free_space_root = root;
}
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->free_space_root = root;
}
return 0;
@@ -2627,6 +2622,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
generation = btrfs_super_generation(sb);
level = btrfs_super_root_level(sb);
tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb),
+ BTRFS_ROOT_TREE_OBJECTID,
generation, level, NULL);
if (IS_ERR(tree_root->node)) {
handle_error = true;
@@ -2791,6 +2787,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
/* Usable values until the real ones are cached from the superblock */
fs_info->nodesize = 4096;
fs_info->sectorsize = 4096;
+ fs_info->sectorsize_bits = ilog2(4096);
fs_info->stripesize = 4096;
spin_lock_init(&fs_info->swapfile_pins_lock);
@@ -2873,6 +2870,109 @@ static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
return 0;
}
+/*
+ * Some options only have meaning at mount time and shouldn't persist across
+ * remounts, or be displayed. Clear these at the end of mount and remount
+ * code paths.
+ */
+void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
+{
+ btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
+ btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
+}
+
+/*
+ * Mounting logic specific to read-write file systems. Shared by open_ctree
+ * and btrfs_remount when remounting from read-only to read-write.
+ */
+int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
+{
+ int ret;
+ const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
+ bool clear_free_space_tree = false;
+
+ if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
+ btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ clear_free_space_tree = true;
+ } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
+ !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
+ btrfs_warn(fs_info, "free space tree is invalid");
+ clear_free_space_tree = true;
+ }
+
+ if (clear_free_space_tree) {
+ btrfs_info(fs_info, "clearing free space tree");
+ ret = btrfs_clear_free_space_tree(fs_info);
+ if (ret) {
+ btrfs_warn(fs_info,
+ "failed to clear free space tree: %d", ret);
+ goto out;
+ }
+ }
+
+ ret = btrfs_cleanup_fs_roots(fs_info);
+ if (ret)
+ goto out;
+
+ down_read(&fs_info->cleanup_work_sem);
+ if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
+ (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
+ up_read(&fs_info->cleanup_work_sem);
+ goto out;
+ }
+ up_read(&fs_info->cleanup_work_sem);
+
+ mutex_lock(&fs_info->cleaner_mutex);
+ ret = btrfs_recover_relocation(fs_info->tree_root);
+ mutex_unlock(&fs_info->cleaner_mutex);
+ if (ret < 0) {
+ btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
+ goto out;
+ }
+
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
+ !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ btrfs_info(fs_info, "creating free space tree");
+ ret = btrfs_create_free_space_tree(fs_info);
+ if (ret) {
+ btrfs_warn(fs_info,
+ "failed to create free space tree: %d", ret);
+ goto out;
+ }
+ }
+
+ if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) {
+ ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
+ if (ret)
+ goto out;
+ }
+
+ ret = btrfs_resume_balance_async(fs_info);
+ if (ret)
+ goto out;
+
+ ret = btrfs_resume_dev_replace_async(fs_info);
+ if (ret) {
+ btrfs_warn(fs_info, "failed to resume dev_replace");
+ goto out;
+ }
+
+ btrfs_qgroup_rescan_resume(fs_info);
+
+ if (!fs_info->uuid_root) {
+ btrfs_info(fs_info, "creating UUID tree");
+ ret = btrfs_create_uuid_tree(fs_info);
+ if (ret) {
+ btrfs_warn(fs_info,
+ "failed to create the UUID tree %d", ret);
+ goto out;
+ }
+ }
+
+out:
+ return ret;
+}
+
int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
char *options)
{
@@ -2888,7 +2988,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
struct btrfs_root *chunk_root;
int ret;
int err = -EINVAL;
- int clear_free_space_tree = 0;
int level;
ret = init_mount_fs_info(fs_info, sb);
@@ -3035,6 +3134,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
btrfs_info(fs_info, "has skinny extents");
+ fs_info->zoned = (features & BTRFS_FEATURE_INCOMPAT_ZONED);
+
/*
* flag our filesystem as having big metadata blocks if
* they are bigger than the page size
@@ -3055,6 +3156,9 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
/* Cache block sizes */
fs_info->nodesize = nodesize;
fs_info->sectorsize = sectorsize;
+ fs_info->sectorsize_bits = ilog2(sectorsize);
+ fs_info->csum_size = btrfs_super_csum_size(disk_super);
+ fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
fs_info->stripesize = stripesize;
/*
@@ -3111,6 +3215,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
chunk_root->node = read_tree_block(fs_info,
btrfs_super_chunk_root(disk_super),
+ BTRFS_CHUNK_TREE_OBJECTID,
generation, level, NULL);
if (IS_ERR(chunk_root->node) ||
!extent_buffer_uptodate(chunk_root->node)) {
@@ -3134,11 +3239,13 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
/*
- * Keep the devid that is marked to be the target device for the
- * device replace procedure
+ * At this point we know all the devices that make this filesystem,
+ * including the seed devices but we don't know yet if the replace
+ * target is required. So free devices that are not part of this
+ * filesystem but skip the replace traget device which is checked
+ * below in btrfs_init_dev_replace().
*/
- btrfs_free_extra_devids(fs_devices, 0);
-
+ btrfs_free_extra_devids(fs_devices);
if (!fs_devices->latest_bdev) {
btrfs_err(fs_info, "failed to read devices");
goto fail_tree_roots;
@@ -3185,7 +3292,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_block_groups;
}
- btrfs_free_extra_devids(fs_devices, 1);
+ ret = btrfs_check_zoned_mode(fs_info);
+ if (ret) {
+ btrfs_err(fs_info, "failed to initialize zoned mode: %d",
+ ret);
+ goto fail_block_groups;
+ }
ret = btrfs_sysfs_add_fsid(fs_devices);
if (ret) {
@@ -3275,22 +3387,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (ret)
goto fail_qgroup;
- if (!sb_rdonly(sb)) {
- ret = btrfs_cleanup_fs_roots(fs_info);
- if (ret)
- goto fail_qgroup;
-
- mutex_lock(&fs_info->cleaner_mutex);
- ret = btrfs_recover_relocation(tree_root);
- mutex_unlock(&fs_info->cleaner_mutex);
- if (ret < 0) {
- btrfs_warn(fs_info, "failed to recover relocation: %d",
- ret);
- err = -EINVAL;
- goto fail_qgroup;
- }
- }
-
fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
if (IS_ERR(fs_info->fs_root)) {
err = PTR_ERR(fs_info->fs_root);
@@ -3300,78 +3396,18 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
if (sb_rdonly(sb))
- return 0;
-
- if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
- btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- clear_free_space_tree = 1;
- } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
- !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
- btrfs_warn(fs_info, "free space tree is invalid");
- clear_free_space_tree = 1;
- }
-
- if (clear_free_space_tree) {
- btrfs_info(fs_info, "clearing free space tree");
- ret = btrfs_clear_free_space_tree(fs_info);
- if (ret) {
- btrfs_warn(fs_info,
- "failed to clear free space tree: %d", ret);
- close_ctree(fs_info);
- return ret;
- }
- }
-
- if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
- !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- btrfs_info(fs_info, "creating free space tree");
- ret = btrfs_create_free_space_tree(fs_info);
- if (ret) {
- btrfs_warn(fs_info,
- "failed to create free space tree: %d", ret);
- close_ctree(fs_info);
- return ret;
- }
- }
-
- down_read(&fs_info->cleanup_work_sem);
- if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
- (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
- up_read(&fs_info->cleanup_work_sem);
- close_ctree(fs_info);
- return ret;
- }
- up_read(&fs_info->cleanup_work_sem);
+ goto clear_oneshot;
- ret = btrfs_resume_balance_async(fs_info);
- if (ret) {
- btrfs_warn(fs_info, "failed to resume balance: %d", ret);
- close_ctree(fs_info);
- return ret;
- }
-
- ret = btrfs_resume_dev_replace_async(fs_info);
+ ret = btrfs_start_pre_rw_mount(fs_info);
if (ret) {
- btrfs_warn(fs_info, "failed to resume device replace: %d", ret);
close_ctree(fs_info);
return ret;
}
-
- btrfs_qgroup_rescan_resume(fs_info);
btrfs_discard_resume(fs_info);
- if (!fs_info->uuid_root) {
- btrfs_info(fs_info, "creating UUID tree");
- ret = btrfs_create_uuid_tree(fs_info);
- if (ret) {
- btrfs_warn(fs_info,
- "failed to create the UUID tree: %d", ret);
- close_ctree(fs_info);
- return ret;
- }
- } else if (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
- fs_info->generation !=
- btrfs_super_uuid_tree_generation(disk_super)) {
+ if (fs_info->uuid_root &&
+ (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
+ fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) {
btrfs_info(fs_info, "checking UUID tree");
ret = btrfs_check_uuid_tree(fs_info);
if (ret) {
@@ -3381,14 +3417,11 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
return ret;
}
}
- set_bit(BTRFS_FS_OPEN, &fs_info->flags);
- /*
- * backuproot only affect mount behavior, and if open_ctree succeeded,
- * no need to keep the flag
- */
- btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
+ set_bit(BTRFS_FS_OPEN, &fs_info->flags);
+clear_oneshot:
+ btrfs_clear_oneshot_options(fs_info);
return 0;
fail_qgroup:
@@ -3469,10 +3502,17 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
{
struct btrfs_super_block *super;
struct page *page;
- u64 bytenr;
+ u64 bytenr, bytenr_orig;
struct address_space *mapping = bdev->bd_inode->i_mapping;
+ int ret;
+
+ bytenr_orig = btrfs_sb_offset(copy_num);
+ ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
+ if (ret == -ENOENT)
+ return ERR_PTR(-EINVAL);
+ else if (ret)
+ return ERR_PTR(ret);
- bytenr = btrfs_sb_offset(copy_num);
if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
return ERR_PTR(-EINVAL);
@@ -3486,7 +3526,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
return ERR_PTR(-ENODATA);
}
- if (btrfs_super_bytenr(super) != bytenr) {
+ if (btrfs_super_bytenr(super) != bytenr_orig) {
btrfs_release_disk_super(super);
return ERR_PTR(-EINVAL);
}
@@ -3541,7 +3581,8 @@ static int write_dev_supers(struct btrfs_device *device,
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
int i;
int errors = 0;
- u64 bytenr;
+ int ret;
+ u64 bytenr, bytenr_orig;
if (max_mirrors == 0)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
@@ -3553,12 +3594,22 @@ static int write_dev_supers(struct btrfs_device *device,
struct bio *bio;
struct btrfs_super_block *disk_super;
- bytenr = btrfs_sb_offset(i);
+ bytenr_orig = btrfs_sb_offset(i);
+ ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
+ if (ret == -ENOENT) {
+ continue;
+ } else if (ret < 0) {
+ btrfs_err(device->fs_info,
+ "couldn't get super block location for mirror %d",
+ i);
+ errors++;
+ continue;
+ }
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
device->commit_total_bytes)
break;
- btrfs_set_super_bytenr(sb, bytenr);
+ btrfs_set_super_bytenr(sb, bytenr_orig);
crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
@@ -3603,6 +3654,7 @@ static int write_dev_supers(struct btrfs_device *device,
bio->bi_opf |= REQ_FUA;
btrfsic_submit_bio(bio);
+ btrfs_advance_sb_log(device, i);
}
return errors < i ? 0 : -1;
}
@@ -3619,6 +3671,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
int i;
int errors = 0;
bool primary_failed = false;
+ int ret;
u64 bytenr;
if (max_mirrors == 0)
@@ -3627,7 +3680,15 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
for (i = 0; i < max_mirrors; i++) {
struct page *page;
- bytenr = btrfs_sb_offset(i);
+ ret = btrfs_sb_log_location(device, i, READ, &bytenr);
+ if (ret == -ENOENT) {
+ break;
+ } else if (ret < 0) {
+ errors++;
+ if (i == 0)
+ primary_failed = true;
+ continue;
+ }
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
device->commit_total_bytes)
break;
@@ -3941,14 +4002,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
}
}
- if (root->free_ino_pinned)
- __btrfs_remove_free_space_cache(root->free_ino_pinned);
- if (root->free_ino_ctl)
- __btrfs_remove_free_space_cache(root->free_ino_ctl);
- if (root->ino_cache_inode) {
- iput(root->ino_cache_inode);
- root->ino_cache_inode = NULL;
- }
if (drop_ref)
btrfs_put_root(root);
}
@@ -4171,8 +4224,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
{
- struct btrfs_fs_info *fs_info;
- struct btrfs_root *root;
+ struct btrfs_fs_info *fs_info = buf->fs_info;
u64 transid = btrfs_header_generation(buf);
int was_dirty;
@@ -4185,8 +4237,6 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
return;
#endif
- root = BTRFS_I(buf->pages[0]->mapping->host)->root;
- fs_info = root->fs_info;
btrfs_assert_tree_locked(buf);
if (transid != fs_info->generation)
WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
@@ -4691,3 +4741,58 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
return 0;
}
+
+int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
+{
+ struct btrfs_path *path;
+ int ret;
+ struct extent_buffer *l;
+ struct btrfs_key search_key;
+ struct btrfs_key found_key;
+ int slot;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
+ search_key.type = -1;
+ search_key.offset = (u64)-1;
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ goto error;
+ BUG_ON(ret == 0); /* Corruption */
+ if (path->slots[0] > 0) {
+ slot = path->slots[0] - 1;
+ l = path->nodes[0];
+ btrfs_item_key_to_cpu(l, &found_key, slot);
+ *objectid = max_t(u64, found_key.objectid,
+ BTRFS_FIRST_FREE_OBJECTID - 1);
+ } else {
+ *objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
+ }
+ ret = 0;
+error:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
+{
+ int ret;
+ mutex_lock(&root->objectid_mutex);
+
+ if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
+ btrfs_warn(root->fs_info,
+ "the objectid of root %llu reaches its highest value",
+ root->root_key.objectid);
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ *objectid = ++root->highest_objectid;
+ ret = 0;
+out:
+ mutex_unlock(&root->objectid_mutex);
+ return ret;
+}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 182540bdcea0..e45057c0c016 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -43,13 +43,15 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info);
int btrfs_verify_level_key(struct extent_buffer *eb, int level,
struct btrfs_key *first_key, u64 parent_transid);
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 parent_transid, int level,
- struct btrfs_key *first_key);
-void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr);
+ u64 owner_root, u64 parent_transid,
+ int level, struct btrfs_key *first_key);
struct extent_buffer *btrfs_find_create_tree_block(
struct btrfs_fs_info *fs_info,
- u64 bytenr);
+ u64 bytenr, u64 owner_root,
+ int level);
void btrfs_clean_tree_block(struct extent_buffer *buf);
+void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info);
+int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
int __cold open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options);
@@ -79,7 +81,7 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
-int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
+int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
struct page *page, u64 start, u64 end,
int mirror);
blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
@@ -112,10 +114,10 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
struct btrfs_key *first_key);
blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
enum btrfs_wq_endio_type metadata);
-blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 bio_offset, void *private_data,
- extent_submit_bio_start_t *submit_bio_start);
+blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags,
+ u64 dio_file_offset,
+ extent_submit_bio_start_t *submit_bio_start);
blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
int mirror_num);
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
@@ -131,16 +133,15 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *));
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
+int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid);
+int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid);
int __init btrfs_end_io_wq_init(void);
void __cold btrfs_end_io_wq_exit(void);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-void btrfs_init_lockdep(void);
void btrfs_set_buffer_lockdep_class(u64 objectid,
struct extent_buffer *eb, int level);
#else
-static inline void btrfs_init_lockdep(void)
-{ }
static inline void btrfs_set_buffer_lockdep_class(u64 objectid,
struct extent_buffer *eb, int level)
{
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1a8d419d9e1f..1d4c2397d0d6 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -222,7 +222,6 @@ static int btrfs_get_name(struct dentry *parent, char *name,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
if (ino == BTRFS_FIRST_FREE_OBJECTID) {
key.objectid = BTRFS_I(inode)->root->root_key.objectid;
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 9800a8306368..04083ee5ae6e 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -21,10 +21,24 @@ struct io_failure_record;
#define EXTENT_NORESERVE (1U << 11)
#define EXTENT_QGROUP_RESERVED (1U << 12)
#define EXTENT_CLEAR_DATA_RESV (1U << 13)
+/*
+ * Must be cleared only during ordered extent completion or on error paths if we
+ * did not manage to submit bios and create the ordered extents for the range.
+ * Should not be cleared during page release and page invalidation (if there is
+ * an ordered extent in flight), that is left for the ordered extent completion.
+ */
#define EXTENT_DELALLOC_NEW (1U << 14)
+/*
+ * When an ordered extent successfully completes for a region marked as a new
+ * delalloc range, use this flag when clearing a new delalloc range to indicate
+ * that the VFS' inode number of bytes should be incremented and the inode's new
+ * delalloc bytes decremented, in an atomic way to prevent races with stat(2).
+ */
+#define EXTENT_ADD_INODE_BYTES (1U << 15)
#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
EXTENT_CLEAR_DATA_RESV)
-#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING)
+#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | \
+ EXTENT_ADD_INODE_BYTES)
/*
* Redefined bits above which are used only in the device allocation tree,
@@ -73,7 +87,7 @@ struct extent_state {
/* ADD NEW ELEMENTS AFTER THIS */
wait_queue_head_t wq;
refcount_t refs;
- unsigned state;
+ u32 state;
struct io_failure_record *failrec;
@@ -105,19 +119,18 @@ void __cold extent_io_exit(void);
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end,
- u64 max_bytes, unsigned bits, int contig);
+ u64 max_bytes, u32 bits, int contig);
void free_extent_state(struct extent_state *state);
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int filled,
- struct extent_state *cached_state);
+ u32 bits, int filled, struct extent_state *cached_state);
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_changeset *changeset);
+ u32 bits, struct extent_changeset *changeset);
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
+ u32 bits, int wake, int delete,
struct extent_state **cached);
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
+ u32 bits, int wake, int delete,
struct extent_state **cached, gfp_t mask,
struct extent_changeset *changeset);
@@ -141,7 +154,7 @@ static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree,
}
static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
- u64 end, unsigned bits)
+ u64 end, u32 bits)
{
int wake = 0;
@@ -152,17 +165,19 @@ static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
}
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_changeset *changeset);
+ u32 bits, struct extent_changeset *changeset);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, u64 *failed_start,
- struct extent_state **cached_state, gfp_t mask);
+ u32 bits, unsigned exclusive_bits, u64 *failed_start,
+ struct extent_state **cached_state, gfp_t mask,
+ struct extent_changeset *changeset);
int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits);
+ u32 bits);
static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
- u64 end, unsigned bits)
+ u64 end, u32 bits)
{
- return set_extent_bit(tree, start, end, bits, NULL, NULL, GFP_NOFS);
+ return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
+ NULL);
}
static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
@@ -175,8 +190,8 @@ static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
u64 end, gfp_t mask)
{
- return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
- NULL, mask);
+ return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, NULL,
+ mask, NULL);
}
static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
@@ -188,16 +203,16 @@ static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
}
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, unsigned clear_bits,
+ u32 bits, u32 clear_bits,
struct extent_state **cached_state);
static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
- u64 end, unsigned int extra_bits,
+ u64 end, u32 extra_bits,
struct extent_state **cached_state)
{
return set_extent_bit(tree, start, end,
EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits,
- NULL, cached_state, GFP_NOFS);
+ 0, NULL, cached_state, GFP_NOFS, NULL);
}
static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
@@ -205,30 +220,30 @@ static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
{
return set_extent_bit(tree, start, end,
EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
- NULL, cached_state, GFP_NOFS);
+ 0, NULL, cached_state, GFP_NOFS, NULL);
}
static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
u64 end)
{
- return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL,
- GFP_NOFS);
+ return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, NULL,
+ GFP_NOFS, NULL);
}
static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached_state, gfp_t mask)
{
- return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
- cached_state, mask);
+ return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
+ cached_state, mask, NULL);
}
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits,
+ u64 *start_ret, u64 *end_ret, u32 bits,
struct extent_state **cached_state);
void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits);
+ u64 *start_ret, u64 *end_ret, u32 bits);
int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits);
+ u64 *start_ret, u64 *end_ret, u32 bits);
int extent_invalidatepage(struct extent_io_tree *tree,
struct page *page, unsigned long offset);
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5fd60b13f4f8..56ea380f5a17 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1465,7 +1465,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
/* this will setup the path even if it fails to insert the back ref */
ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
parent, root_objectid, owner,
@@ -1489,7 +1488,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- path->leave_spinning = 1;
/* now insert the actual backref */
if (owner < BTRFS_FIRST_FREE_OBJECTID) {
BUG_ON(refs_to_add != 1);
@@ -1605,7 +1603,6 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
}
again:
- path->leave_spinning = 1;
ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
if (ret < 0) {
err = ret;
@@ -2133,25 +2130,6 @@ static u64 find_middle(struct rb_root *root)
#endif
/*
- * Takes the number of bytes to be csumm'ed and figures out how many leaves it
- * would require to store the csums for that many bytes.
- */
-u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
-{
- u64 csum_size;
- u64 num_csums_per_leaf;
- u64 num_csums;
-
- csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
- num_csums_per_leaf = div64_u64(csum_size,
- (u64)btrfs_super_csum_size(fs_info->super_copy));
- num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
- num_csums += num_csums_per_leaf - 1;
- num_csums = div64_u64(num_csums, num_csums_per_leaf);
- return num_csums;
-}
-
-/*
* this starts processing the delayed reference count updates and
* extent insertions we have queued up so far. count can be
* 0, which means to process everything in the tree at the start
@@ -2663,6 +2641,11 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
BUG_ON(!btrfs_block_group_done(block_group));
ret = btrfs_remove_free_space(block_group, start, num_bytes);
} else {
+ /*
+ * We must wait for v1 caching to finish, otherwise we may not
+ * remove our space.
+ */
+ btrfs_wait_space_cache_v1_finished(block_group, caching_ctl);
mutex_lock(&caching_ctl->mutex);
if (start >= caching_ctl->progress) {
@@ -2730,31 +2713,6 @@ btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
atomic_inc(&bg->reservations);
}
-void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_caching_control *next;
- struct btrfs_caching_control *caching_ctl;
- struct btrfs_block_group *cache;
-
- down_write(&fs_info->commit_root_sem);
-
- list_for_each_entry_safe(caching_ctl, next,
- &fs_info->caching_block_groups, list) {
- cache = caching_ctl->block_group;
- if (btrfs_block_group_done(cache)) {
- cache->last_byte_to_unpin = (u64)-1;
- list_del_init(&caching_ctl->list);
- btrfs_put_caching_control(caching_ctl);
- } else {
- cache->last_byte_to_unpin = caching_ctl->progress;
- }
- }
-
- up_write(&fs_info->commit_root_sem);
-
- btrfs_update_global_block_rsv(fs_info);
-}
-
/*
* Returns the free cluster for the given space info and sets empty_cluster to
* what it should be based on the mount options.
@@ -2816,11 +2774,13 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
len = cache->start + cache->length - start;
len = min(len, end + 1 - start);
- if (start < cache->last_byte_to_unpin) {
- len = min(len, cache->last_byte_to_unpin - start);
- if (return_free_space)
- btrfs_add_free_space(cache, start, len);
+ down_read(&fs_info->commit_root_sem);
+ if (start < cache->last_byte_to_unpin && return_free_space) {
+ u64 add_len = min(len, cache->last_byte_to_unpin - start);
+
+ btrfs_add_free_space(cache, start, add_len);
}
+ up_read(&fs_info->commit_root_sem);
start += len;
total_unpinned += len;
@@ -3040,8 +3000,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
-
is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
if (!is_data && refs_to_drop != 1) {
@@ -3106,7 +3064,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
goto out;
}
btrfs_release_path(path);
- path->leave_spinning = 1;
/* Slow path to locate EXTENT/METADATA_ITEM */
key.objectid = bytenr;
@@ -4448,7 +4405,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
ins, size);
if (ret) {
@@ -4533,7 +4489,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
&extent_key, size);
if (ret) {
@@ -4662,7 +4617,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *buf;
- buf = btrfs_find_create_tree_block(fs_info, bytenr);
+ buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
if (IS_ERR(buf))
return buf;
@@ -4679,12 +4634,16 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return ERR_PTR(-EUCLEAN);
}
+ /*
+ * This needs to stay, because we could allocate a freed block from an
+ * old tree into a new tree, so we need to make sure this new block is
+ * set to the appropriate level and owner.
+ */
btrfs_set_buffer_lockdep_class(owner, buf, level);
__btrfs_tree_lock(buf, nest);
btrfs_clean_tree_block(buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
- btrfs_set_lock_blocking_write(buf);
set_extent_buffer_uptodate(buf);
memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
@@ -4905,7 +4864,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
continue;
}
reada:
- readahead_tree_block(fs_info, bytenr);
+ btrfs_readahead_node_child(eb, slot);
nread++;
}
wc->reada_slot = slot;
@@ -5064,16 +5023,13 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
next = find_extent_buffer(fs_info, bytenr);
if (!next) {
- next = btrfs_find_create_tree_block(fs_info, bytenr);
+ next = btrfs_find_create_tree_block(fs_info, bytenr,
+ root->root_key.objectid, level - 1);
if (IS_ERR(next))
return PTR_ERR(next);
-
- btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
- level - 1);
reada = 1;
}
btrfs_tree_lock(next);
- btrfs_set_lock_blocking_write(next);
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
&wc->refs[level - 1],
@@ -5124,8 +5080,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (!next) {
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
- next = read_tree_block(fs_info, bytenr, generation, level - 1,
- &first_key);
+ next = read_tree_block(fs_info, bytenr, root->root_key.objectid,
+ generation, level - 1, &first_key);
if (IS_ERR(next)) {
return PTR_ERR(next);
} else if (!extent_buffer_uptodate(next)) {
@@ -5133,7 +5089,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
return -EIO;
}
btrfs_tree_lock(next);
- btrfs_set_lock_blocking_write(next);
}
level--;
@@ -5145,7 +5100,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
}
path->nodes[level] = next;
path->slots[level] = 0;
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
wc->level = level;
if (wc->level == 1)
wc->reada_slot = 0;
@@ -5273,8 +5228,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (!path->locks[level]) {
BUG_ON(level == 0);
btrfs_tree_lock(eb);
- btrfs_set_lock_blocking_write(eb);
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
ret = btrfs_lookup_extent_info(trans, fs_info,
eb->start, level, 1,
@@ -5317,8 +5271,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (!path->locks[level] &&
btrfs_header_generation(eb) == trans->transid) {
btrfs_tree_lock(eb);
- btrfs_set_lock_blocking_write(eb);
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
}
btrfs_clean_tree_block(eb);
}
@@ -5486,9 +5439,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_header_level(root->node);
path->nodes[level] = btrfs_lock_root_node(root);
- btrfs_set_lock_blocking_write(path->nodes[level]);
path->slots[level] = 0;
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
memset(&wc->update_progress, 0,
sizeof(wc->update_progress));
} else {
@@ -5496,7 +5448,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
memcpy(&wc->update_progress, &key,
sizeof(wc->update_progress));
- level = root_item->drop_level;
+ level = btrfs_root_drop_level(root_item);
BUG_ON(level == 0);
path->lowest_level = level;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -5516,8 +5468,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
level = btrfs_header_level(root->node);
while (1) {
btrfs_tree_lock(path->nodes[level]);
- btrfs_set_lock_blocking_write(path->nodes[level]);
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
ret = btrfs_lookup_extent_info(trans, fs_info,
path->nodes[level]->start,
@@ -5529,7 +5480,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
}
BUG_ON(wc->refs[level] == 0);
- if (level == root_item->drop_level)
+ if (level == btrfs_root_drop_level(root_item))
break;
btrfs_tree_unlock(path->nodes[level]);
@@ -5574,7 +5525,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
}
btrfs_cpu_key_to_disk(&root_item->drop_progress,
&wc->drop_progress);
- root_item->drop_level = wc->drop_level;
+ btrfs_set_root_drop_level(root_item, wc->drop_level);
BUG_ON(wc->level == 0);
if (btrfs_should_end_transaction(trans) ||
@@ -5704,7 +5655,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
level = btrfs_header_level(node);
path->nodes[level] = node;
path->slots[level] = 0;
- path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_WRITE_LOCK;
wc->refs[parent_level] = 1;
wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 60f5f68d892d..6e3b72e63e42 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -142,7 +142,7 @@ struct extent_page_data {
unsigned int sync_io:1;
};
-static int add_extent_changeset(struct extent_state *state, unsigned bits,
+static int add_extent_changeset(struct extent_state *state, u32 bits,
struct extent_changeset *changeset,
int set)
{
@@ -530,7 +530,7 @@ static void merge_state(struct extent_io_tree *tree,
}
static void set_state_bits(struct extent_io_tree *tree,
- struct extent_state *state, unsigned *bits,
+ struct extent_state *state, u32 *bits,
struct extent_changeset *changeset);
/*
@@ -547,7 +547,7 @@ static int insert_state(struct extent_io_tree *tree,
struct extent_state *state, u64 start, u64 end,
struct rb_node ***p,
struct rb_node **parent,
- unsigned *bits, struct extent_changeset *changeset)
+ u32 *bits, struct extent_changeset *changeset)
{
struct rb_node *node;
@@ -628,11 +628,11 @@ static struct extent_state *next_state(struct extent_state *state)
*/
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
struct extent_state *state,
- unsigned *bits, int wake,
+ u32 *bits, int wake,
struct extent_changeset *changeset)
{
struct extent_state *next;
- unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
+ u32 bits_to_clear = *bits & ~EXTENT_CTLBITS;
int ret;
if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
@@ -695,9 +695,9 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
* This takes the tree lock, and returns 0 on success and < 0 on error.
*/
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
- struct extent_state **cached_state,
- gfp_t mask, struct extent_changeset *changeset)
+ u32 bits, int wake, int delete,
+ struct extent_state **cached_state,
+ gfp_t mask, struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *cached;
@@ -868,7 +868,7 @@ static void wait_on_state(struct extent_io_tree *tree,
* The tree lock is taken by this function
*/
static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits)
+ u32 bits)
{
struct extent_state *state;
struct rb_node *node;
@@ -915,9 +915,9 @@ out:
static void set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
- unsigned *bits, struct extent_changeset *changeset)
+ u32 *bits, struct extent_changeset *changeset)
{
- unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
+ u32 bits_to_set = *bits & ~EXTENT_CTLBITS;
int ret;
if (tree->private_data && is_data_inode(tree->private_data))
@@ -961,12 +961,10 @@ static void cache_state(struct extent_state *state,
*
* [start, end] is inclusive This takes the tree lock.
*/
-
-static int __must_check
-__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, unsigned exclusive_bits,
- u64 *failed_start, struct extent_state **cached_state,
- gfp_t mask, struct extent_changeset *changeset)
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
+ u32 exclusive_bits, u64 *failed_start,
+ struct extent_state **cached_state, gfp_t mask,
+ struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -980,6 +978,10 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
btrfs_debug_check_extent_io_range(tree, start, end);
trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
+ if (exclusive_bits)
+ ASSERT(failed_start);
+ else
+ ASSERT(failed_start == NULL);
again:
if (!prealloc && gfpflags_allow_blocking(mask)) {
/*
@@ -1179,15 +1181,6 @@ out:
}
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, u64 * failed_start,
- struct extent_state **cached_state, gfp_t mask)
-{
- return __set_extent_bit(tree, start, end, bits, 0, failed_start,
- cached_state, mask, NULL);
-}
-
-
/**
* convert_extent_bit - convert all bits in a given range from one bit to
* another
@@ -1207,7 +1200,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
* All allocations are done with GFP_NOFS.
*/
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, unsigned clear_bits,
+ u32 bits, u32 clear_bits,
struct extent_state **cached_state)
{
struct extent_state *state;
@@ -1408,7 +1401,7 @@ out:
/* wrappers around set/clear extent bit */
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_changeset *changeset)
+ u32 bits, struct extent_changeset *changeset)
{
/*
* We don't support EXTENT_LOCKED yet, as current changeset will
@@ -1418,19 +1411,19 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
*/
BUG_ON(bits & EXTENT_LOCKED);
- return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
- changeset);
+ return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
+ changeset);
}
int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits)
+ u32 bits)
{
- return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
- GFP_NOWAIT, NULL);
+ return set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
+ GFP_NOWAIT, NULL);
}
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
+ u32 bits, int wake, int delete,
struct extent_state **cached)
{
return __clear_extent_bit(tree, start, end, bits, wake, delete,
@@ -1438,7 +1431,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
}
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_changeset *changeset)
+ u32 bits, struct extent_changeset *changeset)
{
/*
* Don't support EXTENT_LOCKED case, same reason as
@@ -1461,9 +1454,9 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u64 failed_start;
while (1) {
- err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
- EXTENT_LOCKED, &failed_start,
- cached_state, GFP_NOFS, NULL);
+ err = set_extent_bit(tree, start, end, EXTENT_LOCKED,
+ EXTENT_LOCKED, &failed_start,
+ cached_state, GFP_NOFS, NULL);
if (err == -EEXIST) {
wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
start = failed_start;
@@ -1479,8 +1472,8 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
int err;
u64 failed_start;
- err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
- &failed_start, NULL, GFP_NOFS, NULL);
+ err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
+ &failed_start, NULL, GFP_NOFS, NULL);
if (err == -EEXIST) {
if (failed_start > start)
clear_extent_bit(tree, start, failed_start - 1,
@@ -1526,8 +1519,7 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
* nothing was found after 'start'
*/
static struct extent_state *
-find_first_extent_bit_state(struct extent_io_tree *tree,
- u64 start, unsigned bits)
+find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits)
{
struct rb_node *node;
struct extent_state *state;
@@ -1554,14 +1546,15 @@ out:
}
/*
- * find the first offset in the io tree with 'bits' set. zero is
- * returned if we find something, and *start_ret and *end_ret are
- * set to reflect the state struct that was found.
+ * Find the first offset in the io tree with one or more @bits set.
*
- * If nothing was found, 1 is returned. If found something, return 0.
+ * Note: If there are multiple bits set in @bits, any of them will match.
+ *
+ * Return 0 if we find something, and update @start_ret and @end_ret.
+ * Return 1 if we found nothing.
*/
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits,
+ u64 *start_ret, u64 *end_ret, u32 bits,
struct extent_state **cached_state)
{
struct extent_state *state;
@@ -1612,7 +1605,7 @@ out:
* returned will be the full contiguous area with the bits set.
*/
int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits)
+ u64 *start_ret, u64 *end_ret, u32 bits)
{
struct extent_state *state;
int ret = 1;
@@ -1649,7 +1642,7 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
* trim @end_ret to the appropriate size.
*/
void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits)
+ u64 *start_ret, u64 *end_ret, u32 bits)
{
struct extent_state *state;
struct rb_node *node, *prev = NULL, *next;
@@ -1946,7 +1939,7 @@ static int __process_pages_contig(struct address_space *mapping,
unsigned long page_ops, pgoff_t *index_ret)
{
unsigned long nr_pages = end_index - start_index + 1;
- unsigned long pages_locked = 0;
+ unsigned long pages_processed = 0;
pgoff_t index = start_index;
struct page *pages[16];
unsigned ret;
@@ -1981,7 +1974,7 @@ static int __process_pages_contig(struct address_space *mapping,
if (locked_page && pages[i] == locked_page) {
put_page(pages[i]);
- pages_locked++;
+ pages_processed++;
continue;
}
if (page_ops & PAGE_CLEAR_DIRTY)
@@ -2006,7 +1999,7 @@ static int __process_pages_contig(struct address_space *mapping,
}
}
put_page(pages[i]);
- pages_locked++;
+ pages_processed++;
}
nr_pages -= ret;
index += ret;
@@ -2014,14 +2007,13 @@ static int __process_pages_contig(struct address_space *mapping,
}
out:
if (err && index_ret)
- *index_ret = start_index + pages_locked - 1;
+ *index_ret = start_index + pages_processed - 1;
return err;
}
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
- unsigned clear_bits,
- unsigned long page_ops)
+ u32 clear_bits, unsigned long page_ops)
{
clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
@@ -2037,7 +2029,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
*/
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end, u64 max_bytes,
- unsigned bits, int contig)
+ u32 bits, int contig)
{
struct rb_node *node;
struct extent_state *state;
@@ -2157,7 +2149,7 @@ out:
* range is found set.
*/
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int filled, struct extent_state *cached)
+ u32 bits, int filled, struct extent_state *cached)
{
struct extent_state *state = NULL;
struct rb_node *node;
@@ -2642,7 +2634,7 @@ static bool btrfs_io_needs_validation(struct inode *inode, struct bio *bio)
}
blk_status_t btrfs_submit_read_repair(struct inode *inode,
- struct bio *failed_bio, u64 phy_offset,
+ struct bio *failed_bio, u32 bio_offset,
struct page *page, unsigned int pgoff,
u64 start, u64 end, int failed_mirror,
submit_bio_hook_t *submit_bio_hook)
@@ -2652,7 +2644,7 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode,
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio);
- const int icsum = phy_offset >> inode->i_sb->s_blocksize_bits;
+ const int icsum = bio_offset >> fs_info->sectorsize_bits;
bool need_validation;
struct bio *repair_bio;
struct btrfs_io_bio *repair_io_bio;
@@ -2685,7 +2677,7 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode,
repair_bio->bi_private = failed_bio->bi_private;
if (failed_io_bio->csum) {
- const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ const u32 csum_size = fs_info->csum_size;
repair_io_bio->csum = repair_io_bio->csum_inline;
memcpy(repair_io_bio->csum,
@@ -2775,16 +2767,88 @@ static void end_bio_extent_writepage(struct bio *bio)
bio_put(bio);
}
-static void
-endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
- int uptodate)
+/*
+ * Record previously processed extent range
+ *
+ * For endio_readpage_release_extent() to handle a full extent range, reducing
+ * the extent io operations.
+ */
+struct processed_extent {
+ struct btrfs_inode *inode;
+ /* Start of the range in @inode */
+ u64 start;
+ /* End of the range in in @inode */
+ u64 end;
+ bool uptodate;
+};
+
+/*
+ * Try to release processed extent range
+ *
+ * May not release the extent range right now if the current range is
+ * contiguous to processed extent.
+ *
+ * Will release processed extent when any of @inode, @uptodate, the range is
+ * no longer contiguous to the processed range.
+ *
+ * Passing @inode == NULL will force processed extent to be released.
+ */
+static void endio_readpage_release_extent(struct processed_extent *processed,
+ struct btrfs_inode *inode, u64 start, u64 end,
+ bool uptodate)
{
struct extent_state *cached = NULL;
- u64 end = start + len - 1;
+ struct extent_io_tree *tree;
+
+ /* The first extent, initialize @processed */
+ if (!processed->inode)
+ goto update;
- if (uptodate && tree->track_uptodate)
- set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
- unlock_extent_cached_atomic(tree, start, end, &cached);
+ /*
+ * Contiguous to processed extent, just uptodate the end.
+ *
+ * Several things to notice:
+ *
+ * - bio can be merged as long as on-disk bytenr is contiguous
+ * This means we can have page belonging to other inodes, thus need to
+ * check if the inode still matches.
+ * - bvec can contain range beyond current page for multi-page bvec
+ * Thus we need to do processed->end + 1 >= start check
+ */
+ if (processed->inode == inode && processed->uptodate == uptodate &&
+ processed->end + 1 >= start && end >= processed->end) {
+ processed->end = end;
+ return;
+ }
+
+ tree = &processed->inode->io_tree;
+ /*
+ * Now we don't have range contiguous to the processed range, release
+ * the processed range now.
+ */
+ if (processed->uptodate && tree->track_uptodate)
+ set_extent_uptodate(tree, processed->start, processed->end,
+ &cached, GFP_ATOMIC);
+ unlock_extent_cached_atomic(tree, processed->start, processed->end,
+ &cached);
+
+update:
+ /* Update processed to current range */
+ processed->inode = inode;
+ processed->start = start;
+ processed->end = end;
+ processed->uptodate = uptodate;
+}
+
+static void endio_readpage_update_page_status(struct page *page, bool uptodate)
+{
+ if (uptodate) {
+ SetPageUptodate(page);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+ unlock_page(page);
}
/*
@@ -2804,12 +2868,12 @@ static void end_bio_extent_readpage(struct bio *bio)
int uptodate = !bio->bi_status;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
struct extent_io_tree *tree, *failure_tree;
- u64 offset = 0;
- u64 start;
- u64 end;
- u64 len;
- u64 extent_start = 0;
- u64 extent_len = 0;
+ struct processed_extent processed = { 0 };
+ /*
+ * The offset to the beginning of a bio, since one bio can never be
+ * larger than UINT_MAX, u32 here is enough.
+ */
+ u32 bio_offset = 0;
int mirror;
int ret;
struct bvec_iter_all iter_all;
@@ -2819,42 +2883,48 @@ static void end_bio_extent_readpage(struct bio *bio)
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ const u32 sectorsize = fs_info->sectorsize;
+ u64 start;
+ u64 end;
+ u32 len;
btrfs_debug(fs_info,
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
- (u64)bio->bi_iter.bi_sector, bio->bi_status,
+ bio->bi_iter.bi_sector, bio->bi_status,
io_bio->mirror_num);
tree = &BTRFS_I(inode)->io_tree;
failure_tree = &BTRFS_I(inode)->io_failure_tree;
- /* We always issue full-page reads, but if some block
- * in a page fails to read, blk_update_request() will
- * advance bv_offset and adjust bv_len to compensate.
- * Print a warning for nonzero offsets, and an error
- * if they don't add up to a full page. */
- if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
- if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
- btrfs_err(fs_info,
- "partial page read in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- else
- btrfs_info(fs_info,
- "incomplete page read in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- }
-
- start = page_offset(page);
- end = start + bvec->bv_offset + bvec->bv_len - 1;
+ /*
+ * We always issue full-sector reads, but if some block in a
+ * page fails to read, blk_update_request() will advance
+ * bv_offset and adjust bv_len to compensate. Print a warning
+ * for unaligned offsets, and an error if they don't add up to
+ * a full sector.
+ */
+ if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
+ btrfs_err(fs_info,
+ "partial page read in btrfs with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
+ else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
+ sectorsize))
+ btrfs_info(fs_info,
+ "incomplete page read with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
+
+ start = page_offset(page) + bvec->bv_offset;
+ end = start + bvec->bv_len - 1;
len = bvec->bv_len;
mirror = io_bio->mirror_num;
if (likely(uptodate)) {
if (is_data_inode(inode))
- ret = btrfs_verify_data_csum(io_bio, offset, page,
- start, end, mirror);
+ ret = btrfs_verify_data_csum(io_bio,
+ bio_offset, page, start, end,
+ mirror);
else
ret = btrfs_validate_metadata_buffer(io_bio,
- offset, page, start, end, mirror);
+ page, start, end, mirror);
if (ret)
uptodate = 0;
else
@@ -2879,12 +2949,14 @@ static void end_bio_extent_readpage(struct bio *bio)
* If it can't handle the error it will return -EIO and
* we remain responsible for that page.
*/
- if (!btrfs_submit_read_repair(inode, bio, offset, page,
+ if (!btrfs_submit_read_repair(inode, bio, bio_offset,
+ page,
start - page_offset(page),
start, end, mirror,
btrfs_submit_data_bio)) {
uptodate = !bio->bi_status;
- offset += len;
+ ASSERT(bio_offset + len > bio_offset);
+ bio_offset += len;
continue;
}
} else {
@@ -2908,40 +2980,17 @@ readpage_ok:
off = offset_in_page(i_size);
if (page->index == end_index && off)
zero_user_segment(page, off, PAGE_SIZE);
- SetPageUptodate(page);
- } else {
- ClearPageUptodate(page);
- SetPageError(page);
- }
- unlock_page(page);
- offset += len;
-
- if (unlikely(!uptodate)) {
- if (extent_len) {
- endio_readpage_release_extent(tree,
- extent_start,
- extent_len, 1);
- extent_start = 0;
- extent_len = 0;
- }
- endio_readpage_release_extent(tree, start,
- end - start + 1, 0);
- } else if (!extent_len) {
- extent_start = start;
- extent_len = end + 1 - start;
- } else if (extent_start + extent_len == start) {
- extent_len += end + 1 - start;
- } else {
- endio_readpage_release_extent(tree, extent_start,
- extent_len, uptodate);
- extent_start = start;
- extent_len = end + 1 - start;
}
- }
+ ASSERT(bio_offset + len > bio_offset);
+ bio_offset += len;
- if (extent_len)
- endio_readpage_release_extent(tree, extent_start, extent_len,
- uptodate);
+ /* Update page status and unlock */
+ endio_readpage_update_page_status(page, uptodate);
+ endio_readpage_release_extent(&processed, BTRFS_I(inode),
+ start, end, uptodate);
+ }
+ /* Release the last extent */
+ endio_readpage_release_extent(&processed, NULL, 0, 0, false);
btrfs_io_bio_free_csum(io_bio);
bio_put(bio);
}
@@ -3038,7 +3087,7 @@ static int submit_extent_page(unsigned int opf,
{
int ret = 0;
struct bio *bio;
- size_t page_size = min_t(size_t, size, PAGE_SIZE);
+ size_t io_size = min_t(size_t, size, PAGE_SIZE);
sector_t sector = offset >> 9;
struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -3054,12 +3103,12 @@ static int submit_extent_page(unsigned int opf,
else
contig = bio_end_sector(bio) == sector;
- if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags))
+ if (btrfs_bio_fits_in_stripe(page, io_size, bio, bio_flags))
can_merge = false;
if (prev_bio_flags != bio_flags || !contig || !can_merge ||
force_bio_submit ||
- bio_add_page(bio, page, page_size, pg_offset) < page_size) {
+ bio_add_page(bio, page, io_size, pg_offset) < io_size) {
ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
if (ret < 0) {
*bio_ret = NULL;
@@ -3068,13 +3117,13 @@ static int submit_extent_page(unsigned int opf,
bio = NULL;
} else {
if (wbc)
- wbc_account_cgroup_owner(wbc, page, page_size);
+ wbc_account_cgroup_owner(wbc, page, io_size);
return 0;
}
}
bio = btrfs_bio_alloc(offset);
- bio_add_page(bio, page, page_size, pg_offset);
+ bio_add_page(bio, page, io_size, pg_offset);
bio->bi_end_io = end_io_func;
bio->bi_private = tree;
bio->bi_write_hint = page->mapping->host->i_write_hint;
@@ -3085,7 +3134,7 @@ static int submit_extent_page(unsigned int opf,
bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev;
bio_set_dev(bio, bdev);
wbc_init_bio(wbc, bio);
- wbc_account_cgroup_owner(wbc, page, page_size);
+ wbc_account_cgroup_owner(wbc, page, io_size);
}
*bio_ret = bio;
@@ -3096,6 +3145,15 @@ static int submit_extent_page(unsigned int opf,
static void attach_extent_buffer_page(struct extent_buffer *eb,
struct page *page)
{
+ /*
+ * If the page is mapped to btree inode, we should hold the private
+ * lock to prevent race.
+ * For cloned or dummy extent buffers, their pages are not mapped and
+ * will not race with any other ebs.
+ */
+ if (page->mapping)
+ lockdep_assert_held(&page->mapping->private_lock);
+
if (!PagePrivate(page))
attach_page_private(page, eb);
else
@@ -3158,7 +3216,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
int nr = 0;
size_t pg_offset = 0;
size_t iosize;
- size_t disk_io_size;
size_t blocksize = inode->i_sb->s_blocksize;
unsigned long this_bio_flag = 0;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
@@ -3224,13 +3281,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
iosize = min(extent_map_end(em) - cur, end - cur + 1);
cur_end = min(extent_map_end(em) - 1, end);
iosize = ALIGN(iosize, blocksize);
- if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
- disk_io_size = em->block_len;
+ if (this_bio_flag & EXTENT_BIO_COMPRESSED)
offset = em->block_start;
- } else {
+ else
offset = em->block_start + extent_offset;
- disk_io_size = iosize;
- }
block_start = em->block_start;
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
block_start = EXTENT_MAP_HOLE;
@@ -3319,7 +3373,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
}
ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
- page, offset, disk_io_size,
+ page, offset, iosize,
pg_offset, bio,
end_bio_extent_readpage, 0,
*bio_flags,
@@ -3656,11 +3710,14 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb)
}
/*
- * Lock eb pages and flush the bio if we can't the locks
+ * Lock extent buffer status and pages for writeback.
+ *
+ * May try to flush write bio if we can't get the lock.
*
- * Return 0 if nothing went wrong
- * Return >0 is same as 0, except bio is not submitted
- * Return <0 if something went wrong, no page is locked
+ * Return 0 if the extent buffer doesn't need to be submitted.
+ * (E.g. the extent buffer is not dirty)
+ * Return >0 is the extent buffer is submitted to bio.
+ * Return <0 if something went wrong, no page is locked.
*/
static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
struct extent_page_data *epd)
@@ -3930,10 +3987,81 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
return ret;
}
+/*
+ * Submit all page(s) of one extent buffer.
+ *
+ * @page: the page of one extent buffer
+ * @eb_context: to determine if we need to submit this page, if current page
+ * belongs to this eb, we don't need to submit
+ *
+ * The caller should pass each page in their bytenr order, and here we use
+ * @eb_context to determine if we have submitted pages of one extent buffer.
+ *
+ * If we have, we just skip until we hit a new page that doesn't belong to
+ * current @eb_context.
+ *
+ * If not, we submit all the page(s) of the extent buffer.
+ *
+ * Return >0 if we have submitted the extent buffer successfully.
+ * Return 0 if we don't need to submit the page, as it's already submitted by
+ * previous call.
+ * Return <0 for fatal error.
+ */
+static int submit_eb_page(struct page *page, struct writeback_control *wbc,
+ struct extent_page_data *epd,
+ struct extent_buffer **eb_context)
+{
+ struct address_space *mapping = page->mapping;
+ struct extent_buffer *eb;
+ int ret;
+
+ if (!PagePrivate(page))
+ return 0;
+
+ spin_lock(&mapping->private_lock);
+ if (!PagePrivate(page)) {
+ spin_unlock(&mapping->private_lock);
+ return 0;
+ }
+
+ eb = (struct extent_buffer *)page->private;
+
+ /*
+ * Shouldn't happen and normally this would be a BUG_ON but no point
+ * crashing the machine for something we can survive anyway.
+ */
+ if (WARN_ON(!eb)) {
+ spin_unlock(&mapping->private_lock);
+ return 0;
+ }
+
+ if (eb == *eb_context) {
+ spin_unlock(&mapping->private_lock);
+ return 0;
+ }
+ ret = atomic_inc_not_zero(&eb->refs);
+ spin_unlock(&mapping->private_lock);
+ if (!ret)
+ return 0;
+
+ *eb_context = eb;
+
+ ret = lock_extent_buffer_for_io(eb, epd);
+ if (ret <= 0) {
+ free_extent_buffer(eb);
+ return ret;
+ }
+ ret = write_one_eb(eb, wbc, epd);
+ free_extent_buffer(eb);
+ if (ret < 0)
+ return ret;
+ return 1;
+}
+
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct extent_buffer *eb, *prev_eb = NULL;
+ struct extent_buffer *eb_context = NULL;
struct extent_page_data epd = {
.bio = NULL,
.extent_locked = 0,
@@ -3979,55 +4107,13 @@ retry:
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- if (!PagePrivate(page))
- continue;
-
- spin_lock(&mapping->private_lock);
- if (!PagePrivate(page)) {
- spin_unlock(&mapping->private_lock);
+ ret = submit_eb_page(page, wbc, &epd, &eb_context);
+ if (ret == 0)
continue;
- }
-
- eb = (struct extent_buffer *)page->private;
-
- /*
- * Shouldn't happen and normally this would be a BUG_ON
- * but no sense in crashing the users box for something
- * we can survive anyway.
- */
- if (WARN_ON(!eb)) {
- spin_unlock(&mapping->private_lock);
- continue;
- }
-
- if (eb == prev_eb) {
- spin_unlock(&mapping->private_lock);
- continue;
- }
-
- ret = atomic_inc_not_zero(&eb->refs);
- spin_unlock(&mapping->private_lock);
- if (!ret)
- continue;
-
- prev_eb = eb;
- ret = lock_extent_buffer_for_io(eb, &epd);
- if (!ret) {
- free_extent_buffer(eb);
- continue;
- } else if (ret < 0) {
- done = 1;
- free_extent_buffer(eb);
- break;
- }
-
- ret = write_one_eb(eb, wbc, &epd);
- if (ret) {
+ if (ret < 0) {
done = 1;
- free_extent_buffer(eb);
break;
}
- free_extent_buffer(eb);
/*
* the filesystem may choose to bump up nr_to_write.
@@ -4048,7 +4134,6 @@ retry:
index = 0;
goto retry;
}
- ASSERT(ret <= 0);
if (ret < 0) {
end_write_bio(&epd, ret);
return ret;
@@ -4382,14 +4467,22 @@ int extent_invalidatepage(struct extent_io_tree *tree,
u64 end = start + PAGE_SIZE - 1;
size_t blocksize = page->mapping->host->i_sb->s_blocksize;
+ /* This function is only called for the btree inode */
+ ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
+
start += ALIGN(offset, blocksize);
if (start > end)
return 0;
lock_extent_bits(tree, start, end, &cached_state);
wait_on_page_writeback(page);
- clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 1, 1, &cached_state);
+
+ /*
+ * Currently for btree io tree, only EXTENT_LOCKED is utilized,
+ * so here we only need to unlock the extent range to free any
+ * existing extent state.
+ */
+ unlock_extent_cached(tree, start, end, &cached_state);
return 0;
}
@@ -4409,12 +4502,14 @@ static int try_release_extent_state(struct extent_io_tree *tree,
ret = 0;
} else {
/*
- * at this point we can safely clear everything except the
- * locked bit and the nodatasum bit
+ * At this point we can safely clear everything except the
+ * locked bit, the nodatasum bit and the delalloc new bit.
+ * The delalloc new bit will be cleared by ordered extent
+ * completion.
*/
ret = __clear_extent_bit(tree, start, end,
- ~(EXTENT_LOCKED | EXTENT_NODATASUM),
- 0, 0, NULL, mask, NULL);
+ ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW),
+ 0, 0, NULL, mask, NULL);
/* if clear_extent_bit failed for enomem reasons,
* we can't allow the release to continue.
@@ -4691,7 +4786,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
roots = ulist_alloc(GFP_KERNEL);
tmp_ulist = ulist_alloc(GFP_KERNEL);
@@ -4950,12 +5044,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
eb->len = len;
eb->fs_info = fs_info;
eb->bflags = 0;
- rwlock_init(&eb->lock);
- atomic_set(&eb->blocking_readers, 0);
- eb->blocking_writers = 0;
- eb->lock_recursed = false;
- init_waitqueue_head(&eb->write_lock_wq);
- init_waitqueue_head(&eb->read_lock_wq);
+ init_rwsem(&eb->lock);
btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
&fs_info->allocated_ebs);
@@ -4964,19 +5053,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
atomic_set(&eb->refs, 1);
atomic_set(&eb->io_pages, 0);
- /*
- * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
- */
- BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
- > MAX_INLINE_EXTENT_BUFFER_SIZE);
- BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
-
-#ifdef CONFIG_BTRFS_DEBUG
- eb->spinning_writers = 0;
- atomic_set(&eb->spinning_readers, 0);
- atomic_set(&eb->read_locks, 0);
- eb->write_locks = 0;
-#endif
+ ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
return eb;
}
@@ -5105,7 +5182,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
rcu_read_lock();
eb = radix_tree_lookup(&fs_info->buffer_radix,
- start >> PAGE_SHIFT);
+ start >> fs_info->sectorsize_bits);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
/*
@@ -5157,7 +5234,7 @@ again:
}
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
- start >> PAGE_SHIFT, eb);
+ start >> fs_info->sectorsize_bits, eb);
spin_unlock(&fs_info->buffer_lock);
radix_tree_preload_end();
if (ret == -EEXIST) {
@@ -5178,7 +5255,7 @@ free_eb:
#endif
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start)
+ u64 start, u64 owner_root, int level)
{
unsigned long len = fs_info->nodesize;
int num_pages;
@@ -5196,6 +5273,14 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
return ERR_PTR(-EINVAL);
}
+ if (fs_info->sectorsize < PAGE_SIZE &&
+ offset_in_page(start) + len > PAGE_SIZE) {
+ btrfs_err(fs_info,
+ "tree block crosses page boundary, start %llu nodesize %lu",
+ start, len);
+ return ERR_PTR(-EINVAL);
+ }
+
eb = find_extent_buffer(fs_info, start);
if (eb)
return eb;
@@ -5203,6 +5288,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
return ERR_PTR(-ENOMEM);
+ btrfs_set_buffer_lockdep_class(owner_root, eb, level);
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++, index++) {
@@ -5231,13 +5317,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
}
exists = NULL;
- /*
- * Do this so attach doesn't complain and we need to
- * drop the ref the old guy had.
- */
- ClearPagePrivate(p);
WARN_ON(PageDirty(p));
- put_page(p);
+ detach_page_private(p);
}
attach_extent_buffer_page(eb, p);
spin_unlock(&mapping->private_lock);
@@ -5265,7 +5346,7 @@ again:
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
- start >> PAGE_SHIFT, eb);
+ start >> fs_info->sectorsize_bits, eb);
spin_unlock(&fs_info->buffer_lock);
radix_tree_preload_end();
if (ret == -EEXIST) {
@@ -5321,7 +5402,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
spin_lock(&fs_info->buffer_lock);
radix_tree_delete(&fs_info->buffer_radix,
- eb->start >> PAGE_SHIFT);
+ eb->start >> fs_info->sectorsize_bits);
spin_unlock(&fs_info->buffer_lock);
} else {
spin_unlock(&eb->refs_lock);
@@ -5622,12 +5703,12 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
struct page *page;
char *kaddr;
char *dst = (char *)dstv;
- unsigned long i = start >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
if (check_eb_range(eb, start, len))
return;
- offset = offset_in_page(start);
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
@@ -5652,13 +5733,13 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
struct page *page;
char *kaddr;
char __user *dst = (char __user *)dstv;
- unsigned long i = start >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
int ret = 0;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = offset_in_page(start);
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
@@ -5687,13 +5768,13 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
struct page *page;
char *kaddr;
char *ptr = (char *)ptrv;
- unsigned long i = start >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
int ret = 0;
if (check_eb_range(eb, start, len))
return -EINVAL;
- offset = offset_in_page(start);
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
@@ -5719,7 +5800,7 @@ void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
char *kaddr;
WARN_ON(!PageUptodate(eb->pages[0]));
- kaddr = page_address(eb->pages[0]);
+ kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv,
BTRFS_FSID_SIZE);
}
@@ -5729,7 +5810,7 @@ void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
char *kaddr;
WARN_ON(!PageUptodate(eb->pages[0]));
- kaddr = page_address(eb->pages[0]);
+ kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv,
BTRFS_FSID_SIZE);
}
@@ -5742,12 +5823,12 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
struct page *page;
char *kaddr;
char *src = (char *)srcv;
- unsigned long i = start >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
if (check_eb_range(eb, start, len))
return;
- offset = offset_in_page(start);
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
@@ -5771,12 +5852,12 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
size_t offset;
struct page *page;
char *kaddr;
- unsigned long i = start >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(start);
if (check_eb_range(eb, start, len))
return;
- offset = offset_in_page(start);
+ offset = get_eb_offset_in_page(eb, start);
while (len > 0) {
page = eb->pages[i];
@@ -5800,10 +5881,20 @@ void copy_extent_buffer_full(const struct extent_buffer *dst,
ASSERT(dst->len == src->len);
- num_pages = num_extent_pages(dst);
- for (i = 0; i < num_pages; i++)
- copy_page(page_address(dst->pages[i]),
- page_address(src->pages[i]));
+ if (dst->fs_info->sectorsize == PAGE_SIZE) {
+ num_pages = num_extent_pages(dst);
+ for (i = 0; i < num_pages; i++)
+ copy_page(page_address(dst->pages[i]),
+ page_address(src->pages[i]));
+ } else {
+ size_t src_offset = get_eb_offset_in_page(src, 0);
+ size_t dst_offset = get_eb_offset_in_page(dst, 0);
+
+ ASSERT(src->fs_info->sectorsize < PAGE_SIZE);
+ memcpy(page_address(dst->pages[0]) + dst_offset,
+ page_address(src->pages[0]) + src_offset,
+ src->len);
+ }
}
void copy_extent_buffer(const struct extent_buffer *dst,
@@ -5816,7 +5907,7 @@ void copy_extent_buffer(const struct extent_buffer *dst,
size_t offset;
struct page *page;
char *kaddr;
- unsigned long i = dst_offset >> PAGE_SHIFT;
+ unsigned long i = get_eb_page_index(dst_offset);
if (check_eb_range(dst, dst_offset, len) ||
check_eb_range(src, src_offset, len))
@@ -5824,7 +5915,7 @@ void copy_extent_buffer(const struct extent_buffer *dst,
WARN_ON(src->len != dst_len);
- offset = offset_in_page(dst_offset);
+ offset = get_eb_offset_in_page(dst, dst_offset);
while (len > 0) {
page = dst->pages[i];
@@ -5868,7 +5959,7 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
* the bitmap item in the extent buffer + the offset of the byte in the
* bitmap item.
*/
- offset = start + byte_offset;
+ offset = start + offset_in_page(eb->start) + byte_offset;
*page_index = offset >> PAGE_SHIFT;
*page_offset = offset_in_page(offset);
@@ -6022,11 +6113,11 @@ void memcpy_extent_buffer(const struct extent_buffer *dst,
return;
while (len > 0) {
- dst_off_in_page = offset_in_page(dst_offset);
- src_off_in_page = offset_in_page(src_offset);
+ dst_off_in_page = get_eb_offset_in_page(dst, dst_offset);
+ src_off_in_page = get_eb_offset_in_page(dst, src_offset);
- dst_i = dst_offset >> PAGE_SHIFT;
- src_i = src_offset >> PAGE_SHIFT;
+ dst_i = get_eb_page_index(dst_offset);
+ src_i = get_eb_page_index(src_offset);
cur = min(len, (unsigned long)(PAGE_SIZE -
src_off_in_page));
@@ -6062,11 +6153,11 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
return;
}
while (len > 0) {
- dst_i = dst_end >> PAGE_SHIFT;
- src_i = src_end >> PAGE_SHIFT;
+ dst_i = get_eb_page_index(dst_end);
+ src_i = get_eb_page_index(src_end);
- dst_off_in_page = offset_in_page(dst_end);
- src_off_in_page = offset_in_page(src_end);
+ dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
+ src_off_in_page = get_eb_offset_in_page(dst, src_end);
cur = min_t(unsigned long, len, src_off_in_page + 1);
cur = min(cur, dst_off_in_page + 1);
@@ -6121,3 +6212,54 @@ int try_release_extent_buffer(struct page *page)
return release_extent_buffer(eb);
}
+
+/*
+ * btrfs_readahead_tree_block - attempt to readahead a child block
+ * @fs_info: the fs_info
+ * @bytenr: bytenr to read
+ * @owner_root: objectid of the root that owns this eb
+ * @gen: generation for the uptodate check, can be 0
+ * @level: level for the eb
+ *
+ * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a
+ * normal uptodate check of the eb, without checking the generation. If we have
+ * to read the block we will not block on anything.
+ */
+void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
+ u64 bytenr, u64 owner_root, u64 gen, int level)
+{
+ struct extent_buffer *eb;
+ int ret;
+
+ eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
+ if (IS_ERR(eb))
+ return;
+
+ if (btrfs_buffer_uptodate(eb, gen, 1)) {
+ free_extent_buffer(eb);
+ return;
+ }
+
+ ret = read_extent_buffer_pages(eb, WAIT_NONE, 0);
+ if (ret < 0)
+ free_extent_buffer_stale(eb);
+ else
+ free_extent_buffer(eb);
+}
+
+/*
+ * btrfs_readahead_node_child - readahead a node's child block
+ * @node: parent node we're reading from
+ * @slot: slot in the parent node for the child we want to read
+ *
+ * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at
+ * the slot in the node provided.
+ */
+void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
+{
+ btrfs_readahead_tree_block(node->fs_info,
+ btrfs_node_blockptr(node, slot),
+ btrfs_header_owner(node),
+ btrfs_node_ptr_generation(node, slot),
+ btrfs_header_level(node) - 1);
+}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index f39d02e7f7ef..19221095c635 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -6,6 +6,7 @@
#include <linux/rbtree.h>
#include <linux/refcount.h>
#include <linux/fiemap.h>
+#include <linux/btrfs_tree.h>
#include "ulist.h"
/*
@@ -71,11 +72,10 @@ typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
int mirror_num,
unsigned long bio_flags);
-typedef blk_status_t (extent_submit_bio_start_t)(void *private_data,
- struct bio *bio, u64 bio_offset);
+typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode,
+ struct bio *bio, u64 dio_file_offset);
-#define INLINE_EXTENT_BUFFER_PAGES 16
-#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE)
+#define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE)
struct extent_buffer {
u64 start;
unsigned long len;
@@ -87,31 +87,13 @@ struct extent_buffer {
int read_mirror;
struct rcu_head rcu_head;
pid_t lock_owner;
-
- int blocking_writers;
- atomic_t blocking_readers;
- bool lock_recursed;
/* >= 0 if eb belongs to a log tree, -1 otherwise */
- short log_index;
-
- /* protects write locks */
- rwlock_t lock;
+ s8 log_index;
- /* readers use lock_wq while they wait for the write
- * lock holders to unlock
- */
- wait_queue_head_t write_lock_wq;
+ struct rw_semaphore lock;
- /* writers use read_lock_wq while they wait for readers
- * to unlock
- */
- wait_queue_head_t read_lock_wq;
struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
#ifdef CONFIG_BTRFS_DEBUG
- int spinning_writers;
- atomic_t spinning_readers;
- atomic_t read_locks;
- int write_locks;
struct list_head leak_list;
#endif
};
@@ -199,7 +181,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
void set_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start);
+ u64 start, u64 owner_root, int level);
struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, unsigned long len);
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
@@ -215,11 +197,20 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
int read_extent_buffer_pages(struct extent_buffer *eb, int wait,
int mirror_num);
void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
+void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
+ u64 bytenr, u64 owner_root, u64 gen, int level);
+void btrfs_readahead_node_child(struct extent_buffer *node, int slot);
static inline int num_extent_pages(const struct extent_buffer *eb)
{
- return (round_up(eb->start + eb->len, PAGE_SIZE) >> PAGE_SHIFT) -
- (eb->start >> PAGE_SHIFT);
+ /*
+ * For sectorsize == PAGE_SIZE case, since nodesize is always aligned to
+ * sectorsize, it's just eb->len >> PAGE_SHIFT.
+ *
+ * For sectorsize < PAGE_SIZE case, we could have nodesize < PAGE_SIZE,
+ * thus have to ensure we get at least one page.
+ */
+ return (eb->len >> PAGE_SHIFT) ?: 1;
}
static inline int extent_buffer_uptodate(const struct extent_buffer *eb)
@@ -270,8 +261,7 @@ void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
- unsigned bits_to_clear,
- unsigned long page_ops);
+ u32 bits_to_clear, unsigned long page_ops);
struct bio *btrfs_bio_alloc(u64 first_byte);
struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
struct bio *btrfs_bio_clone(struct bio *bio);
@@ -307,7 +297,7 @@ struct io_failure_record {
blk_status_t btrfs_submit_read_repair(struct inode *inode,
- struct bio *failed_bio, u64 phy_offset,
+ struct bio *failed_bio, u32 bio_offset,
struct page *page, unsigned int pgoff,
u64 start, u64 end, int failed_mirror,
submit_bio_hook_t *submit_bio_hook);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 8f4f2bd6d9b9..1545c22ef280 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -38,27 +38,27 @@
* Finally new_i_size should only be set in the case of truncate where we're not
* ready to use i_size_read() as the limiter yet.
*/
-void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size)
+void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size)
{
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 start, end, i_size;
int ret;
- i_size = new_i_size ?: i_size_read(inode);
+ i_size = new_i_size ?: i_size_read(&inode->vfs_inode);
if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
- BTRFS_I(inode)->disk_i_size = i_size;
+ inode->disk_i_size = i_size;
return;
}
- spin_lock(&BTRFS_I(inode)->lock);
- ret = find_contiguous_extent_bit(&BTRFS_I(inode)->file_extent_tree, 0,
- &start, &end, EXTENT_DIRTY);
+ spin_lock(&inode->lock);
+ ret = find_contiguous_extent_bit(&inode->file_extent_tree, 0, &start,
+ &end, EXTENT_DIRTY);
if (!ret && start == 0)
i_size = min(i_size, end + 1);
else
i_size = 0;
- BTRFS_I(inode)->disk_i_size = i_size;
- spin_unlock(&BTRFS_I(inode)->lock);
+ inode->disk_i_size = i_size;
+ spin_unlock(&inode->lock);
}
/**
@@ -142,7 +142,6 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
file_key.offset = pos;
file_key.type = BTRFS_EXTENT_DATA_KEY;
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &file_key,
sizeof(*item));
if (ret < 0)
@@ -181,7 +180,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
struct btrfs_csum_item *item;
struct extent_buffer *leaf;
u64 csum_offset = 0;
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ const u32 csum_size = fs_info->csum_size;
int csums_in_item;
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -201,7 +200,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
goto fail;
csum_offset = (bytenr - found_key.offset) >>
- fs_info->sb->s_blocksize_bits;
+ fs_info->sectorsize_bits;
csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
csums_in_item /= csum_size;
@@ -239,12 +238,117 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
return ret;
}
+/*
+ * Find checksums for logical bytenr range [disk_bytenr, disk_bytenr + len) and
+ * estore the result to @dst.
+ *
+ * Return >0 for the number of sectors we found.
+ * Return 0 for the range [disk_bytenr, disk_bytenr + sectorsize) has no csum
+ * for it. Caller may want to try next sector until one range is hit.
+ * Return <0 for fatal error.
+ */
+static int search_csum_tree(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path, u64 disk_bytenr,
+ u64 len, u8 *dst)
+{
+ struct btrfs_csum_item *item = NULL;
+ struct btrfs_key key;
+ const u32 sectorsize = fs_info->sectorsize;
+ const u32 csum_size = fs_info->csum_size;
+ u32 itemsize;
+ int ret;
+ u64 csum_start;
+ u64 csum_len;
+
+ ASSERT(IS_ALIGNED(disk_bytenr, sectorsize) &&
+ IS_ALIGNED(len, sectorsize));
+
+ /* Check if the current csum item covers disk_bytenr */
+ if (path->nodes[0]) {
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_csum_item);
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+
+ csum_start = key.offset;
+ csum_len = (itemsize / csum_size) * sectorsize;
+
+ if (in_range(disk_bytenr, csum_start, csum_len))
+ goto found;
+ }
+
+ /* Current item doesn't contain the desired range, search again */
+ btrfs_release_path(path);
+ item = btrfs_lookup_csum(NULL, fs_info->csum_root, path, disk_bytenr, 0);
+ if (IS_ERR(item)) {
+ ret = PTR_ERR(item);
+ goto out;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+
+ csum_start = key.offset;
+ csum_len = (itemsize / csum_size) * sectorsize;
+ ASSERT(in_range(disk_bytenr, csum_start, csum_len));
+
+found:
+ ret = (min(csum_start + csum_len, disk_bytenr + len) -
+ disk_bytenr) >> fs_info->sectorsize_bits;
+ read_extent_buffer(path->nodes[0], dst, (unsigned long)item,
+ ret * csum_size);
+out:
+ if (ret == -ENOENT)
+ ret = 0;
+ return ret;
+}
+
+/*
+ * Locate the file_offset of @cur_disk_bytenr of a @bio.
+ *
+ * Bio of btrfs represents read range of
+ * [bi_sector << 9, bi_sector << 9 + bi_size).
+ * Knowing this, we can iterate through each bvec to locate the page belong to
+ * @cur_disk_bytenr and get the file offset.
+ *
+ * @inode is used to determine if the bvec page really belongs to @inode.
+ *
+ * Return 0 if we can't find the file offset
+ * Return >0 if we find the file offset and restore it to @file_offset_ret
+ */
+static int search_file_offset_in_bio(struct bio *bio, struct inode *inode,
+ u64 disk_bytenr, u64 *file_offset_ret)
+{
+ struct bvec_iter iter;
+ struct bio_vec bvec;
+ u64 cur = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ int ret = 0;
+
+ bio_for_each_segment(bvec, bio, iter) {
+ struct page *page = bvec.bv_page;
+
+ if (cur > disk_bytenr)
+ break;
+ if (cur + bvec.bv_len <= disk_bytenr) {
+ cur += bvec.bv_len;
+ continue;
+ }
+ ASSERT(in_range(disk_bytenr, cur, bvec.bv_len));
+ if (page->mapping && page->mapping->host &&
+ page->mapping->host == inode) {
+ ret = 1;
+ *file_offset_ret = page_offset(page) + bvec.bv_offset +
+ disk_bytenr - cur;
+ break;
+ }
+ }
+ return ret;
+}
+
/**
- * btrfs_lookup_bio_sums - Look up checksums for a bio.
+ * Lookup the checksum for the read bio in csum tree.
+ *
* @inode: inode that the bio is for.
* @bio: bio to look up.
- * @offset: Unless (u64)-1, look up checksums for this offset in the file.
- * If (u64)-1, use the page offsets from the bio instead.
* @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return
* checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If
* NULL, the checksum buffer is allocated and returned in
@@ -252,31 +356,40 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
*
* Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
*/
-blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
- u64 offset, u8 *dst)
+blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct bio_vec bvec;
- struct bvec_iter iter;
- struct btrfs_csum_item *item = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_path *path;
- const bool page_offsets = (offset == (u64)-1);
+ const u32 sectorsize = fs_info->sectorsize;
+ const u32 csum_size = fs_info->csum_size;
+ u32 orig_len = bio->bi_iter.bi_size;
+ u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 cur_disk_bytenr;
u8 *csum;
- u64 item_start_offset = 0;
- u64 item_last_offset = 0;
- u64 disk_bytenr;
- u64 page_bytes_left;
- u32 diff;
- int nblocks;
+ const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits;
int count = 0;
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ if (!fs_info->csum_root || (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
+ return BLK_STS_OK;
+
+ /*
+ * This function is only called for read bio.
+ *
+ * This means two things:
+ * - All our csums should only be in csum tree
+ * No ordered extents csums, as ordered extents are only for write
+ * path.
+ * - No need to bother any other info from bvec
+ * Since we're looking up csums, the only important info is the
+ * disk_bytenr and the length, which can be extracted from bi_iter
+ * directly.
+ */
+ ASSERT(bio_op(bio) == REQ_OP_READ);
path = btrfs_alloc_path();
if (!path)
return BLK_STS_RESOURCE;
- nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
if (!dst) {
struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio);
@@ -295,7 +408,11 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
csum = dst;
}
- if (bio->bi_iter.bi_size > PAGE_SIZE * 8)
+ /*
+ * If requested number of sectors is larger than one leaf can contain,
+ * kick the readahead for csum tree.
+ */
+ if (nblocks > fs_info->csums_per_leaf)
path->reada = READA_FORWARD;
/*
@@ -309,85 +426,62 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
path->skip_locking = 1;
}
- disk_bytenr = (u64)bio->bi_iter.bi_sector << 9;
+ for (cur_disk_bytenr = orig_disk_bytenr;
+ cur_disk_bytenr < orig_disk_bytenr + orig_len;
+ cur_disk_bytenr += (count * sectorsize)) {
+ u64 search_len = orig_disk_bytenr + orig_len - cur_disk_bytenr;
+ unsigned int sector_offset;
+ u8 *csum_dst;
- bio_for_each_segment(bvec, bio, iter) {
- page_bytes_left = bvec.bv_len;
- if (count)
- goto next;
-
- if (page_offsets)
- offset = page_offset(bvec.bv_page) + bvec.bv_offset;
- count = btrfs_find_ordered_sum(BTRFS_I(inode), offset,
- disk_bytenr, csum, nblocks);
- if (count)
- goto found;
+ /*
+ * Although both cur_disk_bytenr and orig_disk_bytenr is u64,
+ * we're calculating the offset to the bio start.
+ *
+ * Bio size is limited to UINT_MAX, thus unsigned int is large
+ * enough to contain the raw result, not to mention the right
+ * shifted result.
+ */
+ ASSERT(cur_disk_bytenr - orig_disk_bytenr < UINT_MAX);
+ sector_offset = (cur_disk_bytenr - orig_disk_bytenr) >>
+ fs_info->sectorsize_bits;
+ csum_dst = csum + sector_offset * csum_size;
+
+ count = search_csum_tree(fs_info, path, cur_disk_bytenr,
+ search_len, csum_dst);
+ if (count <= 0) {
+ /*
+ * Either we hit a critical error or we didn't find
+ * the csum.
+ * Either way, we put zero into the csums dst, and skip
+ * to the next sector.
+ */
+ memset(csum_dst, 0, csum_size);
+ count = 1;
- if (!item || disk_bytenr < item_start_offset ||
- disk_bytenr >= item_last_offset) {
- struct btrfs_key found_key;
- u32 item_size;
-
- if (item)
- btrfs_release_path(path);
- item = btrfs_lookup_csum(NULL, fs_info->csum_root,
- path, disk_bytenr, 0);
- if (IS_ERR(item)) {
- count = 1;
- memset(csum, 0, csum_size);
- if (BTRFS_I(inode)->root->root_key.objectid ==
- BTRFS_DATA_RELOC_TREE_OBJECTID) {
- set_extent_bits(io_tree, offset,
- offset + fs_info->sectorsize - 1,
+ /*
+ * For data reloc inode, we need to mark the range
+ * NODATASUM so that balance won't report false csum
+ * error.
+ */
+ if (BTRFS_I(inode)->root->root_key.objectid ==
+ BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ u64 file_offset;
+ int ret;
+
+ ret = search_file_offset_in_bio(bio, inode,
+ cur_disk_bytenr, &file_offset);
+ if (ret)
+ set_extent_bits(io_tree, file_offset,
+ file_offset + sectorsize - 1,
EXTENT_NODATASUM);
- } else {
- btrfs_info_rl(fs_info,
- "no csum found for inode %llu start %llu",
- btrfs_ino(BTRFS_I(inode)), offset);
- }
- item = NULL;
- btrfs_release_path(path);
- goto found;
+ } else {
+ btrfs_warn_rl(fs_info,
+ "csum hole found for disk bytenr range [%llu, %llu)",
+ cur_disk_bytenr, cur_disk_bytenr + sectorsize);
}
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- path->slots[0]);
-
- item_start_offset = found_key.offset;
- item_size = btrfs_item_size_nr(path->nodes[0],
- path->slots[0]);
- item_last_offset = item_start_offset +
- (item_size / csum_size) *
- fs_info->sectorsize;
- item = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_csum_item);
- }
- /*
- * this byte range must be able to fit inside
- * a single leaf so it will also fit inside a u32
- */
- diff = disk_bytenr - item_start_offset;
- diff = diff / fs_info->sectorsize;
- diff = diff * csum_size;
- count = min_t(int, nblocks, (item_last_offset - disk_bytenr) >>
- inode->i_sb->s_blocksize_bits);
- read_extent_buffer(path->nodes[0], csum,
- ((unsigned long)item) + diff,
- csum_size * count);
-found:
- csum += count * csum_size;
- nblocks -= count;
-next:
- while (count > 0) {
- count--;
- disk_bytenr += fs_info->sectorsize;
- offset += fs_info->sectorsize;
- page_bytes_left -= fs_info->sectorsize;
- if (!page_bytes_left)
- break; /* move to next bio */
}
}
- WARN_ON_ONCE(count);
btrfs_free_path(path);
return BLK_STS_OK;
}
@@ -406,7 +500,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
int ret;
size_t size;
u64 csum_end;
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ const u32 csum_size = fs_info->csum_size;
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
IS_ALIGNED(end + 1, fs_info->sectorsize));
@@ -433,8 +527,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
key.type == BTRFS_EXTENT_CSUM_KEY) {
- offset = (start - key.offset) >>
- fs_info->sb->s_blocksize_bits;
+ offset = (start - key.offset) >> fs_info->sectorsize_bits;
if (offset * csum_size <
btrfs_item_size_nr(leaf, path->slots[0] - 1))
path->slots[0]--;
@@ -484,10 +577,9 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
sums->bytenr = start;
sums->len = (int)size;
- offset = (start - key.offset) >>
- fs_info->sb->s_blocksize_bits;
+ offset = (start - key.offset) >> fs_info->sectorsize_bits;
offset *= csum_size;
- size >>= fs_info->sb->s_blocksize_bits;
+ size >>= fs_info->sectorsize_bits;
read_extent_buffer(path->nodes[0],
sums->sums,
@@ -539,7 +631,6 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
int i;
u64 offset;
unsigned nofs_flag;
- const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
nofs_flag = memalloc_nofs_save();
sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size),
@@ -557,7 +648,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
else
offset = 0; /* shut up gcc */
- sums->bytenr = (u64)bio->bi_iter.bi_sector << 9;
+ sums->bytenr = bio->bi_iter.bi_sector << 9;
index = 0;
shash->tfm = fs_info->csum_shash;
@@ -596,7 +687,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
ordered = btrfs_lookup_ordered_extent(inode,
offset);
ASSERT(ordered); /* Logic error */
- sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9)
+ sums->bytenr = (bio->bi_iter.bi_sector << 9)
+ total_bytes;
index = 0;
}
@@ -607,7 +698,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
fs_info->sectorsize,
sums->sums + index);
kunmap_atomic(data);
- index += csum_size;
+ index += fs_info->csum_size;
offset += fs_info->sectorsize;
this_sum_bytes += fs_info->sectorsize;
total_bytes += fs_info->sectorsize;
@@ -637,14 +728,14 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 len)
{
struct extent_buffer *leaf;
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ const u32 csum_size = fs_info->csum_size;
u64 csum_end;
u64 end_byte = bytenr + len;
- u32 blocksize_bits = fs_info->sb->s_blocksize_bits;
+ u32 blocksize_bits = fs_info->sectorsize_bits;
leaf = path->nodes[0];
csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
- csum_end <<= fs_info->sb->s_blocksize_bits;
+ csum_end <<= blocksize_bits;
csum_end += key->offset;
if (key->offset < bytenr && csum_end <= end_byte) {
@@ -691,8 +782,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
u64 csum_end;
struct extent_buffer *leaf;
int ret;
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
- int blocksize_bits = fs_info->sb->s_blocksize_bits;
+ const u32 csum_size = fs_info->csum_size;
+ u32 blocksize_bits = fs_info->sectorsize_bits;
ASSERT(root == fs_info->csum_root ||
root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
@@ -706,7 +797,6 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
key.offset = end_byte - 1;
key.type = BTRFS_EXTENT_CSUM_KEY;
- path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
if (path->slots[0] == 0)
@@ -846,7 +936,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
int index = 0;
int found_next;
int ret;
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ const u32 csum_size = fs_info->csum_size;
path = btrfs_alloc_path();
if (!path)
@@ -921,7 +1011,7 @@ again:
if (btrfs_leaf_free_space(leaf) >= csum_size) {
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
csum_offset = (bytenr - found_key.offset) >>
- fs_info->sb->s_blocksize_bits;
+ fs_info->sectorsize_bits;
goto extend_csum;
}
@@ -939,8 +1029,7 @@ again:
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- csum_offset = (bytenr - found_key.offset) >>
- fs_info->sb->s_blocksize_bits;
+ csum_offset = (bytenr - found_key.offset) >> fs_info->sectorsize_bits;
if (found_key.type != BTRFS_EXTENT_CSUM_KEY ||
found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
@@ -956,7 +1045,7 @@ extend_csum:
u32 diff;
tmp = sums->len - total_bytes;
- tmp >>= fs_info->sb->s_blocksize_bits;
+ tmp >>= fs_info->sectorsize_bits;
WARN_ON(tmp < 1);
extend_nr = max_t(int, 1, (int)tmp);
@@ -981,9 +1070,9 @@ insert:
u64 tmp;
tmp = sums->len - total_bytes;
- tmp >>= fs_info->sb->s_blocksize_bits;
+ tmp >>= fs_info->sectorsize_bits;
tmp = min(tmp, (next_offset - file_key.offset) >>
- fs_info->sb->s_blocksize_bits);
+ fs_info->sectorsize_bits);
tmp = max_t(u64, 1, tmp);
tmp = min_t(u64, tmp, MAX_CSUM_ITEMS(fs_info, csum_size));
@@ -991,10 +1080,8 @@ insert:
} else {
ins_size = csum_size;
}
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &file_key,
ins_size);
- path->leave_spinning = 0;
if (ret < 0)
goto out;
if (WARN_ON(ret != 0))
@@ -1007,8 +1094,7 @@ csum:
item = (struct btrfs_csum_item *)((unsigned char *)item +
csum_offset * csum_size);
found:
- ins_size = (u32)(sums->len - total_bytes) >>
- fs_info->sb->s_blocksize_bits;
+ ins_size = (u32)(sums->len - total_bytes) >> fs_info->sectorsize_bits;
ins_size *= csum_size;
ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item,
ins_size);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4373da7bcc0d..0e41459b8de6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -462,7 +462,7 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
*/
int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
- struct extent_state **cached)
+ struct extent_state **cached, bool noreserve)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
int err = 0;
@@ -474,7 +474,13 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
loff_t isize = i_size_read(&inode->vfs_inode);
unsigned int extra_bits = 0;
- start_pos = pos & ~((u64) fs_info->sectorsize - 1);
+ if (write_bytes == 0)
+ return 0;
+
+ if (noreserve)
+ extra_bits |= EXTENT_NORESERVE;
+
+ start_pos = round_down(pos, fs_info->sectorsize);
num_bytes = round_up(write_bytes + pos - start_pos,
fs_info->sectorsize);
@@ -671,14 +677,16 @@ next:
* If an extent intersects the range but is not entirely inside the range
* it is either truncated or split. Anything entirely inside the range
* is deleted from the tree.
+ *
+ * Note: the VFS' inode number of bytes is not updated, it's up to the caller
+ * to deal with that. We set the field 'bytes_found' of the arguments structure
+ * with the number of allocated bytes found in the target range, so that the
+ * caller can update the inode's number of bytes in an atomic way when
+ * replacing extents in a range to avoid races with stat(2).
*/
-int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_inode *inode,
- struct btrfs_path *path, u64 start, u64 end,
- u64 *drop_end, int drop_cache,
- int replace_extent,
- u32 extent_item_size,
- int *key_inserted)
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_inode *inode,
+ struct btrfs_drop_extents_args *args)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
@@ -686,14 +694,13 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_ref ref = { 0 };
struct btrfs_key key;
struct btrfs_key new_key;
- struct inode *vfs_inode = &inode->vfs_inode;
u64 ino = btrfs_ino(inode);
- u64 search_start = start;
+ u64 search_start = args->start;
u64 disk_bytenr = 0;
u64 num_bytes = 0;
u64 extent_offset = 0;
u64 extent_end = 0;
- u64 last_end = start;
+ u64 last_end = args->start;
int del_nr = 0;
int del_slot = 0;
int extent_type;
@@ -703,11 +710,26 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
int update_refs;
int found = 0;
int leafs_visited = 0;
+ struct btrfs_path *path = args->path;
+
+ args->bytes_found = 0;
+ args->extent_inserted = false;
- if (drop_cache)
- btrfs_drop_extent_cache(inode, start, end - 1, 0);
+ /* Must always have a path if ->replace_extent is true */
+ ASSERT(!(args->replace_extent && !args->path));
- if (start >= inode->disk_i_size && !replace_extent)
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ if (args->drop_cache)
+ btrfs_drop_extent_cache(inode, args->start, args->end - 1, 0);
+
+ if (args->start >= inode->disk_i_size && !args->replace_extent)
modify_tree = 0;
update_refs = (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
@@ -718,7 +740,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
search_start, modify_tree);
if (ret < 0)
break;
- if (ret > 0 && path->slots[0] > 0 && search_start == start) {
+ if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
if (key.objectid == ino &&
@@ -753,7 +775,7 @@ next_slot:
path->slots[0]++;
goto next_slot;
}
- if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
+ if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
break;
fi = btrfs_item_ptr(leaf, path->slots[0],
@@ -795,7 +817,7 @@ next_slot:
}
found = 1;
- search_start = max(key.offset, start);
+ search_start = max(key.offset, args->start);
if (recow || !modify_tree) {
modify_tree = -1;
btrfs_release_path(path);
@@ -806,7 +828,7 @@ next_slot:
* | - range to drop - |
* | -------- extent -------- |
*/
- if (start > key.offset && end < extent_end) {
+ if (args->start > key.offset && args->end < extent_end) {
BUG_ON(del_nr > 0);
if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
ret = -EOPNOTSUPP;
@@ -814,7 +836,7 @@ next_slot:
}
memcpy(&new_key, &key, sizeof(new_key));
- new_key.offset = start;
+ new_key.offset = args->start;
ret = btrfs_duplicate_item(trans, root, path,
&new_key);
if (ret == -EAGAIN) {
@@ -828,15 +850,15 @@ next_slot:
fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
struct btrfs_file_extent_item);
btrfs_set_file_extent_num_bytes(leaf, fi,
- start - key.offset);
+ args->start - key.offset);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- extent_offset += start - key.offset;
+ extent_offset += args->start - key.offset;
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
- extent_end - start);
+ extent_end - args->start);
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0) {
@@ -846,11 +868,11 @@ next_slot:
btrfs_init_data_ref(&ref,
root->root_key.objectid,
new_key.objectid,
- start - extent_offset);
+ args->start - extent_offset);
ret = btrfs_inc_extent_ref(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
}
- key.offset = start;
+ key.offset = args->start;
}
/*
* From here on out we will have actually dropped something, so
@@ -862,23 +884,23 @@ next_slot:
* | ---- range to drop ----- |
* | -------- extent -------- |
*/
- if (start <= key.offset && end < extent_end) {
+ if (args->start <= key.offset && args->end < extent_end) {
if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
ret = -EOPNOTSUPP;
break;
}
memcpy(&new_key, &key, sizeof(new_key));
- new_key.offset = end;
+ new_key.offset = args->end;
btrfs_set_item_key_safe(fs_info, path, &new_key);
- extent_offset += end - key.offset;
+ extent_offset += args->end - key.offset;
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
- extent_end - end);
+ extent_end - args->end);
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0)
- inode_sub_bytes(vfs_inode, end - key.offset);
+ args->bytes_found += args->end - key.offset;
break;
}
@@ -887,7 +909,7 @@ next_slot:
* | ---- range to drop ----- |
* | -------- extent -------- |
*/
- if (start > key.offset && end >= extent_end) {
+ if (args->start > key.offset && args->end >= extent_end) {
BUG_ON(del_nr > 0);
if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
ret = -EOPNOTSUPP;
@@ -895,11 +917,11 @@ next_slot:
}
btrfs_set_file_extent_num_bytes(leaf, fi,
- start - key.offset);
+ args->start - key.offset);
btrfs_mark_buffer_dirty(leaf);
if (update_refs && disk_bytenr > 0)
- inode_sub_bytes(vfs_inode, extent_end - start);
- if (end == extent_end)
+ args->bytes_found += extent_end - args->start;
+ if (args->end == extent_end)
break;
path->slots[0]++;
@@ -910,7 +932,7 @@ next_slot:
* | ---- range to drop ----- |
* | ------ extent ------ |
*/
- if (start <= key.offset && end >= extent_end) {
+ if (args->start <= key.offset && args->end >= extent_end) {
delete_extent_item:
if (del_nr == 0) {
del_slot = path->slots[0];
@@ -922,8 +944,7 @@ delete_extent_item:
if (update_refs &&
extent_type == BTRFS_FILE_EXTENT_INLINE) {
- inode_sub_bytes(vfs_inode,
- extent_end - key.offset);
+ args->bytes_found += extent_end - key.offset;
extent_end = ALIGN(extent_end,
fs_info->sectorsize);
} else if (update_refs && disk_bytenr > 0) {
@@ -936,11 +957,10 @@ delete_extent_item:
key.offset - extent_offset);
ret = btrfs_free_extent(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
- inode_sub_bytes(vfs_inode,
- extent_end - key.offset);
+ args->bytes_found += extent_end - key.offset;
}
- if (end == extent_end)
+ if (args->end == extent_end)
break;
if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
@@ -970,7 +990,7 @@ delete_extent_item:
* Set path->slots[0] to first slot, so that after the delete
* if items are move off from our leaf to its immediate left or
* right neighbor leafs, we end up with a correct and adjusted
- * path->slots[0] for our insertion (if replace_extent != 0).
+ * path->slots[0] for our insertion (if args->replace_extent).
*/
path->slots[0] = del_slot;
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
@@ -984,15 +1004,14 @@ delete_extent_item:
* which case it unlocked our path, so check path->locks[0] matches a
* write lock.
*/
- if (!ret && replace_extent && leafs_visited == 1 &&
- (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
- path->locks[0] == BTRFS_WRITE_LOCK) &&
+ if (!ret && args->replace_extent && leafs_visited == 1 &&
+ path->locks[0] == BTRFS_WRITE_LOCK &&
btrfs_leaf_free_space(leaf) >=
- sizeof(struct btrfs_item) + extent_item_size) {
+ sizeof(struct btrfs_item) + args->extent_item_size) {
key.objectid = ino;
key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = start;
+ key.offset = args->start;
if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
struct btrfs_key slot_key;
@@ -1000,30 +1019,18 @@ delete_extent_item:
if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
path->slots[0]++;
}
- setup_items_for_insert(root, path, &key, &extent_item_size, 1);
- *key_inserted = 1;
+ setup_items_for_insert(root, path, &key,
+ &args->extent_item_size, 1);
+ args->extent_inserted = true;
}
- if (!replace_extent || !(*key_inserted))
+ if (!args->path)
+ btrfs_free_path(path);
+ else if (!args->extent_inserted)
btrfs_release_path(path);
- if (drop_end)
- *drop_end = found ? min(end, last_end) : end;
- return ret;
-}
-
-int btrfs_drop_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode, u64 start,
- u64 end, int drop_cache)
-{
- struct btrfs_path *path;
- int ret;
+out:
+ args->drop_end = found ? min(args->end, last_end) : args->end;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path, start,
- end, NULL, drop_cache, 0, 0, NULL);
- btrfs_free_path(path);
return ret;
}
@@ -1558,11 +1565,84 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
btrfs_drew_write_unlock(&inode->root->snapshot_lock);
}
+static void update_time_for_write(struct inode *inode)
+{
+ struct timespec64 now;
+
+ if (IS_NOCMTIME(inode))
+ return;
+
+ now = current_time(inode);
+ if (!timespec64_equal(&inode->i_mtime, &now))
+ inode->i_mtime = now;
+
+ if (!timespec64_equal(&inode->i_ctime, &now))
+ inode->i_ctime = now;
+
+ if (IS_I_VERSION(inode))
+ inode_inc_iversion(inode);
+}
+
+static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
+ size_t count)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ loff_t pos = iocb->ki_pos;
+ int ret;
+ loff_t oldsize;
+ loff_t start_pos;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ size_t nocow_bytes = count;
+
+ /* We will allocate space in case nodatacow is not set, so bail */
+ if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes) <= 0)
+ return -EAGAIN;
+ /*
+ * There are holes in the range or parts of the range that must
+ * be COWed (shared extents, RO block groups, etc), so just bail
+ * out.
+ */
+ if (nocow_bytes < count)
+ return -EAGAIN;
+ }
+
+ current->backing_dev_info = inode_to_bdi(inode);
+ ret = file_remove_privs(file);
+ if (ret)
+ return ret;
+
+ /*
+ * We reserve space for updating the inode when we reserve space for the
+ * extent we are going to write, so we will enospc out there. We don't
+ * need to start yet another transaction to update the inode as we will
+ * update the inode when we finish writing whatever data we write.
+ */
+ update_time_for_write(inode);
+
+ start_pos = round_down(pos, fs_info->sectorsize);
+ oldsize = i_size_read(inode);
+ if (start_pos > oldsize) {
+ /* Expand hole size to cover write data, preventing empty gap */
+ loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
+
+ ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
+ if (ret) {
+ current->backing_dev_info = NULL;
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
struct iov_iter *i)
{
struct file *file = iocb->ki_filp;
- loff_t pos = iocb->ki_pos;
+ loff_t pos;
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct page **pages = NULL;
@@ -1572,17 +1652,37 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
u64 lockend;
size_t num_written = 0;
int nrptrs;
- int ret = 0;
+ ssize_t ret;
bool only_release_metadata = false;
bool force_page_uptodate = false;
+ loff_t old_isize = i_size_read(inode);
+ unsigned int ilock_flags = 0;
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ ilock_flags |= BTRFS_ILOCK_TRY;
+
+ ret = btrfs_inode_lock(inode, ilock_flags);
+ if (ret < 0)
+ return ret;
+
+ ret = generic_write_checks(iocb, i);
+ if (ret <= 0)
+ goto out;
+ ret = btrfs_write_check(iocb, i, ret);
+ if (ret < 0)
+ goto out;
+
+ pos = iocb->ki_pos;
nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
PAGE_SIZE / (sizeof(struct page *)));
nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
nrptrs = max(nrptrs, 8);
pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
- if (!pages)
- return -ENOMEM;
+ if (!pages) {
+ ret = -ENOMEM;
+ goto out;
+ }
while (iov_iter_count(i) > 0) {
struct extent_state *cached_state = NULL;
@@ -1591,8 +1691,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
size_t write_bytes = min(iov_iter_count(i),
nrptrs * (size_t)PAGE_SIZE -
offset);
- size_t num_pages = DIV_ROUND_UP(write_bytes + offset,
- PAGE_SIZE);
+ size_t num_pages;
size_t reserve_bytes;
size_t dirty_pages;
size_t copied;
@@ -1600,8 +1699,6 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
size_t num_sectors;
int extents_locked;
- WARN_ON(num_pages > nrptrs);
-
/*
* Fault pages before locking them in prepare_pages
* to avoid recursive lock
@@ -1613,35 +1710,28 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
only_release_metadata = false;
sector_offset = pos & (fs_info->sectorsize - 1);
- reserve_bytes = round_up(write_bytes + sector_offset,
- fs_info->sectorsize);
extent_changeset_release(data_reserved);
ret = btrfs_check_data_free_space(BTRFS_I(inode),
&data_reserved, pos,
write_bytes);
if (ret < 0) {
+ /*
+ * If we don't have to COW at the offset, reserve
+ * metadata only. write_bytes may get smaller than
+ * requested here.
+ */
if (btrfs_check_nocow_lock(BTRFS_I(inode), pos,
- &write_bytes) > 0) {
- /*
- * For nodata cow case, no need to reserve
- * data space.
- */
+ &write_bytes) > 0)
only_release_metadata = true;
- /*
- * our prealloc extent may be smaller than
- * write_bytes, so scale down.
- */
- num_pages = DIV_ROUND_UP(write_bytes + offset,
- PAGE_SIZE);
- reserve_bytes = round_up(write_bytes +
- sector_offset,
- fs_info->sectorsize);
- } else {
+ else
break;
- }
}
+ num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
+ WARN_ON(num_pages > nrptrs);
+ reserve_bytes = round_up(write_bytes + sector_offset,
+ fs_info->sectorsize);
WARN_ON(reserve_bytes == 0);
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
reserve_bytes);
@@ -1710,8 +1800,7 @@ again:
if (num_sectors > dirty_sectors) {
/* release everything except the sectors we dirtied */
- release_bytes -= dirty_sectors <<
- fs_info->sb->s_blocksize_bits;
+ release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
if (only_release_metadata) {
btrfs_delalloc_release_metadata(BTRFS_I(inode),
release_bytes, true);
@@ -1730,10 +1819,9 @@ again:
release_bytes = round_up(copied + sector_offset,
fs_info->sectorsize);
- if (copied > 0)
- ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
- dirty_pages, pos, copied,
- &cached_state);
+ ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
+ dirty_pages, pos, copied,
+ &cached_state, only_release_metadata);
/*
* If we have not locked the extent range, because the range's
@@ -1758,17 +1846,6 @@ again:
if (only_release_metadata)
btrfs_check_nocow_unlock(BTRFS_I(inode));
- if (only_release_metadata && copied > 0) {
- lockstart = round_down(pos,
- fs_info->sectorsize);
- lockend = round_up(pos + copied,
- fs_info->sectorsize) - 1;
-
- set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, EXTENT_NORESERVE, NULL,
- NULL, GFP_NOFS);
- }
-
btrfs_drop_pages(pages, num_pages);
cond_resched();
@@ -1795,24 +1872,102 @@ again:
}
extent_changeset_free(data_reserved);
+ if (num_written > 0) {
+ pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
+ iocb->ki_pos += num_written;
+ }
+out:
+ btrfs_inode_unlock(inode, ilock_flags);
return num_written ? num_written : ret;
}
-static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
+ const struct iov_iter *iter, loff_t offset)
+{
+ const u32 blocksize_mask = fs_info->sectorsize - 1;
+
+ if (offset & blocksize_mask)
+ return -EINVAL;
+
+ if (iov_iter_alignment(iter) & blocksize_mask)
+ return -EINVAL;
+
+ return 0;
+}
+
+static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
loff_t pos;
- ssize_t written;
+ ssize_t written = 0;
ssize_t written_buffered;
loff_t endbyte;
- int err;
+ ssize_t err;
+ unsigned int ilock_flags = 0;
+ struct iomap_dio *dio = NULL;
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ ilock_flags |= BTRFS_ILOCK_TRY;
+
+ /* If the write DIO is within EOF, use a shared lock */
+ if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode))
+ ilock_flags |= BTRFS_ILOCK_SHARED;
+
+relock:
+ err = btrfs_inode_lock(inode, ilock_flags);
+ if (err < 0)
+ return err;
+
+ err = generic_write_checks(iocb, from);
+ if (err <= 0) {
+ btrfs_inode_unlock(inode, ilock_flags);
+ return err;
+ }
+
+ err = btrfs_write_check(iocb, from, err);
+ if (err < 0) {
+ btrfs_inode_unlock(inode, ilock_flags);
+ goto out;
+ }
+
+ pos = iocb->ki_pos;
+ /*
+ * Re-check since file size may have changed just before taking the
+ * lock or pos may have changed because of O_APPEND in generic_write_check()
+ */
+ if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
+ pos + iov_iter_count(from) > i_size_read(inode)) {
+ btrfs_inode_unlock(inode, ilock_flags);
+ ilock_flags &= ~BTRFS_ILOCK_SHARED;
+ goto relock;
+ }
- written = btrfs_direct_IO(iocb, from);
+ if (check_direct_IO(fs_info, from, pos)) {
+ btrfs_inode_unlock(inode, ilock_flags);
+ goto buffered;
+ }
- if (written < 0 || !iov_iter_count(from))
- return written;
+ dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops,
+ &btrfs_dio_ops, is_sync_kiocb(iocb));
+ btrfs_inode_unlock(inode, ilock_flags);
+
+ if (IS_ERR_OR_NULL(dio)) {
+ err = PTR_ERR_OR_ZERO(dio);
+ if (err < 0 && err != -ENOTBLK)
+ goto out;
+ } else {
+ written = iomap_dio_complete(dio);
+ }
+
+ if (written < 0 || !iov_iter_count(from)) {
+ err = written;
+ goto out;
+ }
+
+buffered:
pos = iocb->ki_pos;
written_buffered = btrfs_buffered_write(iocb, from);
if (written_buffered < 0) {
@@ -1838,24 +1993,6 @@ out:
return written ? written : err;
}
-static void update_time_for_write(struct inode *inode)
-{
- struct timespec64 now;
-
- if (IS_NOCMTIME(inode))
- return;
-
- now = current_time(inode);
- if (!timespec64_equal(&inode->i_mtime, &now))
- inode->i_mtime = now;
-
- if (!timespec64_equal(&inode->i_ctime, &now))
- inode->i_ctime = now;
-
- if (IS_I_VERSION(inode))
- inode_inc_iversion(inode);
-}
-
static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
@@ -1863,148 +2000,28 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
- u64 start_pos;
- u64 end_pos;
ssize_t num_written = 0;
const bool sync = iocb->ki_flags & IOCB_DSYNC;
- ssize_t err;
- loff_t pos;
- size_t count;
- loff_t oldsize;
- int clean_page = 0;
-
- if (!(iocb->ki_flags & IOCB_DIRECT) &&
- (iocb->ki_flags & IOCB_NOWAIT))
- return -EOPNOTSUPP;
-
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!inode_trylock(inode))
- return -EAGAIN;
- } else {
- inode_lock(inode);
- }
-
- err = generic_write_checks(iocb, from);
- if (err <= 0) {
- inode_unlock(inode);
- return err;
- }
-
- pos = iocb->ki_pos;
- count = iov_iter_count(from);
- if (iocb->ki_flags & IOCB_NOWAIT) {
- size_t nocow_bytes = count;
-
- /*
- * We will allocate space in case nodatacow is not set,
- * so bail
- */
- if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes)
- <= 0) {
- inode_unlock(inode);
- return -EAGAIN;
- }
- /*
- * There are holes in the range or parts of the range that must
- * be COWed (shared extents, RO block groups, etc), so just bail
- * out.
- */
- if (nocow_bytes < count) {
- inode_unlock(inode);
- return -EAGAIN;
- }
- }
-
- current->backing_dev_info = inode_to_bdi(inode);
- err = file_remove_privs(file);
- if (err) {
- inode_unlock(inode);
- goto out;
- }
/*
- * If BTRFS flips readonly due to some impossible error
- * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
- * although we have opened a file as writable, we have
- * to stop this write operation to ensure FS consistency.
+ * If the fs flips readonly due to some impossible error, although we
+ * have opened a file as writable, we have to stop this write operation
+ * to ensure consistency.
*/
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
- inode_unlock(inode);
- err = -EROFS;
- goto out;
- }
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ return -EROFS;
- /*
- * We reserve space for updating the inode when we reserve space for the
- * extent we are going to write, so we will enospc out there. We don't
- * need to start yet another transaction to update the inode as we will
- * update the inode when we finish writing whatever data we write.
- */
- update_time_for_write(inode);
-
- start_pos = round_down(pos, fs_info->sectorsize);
- oldsize = i_size_read(inode);
- if (start_pos > oldsize) {
- /* Expand hole size to cover write data, preventing empty gap */
- end_pos = round_up(pos + count,
- fs_info->sectorsize);
- err = btrfs_cont_expand(inode, oldsize, end_pos);
- if (err) {
- inode_unlock(inode);
- goto out;
- }
- if (start_pos > round_up(oldsize, fs_info->sectorsize))
- clean_page = 1;
- }
+ if (!(iocb->ki_flags & IOCB_DIRECT) &&
+ (iocb->ki_flags & IOCB_NOWAIT))
+ return -EOPNOTSUPP;
if (sync)
atomic_inc(&BTRFS_I(inode)->sync_writers);
- if (iocb->ki_flags & IOCB_DIRECT) {
- /*
- * 1. We must always clear IOCB_DSYNC in order to not deadlock
- * in iomap, as it calls generic_write_sync() in this case.
- * 2. If we are async, we can call iomap_dio_complete() either
- * in
- *
- * 2.1. A worker thread from the last bio completed. In this
- * case we need to mark the btrfs_dio_data that it is
- * async in order to call generic_write_sync() properly.
- * This is handled by setting BTRFS_DIO_SYNC_STUB in the
- * current->journal_info.
- * 2.2 The submitter context, because all IO completed
- * before we exited iomap_dio_rw(). In this case we can
- * just re-set the IOCB_DSYNC on the iocb and we'll do
- * the sync below. If our ->end_io() gets called and
- * current->journal_info is set, then we know we're in
- * our current context and we will clear
- * current->journal_info to indicate that we need to
- * sync below.
- */
- if (sync) {
- ASSERT(current->journal_info == NULL);
- iocb->ki_flags &= ~IOCB_DSYNC;
- current->journal_info = BTRFS_DIO_SYNC_STUB;
- }
- num_written = __btrfs_direct_write(iocb, from);
-
- /*
- * As stated above, we cleared journal_info, so we need to do
- * the sync ourselves.
- */
- if (sync && current->journal_info == NULL)
- iocb->ki_flags |= IOCB_DSYNC;
- current->journal_info = NULL;
- } else {
+ if (iocb->ki_flags & IOCB_DIRECT)
+ num_written = btrfs_direct_write(iocb, from);
+ else
num_written = btrfs_buffered_write(iocb, from);
- if (num_written > 0)
- iocb->ki_pos = pos + num_written;
- if (clean_page)
- pagecache_isize_extended(inode, oldsize,
- i_size_read(inode));
- }
-
- inode_unlock(inode);
/*
* We also have to set last_sub_trans to the current log transid,
@@ -2019,9 +2036,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
if (sync)
atomic_dec(&BTRFS_I(inode)->sync_writers);
-out:
+
current->backing_dev_info = NULL;
- return num_written ? num_written : err;
+ return num_written;
}
int btrfs_release_file(struct inode *inode, struct file *filp)
@@ -2116,13 +2133,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
inode_lock(inode);
- /*
- * We take the dio_sem here because the tree log stuff can race with
- * lockless dio writes and get an extent map logged for an extent we
- * never waited on. We need it this high up for lockdep reasons.
- */
- down_write(&BTRFS_I(inode)->dio_sem);
-
atomic_inc(&root->log_batch);
/*
@@ -2153,7 +2163,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = start_ordered_ops(inode, start, end);
if (ret) {
- up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@@ -2250,7 +2259,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
- up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
if (ret != BTRFS_NO_LOG_SYNC) {
@@ -2281,7 +2289,6 @@ out:
out_release_extents:
btrfs_release_log_ctx_extents(&ctx);
- up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@@ -2443,13 +2450,13 @@ out:
* em->start + em->len > start)
* When a hole extent is found, return 1 and modify start/len.
*/
-static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
+static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_map *em;
int ret = 0;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
+ em = btrfs_get_extent(inode, NULL, 0,
round_down(*start, fs_info->sectorsize),
round_up(*len, fs_info->sectorsize));
if (IS_ERR(em))
@@ -2509,13 +2516,14 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
}
static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
- struct inode *inode,
+ struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_replace_extent_info *extent_info,
- const u64 replace_len)
+ const u64 replace_len,
+ const u64 bytes_to_drop)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *root = inode->root;
struct btrfs_file_extent_item *extent;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -2527,10 +2535,12 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
return 0;
if (extent_info->disk_offset == 0 &&
- btrfs_fs_incompat(fs_info, NO_HOLES))
+ btrfs_fs_incompat(fs_info, NO_HOLES)) {
+ btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
return 0;
+ }
- key.objectid = btrfs_ino(BTRFS_I(inode));
+ key.objectid = btrfs_ino(inode);
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = extent_info->file_offset;
ret = btrfs_insert_empty_item(trans, root, path, &key,
@@ -2551,23 +2561,25 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
- extent_info->file_offset, replace_len);
+ ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
+ replace_len);
if (ret)
return ret;
/* If it's a hole, nothing more needs to be done. */
- if (extent_info->disk_offset == 0)
+ if (extent_info->disk_offset == 0) {
+ btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
return 0;
+ }
- inode_add_bytes(inode, replace_len);
+ btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop);
if (extent_info->is_new_extent && extent_info->insertions == 0) {
key.objectid = extent_info->disk_offset;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = extent_info->disk_len;
ret = btrfs_alloc_reserved_file_extent(trans, root,
- btrfs_ino(BTRFS_I(inode)),
+ btrfs_ino(inode),
extent_info->file_offset,
extent_info->qgroup_reserved,
&key);
@@ -2579,7 +2591,7 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
extent_info->disk_len, 0);
ref_offset = extent_info->file_offset - extent_info->data_offset;
btrfs_init_data_ref(&ref, root->root_key.objectid,
- btrfs_ino(BTRFS_I(inode)), ref_offset);
+ btrfs_ino(inode), ref_offset);
ret = btrfs_inc_extent_ref(trans, &ref);
}
@@ -2602,6 +2614,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
struct btrfs_replace_extent_info *extent_info,
struct btrfs_trans_handle **trans_out)
{
+ struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
u64 ino_size = round_up(inode->i_size, fs_info->sectorsize);
@@ -2610,7 +2623,6 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
struct btrfs_block_rsv *rsv;
unsigned int rsv_count;
u64 cur_offset;
- u64 drop_end;
u64 len = end - start;
int ret = 0;
@@ -2649,10 +2661,16 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
trans->block_rsv = rsv;
cur_offset = start;
+ drop_args.path = path;
+ drop_args.end = end + 1;
+ drop_args.drop_cache = true;
while (cur_offset < end) {
- ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path,
- cur_offset, end + 1, &drop_end,
- 1, 0, 0, NULL);
+ drop_args.start = cur_offset;
+ ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
+ /* If we are punching a hole decrement the inode's byte count */
+ if (!extent_info)
+ btrfs_update_inode_bytes(BTRFS_I(inode), 0,
+ drop_args.bytes_found);
if (ret != -ENOSPC) {
/*
* When cloning we want to avoid transaction aborts when
@@ -2669,10 +2687,10 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
trans->block_rsv = &fs_info->trans_block_rsv;
- if (!extent_info && cur_offset < drop_end &&
+ if (!extent_info && cur_offset < drop_args.drop_end &&
cur_offset < ino_size) {
ret = fill_holes(trans, BTRFS_I(inode), path,
- cur_offset, drop_end);
+ cur_offset, drop_args.drop_end);
if (ret) {
/*
* If we failed then we didn't insert our hole
@@ -2683,7 +2701,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
btrfs_abort_transaction(trans, ret);
break;
}
- } else if (!extent_info && cur_offset < drop_end) {
+ } else if (!extent_info && cur_offset < drop_args.drop_end) {
/*
* We are past the i_size here, but since we didn't
* insert holes we need to clear the mapped area so we
@@ -2691,7 +2709,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
* file extent is inserted here.
*/
ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
- cur_offset, drop_end - cur_offset);
+ cur_offset,
+ drop_args.drop_end - cur_offset);
if (ret) {
/*
* We couldn't clear our area, so we could
@@ -2703,11 +2722,14 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
}
}
- if (extent_info && drop_end > extent_info->file_offset) {
- u64 replace_len = drop_end - extent_info->file_offset;
+ if (extent_info &&
+ drop_args.drop_end > extent_info->file_offset) {
+ u64 replace_len = drop_args.drop_end -
+ extent_info->file_offset;
- ret = btrfs_insert_replace_extent(trans, inode, path,
- extent_info, replace_len);
+ ret = btrfs_insert_replace_extent(trans, BTRFS_I(inode),
+ path, extent_info, replace_len,
+ drop_args.bytes_found);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
@@ -2717,9 +2739,9 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
extent_info->file_offset += replace_len;
}
- cur_offset = drop_end;
+ cur_offset = drop_args.drop_end;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret)
break;
@@ -2739,7 +2761,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
trans->block_rsv = rsv;
if (!extent_info) {
- ret = find_first_non_hole(inode, &cur_offset, &len);
+ ret = find_first_non_hole(BTRFS_I(inode), &cur_offset,
+ &len);
if (unlikely(ret < 0))
break;
if (ret && !len) {
@@ -2776,25 +2799,26 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
* will not record the existence of the hole region
* [existing_hole_start, lockend].
*/
- if (drop_end <= end)
- drop_end = end + 1;
+ if (drop_args.drop_end <= end)
+ drop_args.drop_end = end + 1;
/*
* Don't insert file hole extent item if it's for a range beyond eof
* (because it's useless) or if it represents a 0 bytes range (when
* cur_offset == drop_end).
*/
- if (!extent_info && cur_offset < ino_size && cur_offset < drop_end) {
+ if (!extent_info && cur_offset < ino_size &&
+ cur_offset < drop_args.drop_end) {
ret = fill_holes(trans, BTRFS_I(inode), path,
- cur_offset, drop_end);
+ cur_offset, drop_args.drop_end);
if (ret) {
/* Same comment as above. */
btrfs_abort_transaction(trans, ret);
goto out_trans;
}
- } else if (!extent_info && cur_offset < drop_end) {
+ } else if (!extent_info && cur_offset < drop_args.drop_end) {
/* See the comment in the loop above for the reasoning here. */
ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
- cur_offset, drop_end - cur_offset);
+ cur_offset, drop_args.drop_end - cur_offset);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_trans;
@@ -2802,8 +2826,9 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
}
if (extent_info) {
- ret = btrfs_insert_replace_extent(trans, inode, path, extent_info,
- extent_info->data_len);
+ ret = btrfs_insert_replace_extent(trans, BTRFS_I(inode), path,
+ extent_info, extent_info->data_len,
+ drop_args.bytes_found);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_trans;
@@ -2849,7 +2874,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
inode_lock(inode);
ino_size = round_up(inode->i_size, fs_info->sectorsize);
- ret = find_first_non_hole(inode, &offset, &len);
+ ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
if (ret < 0)
goto out_only_mutex;
if (ret && !len) {
@@ -2874,7 +2899,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (same_block && len < fs_info->sectorsize) {
if (offset < ino_size) {
truncated_block = true;
- ret = btrfs_truncate_block(inode, offset, len, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
+ 0);
} else {
ret = 0;
}
@@ -2884,7 +2910,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
/* zero back part of the first block */
if (offset < ino_size) {
truncated_block = true;
- ret = btrfs_truncate_block(inode, offset, 0, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
if (ret) {
inode_unlock(inode);
return ret;
@@ -2899,7 +2925,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
/* after truncate page, check hole again */
len = offset + len - lockstart;
offset = lockstart;
- ret = find_first_non_hole(inode, &offset, &len);
+ ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
if (ret < 0)
goto out_only_mutex;
if (ret && !len) {
@@ -2913,14 +2939,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
tail_start = lockend + 1;
tail_len = offset + len - tail_start;
if (tail_len) {
- ret = find_first_non_hole(inode, &tail_start, &tail_len);
+ ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
if (unlikely(ret < 0))
goto out_only_mutex;
if (!ret) {
/* zero the front end of the last page */
if (tail_start + tail_len < ino_size) {
truncated_block = true;
- ret = btrfs_truncate_block(inode,
+ ret = btrfs_truncate_block(BTRFS_I(inode),
tail_start + tail_len,
0, 1);
if (ret)
@@ -2954,7 +2980,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
ASSERT(trans != NULL);
inode_inc_iversion(inode);
inode->i_mtime = inode->i_ctime = current_time(inode);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
updated_inode = true;
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
@@ -2981,7 +3007,7 @@ out_only_mutex:
} else {
int ret2;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
ret2 = btrfs_end_transaction(trans);
if (!ret)
ret = ret2;
@@ -3049,8 +3075,8 @@ static int btrfs_fallocate_update_isize(struct inode *inode,
inode->i_ctime = current_time(inode);
i_size_write(inode, end);
- btrfs_inode_safe_disk_i_size_write(inode, 0);
- ret = btrfs_update_inode(trans, root, inode);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
ret2 = btrfs_end_transaction(trans);
return ret ? ret : ret2;
@@ -3162,7 +3188,8 @@ static int btrfs_zero_range(struct inode *inode,
}
if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
free_extent_map(em);
- ret = btrfs_truncate_block(inode, offset, len, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
+ 0);
if (!ret)
ret = btrfs_fallocate_update_isize(inode,
offset + len,
@@ -3193,7 +3220,7 @@ static int btrfs_zero_range(struct inode *inode,
alloc_start = round_down(offset, sectorsize);
ret = 0;
} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
- ret = btrfs_truncate_block(inode, offset, 0, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
if (ret)
goto out;
} else {
@@ -3210,7 +3237,8 @@ static int btrfs_zero_range(struct inode *inode,
alloc_end = round_up(offset + len, sectorsize);
ret = 0;
} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
- ret = btrfs_truncate_block(inode, offset + len, 0, 1);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset + len,
+ 0, 1);
if (ret)
goto out;
} else {
@@ -3280,6 +3308,10 @@ static long btrfs_fallocate(struct file *file, int mode,
int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
int ret;
+ /* Do not allow fallocate in ZONED mode */
+ if (btrfs_is_zoned(btrfs_sb(inode->i_sb)))
+ return -EOPNOTSUPP;
+
alloc_start = round_down(offset, blocksize);
alloc_end = round_up(offset + len, blocksize);
cur_offset = alloc_start;
@@ -3304,7 +3336,7 @@ static long btrfs_fallocate(struct file *file, int mode,
return ret;
}
- inode_lock(inode);
+ btrfs_inode_lock(inode, 0);
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
ret = inode_newsize_ok(inode, offset + len);
@@ -3320,7 +3352,7 @@ static long btrfs_fallocate(struct file *file, int mode,
* But that's a minor problem and won't do much harm BTW.
*/
if (alloc_start > inode->i_size) {
- ret = btrfs_cont_expand(inode, i_size_read(inode),
+ ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
alloc_start);
if (ret)
goto out;
@@ -3330,7 +3362,7 @@ static long btrfs_fallocate(struct file *file, int mode,
* need to zero out the end of the block if i_size lands in the
* middle of a block.
*/
- ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
if (ret)
goto out;
}
@@ -3543,9 +3575,9 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
return generic_file_llseek(file, offset, whence);
case SEEK_DATA:
case SEEK_HOLE:
- inode_lock_shared(inode);
+ btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
offset = find_desired_extent(inode, offset, whence);
- inode_unlock_shared(inode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
break;
}
@@ -3561,16 +3593,47 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
return generic_file_open(inode, filp);
}
+static int check_direct_read(struct btrfs_fs_info *fs_info,
+ const struct iov_iter *iter, loff_t offset)
+{
+ int ret;
+ int i, seg;
+
+ ret = check_direct_IO(fs_info, iter, offset);
+ if (ret < 0)
+ return ret;
+
+ if (!iter_is_iovec(iter))
+ return 0;
+
+ for (seg = 0; seg < iter->nr_segs; seg++)
+ for (i = seg + 1; i < iter->nr_segs; i++)
+ if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
+ return -EINVAL;
+ return 0;
+}
+
+static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+
+ if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
+ return 0;
+
+ btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
+ ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ is_sync_kiocb(iocb));
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ return ret;
+}
+
static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
ssize_t ret = 0;
if (iocb->ki_flags & IOCB_DIRECT) {
- struct inode *inode = file_inode(iocb->ki_filp);
-
- inode_lock_shared(inode);
- ret = btrfs_direct_IO(iocb, to);
- inode_unlock_shared(inode);
+ ret = btrfs_direct_read(iocb, to);
if (ret < 0 || !iov_iter_count(to) ||
iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
return ret;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index af0013d3df63..4d8897879c9c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -16,7 +16,6 @@
#include "transaction.h"
#include "disk-io.h"
#include "extent_io.h"
-#include "inode-map.h"
#include "volumes.h"
#include "space-info.h"
#include "delalloc-space.h"
@@ -33,16 +32,18 @@ struct btrfs_trim_range {
struct list_head list;
};
-static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *bitmap_info);
static int link_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info);
static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info);
-static int btrfs_wait_cache_io_root(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
- struct btrfs_io_ctl *io_ctl,
- struct btrfs_path *path);
+static int search_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *bitmap_info, u64 *offset,
+ u64 *bytes, bool for_alloc);
+static void free_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *bitmap_info);
+static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info, u64 offset,
+ u64 bytes);
static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_path *path,
@@ -141,17 +142,15 @@ static int __create_free_space_inode(struct btrfs_root *root,
struct btrfs_free_space_header *header;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
- u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
+ /* We inline CRCs for the free disk space cache */
+ const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC |
+ BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
int ret;
ret = btrfs_insert_empty_inode(trans, root, path, ino);
if (ret)
return ret;
- /* We inline crc's for the free disk space cache */
- if (ino != BTRFS_FREE_INO_OBJECTID)
- flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
-
leaf = path->nodes[0];
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
@@ -207,6 +206,65 @@ int create_free_space_inode(struct btrfs_trans_handle *trans,
ino, block_group->start);
}
+/*
+ * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode
+ * handles lookup, otherwise it takes ownership and iputs the inode.
+ * Don't reuse an inode pointer after passing it into this function.
+ */
+int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct btrfs_block_group *block_group)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ if (!inode)
+ inode = lookup_free_space_inode(block_group, path);
+ if (IS_ERR(inode)) {
+ if (PTR_ERR(inode) != -ENOENT)
+ ret = PTR_ERR(inode);
+ goto out;
+ }
+ ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ if (ret) {
+ btrfs_add_delayed_iput(inode);
+ goto out;
+ }
+ clear_nlink(inode);
+ /* One for the block groups ref */
+ spin_lock(&block_group->lock);
+ if (block_group->iref) {
+ block_group->iref = 0;
+ block_group->inode = NULL;
+ spin_unlock(&block_group->lock);
+ iput(inode);
+ } else {
+ spin_unlock(&block_group->lock);
+ }
+ /* One for the lookup ref */
+ btrfs_add_delayed_iput(inode);
+
+ key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+ key.type = 0;
+ key.offset = block_group->start;
+ ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path,
+ -1, 1);
+ if (ret) {
+ if (ret > 0)
+ ret = 0;
+ goto out;
+ }
+ ret = btrfs_del_item(trans, trans->fs_info->tree_root, path);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv)
{
@@ -267,12 +325,12 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
* We skip the throttling logic for free space cache inodes, so we don't
* need to check for -EAGAIN.
*/
- ret = btrfs_truncate_inode_items(trans, root, inode,
+ ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
0, BTRFS_EXTENT_DATA_KEY);
if (ret)
goto fail;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
fail:
if (locked)
@@ -304,16 +362,11 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
int write)
{
int num_pages;
- int check_crcs = 0;
num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FREE_INO_OBJECTID)
- check_crcs = 1;
-
/* Make sure we can fit our crcs and generation into the first page */
- if (write && check_crcs &&
- (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
+ if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
return -ENOSPC;
memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
@@ -324,7 +377,6 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
io_ctl->num_pages = num_pages;
io_ctl->fs_info = btrfs_sb(inode->i_sb);
- io_ctl->check_crcs = check_crcs;
io_ctl->inode = inode;
return 0;
@@ -419,13 +471,8 @@ static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
* Skip the csum areas. If we don't check crcs then we just have a
* 64bit chunk at the front of the first page.
*/
- if (io_ctl->check_crcs) {
- io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
- io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
- } else {
- io_ctl->cur += sizeof(u64);
- io_ctl->size -= sizeof(u64) * 2;
- }
+ io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
+ io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
put_unaligned_le64(generation, io_ctl->cur);
io_ctl->cur += sizeof(u64);
@@ -439,14 +486,8 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
* Skip the crc area. If we don't check crcs then we just have a 64bit
* chunk at the front of the first page.
*/
- if (io_ctl->check_crcs) {
- io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
- io_ctl->size -= sizeof(u64) +
- (sizeof(u32) * io_ctl->num_pages);
- } else {
- io_ctl->cur += sizeof(u64);
- io_ctl->size -= sizeof(u64) * 2;
- }
+ io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
+ io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
cache_gen = get_unaligned_le64(io_ctl->cur);
if (cache_gen != generation) {
@@ -466,11 +507,6 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
u32 crc = ~(u32)0;
unsigned offset = 0;
- if (!io_ctl->check_crcs) {
- io_ctl_unmap_page(io_ctl);
- return;
- }
-
if (index == 0)
offset = sizeof(u32) * io_ctl->num_pages;
@@ -488,11 +524,6 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
u32 crc = ~(u32)0;
unsigned offset = 0;
- if (!io_ctl->check_crcs) {
- io_ctl_map_page(io_ctl, 0);
- return 0;
- }
-
if (index == 0)
offset = sizeof(u32) * io_ctl->num_pages;
@@ -625,42 +656,42 @@ static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
return 0;
}
-/*
- * Since we attach pinned extents after the fact we can have contiguous sections
- * of free space that are split up in entries. This poses a problem with the
- * tree logging stuff since it could have allocated across what appears to be 2
- * entries since we would have merged the entries when adding the pinned extents
- * back to the free space cache. So run through the space cache that we just
- * loaded and merge contiguous entries. This will make the log replay stuff not
- * blow up and it will make for nicer allocator behavior.
- */
-static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
+static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
{
- struct btrfs_free_space *e, *prev = NULL;
- struct rb_node *n;
+ struct btrfs_block_group *block_group = ctl->private;
+ u64 max_bytes;
+ u64 bitmap_bytes;
+ u64 extent_bytes;
+ u64 size = block_group->length;
+ u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
+ u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
-again:
- spin_lock(&ctl->tree_lock);
- for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
- e = rb_entry(n, struct btrfs_free_space, offset_index);
- if (!prev)
- goto next;
- if (e->bitmap || prev->bitmap)
- goto next;
- if (prev->offset + prev->bytes == e->offset) {
- unlink_free_space(ctl, prev);
- unlink_free_space(ctl, e);
- prev->bytes += e->bytes;
- kmem_cache_free(btrfs_free_space_cachep, e);
- link_free_space(ctl, prev);
- prev = NULL;
- spin_unlock(&ctl->tree_lock);
- goto again;
- }
-next:
- prev = e;
- }
- spin_unlock(&ctl->tree_lock);
+ max_bitmaps = max_t(u64, max_bitmaps, 1);
+
+ ASSERT(ctl->total_bitmaps <= max_bitmaps);
+
+ /*
+ * We are trying to keep the total amount of memory used per 1GiB of
+ * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation
+ * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
+ * bitmaps, we may end up using more memory than this.
+ */
+ if (size < SZ_1G)
+ max_bytes = MAX_CACHE_BYTES_PER_GIG;
+ else
+ max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
+
+ bitmap_bytes = ctl->total_bitmaps * ctl->unit;
+
+ /*
+ * we want the extent entry threshold to always be at most 1/2 the max
+ * bytes we can have, or whatever is less than that.
+ */
+ extent_bytes = max_bytes - bitmap_bytes;
+ extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
+
+ ctl->extents_thresh =
+ div_u64(extent_bytes, sizeof(struct btrfs_free_space));
}
static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
@@ -753,16 +784,6 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
goto free_cache;
}
- /*
- * Sync discard ensures that the free space cache is always
- * trimmed. So when reading this in, the state should reflect
- * that. We also do this for async as a stop gap for lack of
- * persistence.
- */
- if (btrfs_test_opt(fs_info, DISCARD_SYNC) ||
- btrfs_test_opt(fs_info, DISCARD_ASYNC))
- e->trim_state = BTRFS_TRIM_STATE_TRIMMED;
-
if (!e->bytes) {
kmem_cache_free(btrfs_free_space_cachep, e);
goto free_cache;
@@ -791,7 +812,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
spin_lock(&ctl->tree_lock);
ret = link_free_space(ctl, e);
ctl->total_bitmaps++;
- ctl->op->recalc_thresholds(ctl);
+ recalculate_thresholds(ctl);
spin_unlock(&ctl->tree_lock);
if (ret) {
btrfs_err(fs_info,
@@ -816,19 +837,11 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
ret = io_ctl_read_bitmap(&io_ctl, e);
if (ret)
goto free_cache;
- e->bitmap_extents = count_bitmap_extents(ctl, e);
- if (!btrfs_free_space_trimmed(e)) {
- ctl->discardable_extents[BTRFS_STAT_CURR] +=
- e->bitmap_extents;
- ctl->discardable_bytes[BTRFS_STAT_CURR] += e->bytes;
- }
}
io_ctl_drop_pages(&io_ctl);
- merge_space_tree(ctl);
ret = 1;
out:
- btrfs_discard_update_discardable(ctl->private, ctl);
io_ctl_free(&io_ctl);
return ret;
free_cache:
@@ -837,10 +850,46 @@ free_cache:
goto out;
}
+static int copy_free_space_cache(struct btrfs_block_group *block_group,
+ struct btrfs_free_space_ctl *ctl)
+{
+ struct btrfs_free_space *info;
+ struct rb_node *n;
+ int ret = 0;
+
+ while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) {
+ info = rb_entry(n, struct btrfs_free_space, offset_index);
+ if (!info->bitmap) {
+ unlink_free_space(ctl, info);
+ ret = btrfs_add_free_space(block_group, info->offset,
+ info->bytes);
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ } else {
+ u64 offset = info->offset;
+ u64 bytes = ctl->unit;
+
+ while (search_bitmap(ctl, info, &offset, &bytes,
+ false) == 0) {
+ ret = btrfs_add_free_space(block_group, offset,
+ bytes);
+ if (ret)
+ break;
+ bitmap_clear_bits(ctl, info, offset, bytes);
+ offset = info->offset;
+ bytes = ctl->unit;
+ }
+ free_bitmap(ctl, info);
+ }
+ cond_resched();
+ }
+ return ret;
+}
+
int load_free_space_cache(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space_ctl tmp_ctl = {};
struct inode *inode;
struct btrfs_path *path;
int ret = 0;
@@ -848,6 +897,13 @@ int load_free_space_cache(struct btrfs_block_group *block_group)
u64 used = block_group->used;
/*
+ * Because we could potentially discard our loaded free space, we want
+ * to load everything into a temporary structure first, and then if it's
+ * valid copy it all into the actual free space ctl.
+ */
+ btrfs_init_free_space_ctl(block_group, &tmp_ctl);
+
+ /*
* If this block group has been marked to be cleared for one reason or
* another then we can't trust the on disk cache, so just return.
*/
@@ -898,19 +954,25 @@ int load_free_space_cache(struct btrfs_block_group *block_group)
}
spin_unlock(&block_group->lock);
- ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
+ ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl,
path, block_group->start);
btrfs_free_path(path);
if (ret <= 0)
goto out;
- spin_lock(&ctl->tree_lock);
- matched = (ctl->free_space == (block_group->length - used -
- block_group->bytes_super));
- spin_unlock(&ctl->tree_lock);
+ matched = (tmp_ctl.free_space == (block_group->length - used -
+ block_group->bytes_super));
- if (!matched) {
- __btrfs_remove_free_space_cache(ctl);
+ if (matched) {
+ ret = copy_free_space_cache(block_group, &tmp_ctl);
+ /*
+ * ret == 1 means we successfully loaded the free space cache,
+ * so we need to re-set it here.
+ */
+ if (ret == 0)
+ ret = 1;
+ } else {
+ __btrfs_remove_free_space_cache(&tmp_ctl);
btrfs_warn(fs_info,
"block group %llu has wrong amount of free space",
block_group->start);
@@ -929,6 +991,9 @@ out:
block_group->start);
}
+ spin_lock(&ctl->tree_lock);
+ btrfs_discard_update_discardable(block_group);
+ spin_unlock(&ctl->tree_lock);
iput(inode);
return ret;
}
@@ -1191,7 +1256,7 @@ out:
"failed to write free space cache for block group %llu error %d",
block_group->start, ret);
}
- btrfs_update_inode(trans, root, inode);
+ btrfs_update_inode(trans, root, BTRFS_I(inode));
if (block_group) {
/* the dirty list is protected by the dirty_bgs_lock */
@@ -1220,14 +1285,6 @@ out:
}
-static int btrfs_wait_cache_io_root(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
- struct btrfs_io_ctl *io_ctl,
- struct btrfs_path *path)
-{
- return __btrfs_wait_cache_io(root, trans, NULL, io_ctl, path, 0);
-}
-
int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path)
@@ -1332,7 +1389,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
/* Everything is written out, now we dirty the pages in the file. */
ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
io_ctl->num_pages, 0, i_size_read(inode),
- &cached_state);
+ &cached_state, false);
if (ret)
goto out_nospc;
@@ -1381,7 +1438,7 @@ out:
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
}
- btrfs_update_inode(trans, root, inode);
+ btrfs_update_inode(trans, root, BTRFS_I(inode));
if (must_iput)
iput(inode);
return ret;
@@ -1672,44 +1729,6 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
return ret;
}
-static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
-{
- struct btrfs_block_group *block_group = ctl->private;
- u64 max_bytes;
- u64 bitmap_bytes;
- u64 extent_bytes;
- u64 size = block_group->length;
- u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
- u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
-
- max_bitmaps = max_t(u64, max_bitmaps, 1);
-
- ASSERT(ctl->total_bitmaps <= max_bitmaps);
-
- /*
- * We are trying to keep the total amount of memory used per 1GiB of
- * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation
- * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
- * bitmaps, we may end up using more memory than this.
- */
- if (size < SZ_1G)
- max_bytes = MAX_CACHE_BYTES_PER_GIG;
- else
- max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
-
- bitmap_bytes = ctl->total_bitmaps * ctl->unit;
-
- /*
- * we want the extent entry threshold to always be at most 1/2 the max
- * bytes we can have, or whatever is less than that.
- */
- extent_bytes = max_bytes - bitmap_bytes;
- extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
-
- ctl->extents_thresh =
- div_u64(extent_bytes, sizeof(struct btrfs_free_space));
-}
-
static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info,
u64 offset, u64 bytes)
@@ -1912,29 +1931,6 @@ out:
return NULL;
}
-static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *bitmap_info)
-{
- struct btrfs_block_group *block_group = ctl->private;
- u64 bytes = bitmap_info->bytes;
- unsigned int rs, re;
- int count = 0;
-
- if (!block_group || !bytes)
- return count;
-
- bitmap_for_each_set_region(bitmap_info->bitmap, rs, re, 0,
- BITS_PER_BITMAP) {
- bytes -= (rs - re) * ctl->unit;
- count++;
-
- if (!bytes)
- break;
- }
-
- return count;
-}
-
static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset)
{
@@ -1944,8 +1940,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
INIT_LIST_HEAD(&info->list);
link_free_space(ctl, info);
ctl->total_bitmaps++;
-
- ctl->op->recalc_thresholds(ctl);
+ recalculate_thresholds(ctl);
}
static void free_bitmap(struct btrfs_free_space_ctl *ctl,
@@ -1967,7 +1962,7 @@ static void free_bitmap(struct btrfs_free_space_ctl *ctl,
kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
ctl->total_bitmaps--;
- ctl->op->recalc_thresholds(ctl);
+ recalculate_thresholds(ctl);
}
static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
@@ -2134,7 +2129,6 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
}
static const struct btrfs_free_space_op free_space_op = {
- .recalc_thresholds = recalculate_thresholds,
.use_bitmap = use_bitmap,
};
@@ -2508,7 +2502,7 @@ link:
if (ret)
kmem_cache_free(btrfs_free_space_cachep, info);
out:
- btrfs_discard_update_discardable(block_group, ctl);
+ btrfs_discard_update_discardable(block_group);
spin_unlock(&ctl->tree_lock);
if (ret) {
@@ -2643,7 +2637,7 @@ again:
goto again;
}
out_lock:
- btrfs_discard_update_discardable(block_group, ctl);
+ btrfs_discard_update_discardable(block_group);
spin_unlock(&ctl->tree_lock);
out:
return ret;
@@ -2674,10 +2668,10 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
"%d blocks of free space at or bigger than bytes is", count);
}
-void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group)
+void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
+ struct btrfs_free_space_ctl *ctl)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
spin_lock_init(&ctl->tree_lock);
ctl->unit = fs_info->sectorsize;
@@ -2779,7 +2773,7 @@ void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
spin_lock(&ctl->tree_lock);
__btrfs_remove_free_space_cache_locked(ctl);
if (ctl->private)
- btrfs_discard_update_discardable(ctl->private, ctl);
+ btrfs_discard_update_discardable(ctl->private);
spin_unlock(&ctl->tree_lock);
}
@@ -2801,7 +2795,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
cond_resched_lock(&ctl->tree_lock);
}
__btrfs_remove_free_space_cache_locked(ctl);
- btrfs_discard_update_discardable(block_group, ctl);
+ btrfs_discard_update_discardable(block_group);
spin_unlock(&ctl->tree_lock);
}
@@ -2885,7 +2879,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
link_free_space(ctl, entry);
}
out:
- btrfs_discard_update_discardable(block_group, ctl);
+ btrfs_discard_update_discardable(block_group);
spin_unlock(&ctl->tree_lock);
if (align_gap_len)
@@ -3054,7 +3048,7 @@ out:
kmem_cache_free(btrfs_free_space_bitmap_cachep,
entry->bitmap);
ctl->total_bitmaps--;
- ctl->op->recalc_thresholds(ctl);
+ recalculate_thresholds(ctl);
} else if (!btrfs_free_space_trimmed(entry)) {
ctl->discardable_extents[BTRFS_STAT_CURR]--;
}
@@ -3828,166 +3822,62 @@ int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
return ret;
}
-/*
- * Find the left-most item in the cache tree, and then return the
- * smallest inode number in the item.
- *
- * Note: the returned inode number may not be the smallest one in
- * the tree, if the left-most item is a bitmap.
- */
-u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
+bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info)
{
- struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
- struct btrfs_free_space *entry = NULL;
- u64 ino = 0;
-
- spin_lock(&ctl->tree_lock);
-
- if (RB_EMPTY_ROOT(&ctl->free_space_offset))
- goto out;
-
- entry = rb_entry(rb_first(&ctl->free_space_offset),
- struct btrfs_free_space, offset_index);
-
- if (!entry->bitmap) {
- ino = entry->offset;
-
- unlink_free_space(ctl, entry);
- entry->offset++;
- entry->bytes--;
- if (!entry->bytes)
- kmem_cache_free(btrfs_free_space_cachep, entry);
- else
- link_free_space(ctl, entry);
- } else {
- u64 offset = 0;
- u64 count = 1;
- int ret;
-
- ret = search_bitmap(ctl, entry, &offset, &count, true);
- /* Logic error; Should be empty if it can't find anything */
- ASSERT(!ret);
-
- ino = offset;
- bitmap_clear_bits(ctl, entry, offset, 1);
- if (entry->bytes == 0)
- free_bitmap(ctl, entry);
- }
-out:
- spin_unlock(&ctl->tree_lock);
-
- return ino;
+ return btrfs_super_cache_generation(fs_info->super_copy);
}
-struct inode *lookup_free_ino_inode(struct btrfs_root *root,
- struct btrfs_path *path)
+static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans)
{
- struct inode *inode = NULL;
-
- spin_lock(&root->ino_cache_lock);
- if (root->ino_cache_inode)
- inode = igrab(root->ino_cache_inode);
- spin_unlock(&root->ino_cache_lock);
- if (inode)
- return inode;
-
- inode = __lookup_free_space_inode(root, path, 0);
- if (IS_ERR(inode))
- return inode;
-
- spin_lock(&root->ino_cache_lock);
- if (!btrfs_fs_closing(root->fs_info))
- root->ino_cache_inode = igrab(inode);
- spin_unlock(&root->ino_cache_lock);
-
- return inode;
-}
-
-int create_free_ino_inode(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
- struct btrfs_path *path)
-{
- return __create_free_space_inode(root, trans, path,
- BTRFS_FREE_INO_OBJECTID, 0);
-}
-
-int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
-{
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
- struct btrfs_path *path;
- struct inode *inode;
- int ret = 0;
- u64 root_gen = btrfs_root_generation(&root->root_item);
-
- if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
- return 0;
-
- /*
- * If we're unmounting then just return, since this does a search on the
- * normal root and not the commit root and we could deadlock.
- */
- if (btrfs_fs_closing(fs_info))
- return 0;
-
- path = btrfs_alloc_path();
- if (!path)
- return 0;
-
- inode = lookup_free_ino_inode(root, path);
- if (IS_ERR(inode))
- goto out;
-
- if (root_gen != BTRFS_I(inode)->generation)
- goto out_put;
+ struct btrfs_block_group *block_group;
+ struct rb_node *node;
+ int ret;
- ret = __load_free_space_cache(root, inode, ctl, path, 0);
+ btrfs_info(fs_info, "cleaning free space cache v1");
- if (ret < 0)
- btrfs_err(fs_info,
- "failed to load free ino cache for root %llu",
- root->root_key.objectid);
-out_put:
- iput(inode);
+ node = rb_first(&fs_info->block_group_cache_tree);
+ while (node) {
+ block_group = rb_entry(node, struct btrfs_block_group, cache_node);
+ ret = btrfs_remove_free_space_inode(trans, NULL, block_group);
+ if (ret)
+ goto out;
+ node = rb_next(node);
+ }
out:
- btrfs_free_path(path);
return ret;
}
-int btrfs_write_out_ino_cache(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- struct inode *inode)
+int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+ struct btrfs_trans_handle *trans;
int ret;
- struct btrfs_io_ctl io_ctl;
- bool release_metadata = true;
- if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
- return 0;
+ /*
+ * update_super_roots will appropriately set or unset
+ * super_copy->cache_generation based on SPACE_CACHE and
+ * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a
+ * transaction commit whether we are enabling space cache v1 and don't
+ * have any other work to do, or are disabling it and removing free
+ * space inodes.
+ */
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- memset(&io_ctl, 0, sizeof(io_ctl));
- ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, trans);
- if (!ret) {
- /*
- * At this point writepages() didn't error out, so our metadata
- * reservation is released when the writeback finishes, at
- * inode.c:btrfs_finish_ordered_io(), regardless of it finishing
- * with or without an error.
- */
- release_metadata = false;
- ret = btrfs_wait_cache_io_root(root, trans, &io_ctl, path);
+ if (!active) {
+ set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
+ ret = cleanup_free_space_cache_v1(fs_info, trans);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ goto out;
+ }
}
- if (ret) {
- if (release_metadata)
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- inode->i_size, true);
- btrfs_debug(fs_info,
- "failed to write free ino cache for root %llu error %d",
- root->root_key.objectid, ret);
- }
+ ret = btrfs_commit_transaction(trans);
+out:
+ clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
return ret;
}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index e3d5e0ad8f8e..ecb09a02d544 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -60,7 +60,6 @@ struct btrfs_free_space_ctl {
};
struct btrfs_free_space_op {
- void (*recalc_thresholds)(struct btrfs_free_space_ctl *ctl);
bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info);
};
@@ -76,7 +75,6 @@ struct btrfs_io_ctl {
int num_pages;
int entries;
int bitmaps;
- unsigned check_crcs:1;
};
struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
@@ -84,6 +82,9 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
int create_free_space_inode(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path);
+int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct btrfs_block_group *block_group);
int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
@@ -97,19 +98,9 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path);
-struct inode *lookup_free_ino_inode(struct btrfs_root *root,
- struct btrfs_path *path);
-int create_free_ino_inode(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
- struct btrfs_path *path);
-int load_free_ino_cache(struct btrfs_fs_info *fs_info,
- struct btrfs_root *root);
-int btrfs_write_out_ino_cache(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
- struct inode *inode);
-void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group);
+void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
+ struct btrfs_free_space_ctl *ctl);
int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_free_space_ctl *ctl,
u64 bytenr, u64 size,
@@ -126,7 +117,6 @@ bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group);
u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
u64 offset, u64 bytes, u64 empty_size,
u64 *max_extent_size);
-u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root);
void btrfs_dump_free_space(struct btrfs_block_group *block_group,
u64 bytes);
int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
@@ -148,6 +138,8 @@ int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen,
u64 maxlen, bool async);
+bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info);
+int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active);
/* Support functions for running our sanity tests */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int test_add_free_space_entry(struct btrfs_block_group *cache,
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 6b9faf3b0e96..e33a65bd9a0c 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -136,9 +136,10 @@ static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans,
return 0;
}
-static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize)
+static inline u32 free_space_bitmap_size(const struct btrfs_fs_info *fs_info,
+ u64 size)
{
- return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE);
+ return DIV_ROUND_UP(size >> fs_info->sectorsize_bits, BITS_PER_BYTE);
}
static unsigned long *alloc_bitmap(u32 bitmap_size)
@@ -200,8 +201,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
int done = 0, nr;
int ret;
- bitmap_size = free_space_bitmap_size(block_group->length,
- fs_info->sectorsize);
+ bitmap_size = free_space_bitmap_size(fs_info, block_group->length);
bitmap = alloc_bitmap(bitmap_size);
if (!bitmap) {
ret = -ENOMEM;
@@ -290,8 +290,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
u32 data_size;
extent_size = min(end - i, bitmap_range);
- data_size = free_space_bitmap_size(extent_size,
- fs_info->sectorsize);
+ data_size = free_space_bitmap_size(fs_info, extent_size);
key.objectid = i;
key.type = BTRFS_FREE_SPACE_BITMAP_KEY;
@@ -339,8 +338,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
int done = 0, nr;
int ret;
- bitmap_size = free_space_bitmap_size(block_group->length,
- fs_info->sectorsize);
+ bitmap_size = free_space_bitmap_size(fs_info, block_group->length);
bitmap = alloc_bitmap(bitmap_size);
if (!bitmap) {
ret = -ENOMEM;
@@ -383,8 +381,8 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
fs_info->sectorsize *
BITS_PER_BYTE);
bitmap_cursor = ((char *)bitmap) + bitmap_pos;
- data_size = free_space_bitmap_size(found_key.offset,
- fs_info->sectorsize);
+ data_size = free_space_bitmap_size(fs_info,
+ found_key.offset);
ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1);
read_extent_buffer(leaf, bitmap_cursor, ptr,
@@ -416,7 +414,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- nrbits = div_u64(block_group->length, block_group->fs_info->sectorsize);
+ nrbits = block_group->length >> block_group->fs_info->sectorsize_bits;
start_bit = find_next_bit_le(bitmap, nrbits, 0);
while (start_bit < nrbits) {
@@ -540,8 +538,8 @@ static void free_space_set_bits(struct btrfs_block_group *block_group,
end = found_end;
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
- first = div_u64(*start - found_start, fs_info->sectorsize);
- last = div_u64(end - found_start, fs_info->sectorsize);
+ first = (*start - found_start) >> fs_info->sectorsize_bits;
+ last = (end - found_start) >> fs_info->sectorsize_bits;
if (bit)
extent_buffer_bitmap_set(leaf, ptr, first, last - first);
else
@@ -1195,8 +1193,6 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
-
key.objectid = 0;
key.type = 0;
key.offset = 0;
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 668701832845..37f36ffdaf6b 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -119,8 +119,6 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
-
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0)
ret = -ENOENT;
@@ -193,8 +191,6 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
-
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
ret = -ENOENT;
@@ -270,7 +266,6 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &key,
ins_len);
if (ret == -EEXIST) {
@@ -327,7 +322,6 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
path->skip_release_on_error = 1;
ret = btrfs_insert_empty_item(trans, root, path, &key,
ins_len);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
deleted file mode 100644
index 76d2e43817ea..000000000000
--- a/fs/btrfs/inode-map.c
+++ /dev/null
@@ -1,582 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2007 Oracle. All rights reserved.
- */
-
-#include <linux/kthread.h>
-#include <linux/pagemap.h>
-
-#include "ctree.h"
-#include "disk-io.h"
-#include "free-space-cache.h"
-#include "inode-map.h"
-#include "transaction.h"
-#include "delalloc-space.h"
-
-static void fail_caching_thread(struct btrfs_root *root)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
-
- btrfs_warn(fs_info, "failed to start inode caching task");
- btrfs_clear_pending_and_info(fs_info, INODE_MAP_CACHE,
- "disabling inode map caching");
- spin_lock(&root->ino_cache_lock);
- root->ino_cache_state = BTRFS_CACHE_ERROR;
- spin_unlock(&root->ino_cache_lock);
- wake_up(&root->ino_cache_wait);
-}
-
-static int caching_kthread(void *data)
-{
- struct btrfs_root *root = data;
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
- struct btrfs_key key;
- struct btrfs_path *path;
- struct extent_buffer *leaf;
- u64 last = (u64)-1;
- int slot;
- int ret;
-
- if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
- return 0;
-
- path = btrfs_alloc_path();
- if (!path) {
- fail_caching_thread(root);
- return -ENOMEM;
- }
-
- /* Since the commit root is read-only, we can safely skip locking. */
- path->skip_locking = 1;
- path->search_commit_root = 1;
- path->reada = READA_FORWARD;
-
- key.objectid = BTRFS_FIRST_FREE_OBJECTID;
- key.offset = 0;
- key.type = BTRFS_INODE_ITEM_KEY;
-again:
- /* need to make sure the commit_root doesn't disappear */
- down_read(&fs_info->commit_root_sem);
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- while (1) {
- if (btrfs_fs_closing(fs_info))
- goto out;
-
- leaf = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto out;
- else if (ret > 0)
- break;
-
- if (need_resched() ||
- btrfs_transaction_in_commit(fs_info)) {
- leaf = path->nodes[0];
-
- if (WARN_ON(btrfs_header_nritems(leaf) == 0))
- break;
-
- /*
- * Save the key so we can advances forward
- * in the next search.
- */
- btrfs_item_key_to_cpu(leaf, &key, 0);
- btrfs_release_path(path);
- root->ino_cache_progress = last;
- up_read(&fs_info->commit_root_sem);
- schedule_timeout(1);
- goto again;
- } else
- continue;
- }
-
- btrfs_item_key_to_cpu(leaf, &key, slot);
-
- if (key.type != BTRFS_INODE_ITEM_KEY)
- goto next;
-
- if (key.objectid >= root->highest_objectid)
- break;
-
- if (last != (u64)-1 && last + 1 != key.objectid) {
- __btrfs_add_free_space(fs_info, ctl, last + 1,
- key.objectid - last - 1, 0);
- wake_up(&root->ino_cache_wait);
- }
-
- last = key.objectid;
-next:
- path->slots[0]++;
- }
-
- if (last < root->highest_objectid - 1) {
- __btrfs_add_free_space(fs_info, ctl, last + 1,
- root->highest_objectid - last - 1, 0);
- }
-
- spin_lock(&root->ino_cache_lock);
- root->ino_cache_state = BTRFS_CACHE_FINISHED;
- spin_unlock(&root->ino_cache_lock);
-
- root->ino_cache_progress = (u64)-1;
- btrfs_unpin_free_ino(root);
-out:
- wake_up(&root->ino_cache_wait);
- up_read(&fs_info->commit_root_sem);
-
- btrfs_free_path(path);
-
- return ret;
-}
-
-static void start_caching(struct btrfs_root *root)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
- struct task_struct *tsk;
- int ret;
- u64 objectid;
-
- if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
- return;
-
- spin_lock(&root->ino_cache_lock);
- if (root->ino_cache_state != BTRFS_CACHE_NO) {
- spin_unlock(&root->ino_cache_lock);
- return;
- }
-
- root->ino_cache_state = BTRFS_CACHE_STARTED;
- spin_unlock(&root->ino_cache_lock);
-
- ret = load_free_ino_cache(fs_info, root);
- if (ret == 1) {
- spin_lock(&root->ino_cache_lock);
- root->ino_cache_state = BTRFS_CACHE_FINISHED;
- spin_unlock(&root->ino_cache_lock);
- wake_up(&root->ino_cache_wait);
- return;
- }
-
- /*
- * It can be quite time-consuming to fill the cache by searching
- * through the extent tree, and this can keep ino allocation path
- * waiting. Therefore at start we quickly find out the highest
- * inode number and we know we can use inode numbers which fall in
- * [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID].
- */
- ret = btrfs_find_free_objectid(root, &objectid);
- if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) {
- __btrfs_add_free_space(fs_info, ctl, objectid,
- BTRFS_LAST_FREE_OBJECTID - objectid + 1,
- 0);
- wake_up(&root->ino_cache_wait);
- }
-
- tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu",
- root->root_key.objectid);
- if (IS_ERR(tsk))
- fail_caching_thread(root);
-}
-
-int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
-{
- if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
- return btrfs_find_free_objectid(root, objectid);
-
-again:
- *objectid = btrfs_find_ino_for_alloc(root);
-
- if (*objectid != 0)
- return 0;
-
- start_caching(root);
-
- wait_event(root->ino_cache_wait,
- root->ino_cache_state == BTRFS_CACHE_FINISHED ||
- root->ino_cache_state == BTRFS_CACHE_ERROR ||
- root->free_ino_ctl->free_space > 0);
-
- if (root->ino_cache_state == BTRFS_CACHE_FINISHED &&
- root->free_ino_ctl->free_space == 0)
- return -ENOSPC;
- else if (root->ino_cache_state == BTRFS_CACHE_ERROR)
- return btrfs_find_free_objectid(root, objectid);
- else
- goto again;
-}
-
-void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
-
- if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
- return;
-again:
- if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
- __btrfs_add_free_space(fs_info, pinned, objectid, 1, 0);
- } else {
- down_write(&fs_info->commit_root_sem);
- spin_lock(&root->ino_cache_lock);
- if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
- spin_unlock(&root->ino_cache_lock);
- up_write(&fs_info->commit_root_sem);
- goto again;
- }
- spin_unlock(&root->ino_cache_lock);
-
- start_caching(root);
-
- __btrfs_add_free_space(fs_info, pinned, objectid, 1, 0);
-
- up_write(&fs_info->commit_root_sem);
- }
-}
-
-/*
- * When a transaction is committed, we'll move those inode numbers which are
- * smaller than root->ino_cache_progress from pinned tree to free_ino tree, and
- * others will just be dropped, because the commit root we were searching has
- * changed.
- *
- * Must be called with root->fs_info->commit_root_sem held
- */
-void btrfs_unpin_free_ino(struct btrfs_root *root)
-{
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
- struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset;
- spinlock_t *rbroot_lock = &root->free_ino_pinned->tree_lock;
- struct btrfs_free_space *info;
- struct rb_node *n;
- u64 count;
-
- if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
- return;
-
- while (1) {
- spin_lock(rbroot_lock);
- n = rb_first(rbroot);
- if (!n) {
- spin_unlock(rbroot_lock);
- break;
- }
-
- info = rb_entry(n, struct btrfs_free_space, offset_index);
- BUG_ON(info->bitmap); /* Logic error */
-
- if (info->offset > root->ino_cache_progress)
- count = 0;
- else
- count = min(root->ino_cache_progress - info->offset + 1,
- info->bytes);
-
- rb_erase(&info->offset_index, rbroot);
- spin_unlock(rbroot_lock);
- if (count)
- __btrfs_add_free_space(root->fs_info, ctl,
- info->offset, count, 0);
- kmem_cache_free(btrfs_free_space_cachep, info);
- }
-}
-
-#define INIT_THRESHOLD ((SZ_32K / 2) / sizeof(struct btrfs_free_space))
-#define INODES_PER_BITMAP (PAGE_SIZE * 8)
-
-/*
- * The goal is to keep the memory used by the free_ino tree won't
- * exceed the memory if we use bitmaps only.
- */
-static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
-{
- struct btrfs_free_space *info;
- struct rb_node *n;
- int max_ino;
- int max_bitmaps;
-
- n = rb_last(&ctl->free_space_offset);
- if (!n) {
- ctl->extents_thresh = INIT_THRESHOLD;
- return;
- }
- info = rb_entry(n, struct btrfs_free_space, offset_index);
-
- /*
- * Find the maximum inode number in the filesystem. Note we
- * ignore the fact that this can be a bitmap, because we are
- * not doing precise calculation.
- */
- max_ino = info->bytes - 1;
-
- max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP;
- if (max_bitmaps <= ctl->total_bitmaps) {
- ctl->extents_thresh = 0;
- return;
- }
-
- ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) *
- PAGE_SIZE / sizeof(*info);
-}
-
-/*
- * We don't fall back to bitmap, if we are below the extents threshold
- * or this chunk of inode numbers is a big one.
- */
-static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info)
-{
- if (ctl->free_extents < ctl->extents_thresh ||
- info->bytes > INODES_PER_BITMAP / 10)
- return false;
-
- return true;
-}
-
-static const struct btrfs_free_space_op free_ino_op = {
- .recalc_thresholds = recalculate_thresholds,
- .use_bitmap = use_bitmap,
-};
-
-static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl)
-{
-}
-
-static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl,
- struct btrfs_free_space *info)
-{
- /*
- * We always use extents for two reasons:
- *
- * - The pinned tree is only used during the process of caching
- * work.
- * - Make code simpler. See btrfs_unpin_free_ino().
- */
- return false;
-}
-
-static const struct btrfs_free_space_op pinned_free_ino_op = {
- .recalc_thresholds = pinned_recalc_thresholds,
- .use_bitmap = pinned_use_bitmap,
-};
-
-void btrfs_init_free_ino_ctl(struct btrfs_root *root)
-{
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
- struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
-
- spin_lock_init(&ctl->tree_lock);
- ctl->unit = 1;
- ctl->start = 0;
- ctl->private = NULL;
- ctl->op = &free_ino_op;
- INIT_LIST_HEAD(&ctl->trimming_ranges);
- mutex_init(&ctl->cache_writeout_mutex);
-
- /*
- * Initially we allow to use 16K of ram to cache chunks of
- * inode numbers before we resort to bitmaps. This is somewhat
- * arbitrary, but it will be adjusted in runtime.
- */
- ctl->extents_thresh = INIT_THRESHOLD;
-
- spin_lock_init(&pinned->tree_lock);
- pinned->unit = 1;
- pinned->start = 0;
- pinned->private = NULL;
- pinned->extents_thresh = 0;
- pinned->op = &pinned_free_ino_op;
-}
-
-int btrfs_save_ino_cache(struct btrfs_root *root,
- struct btrfs_trans_handle *trans)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
- struct btrfs_path *path;
- struct inode *inode;
- struct btrfs_block_rsv *rsv;
- struct extent_changeset *data_reserved = NULL;
- u64 num_bytes;
- u64 alloc_hint = 0;
- int ret;
- int prealloc;
- bool retry = false;
-
- /* only fs tree and subvol/snap needs ino cache */
- if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID &&
- (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID ||
- root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID))
- return 0;
-
- /* Don't save inode cache if we are deleting this root */
- if (btrfs_root_refs(&root->root_item) == 0)
- return 0;
-
- if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
- return 0;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- rsv = trans->block_rsv;
- trans->block_rsv = &fs_info->trans_block_rsv;
-
- num_bytes = trans->bytes_reserved;
- /*
- * 1 item for inode item insertion if need
- * 4 items for inode item update (in the worst case)
- * 1 items for slack space if we need do truncation
- * 1 item for free space object
- * 3 items for pre-allocation
- */
- trans->bytes_reserved = btrfs_calc_insert_metadata_size(fs_info, 10);
- ret = btrfs_block_rsv_add(root, trans->block_rsv,
- trans->bytes_reserved,
- BTRFS_RESERVE_NO_FLUSH);
- if (ret)
- goto out;
- trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid,
- trans->bytes_reserved, 1);
-again:
- inode = lookup_free_ino_inode(root, path);
- if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) {
- ret = PTR_ERR(inode);
- goto out_release;
- }
-
- if (IS_ERR(inode)) {
- BUG_ON(retry); /* Logic error */
- retry = true;
-
- ret = create_free_ino_inode(root, trans, path);
- if (ret)
- goto out_release;
- goto again;
- }
-
- BTRFS_I(inode)->generation = 0;
- ret = btrfs_update_inode(trans, root, inode);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out_put;
- }
-
- if (i_size_read(inode) > 0) {
- ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
- if (ret) {
- if (ret != -ENOSPC)
- btrfs_abort_transaction(trans, ret);
- goto out_put;
- }
- }
-
- spin_lock(&root->ino_cache_lock);
- if (root->ino_cache_state != BTRFS_CACHE_FINISHED) {
- ret = -1;
- spin_unlock(&root->ino_cache_lock);
- goto out_put;
- }
- spin_unlock(&root->ino_cache_lock);
-
- spin_lock(&ctl->tree_lock);
- prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
- prealloc = ALIGN(prealloc, PAGE_SIZE);
- prealloc += ctl->total_bitmaps * PAGE_SIZE;
- spin_unlock(&ctl->tree_lock);
-
- /* Just to make sure we have enough space */
- prealloc += 8 * PAGE_SIZE;
-
- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, 0,
- prealloc);
- if (ret)
- goto out_put;
-
- ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
- prealloc, prealloc, &alloc_hint);
- if (ret) {
- btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
- btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc, true);
- goto out_put;
- }
-
- ret = btrfs_write_out_ino_cache(root, trans, path, inode);
- btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
-out_put:
- iput(inode);
-out_release:
- trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid,
- trans->bytes_reserved, 0);
- btrfs_block_rsv_release(fs_info, trans->block_rsv,
- trans->bytes_reserved, NULL);
-out:
- trans->block_rsv = rsv;
- trans->bytes_reserved = num_bytes;
-
- btrfs_free_path(path);
- extent_changeset_free(data_reserved);
- return ret;
-}
-
-int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
-{
- struct btrfs_path *path;
- int ret;
- struct extent_buffer *l;
- struct btrfs_key search_key;
- struct btrfs_key found_key;
- int slot;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
- search_key.type = -1;
- search_key.offset = (u64)-1;
- ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
- if (ret < 0)
- goto error;
- BUG_ON(ret == 0); /* Corruption */
- if (path->slots[0] > 0) {
- slot = path->slots[0] - 1;
- l = path->nodes[0];
- btrfs_item_key_to_cpu(l, &found_key, slot);
- *objectid = max_t(u64, found_key.objectid,
- BTRFS_FIRST_FREE_OBJECTID - 1);
- } else {
- *objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
- }
- ret = 0;
-error:
- btrfs_free_path(path);
- return ret;
-}
-
-int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
-{
- int ret;
- mutex_lock(&root->objectid_mutex);
-
- if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
- btrfs_warn(root->fs_info,
- "the objectid of root %llu reaches its highest value",
- root->root_key.objectid);
- ret = -ENOSPC;
- goto out;
- }
-
- *objectid = ++root->highest_objectid;
- ret = 0;
-out:
- mutex_unlock(&root->objectid_mutex);
- return ret;
-}
diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h
deleted file mode 100644
index 7a962811dffe..000000000000
--- a/fs/btrfs/inode-map.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef BTRFS_INODE_MAP_H
-#define BTRFS_INODE_MAP_H
-
-void btrfs_init_free_ino_ctl(struct btrfs_root *root);
-void btrfs_unpin_free_ino(struct btrfs_root *root);
-void btrfs_return_ino(struct btrfs_root *root, u64 objectid);
-int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid);
-int btrfs_save_ino_cache(struct btrfs_root *root,
- struct btrfs_trans_handle *trans);
-
-int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid);
-int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid);
-
-#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7e8d8169779d..8e23780acfae 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -45,7 +45,6 @@
#include "compression.h"
#include "locking.h"
#include "free-space-cache.h"
-#include "inode-map.h"
#include "props.h"
#include "qgroup.h"
#include "delalloc-space.h"
@@ -62,7 +61,6 @@ struct btrfs_dio_data {
loff_t length;
ssize_t submitted;
struct extent_changeset *data_reserved;
- bool sync;
};
static const struct inode_operations btrfs_dir_inode_operations;
@@ -96,6 +94,51 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode,
const bool uptodate);
/*
+ * btrfs_inode_lock - lock inode i_rwsem based on arguments passed
+ *
+ * ilock_flags can have the following bit set:
+ *
+ * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
+ * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
+ * return -EAGAIN
+ */
+int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
+{
+ if (ilock_flags & BTRFS_ILOCK_SHARED) {
+ if (ilock_flags & BTRFS_ILOCK_TRY) {
+ if (!inode_trylock_shared(inode))
+ return -EAGAIN;
+ else
+ return 0;
+ }
+ inode_lock_shared(inode);
+ } else {
+ if (ilock_flags & BTRFS_ILOCK_TRY) {
+ if (!inode_trylock(inode))
+ return -EAGAIN;
+ else
+ return 0;
+ }
+ inode_lock(inode);
+ }
+ return 0;
+}
+
+/*
+ * btrfs_inode_unlock - unock inode i_rwsem
+ *
+ * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
+ * to decide whether the lock acquired is shared or exclusive.
+ */
+void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags)
+{
+ if (ilock_flags & BTRFS_ILOCK_SHARED)
+ inode_unlock_shared(inode);
+ else
+ inode_unlock(inode);
+}
+
+/*
* Cleanup all submitted ordered extents in specified range to handle errors
* from the btrfs_run_delalloc_range() callback.
*
@@ -158,7 +201,7 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
* no overlapping inline items exist in the btree
*/
static int insert_inline_extent(struct btrfs_trans_handle *trans,
- struct btrfs_path *path, int extent_inserted,
+ struct btrfs_path *path, bool extent_inserted,
struct btrfs_root *root, struct inode *inode,
u64 start, size_t size, size_t compressed_size,
int compress_type,
@@ -179,8 +222,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
if (compressed_size && compressed_pages)
cur_size = compressed_size;
- inode_add_bytes(inode, size);
-
if (!extent_inserted) {
struct btrfs_key key;
size_t datasize;
@@ -190,7 +231,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
key.type = BTRFS_EXTENT_DATA_KEY;
datasize = btrfs_file_extent_calc_inline_size(cur_size);
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &key,
datasize);
if (ret)
@@ -256,8 +296,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
* could end up racing with unlink.
*/
BTRFS_I(inode)->disk_i_size = inode->i_size;
- ret = btrfs_update_inode(trans, root, inode);
-
fail:
return ret;
}
@@ -273,6 +311,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
int compress_type,
struct page **compressed_pages)
{
+ struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
@@ -283,8 +322,6 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
u64 data_len = inline_len;
int ret;
struct btrfs_path *path;
- int extent_inserted = 0;
- u32 extent_item_size;
if (compressed_size)
data_len = compressed_size;
@@ -310,16 +347,20 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
}
trans->block_rsv = &inode->block_rsv;
+ drop_args.path = path;
+ drop_args.start = start;
+ drop_args.end = aligned_end;
+ drop_args.drop_cache = true;
+ drop_args.replace_extent = true;
+
if (compressed_size && compressed_pages)
- extent_item_size = btrfs_file_extent_calc_inline_size(
+ drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
compressed_size);
else
- extent_item_size = btrfs_file_extent_calc_inline_size(
+ drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
inline_len);
- ret = __btrfs_drop_extents(trans, root, inode, path, start, aligned_end,
- NULL, 1, 1, extent_item_size,
- &extent_inserted);
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -327,7 +368,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
if (isize > actual_end)
inline_len = min_t(u64, isize, actual_end);
- ret = insert_inline_extent(trans, path, extent_inserted,
+ ret = insert_inline_extent(trans, path, drop_args.extent_inserted,
root, &inode->vfs_inode, start,
inline_len, compressed_size,
compress_type, compressed_pages);
@@ -339,8 +380,17 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
goto out;
}
+ btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found);
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret && ret != -ENOSPC) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ } else if (ret == -ENOSPC) {
+ ret = 1;
+ goto out;
+ }
+
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
- btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
out:
/*
* Don't forget to free the reserved space, as for inlined extent
@@ -1598,6 +1648,15 @@ next_slot:
goto out_check;
if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
goto out_check;
+
+ /*
+ * The following checks can be expensive, as they need to
+ * take other locks and do btree or rbtree searches, so
+ * release the path to avoid blocking other tasks for too
+ * long.
+ */
+ btrfs_release_path(path);
+
/* If extent is RO, we must COW it */
if (btrfs_extent_readonly(fs_info, disk_bytenr))
goto out_check;
@@ -1673,12 +1732,12 @@ out_check:
cur_offset = extent_end;
if (cur_offset > end)
break;
+ if (!path->nodes[0])
+ continue;
path->slots[0]++;
goto next_slot;
}
- btrfs_release_path(path);
-
/*
* COW range from cow_start to found_key.offset - 1. As the key
* will contain the beginning of the first extent that can be
@@ -2098,6 +2157,8 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
spin_lock(&inode->lock);
ASSERT(inode->new_delalloc_bytes >= len);
inode->new_delalloc_bytes -= len;
+ if (*bits & EXTENT_ADD_INODE_BYTES)
+ inode_add_bytes(&inode->vfs_inode, len);
spin_unlock(&inode->lock);
}
}
@@ -2121,7 +2182,7 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
{
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- u64 logical = (u64)bio->bi_iter.bi_sector << 9;
+ u64 logical = bio->bi_iter.bi_sector << 9;
u64 length = 0;
u64 map_length;
int ret;
@@ -2150,11 +2211,9 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
* At IO completion time the cums attached on the ordered extent record
* are inserted into the btree
*/
-static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
- u64 bio_offset)
+static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
+ u64 dio_file_offset)
{
- struct inode *inode = private_data;
-
return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
}
@@ -2187,7 +2246,8 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
int skip_sum;
int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
- skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+ skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
+ !fs_info->csum_root;
if (btrfs_is_free_space_inode(BTRFS_I(inode)))
metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
@@ -2202,8 +2262,13 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
mirror_num,
bio_flags);
goto out;
- } else if (!skip_sum) {
- ret = btrfs_lookup_bio_sums(inode, bio, (u64)-1, NULL);
+ } else {
+ /*
+ * Lookup bio sums does extra checks around whether we
+ * need to csum or not, which is why we ignore skip_sum
+ * here.
+ */
+ ret = btrfs_lookup_bio_sums(inode, bio, NULL);
if (ret)
goto out;
}
@@ -2213,8 +2278,8 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
goto mapit;
/* we're doing a write, do the async checksumming */
- ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
- 0, inode, btrfs_submit_bio_start);
+ ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags,
+ 0, btrfs_submit_bio_start);
goto out;
} else if (!skip_sum) {
ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
@@ -2282,8 +2347,8 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
ret = set_extent_bit(&inode->io_tree, search_start,
search_start + em_len - 1,
- EXTENT_DELALLOC_NEW,
- NULL, cached_state, GFP_NOFS);
+ EXTENT_DELALLOC_NEW, 0, NULL, cached_state,
+ GFP_NOFS, NULL);
next:
search_start = extent_map_end(em);
free_extent_map(em);
@@ -2511,9 +2576,11 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 file_pos,
struct btrfs_file_extent_item *stack_fi,
+ const bool update_inode_bytes,
u64 qgroup_reserved)
{
struct btrfs_root *root = inode->root;
+ const u64 sectorsize = root->fs_info->sectorsize;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key ins;
@@ -2521,7 +2588,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
- int extent_inserted = 0;
+ struct btrfs_drop_extents_args drop_args = { 0 };
int ret;
path = btrfs_alloc_path();
@@ -2537,18 +2604,20 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
* the caller is expected to unpin it and allow it to be merged
* with the others.
*/
- ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
- file_pos + num_bytes, NULL, 0,
- 1, sizeof(*stack_fi), &extent_inserted);
+ drop_args.path = path;
+ drop_args.start = file_pos;
+ drop_args.end = file_pos + num_bytes;
+ drop_args.replace_extent = true;
+ drop_args.extent_item_size = sizeof(*stack_fi);
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret)
goto out;
- if (!extent_inserted) {
+ if (!drop_args.extent_inserted) {
ins.objectid = btrfs_ino(inode);
ins.offset = file_pos;
ins.type = BTRFS_EXTENT_DATA_KEY;
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &ins,
sizeof(*stack_fi));
if (ret)
@@ -2563,7 +2632,24 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- inode_add_bytes(&inode->vfs_inode, num_bytes);
+ /*
+ * If we dropped an inline extent here, we know the range where it is
+ * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
+ * number of bytes only for that range contaning the inline extent.
+ * The remaining of the range will be processed when clearning the
+ * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
+ */
+ if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
+ u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
+
+ inline_size = drop_args.bytes_found - inline_size;
+ btrfs_update_inode_bytes(inode, sectorsize, inline_size);
+ drop_args.bytes_found -= inline_size;
+ num_bytes -= sectorsize;
+ }
+
+ if (update_inode_bytes)
+ btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
ins.objectid = disk_bytenr;
ins.offset = disk_num_bytes;
@@ -2601,6 +2687,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
{
struct btrfs_file_extent_item stack_fi;
u64 logical_len;
+ bool update_inode_bytes;
memset(&stack_fi, 0, sizeof(stack_fi));
btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
@@ -2616,9 +2703,18 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
/* Encryption and other encoding is reserved and all 0 */
+ /*
+ * For delalloc, when completing an ordered extent we update the inode's
+ * bytes when clearing the range in the inode's io tree, so pass false
+ * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
+ * except if the ordered extent was truncated.
+ */
+ update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
+ test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
+
return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
oe->file_offset, &stack_fi,
- oe->qgroup_rsv);
+ update_inode_bytes, oe->qgroup_rsv);
}
/*
@@ -2628,11 +2724,11 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
*/
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
{
- struct inode *inode = ordered_extent->inode;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans = NULL;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_io_tree *io_tree = &inode->io_tree;
struct extent_state *cached_state = NULL;
u64 start, end;
int compress_type = 0;
@@ -2640,10 +2736,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
u64 logical_len = ordered_extent->num_bytes;
bool freespace_inode;
bool truncated = false;
- bool range_locked = false;
- bool clear_new_delalloc_bytes = false;
bool clear_reserved_extent = true;
- unsigned int clear_bits;
+ unsigned int clear_bits = EXTENT_DEFRAG;
start = ordered_extent->file_offset;
end = start + ordered_extent->num_bytes - 1;
@@ -2651,16 +2745,16 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
- clear_new_delalloc_bytes = true;
+ clear_bits |= EXTENT_DELALLOC_NEW;
- freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode));
+ freespace_inode = btrfs_is_free_space_inode(inode);
if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
ret = -EIO;
goto out;
}
- btrfs_free_io_failure_record(BTRFS_I(inode), start, end);
+ btrfs_free_io_failure_record(inode, start, end);
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
truncated = true;
@@ -2683,14 +2777,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
trans = NULL;
goto out;
}
- trans->block_rsv = &BTRFS_I(inode)->block_rsv;
+ trans->block_rsv = &inode->block_rsv;
ret = btrfs_update_inode_fallback(trans, root, inode);
if (ret) /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
goto out;
}
- range_locked = true;
+ clear_bits |= EXTENT_LOCKED;
lock_extent_bits(io_tree, start, end, &cached_state);
if (freespace_inode)
@@ -2703,13 +2797,13 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- trans->block_rsv = &BTRFS_I(inode)->block_rsv;
+ trans->block_rsv = &inode->block_rsv;
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
compress_type = ordered_extent->compress_type;
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
BUG_ON(compress_type);
- ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
+ ret = btrfs_mark_extent_written(trans, inode,
ordered_extent->file_offset,
ordered_extent->file_offset +
logical_len);
@@ -2723,8 +2817,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->disk_num_bytes);
}
}
- unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
- ordered_extent->file_offset,
+ unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
ordered_extent->num_bytes, trans->transid);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
@@ -2737,6 +2830,17 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
+ /*
+ * If this is a new delalloc range, clear its new delalloc flag to
+ * update the inode's number of bytes. This needs to be done first
+ * before updating the inode item.
+ */
+ if ((clear_bits & EXTENT_DELALLOC_NEW) &&
+ !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
+ clear_extent_bit(&inode->io_tree, start, end,
+ EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
+ 0, 0, &cached_state);
+
btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode_fallback(trans, root, inode);
if (ret) { /* -ENOMEM or corruption */
@@ -2745,12 +2849,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
}
ret = 0;
out:
- clear_bits = EXTENT_DEFRAG;
- if (range_locked)
- clear_bits |= EXTENT_LOCKED;
- if (clear_new_delalloc_bytes)
- clear_bits |= EXTENT_DELALLOC_NEW;
- clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits,
+ clear_extent_bit(&inode->io_tree, start, end, clear_bits,
(clear_bits & EXTENT_LOCKED) ? 1 : 0, 0,
&cached_state);
@@ -2765,7 +2864,7 @@ out:
clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
/* Drop the cache for the part of the extent we didn't write. */
- btrfs_drop_extent_cache(BTRFS_I(inode), unwritten_start, end, 0);
+ btrfs_drop_extent_cache(inode, unwritten_start, end, 0);
/*
* If the ordered extent had an IOERR or something else went
@@ -2800,7 +2899,7 @@ out:
* This needs to be done to make sure anybody waiting knows we are done
* updating everything for this ordered extent.
*/
- btrfs_remove_ordered_extent(BTRFS_I(inode), ordered_extent);
+ btrfs_remove_ordered_extent(inode, ordered_extent);
/* once for us */
btrfs_put_ordered_extent(ordered_extent);
@@ -2841,18 +2940,32 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
btrfs_queue_work(wq, &ordered_extent->work);
}
+/*
+ * check_data_csum - verify checksum of one sector of uncompressed data
+ * @inode: inode
+ * @io_bio: btrfs_io_bio which contains the csum
+ * @bio_offset: offset to the beginning of the bio (in bytes)
+ * @page: page where is the data to be verified
+ * @pgoff: offset inside the page
+ *
+ * The length of such check is always one sector size.
+ */
static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
- int icsum, struct page *page, int pgoff, u64 start,
- size_t len)
+ u32 bio_offset, struct page *page, u32 pgoff)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
char *kaddr;
- u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ u32 len = fs_info->sectorsize;
+ const u32 csum_size = fs_info->csum_size;
+ unsigned int offset_sectors;
u8 *csum_expected;
u8 csum[BTRFS_CSUM_SIZE];
- csum_expected = ((u8 *)io_bio->csum) + icsum * csum_size;
+ ASSERT(pgoff + len <= PAGE_SIZE);
+
+ offset_sectors = bio_offset >> fs_info->sectorsize_bits;
+ csum_expected = ((u8 *)io_bio->csum) + offset_sectors * csum_size;
kaddr = kmap_atomic(page);
shash->tfm = fs_info->csum_shash;
@@ -2865,8 +2978,8 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
kunmap_atomic(kaddr);
return 0;
zeroit:
- btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
- io_bio->mirror_num);
+ btrfs_print_data_csum_error(BTRFS_I(inode), page_offset(page) + pgoff,
+ csum, csum_expected, io_bio->mirror_num);
if (io_bio->device)
btrfs_dev_stat_inc_and_print(io_bio->device,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
@@ -2877,17 +2990,23 @@ zeroit:
}
/*
- * when reads are done, we need to check csums to verify the data is correct
+ * When reads are done, we need to check csums to verify the data is correct.
* if there's a match, we allow the bio to finish. If not, the code in
* extent_io.c will try to find good copies for us.
+ *
+ * @bio_offset: offset to the beginning of the bio (in bytes)
+ * @start: file offset of the range start
+ * @end: file offset of the range end (inclusive)
+ * @mirror: mirror number
*/
-int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset,
+int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
struct page *page, u64 start, u64 end, int mirror)
{
- size_t offset = start - page_offset(page);
struct inode *inode = page->mapping->host;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_root *root = BTRFS_I(inode)->root;
+ const u32 sectorsize = root->fs_info->sectorsize;
+ u32 pg_off;
if (PageChecked(page)) {
ClearPageChecked(page);
@@ -2897,15 +3016,27 @@ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset,
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
return 0;
+ if (!root->fs_info->csum_root)
+ return 0;
+
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
return 0;
}
- phy_offset >>= inode->i_sb->s_blocksize_bits;
- return check_data_csum(inode, io_bio, phy_offset, page, offset, start,
- (size_t)(end - start + 1));
+ ASSERT(page_offset(page) <= start &&
+ end <= page_offset(page) + PAGE_SIZE - 1);
+ for (pg_off = offset_in_page(start);
+ pg_off < offset_in_page(end);
+ pg_off += sectorsize, bio_offset += sectorsize) {
+ int ret;
+
+ ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off);
+ if (ret < 0)
+ return -EIO;
+ }
+ return 0;
}
/*
@@ -3515,7 +3646,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
* copy everything in the in-memory inode into the btree.
*/
static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode)
+ struct btrfs_root *root,
+ struct btrfs_inode *inode)
{
struct btrfs_inode_item *inode_item;
struct btrfs_path *path;
@@ -3526,9 +3658,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
- ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
- 1);
+ ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1);
if (ret) {
if (ret > 0)
ret = -ENOENT;
@@ -3539,9 +3669,9 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
- fill_inode_item(trans, leaf, inode_item, inode);
+ fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
btrfs_mark_buffer_dirty(leaf);
- btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
+ btrfs_set_inode_last_trans(trans, inode);
ret = 0;
failed:
btrfs_free_path(path);
@@ -3552,7 +3682,8 @@ failed:
* copy everything in the in-memory inode into the btree.
*/
noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode)
+ struct btrfs_root *root,
+ struct btrfs_inode *inode)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
@@ -3564,23 +3695,22 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
* The data relocation inode should also be directly updated
* without delay
*/
- if (!btrfs_is_free_space_inode(BTRFS_I(inode))
+ if (!btrfs_is_free_space_inode(inode)
&& root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
&& !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
btrfs_update_root_times(trans, root);
ret = btrfs_delayed_update_inode(trans, root, inode);
if (!ret)
- btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
+ btrfs_set_inode_last_trans(trans, inode);
return ret;
}
return btrfs_update_inode_item(trans, root, inode);
}
-noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct inode *inode)
+int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_inode *inode)
{
int ret;
@@ -3615,7 +3745,6 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
goto out;
}
- path->leave_spinning = 1;
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
name, name_len, -1);
if (IS_ERR_OR_NULL(di)) {
@@ -3695,7 +3824,7 @@ err:
inode_inc_iversion(&dir->vfs_inode);
inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime =
dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
- ret = btrfs_update_inode(trans, root, &dir->vfs_inode);
+ ret = btrfs_update_inode(trans, root, dir);
out:
return ret;
}
@@ -3709,7 +3838,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
if (!ret) {
drop_nlink(&inode->vfs_inode);
- ret = btrfs_update_inode(trans, root, &inode->vfs_inode);
+ ret = btrfs_update_inode(trans, root, inode);
}
return ret;
}
@@ -3858,7 +3987,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
inode_inc_iversion(dir);
dir->i_mtime = dir->i_ctime = current_time(dir);
- ret = btrfs_update_inode_fallback(trans, root, dir);
+ ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir));
if (ret)
btrfs_abort_transaction(trans, ret);
out:
@@ -3995,7 +4124,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
struct btrfs_block_rsv block_rsv;
u64 root_flags;
int ret;
- int err;
/*
* Don't allow to delete a subvolume with send in progress. This is
@@ -4017,8 +4145,8 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
down_write(&fs_info->subvol_sem);
- err = may_destroy_subvol(dest);
- if (err)
+ ret = may_destroy_subvol(dest);
+ if (ret)
goto out_up_write;
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
@@ -4027,13 +4155,13 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
* two for dir entries,
* two for root ref/backref.
*/
- err = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
- if (err)
+ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
+ if (ret)
goto out_up_write;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_release;
}
trans->block_rsv = &block_rsv;
@@ -4043,7 +4171,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
ret = btrfs_unlink_subvol(trans, dir, dentry);
if (ret) {
- err = ret;
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -4052,7 +4179,7 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
memset(&dest->root_item.drop_progress, 0,
sizeof(dest->root_item.drop_progress));
- dest->root_item.drop_level = 0;
+ btrfs_set_root_drop_level(&dest->root_item, 0);
btrfs_set_root_refs(&dest->root_item, 0);
if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
@@ -4061,7 +4188,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
dest->root_key.objectid);
if (ret) {
btrfs_abort_transaction(trans, ret);
- err = ret;
goto out_end_trans;
}
}
@@ -4071,7 +4197,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
dest->root_key.objectid);
if (ret && ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
- err = ret;
goto out_end_trans;
}
if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
@@ -4081,7 +4206,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
dest->root_key.objectid);
if (ret && ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
- err = ret;
goto out_end_trans;
}
}
@@ -4092,14 +4216,12 @@ out_end_trans:
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
ret = btrfs_end_transaction(trans);
- if (ret && !err)
- err = ret;
inode->i_flags |= S_DEAD;
out_release:
btrfs_subvolume_release_metadata(root, &block_rsv);
out_up_write:
up_write(&fs_info->subvol_sem);
- if (err) {
+ if (ret) {
spin_lock(&dest->root_item_lock);
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
@@ -4109,15 +4231,9 @@ out_up_write:
d_invalidate(dentry);
btrfs_prune_dentries(dest);
ASSERT(dest->send_in_progress == 0);
-
- /* the last ref */
- if (dest->ino_cache_inode) {
- iput(dest->ino_cache_inode);
- dest->ino_cache_inode = NULL;
- }
}
- return err;
+ return ret;
}
static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
@@ -4194,7 +4310,7 @@ out:
*/
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- struct inode *inode,
+ struct btrfs_inode *inode,
u64 new_size, u32 min_type)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4215,7 +4331,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
int pending_del_slot = 0;
int extent_type = -1;
int ret;
- u64 ino = btrfs_ino(BTRFS_I(inode));
+ u64 ino = btrfs_ino(inode);
u64 bytes_deleted = 0;
bool be_nice = false;
bool should_throttle = false;
@@ -4229,7 +4345,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
* off from time to time. This means all inodes in subvolume roots,
* reloc roots, and data reloc roots.
*/
- if (!btrfs_is_free_space_inode(BTRFS_I(inode)) &&
+ if (!btrfs_is_free_space_inode(inode) &&
test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
be_nice = true;
@@ -4239,7 +4355,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
path->reada = READA_BACK;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
+ lock_extent_bits(&inode->io_tree, lock_start, (u64)-1,
&cached_state);
/*
@@ -4247,7 +4363,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
* new size is not block aligned since we will be keeping the
* last block of the extent just the way it is.
*/
- btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size,
+ btrfs_drop_extent_cache(inode, ALIGN(new_size,
fs_info->sectorsize),
(u64)-1, 0);
}
@@ -4258,8 +4374,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
* it is used to drop the logged items. So we shouldn't kill the delayed
* items.
*/
- if (min_type == 0 && root == BTRFS_I(inode)->root)
- btrfs_kill_delayed_inode_items(BTRFS_I(inode));
+ if (min_type == 0 && root == inode->root)
+ btrfs_kill_delayed_inode_items(inode);
key.objectid = ino;
key.offset = (u64)-1;
@@ -4315,14 +4431,13 @@ search_again:
btrfs_file_extent_num_bytes(leaf, fi);
trace_btrfs_truncate_show_fi_regular(
- BTRFS_I(inode), leaf, fi,
- found_key.offset);
+ inode, leaf, fi, found_key.offset);
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
item_end += btrfs_file_extent_ram_bytes(leaf,
fi);
trace_btrfs_truncate_show_fi_inline(
- BTRFS_I(inode), leaf, fi, path->slots[0],
+ inode, leaf, fi, path->slots[0],
found_key.offset);
}
item_end--;
@@ -4361,7 +4476,8 @@ search_again:
if (test_bit(BTRFS_ROOT_SHAREABLE,
&root->state) &&
extent_start != 0)
- inode_sub_bytes(inode, num_dec);
+ inode_sub_bytes(&inode->vfs_inode,
+ num_dec);
btrfs_mark_buffer_dirty(leaf);
} else {
extent_num_bytes =
@@ -4376,7 +4492,8 @@ search_again:
found_extent = 1;
if (test_bit(BTRFS_ROOT_SHAREABLE,
&root->state))
- inode_sub_bytes(inode, num_dec);
+ inode_sub_bytes(&inode->vfs_inode,
+ num_dec);
}
}
clear_len = num_dec;
@@ -4411,7 +4528,8 @@ search_again:
}
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
- inode_sub_bytes(inode, item_end + 1 - new_size);
+ inode_sub_bytes(&inode->vfs_inode,
+ item_end + 1 - new_size);
}
delete:
/*
@@ -4419,8 +4537,8 @@ delete:
* multiple fsyncs, and in this case we don't want to clear the
* file extent range because it's just the log.
*/
- if (root == BTRFS_I(inode)->root) {
- ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
+ if (root == inode->root) {
+ ret = btrfs_inode_clear_file_extent_range(inode,
clear_start, clear_len);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -4529,8 +4647,8 @@ out:
if (!ret && last_size > new_size)
last_size = new_size;
btrfs_inode_safe_disk_i_size_write(inode, last_size);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
- (u64)-1, &cached_state);
+ unlock_extent_cached(&inode->io_tree, lock_start, (u64)-1,
+ &cached_state);
}
btrfs_free_path(path);
@@ -4548,12 +4666,12 @@ out:
* This will find the block for the "from" offset and cow the block and zero the
* part we want to zero. This is used with truncate and hole punching.
*/
-int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
- int front)
+int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
+ int front)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct address_space *mapping = inode->i_mapping;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ struct extent_io_tree *io_tree = &inode->io_tree;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
@@ -4576,30 +4694,29 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
block_start = round_down(from, blocksize);
block_end = block_start + blocksize - 1;
- ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved,
- block_start, blocksize);
+ ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
+ blocksize);
if (ret < 0) {
- if (btrfs_check_nocow_lock(BTRFS_I(inode), block_start,
- &write_bytes) > 0) {
+ if (btrfs_check_nocow_lock(inode, block_start, &write_bytes) > 0) {
/* For nocow case, no need to reserve data space */
only_release_metadata = true;
} else {
goto out;
}
}
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), blocksize);
+ ret = btrfs_delalloc_reserve_metadata(inode, blocksize);
if (ret < 0) {
if (!only_release_metadata)
- btrfs_free_reserved_data_space(BTRFS_I(inode),
- data_reserved, block_start, blocksize);
+ btrfs_free_reserved_data_space(inode, data_reserved,
+ block_start, blocksize);
goto out;
}
again:
page = find_or_create_page(mapping, index, mask);
if (!page) {
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- block_start, blocksize, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
+ btrfs_delalloc_release_space(inode, data_reserved, block_start,
+ blocksize, true);
+ btrfs_delalloc_release_extents(inode, blocksize);
ret = -ENOMEM;
goto out;
}
@@ -4622,7 +4739,7 @@ again:
lock_extent_bits(io_tree, block_start, block_end, &cached_state);
set_page_extent_mapped(page);
- ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), block_start);
+ ordered = btrfs_lookup_ordered_extent(inode, block_start);
if (ordered) {
unlock_extent_cached(io_tree, block_start, block_end,
&cached_state);
@@ -4633,11 +4750,11 @@ again:
goto again;
}
- clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
+ clear_extent_bit(&inode->io_tree, block_start, block_end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, &cached_state);
- ret = btrfs_set_extent_delalloc(BTRFS_I(inode), block_start, block_end, 0,
+ ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
&cached_state);
if (ret) {
unlock_extent_cached(io_tree, block_start, block_end,
@@ -4663,34 +4780,33 @@ again:
unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
if (only_release_metadata)
- set_extent_bit(&BTRFS_I(inode)->io_tree, block_start,
- block_end, EXTENT_NORESERVE, NULL, NULL,
- GFP_NOFS);
+ set_extent_bit(&inode->io_tree, block_start, block_end,
+ EXTENT_NORESERVE, 0, NULL, NULL, GFP_NOFS, NULL);
out_unlock:
if (ret) {
if (only_release_metadata)
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- blocksize, true);
+ btrfs_delalloc_release_metadata(inode, blocksize, true);
else
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
+ btrfs_delalloc_release_space(inode, data_reserved,
block_start, blocksize, true);
}
- btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
+ btrfs_delalloc_release_extents(inode, blocksize);
unlock_page(page);
put_page(page);
out:
if (only_release_metadata)
- btrfs_check_nocow_unlock(BTRFS_I(inode));
+ btrfs_check_nocow_unlock(inode);
extent_changeset_free(data_reserved);
return ret;
}
-static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
+static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
u64 offset, u64 len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
+ struct btrfs_drop_extents_args drop_args = { 0 };
int ret;
/*
@@ -4698,9 +4814,9 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
* that any holes get logged if we fsync.
*/
if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
- BTRFS_I(inode)->last_trans = fs_info->generation;
- BTRFS_I(inode)->last_sub_trans = root->log_transid;
- BTRFS_I(inode)->last_log_commit = root->last_log_commit;
+ inode->last_trans = fs_info->generation;
+ inode->last_sub_trans = root->log_transid;
+ inode->last_log_commit = root->last_log_commit;
return 0;
}
@@ -4713,19 +4829,25 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
+ drop_args.start = offset;
+ drop_args.end = offset + len;
+ drop_args.drop_cache = true;
+
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
}
- ret = btrfs_insert_file_extent(trans, root, btrfs_ino(BTRFS_I(inode)),
+ ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
offset, 0, 0, len, 0, len, 0, 0, 0);
- if (ret)
+ if (ret) {
btrfs_abort_transaction(trans, ret);
- else
+ } else {
+ btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
btrfs_update_inode(trans, root, inode);
+ }
btrfs_end_transaction(trans);
return ret;
}
@@ -4736,14 +4858,14 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
* these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
* the range between oldsize and size
*/
-int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
+int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map_tree *em_tree = &inode->extent_tree;
u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
u64 block_end = ALIGN(size, fs_info->sectorsize);
u64 last_byte;
@@ -4763,11 +4885,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
if (size <= hole_start)
return 0;
- btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), hole_start,
- block_end - 1, &cached_state);
+ btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
+ &cached_state);
cur_offset = hole_start;
while (1) {
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
+ em = btrfs_get_extent(inode, NULL, 0, cur_offset,
block_end - cur_offset);
if (IS_ERR(em)) {
err = PTR_ERR(em);
@@ -4786,17 +4908,17 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
if (err)
break;
- err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
+ err = btrfs_inode_set_file_extent_range(inode,
cur_offset, hole_size);
if (err)
break;
- btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
+ btrfs_drop_extent_cache(inode, cur_offset,
cur_offset + hole_size - 1, 0);
hole_em = alloc_extent_map();
if (!hole_em) {
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ &inode->runtime_flags);
goto next;
}
hole_em->start = cur_offset;
@@ -4816,14 +4938,13 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
write_unlock(&em_tree->lock);
if (err != -EEXIST)
break;
- btrfs_drop_extent_cache(BTRFS_I(inode),
- cur_offset,
+ btrfs_drop_extent_cache(inode, cur_offset,
cur_offset +
hole_size - 1, 0);
}
free_extent_map(hole_em);
} else {
- err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
+ err = btrfs_inode_set_file_extent_range(inode,
cur_offset, hole_size);
if (err)
break;
@@ -4871,7 +4992,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
* this truncation.
*/
btrfs_drew_write_lock(&root->snapshot_lock);
- ret = btrfs_cont_expand(inode, oldsize, newsize);
+ ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
if (ret) {
btrfs_drew_write_unlock(&root->snapshot_lock);
return ret;
@@ -4884,9 +5005,9 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
}
i_size_write(inode, newsize);
- btrfs_inode_safe_disk_i_size_write(inode, 0);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
pagecache_isize_extended(inode, oldsize, newsize);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_end_transaction(trans);
} else {
@@ -5157,7 +5278,8 @@ void btrfs_evict_inode(struct inode *inode)
trans->block_rsv = rsv;
- ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
+ ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
+ 0, 0);
trans->block_rsv = &fs_info->trans_block_rsv;
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
@@ -5184,10 +5306,6 @@ void btrfs_evict_inode(struct inode *inode)
btrfs_end_transaction(trans);
}
- if (!(root == fs_info->tree_root ||
- root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
- btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode)));
-
free_rsv:
btrfs_free_block_rsv(fs_info, rsv);
no_delete:
@@ -5797,7 +5915,7 @@ static int btrfs_dirty_inode(struct inode *inode)
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret && ret == -ENOSPC) {
/* whoops, lets try again with the full transaction */
btrfs_end_transaction(trans);
@@ -5805,7 +5923,7 @@ static int btrfs_dirty_inode(struct inode *inode)
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
}
btrfs_end_transaction(trans);
if (BTRFS_I(inode)->delayed_node)
@@ -6068,7 +6186,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
goto fail;
}
- path->leave_spinning = 1;
ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
if (ret != 0)
goto fail_unlock;
@@ -6194,7 +6311,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
parent_inode->vfs_inode.i_mtime = now;
parent_inode->vfs_inode.i_ctime = now;
}
- ret = btrfs_update_inode(trans, root, &parent_inode->vfs_inode);
+ ret = btrfs_update_inode(trans, root, parent_inode);
if (ret)
btrfs_abort_transaction(trans, ret);
return ret;
@@ -6254,7 +6371,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
if (IS_ERR(trans))
return PTR_ERR(trans);
- err = btrfs_find_free_ino(root, &objectid);
+ err = btrfs_find_free_objectid(root, &objectid);
if (err)
goto out_unlock;
@@ -6285,7 +6402,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
if (err)
goto out_unlock;
- btrfs_update_inode(trans, root, inode);
+ btrfs_update_inode(trans, root, BTRFS_I(inode));
d_instantiate_new(dentry, inode);
out_unlock:
@@ -6318,7 +6435,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
if (IS_ERR(trans))
return PTR_ERR(trans);
- err = btrfs_find_free_ino(root, &objectid);
+ err = btrfs_find_free_objectid(root, &objectid);
if (err)
goto out_unlock;
@@ -6344,7 +6461,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
if (err)
goto out_unlock;
- err = btrfs_update_inode(trans, root, inode);
+ err = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (err)
goto out_unlock;
@@ -6416,7 +6533,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
} else {
struct dentry *parent = dentry->d_parent;
- err = btrfs_update_inode(trans, root, inode);
+ err = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (err)
goto fail;
if (inode->i_nlink == 1) {
@@ -6462,7 +6579,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
if (IS_ERR(trans))
return PTR_ERR(trans);
- err = btrfs_find_free_ino(root, &objectid);
+ err = btrfs_find_free_objectid(root, &objectid);
if (err)
goto out_fail;
@@ -6484,7 +6601,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
goto out_fail;
btrfs_i_size_write(BTRFS_I(inode), 0);
- err = btrfs_update_inode(trans, root, inode);
+ err = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (err)
goto out_fail;
@@ -6621,12 +6738,14 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
path->reada = READA_FORWARD;
/*
- * Unless we're going to uncompress the inline extent, no sleep would
- * happen.
+ * The same explanation in load_free_space_cache applies here as well,
+ * we only read when we're loading the free space cache, and at that
+ * point the commit_root has everything we need.
*/
- path->leave_spinning = 1;
-
- path->recurse = btrfs_is_free_space_inode(inode);
+ if (btrfs_is_free_space_inode(inode)) {
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+ }
ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
if (ret < 0) {
@@ -6728,7 +6847,6 @@ next:
em->orig_start = em->start;
ptr = btrfs_file_extent_inline_start(item) + extent_offset;
- btrfs_set_path_blocking(path);
if (!PageUptodate(page)) {
if (btrfs_file_extent_compression(leaf, item) !=
BTRFS_COMPRESS_NONE) {
@@ -7377,17 +7495,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
int ret = 0;
u64 len = length;
bool unlock_extents = false;
- bool sync = (current->journal_info == BTRFS_DIO_SYNC_STUB);
-
- /*
- * We used current->journal_info here to see if we were sync, but
- * there's a lot of tests in the enospc machinery to not do flushing if
- * we have a journal_info set, so we need to clear this out and re-set
- * it in iomap_end.
- */
- ASSERT(current->journal_info == NULL ||
- current->journal_info == BTRFS_DIO_SYNC_STUB);
- current->journal_info = NULL;
if (!write)
len = min_t(u64, len, fs_info->sectorsize);
@@ -7413,7 +7520,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
if (!dio_data)
return -ENOMEM;
- dio_data->sync = sync;
dio_data->length = length;
if (write) {
dio_data->reserve = round_up(length, fs_info->sectorsize);
@@ -7561,14 +7667,6 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
extent_changeset_free(dio_data->data_reserved);
}
out:
- /*
- * We're all done, we can re-set the current->journal_info now safely
- * for our endio.
- */
- if (dio_data->sync) {
- ASSERT(current->journal_info == NULL);
- current->journal_info = BTRFS_DIO_SYNC_STUB;
- }
kfree(dio_data);
iomap->private = NULL;
@@ -7632,7 +7730,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
struct bio_vec bvec;
struct bvec_iter iter;
u64 start = io_bio->logical;
- int icsum = 0;
+ u32 bio_offset = 0;
blk_status_t err = BLK_STS_OK;
__bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) {
@@ -7643,9 +7741,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
for (i = 0; i < nr_sectors; i++) {
ASSERT(pgoff < PAGE_SIZE);
if (uptodate &&
- (!csum || !check_data_csum(inode, io_bio, icsum,
- bvec.bv_page, pgoff,
- start, sectorsize))) {
+ (!csum || !check_data_csum(inode, io_bio,
+ bio_offset, bvec.bv_page, pgoff))) {
clean_io_failure(fs_info, failure_tree, io_tree,
start, bvec.bv_page,
btrfs_ino(BTRFS_I(inode)),
@@ -7653,6 +7750,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
} else {
blk_status_t status;
+ ASSERT((start - io_bio->logical) < UINT_MAX);
status = btrfs_submit_read_repair(inode,
&io_bio->bio,
start - io_bio->logical,
@@ -7665,7 +7763,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
err = status;
}
start += sectorsize;
- icsum++;
+ ASSERT(bio_offset + sectorsize > bio_offset);
+ bio_offset += sectorsize;
pgoff += sectorsize;
}
}
@@ -7715,12 +7814,11 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode,
}
}
-static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data,
- struct bio *bio, u64 offset)
+static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
+ struct bio *bio,
+ u64 dio_file_offset)
{
- struct inode *inode = private_data;
-
- return btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1);
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, 1);
}
static void btrfs_end_dio_bio(struct bio *bio)
@@ -7732,8 +7830,7 @@ static void btrfs_end_dio_bio(struct bio *bio)
btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
"direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio),
- bio->bi_opf,
- (unsigned long long)bio->bi_iter.bi_sector,
+ bio->bi_opf, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, err);
if (bio_op(bio) == REQ_OP_READ) {
@@ -7770,8 +7867,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
goto map;
if (write && async_submit) {
- ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0,
- file_offset, inode,
+ ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset,
btrfs_submit_bio_start_direct_io);
goto err;
} else if (write) {
@@ -7786,8 +7882,8 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
u64 csum_offset;
csum_offset = file_offset - dip->logical_offset;
- csum_offset >>= inode->i_sb->s_blocksize_bits;
- csum_offset *= btrfs_super_csum_size(fs_info->super_copy);
+ csum_offset >>= fs_info->sectorsize_bits;
+ csum_offset *= fs_info->csum_size;
btrfs_io_bio(bio)->csum = dip->csums + csum_offset;
}
map:
@@ -7812,11 +7908,10 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
dip_size = sizeof(*dip);
if (!write && csum) {
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
size_t nblocks;
- nblocks = dio_bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
- dip_size += csum_size * nblocks;
+ nblocks = dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits;
+ dip_size += fs_info->csum_size * nblocks;
}
dip = kzalloc(dip_size, GFP_NOFS);
@@ -7826,7 +7921,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
dip->inode = inode;
dip->logical_offset = file_offset;
dip->bytes = dio_bio->bi_iter.bi_size;
- dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
+ dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9;
dip->dio_bio = dio_bio;
refcount_set(&dip->refs, 1);
return dip;
@@ -7836,7 +7931,6 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
struct bio *dio_bio, loff_t file_offset)
{
const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
- const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
BTRFS_BLOCK_GROUP_RAID56_MASK);
@@ -7863,13 +7957,14 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
return BLK_QC_T_NONE;
}
- if (!write && csum) {
+ if (!write) {
/*
* Load the csums up front to reduce csum tree searches and
* contention when submitting bios.
+ *
+ * If we have csums disabled this will do nothing.
*/
- status = btrfs_lookup_bio_sums(inode, dio_bio, file_offset,
- dip->csums);
+ status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums);
if (status != BLK_STS_OK)
goto out_err;
}
@@ -7944,129 +8039,15 @@ out_err:
return BLK_QC_T_NONE;
}
-static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
- const struct iov_iter *iter, loff_t offset)
-{
- int seg;
- int i;
- unsigned int blocksize_mask = fs_info->sectorsize - 1;
- ssize_t retval = -EINVAL;
-
- if (offset & blocksize_mask)
- goto out;
-
- if (iov_iter_alignment(iter) & blocksize_mask)
- goto out;
-
- /* If this is a write we don't need to check anymore */
- if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter))
- return 0;
- /*
- * Check to make sure we don't have duplicate iov_base's in this
- * iovec, if so return EINVAL, otherwise we'll get csum errors
- * when reading back.
- */
- for (seg = 0; seg < iter->nr_segs; seg++) {
- for (i = seg + 1; i < iter->nr_segs; i++) {
- if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
- goto out;
- }
- }
- retval = 0;
-out:
- return retval;
-}
-
-static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size,
- int error, unsigned flags)
-{
- /*
- * Now if we're still in the context of our submitter we know we can't
- * safely run generic_write_sync(), so clear our flag here so that the
- * caller knows to follow up with a sync.
- */
- if (current->journal_info == BTRFS_DIO_SYNC_STUB) {
- current->journal_info = NULL;
- return error;
- }
-
- if (error)
- return error;
-
- if (size) {
- iocb->ki_flags |= IOCB_DSYNC;
- return generic_write_sync(iocb, size);
- }
-
- return 0;
-}
-
-static const struct iomap_ops btrfs_dio_iomap_ops = {
+const struct iomap_ops btrfs_dio_iomap_ops = {
.iomap_begin = btrfs_dio_iomap_begin,
.iomap_end = btrfs_dio_iomap_end,
};
-static const struct iomap_dio_ops btrfs_dio_ops = {
+const struct iomap_dio_ops btrfs_dio_ops = {
.submit_io = btrfs_submit_direct,
};
-static const struct iomap_dio_ops btrfs_sync_dops = {
- .submit_io = btrfs_submit_direct,
- .end_io = btrfs_maybe_fsync_end_io,
-};
-
-ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_changeset *data_reserved = NULL;
- loff_t offset = iocb->ki_pos;
- size_t count = 0;
- bool relock = false;
- ssize_t ret;
-
- if (check_direct_IO(fs_info, iter, offset))
- return 0;
-
- count = iov_iter_count(iter);
- if (iov_iter_rw(iter) == WRITE) {
- /*
- * If the write DIO is beyond the EOF, we need update
- * the isize, but it is protected by i_mutex. So we can
- * not unlock the i_mutex at this case.
- */
- if (offset + count <= inode->i_size) {
- inode_unlock(inode);
- relock = true;
- }
- down_read(&BTRFS_I(inode)->dio_sem);
- }
-
- /*
- * We have are actually a sync iocb, so we need our fancy endio to know
- * if we need to sync.
- */
- if (current->journal_info)
- ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
- &btrfs_sync_dops, is_sync_kiocb(iocb));
- else
- ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
- &btrfs_dio_ops, is_sync_kiocb(iocb));
-
- if (ret == -ENOTBLK)
- ret = 0;
-
- if (iov_iter_rw(iter) == WRITE)
- up_read(&BTRFS_I(inode)->dio_sem);
-
- if (relock)
- inode_lock(inode);
-
- extent_changeset_free(data_reserved);
- return ret;
-}
-
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
@@ -8186,6 +8167,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
u64 start;
u64 end;
int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
+ bool found_ordered = false;
+ bool completed_ordered = false;
/*
* we have the page locked, so new writeback can't start,
@@ -8207,15 +8190,17 @@ again:
start = page_start;
ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1);
if (ordered) {
+ found_ordered = true;
end = min(page_end,
ordered->file_offset + ordered->num_bytes - 1);
/*
- * IO on this page will never be started, so we need
- * to account for any ordered extents now
+ * IO on this page will never be started, so we need to account
+ * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
+ * here, must leave that up for the ordered extent completion.
*/
if (!inode_evicting)
clear_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_DELALLOC |
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 1, 0, &cached_state);
/*
@@ -8237,8 +8222,10 @@ again:
if (btrfs_dec_test_ordered_pending(inode, &ordered,
start,
- end - start + 1, 1))
+ end - start + 1, 1)) {
btrfs_finish_ordered_io(ordered);
+ completed_ordered = true;
+ }
}
btrfs_put_ordered_extent(ordered);
if (!inode_evicting) {
@@ -8267,10 +8254,23 @@ again:
*/
btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
if (!inode_evicting) {
+ bool delete = true;
+
+ /*
+ * If there's an ordered extent for this range and we have not
+ * finished it ourselves, we must leave EXTENT_DELALLOC_NEW set
+ * in the range for the ordered extent completion. We must also
+ * not delete the range, otherwise we would lose that bit (and
+ * any other bits set in the range). Make sure EXTENT_UPTODATE
+ * is cleared if we don't delete, otherwise it can lead to
+ * corruptions if the i_size is extented later.
+ */
+ if (found_ordered && !completed_ordered)
+ delete = false;
clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED |
- EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
- &cached_state);
+ EXTENT_DELALLOC | EXTENT_UPTODATE |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1,
+ delete, &cached_state);
__btrfs_releasepage(page, GFP_NOFS);
}
@@ -8519,14 +8519,14 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
trans->block_rsv = rsv;
while (1) {
- ret = btrfs_truncate_inode_items(trans, root, inode,
+ ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
inode->i_size,
BTRFS_EXTENT_DATA_KEY);
trans->block_rsv = &fs_info->trans_block_rsv;
if (ret != -ENOSPC && ret != -EAGAIN)
break;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret)
break;
@@ -8557,7 +8557,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
- ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
if (ret)
goto out;
trans = btrfs_start_transaction(root, 1);
@@ -8565,14 +8565,14 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
ret = PTR_ERR(trans);
goto out;
}
- btrfs_inode_safe_disk_i_size_write(inode, 0);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
}
if (trans) {
int ret2;
trans->block_rsv = &fs_info->trans_block_rsv;
- ret2 = btrfs_update_inode(trans, root, inode);
+ ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret2 && !ret)
ret = ret2;
@@ -8618,7 +8618,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
"error inheriting subvolume %llu properties: %d",
new_root->root_key.objectid, err);
- err = btrfs_update_inode(trans, new_root, inode);
+ err = btrfs_update_inode(trans, new_root, BTRFS_I(inode));
iput(inode);
return err;
@@ -8680,7 +8680,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->delayed_iput);
RB_CLEAR_NODE(&ei->rb_node);
- init_rwsem(&ei->dio_sem);
return inode;
}
@@ -8820,6 +8819,7 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags)
{
u64 delalloc_bytes;
+ u64 inode_bytes;
struct inode *inode = d_inode(path->dentry);
u32 blocksize = inode->i_sb->s_blocksize;
u32 bi_flags = BTRFS_I(inode)->flags;
@@ -8846,8 +8846,9 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat,
spin_lock(&BTRFS_I(inode)->lock);
delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
+ inode_bytes = inode_get_bytes(inode);
spin_unlock(&BTRFS_I(inode)->lock);
- stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
+ stat->blocks = (ALIGN(inode_bytes, blocksize) +
ALIGN(delalloc_bytes, blocksize)) >> 9;
return 0;
}
@@ -8973,7 +8974,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
old_dentry->d_name.name,
old_dentry->d_name.len);
if (!ret)
- ret = btrfs_update_inode(trans, root, old_inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
}
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -8989,7 +8990,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
new_dentry->d_name.name,
new_dentry->d_name.len);
if (!ret)
- ret = btrfs_update_inode(trans, dest, new_inode);
+ ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
}
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -9078,7 +9079,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
u64 objectid;
u64 index;
- ret = btrfs_find_free_ino(root, &objectid);
+ ret = btrfs_find_free_objectid(root, &objectid);
if (ret)
return ret;
@@ -9109,7 +9110,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
out:
unlock_new_inode(inode);
if (ret)
@@ -9243,7 +9244,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
old_dentry->d_name.name,
old_dentry->d_name.len);
if (!ret)
- ret = btrfs_update_inode(trans, root, old_inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
}
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -9541,7 +9542,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
if (IS_ERR(trans))
return PTR_ERR(trans);
- err = btrfs_find_free_ino(root, &objectid);
+ err = btrfs_find_free_objectid(root, &objectid);
if (err)
goto out_unlock;
@@ -9603,7 +9604,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
inode_nohighmem(inode);
inode_set_bytes(inode, name_len);
btrfs_i_size_write(BTRFS_I(inode), name_len);
- err = btrfs_update_inode(trans, root, inode);
+ err = btrfs_update_inode(trans, root, BTRFS_I(inode));
/*
* Last step, add directory indexes for our symlink inode. This is the
* last step to avoid extra cleanup of these indexes if an error happens
@@ -9629,7 +9630,8 @@ out_unlock:
static struct btrfs_trans_handle *insert_prealloc_file_extent(
struct btrfs_trans_handle *trans_in,
- struct inode *inode, struct btrfs_key *ins,
+ struct btrfs_inode *inode,
+ struct btrfs_key *ins,
u64 file_offset)
{
struct btrfs_file_extent_item stack_fi;
@@ -9650,13 +9652,14 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
/* Encryption and other encoding is reserved and all 0 */
- ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len);
+ ret = btrfs_qgroup_release_data(inode, file_offset, len);
if (ret < 0)
return ERR_PTR(ret);
if (trans) {
- ret = insert_reserved_file_extent(trans, BTRFS_I(inode),
- file_offset, &stack_fi, ret);
+ ret = insert_reserved_file_extent(trans, inode,
+ file_offset, &stack_fi,
+ true, ret);
if (ret)
return ERR_PTR(ret);
return trans;
@@ -9676,7 +9679,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
if (!path)
return ERR_PTR(-ENOMEM);
- ret = btrfs_replace_file_extents(inode, path, file_offset,
+ ret = btrfs_replace_file_extents(&inode->vfs_inode, path, file_offset,
file_offset + len - 1, &extent_info,
&trans);
btrfs_free_path(path);
@@ -9732,7 +9735,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
clear_offset += ins.offset;
last_alloc = ins.offset;
- trans = insert_prealloc_file_extent(trans, inode, &ins, cur_offset);
+ trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
+ &ins, cur_offset);
/*
* Now that we inserted the prealloc extent we can finally
* decrement the number of reservations in the block group.
@@ -9794,10 +9798,10 @@ next:
else
i_size = cur_offset;
i_size_write(inode, i_size);
- btrfs_inode_safe_disk_i_size_write(inode, 0);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
}
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -9872,7 +9876,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_find_free_ino(root, &objectid);
+ ret = btrfs_find_free_objectid(root, &objectid);
if (ret)
goto out;
@@ -9893,7 +9897,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
if (ret)
goto out;
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret)
goto out;
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
@@ -10272,6 +10276,27 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
}
#endif
+/*
+ * Update the number of bytes used in the VFS' inode. When we replace extents in
+ * a range (clone, dedupe, fallocate's zero range), we must update the number of
+ * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
+ * always get a correct value.
+ */
+void btrfs_update_inode_bytes(struct btrfs_inode *inode,
+ const u64 add_bytes,
+ const u64 del_bytes)
+{
+ if (add_bytes == del_bytes)
+ return;
+
+ spin_lock(&inode->lock);
+ if (del_bytes > 0)
+ inode_sub_bytes(&inode->vfs_inode, del_bytes);
+ if (add_bytes > 0)
+ inode_add_bytes(&inode->vfs_inode, add_bytes);
+ spin_unlock(&inode->lock);
+}
+
static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 69a384145dc6..703212ff50a5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -34,7 +34,6 @@
#include "print-tree.h"
#include "volumes.h"
#include "locking.h"
-#include "inode-map.h"
#include "backref.h"
#include "rcu-string.h"
#include "send.h"
@@ -193,6 +192,15 @@ static int check_fsflags(unsigned int old_flags, unsigned int flags)
return 0;
}
+static int check_fsflags_compatible(struct btrfs_fs_info *fs_info,
+ unsigned int flags)
+{
+ if (btrfs_is_zoned(fs_info) && (flags & FS_NOCOW_FL))
+ return -EPERM;
+
+ return 0;
+}
+
static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
@@ -230,6 +238,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
if (ret)
goto out_unlock;
+ ret = check_fsflags_compatible(fs_info, fsflags);
+ if (ret)
+ goto out_unlock;
+
binode_flags = binode->flags;
if (fsflags & FS_SYNC_FL)
binode_flags |= BTRFS_INODE_SYNC;
@@ -336,7 +348,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
btrfs_sync_inode_flags_to_i_flags(inode);
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
out_end_trans:
btrfs_end_transaction(trans);
@@ -479,7 +491,7 @@ static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg)
btrfs_sync_inode_flags_to_i_flags(inode);
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
btrfs_end_transaction(trans);
@@ -733,7 +745,7 @@ static noinline int create_subvol(struct inode *dir,
}
btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
- ret = btrfs_update_inode(trans, root, dir);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -1275,6 +1287,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
u64 page_end;
u64 page_cnt;
u64 start = (u64)start_index << PAGE_SHIFT;
+ u64 search_start;
int ret;
int i;
int i_done;
@@ -1371,6 +1384,40 @@ again:
lock_extent_bits(&BTRFS_I(inode)->io_tree,
page_start, page_end - 1, &cached_state);
+
+ /*
+ * When defragmenting we skip ranges that have holes or inline extents,
+ * (check should_defrag_range()), to avoid unnecessary IO and wasting
+ * space. At btrfs_defrag_file(), we check if a range should be defragged
+ * before locking the inode and then, if it should, we trigger a sync
+ * page cache readahead - we lock the inode only after that to avoid
+ * blocking for too long other tasks that possibly want to operate on
+ * other file ranges. But before we were able to get the inode lock,
+ * some other task may have punched a hole in the range, or we may have
+ * now an inline extent, in which case we should not defrag. So check
+ * for that here, where we have the inode and the range locked, and bail
+ * out if that happened.
+ */
+ search_start = page_start;
+ while (search_start < page_end) {
+ struct extent_map *em;
+
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, search_start,
+ page_end - search_start);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_unlock_range;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ free_extent_map(em);
+ /* Ok, 0 means we did not defrag anything */
+ ret = 0;
+ goto out_unlock_range;
+ }
+ search_start = extent_map_end(em);
+ free_extent_map(em);
+ }
+
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 0, 0, &cached_state);
@@ -1401,6 +1448,10 @@ again:
btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
extent_changeset_free(data_reserved);
return i_done;
+
+out_unlock_range:
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ page_start, page_end - 1, &cached_state);
out:
for (i = 0; i < i_done; i++) {
unlock_page(pages[i]);
@@ -1678,7 +1729,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
btrfs_info(fs_info, "resizing devid %llu", devid);
}
- device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
+ device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
if (!device) {
btrfs_info(fs_info, "resizer unable to find device %llu",
devid);
@@ -3321,7 +3372,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
rcu_read_lock();
dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid,
- NULL, true);
+ NULL);
if (!dev) {
ret = -ENODEV;
@@ -3393,7 +3444,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
ret = -ENOMEM;
goto out_free;
}
- path->leave_spinning = 1;
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 66e02ebdd340..5fafc5e89bb7 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -17,404 +17,89 @@
* Extent buffer locking
* =====================
*
- * The locks use a custom scheme that allows to do more operations than are
- * available fromt current locking primitives. The building blocks are still
- * rwlock and wait queues.
- *
- * Required semantics:
+ * We use a rw_semaphore for tree locking, and the semantics are exactly the
+ * same:
*
* - reader/writer exclusion
* - writer/writer exclusion
* - reader/reader sharing
- * - spinning lock semantics
- * - blocking lock semantics
* - try-lock semantics for readers and writers
- * - one level nesting, allowing read lock to be taken by the same thread that
- * already has write lock
- *
- * The extent buffer locks (also called tree locks) manage access to eb data
- * related to the storage in the b-tree (keys, items, but not the individual
- * members of eb).
- * We want concurrency of many readers and safe updates. The underlying locking
- * is done by read-write spinlock and the blocking part is implemented using
- * counters and wait queues.
- *
- * spinning semantics - the low-level rwlock is held so all other threads that
- * want to take it are spinning on it.
- *
- * blocking semantics - the low-level rwlock is not held but the counter
- * denotes how many times the blocking lock was held;
- * sleeping is possible
- *
- * Write lock always allows only one thread to access the data.
- *
- *
- * Debugging
- * ---------
- *
- * There are additional state counters that are asserted in various contexts,
- * removed from non-debug build to reduce extent_buffer size and for
- * performance reasons.
- *
- *
- * Lock recursion
- * --------------
- *
- * A write operation on a tree might indirectly start a look up on the same
- * tree. This can happen when btrfs_cow_block locks the tree and needs to
- * lookup free extents.
- *
- * btrfs_cow_block
- * ..
- * alloc_tree_block_no_bg_flush
- * btrfs_alloc_tree_block
- * btrfs_reserve_extent
- * ..
- * load_free_space_cache
- * ..
- * btrfs_lookup_file_extent
- * btrfs_search_slot
- *
- *
- * Locking pattern - spinning
- * --------------------------
- *
- * The simple locking scenario, the +--+ denotes the spinning section.
- *
- * +- btrfs_tree_lock
- * | - extent_buffer::rwlock is held
- * | - no heavy operations should happen, eg. IO, memory allocations, large
- * | structure traversals
- * +- btrfs_tree_unock
-*
-*
- * Locking pattern - blocking
- * --------------------------
- *
- * The blocking write uses the following scheme. The +--+ denotes the spinning
- * section.
- *
- * +- btrfs_tree_lock
- * |
- * +- btrfs_set_lock_blocking_write
- *
- * - allowed: IO, memory allocations, etc.
- *
- * -- btrfs_tree_unlock - note, no explicit unblocking necessary
- *
- *
- * Blocking read is similar.
- *
- * +- btrfs_tree_read_lock
- * |
- * +- btrfs_set_lock_blocking_read
- *
- * - heavy operations allowed
- *
- * +- btrfs_tree_read_unlock_blocking
- * |
- * +- btrfs_tree_read_unlock
- *
- */
-
-#ifdef CONFIG_BTRFS_DEBUG
-static inline void btrfs_assert_spinning_writers_get(struct extent_buffer *eb)
-{
- WARN_ON(eb->spinning_writers);
- eb->spinning_writers++;
-}
-
-static inline void btrfs_assert_spinning_writers_put(struct extent_buffer *eb)
-{
- WARN_ON(eb->spinning_writers != 1);
- eb->spinning_writers--;
-}
-
-static inline void btrfs_assert_no_spinning_writers(struct extent_buffer *eb)
-{
- WARN_ON(eb->spinning_writers);
-}
-
-static inline void btrfs_assert_spinning_readers_get(struct extent_buffer *eb)
-{
- atomic_inc(&eb->spinning_readers);
-}
-
-static inline void btrfs_assert_spinning_readers_put(struct extent_buffer *eb)
-{
- WARN_ON(atomic_read(&eb->spinning_readers) == 0);
- atomic_dec(&eb->spinning_readers);
-}
-
-static inline void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb)
-{
- atomic_inc(&eb->read_locks);
-}
-
-static inline void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb)
-{
- atomic_dec(&eb->read_locks);
-}
-
-static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
-{
- BUG_ON(!atomic_read(&eb->read_locks));
-}
-
-static inline void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb)
-{
- eb->write_locks++;
-}
-
-static inline void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb)
-{
- eb->write_locks--;
-}
-
-#else
-static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) { }
-static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) { }
-static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) { }
-static void btrfs_assert_spinning_readers_put(struct extent_buffer *eb) { }
-static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) { }
-static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { }
-static void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) { }
-static void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) { }
-static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) { }
-static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) { }
-#endif
-
-/*
- * Mark already held read lock as blocking. Can be nested in write lock by the
- * same thread.
- *
- * Use when there are potentially long operations ahead so other thread waiting
- * on the lock will not actively spin but sleep instead.
- *
- * The rwlock is released and blocking reader counter is increased.
- */
-void btrfs_set_lock_blocking_read(struct extent_buffer *eb)
-{
- trace_btrfs_set_lock_blocking_read(eb);
- /*
- * No lock is required. The lock owner may change if we have a read
- * lock, but it won't change to or away from us. If we have the write
- * lock, we are the owner and it'll never change.
- */
- if (eb->lock_recursed && current->pid == eb->lock_owner)
- return;
- btrfs_assert_tree_read_locked(eb);
- atomic_inc(&eb->blocking_readers);
- btrfs_assert_spinning_readers_put(eb);
- read_unlock(&eb->lock);
-}
-
-/*
- * Mark already held write lock as blocking.
- *
- * Use when there are potentially long operations ahead so other threads
- * waiting on the lock will not actively spin but sleep instead.
*
- * The rwlock is released and blocking writers is set.
+ * The rwsem implementation does opportunistic spinning which reduces number of
+ * times the locking task needs to sleep.
*/
-void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
-{
- trace_btrfs_set_lock_blocking_write(eb);
- /*
- * No lock is required. The lock owner may change if we have a read
- * lock, but it won't change to or away from us. If we have the write
- * lock, we are the owner and it'll never change.
- */
- if (eb->lock_recursed && current->pid == eb->lock_owner)
- return;
- if (eb->blocking_writers == 0) {
- btrfs_assert_spinning_writers_put(eb);
- btrfs_assert_tree_locked(eb);
- WRITE_ONCE(eb->blocking_writers, 1);
- write_unlock(&eb->lock);
- }
-}
/*
- * Lock the extent buffer for read. Wait for any writers (spinning or blocking).
- * Can be nested in write lock by the same thread.
+ * __btrfs_tree_read_lock - lock extent buffer for read
+ * @eb: the eb to be locked
+ * @nest: the nesting level to be used for lockdep
*
- * Use when the locked section does only lightweight actions and busy waiting
- * would be cheaper than making other threads do the wait/wake loop.
- *
- * The rwlock is held upon exit.
+ * This takes the read lock on the extent buffer, using the specified nesting
+ * level for lockdep purposes.
*/
-void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest,
- bool recurse)
+void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
{
u64 start_ns = 0;
if (trace_btrfs_tree_read_lock_enabled())
start_ns = ktime_get_ns();
-again:
- read_lock(&eb->lock);
- BUG_ON(eb->blocking_writers == 0 &&
- current->pid == eb->lock_owner);
- if (eb->blocking_writers) {
- if (current->pid == eb->lock_owner) {
- /*
- * This extent is already write-locked by our thread.
- * We allow an additional read lock to be added because
- * it's for the same thread. btrfs_find_all_roots()
- * depends on this as it may be called on a partly
- * (write-)locked tree.
- */
- WARN_ON(!recurse);
- BUG_ON(eb->lock_recursed);
- eb->lock_recursed = true;
- read_unlock(&eb->lock);
- trace_btrfs_tree_read_lock(eb, start_ns);
- return;
- }
- read_unlock(&eb->lock);
- wait_event(eb->write_lock_wq,
- READ_ONCE(eb->blocking_writers) == 0);
- goto again;
- }
- btrfs_assert_tree_read_locks_get(eb);
- btrfs_assert_spinning_readers_get(eb);
+
+ down_read_nested(&eb->lock, nest);
+ eb->lock_owner = current->pid;
trace_btrfs_tree_read_lock(eb, start_ns);
}
void btrfs_tree_read_lock(struct extent_buffer *eb)
{
- __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, false);
-}
-
-/*
- * Lock extent buffer for read, optimistically expecting that there are no
- * contending blocking writers. If there are, don't wait.
- *
- * Return 1 if the rwlock has been taken, 0 otherwise
- */
-int btrfs_tree_read_lock_atomic(struct extent_buffer *eb)
-{
- if (READ_ONCE(eb->blocking_writers))
- return 0;
-
- read_lock(&eb->lock);
- /* Refetch value after lock */
- if (READ_ONCE(eb->blocking_writers)) {
- read_unlock(&eb->lock);
- return 0;
- }
- btrfs_assert_tree_read_locks_get(eb);
- btrfs_assert_spinning_readers_get(eb);
- trace_btrfs_tree_read_lock_atomic(eb);
- return 1;
+ __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL);
}
/*
- * Try-lock for read. Don't block or wait for contending writers.
+ * Try-lock for read.
*
* Retrun 1 if the rwlock has been taken, 0 otherwise
*/
int btrfs_try_tree_read_lock(struct extent_buffer *eb)
{
- if (READ_ONCE(eb->blocking_writers))
- return 0;
-
- if (!read_trylock(&eb->lock))
- return 0;
-
- /* Refetch value after lock */
- if (READ_ONCE(eb->blocking_writers)) {
- read_unlock(&eb->lock);
- return 0;
+ if (down_read_trylock(&eb->lock)) {
+ eb->lock_owner = current->pid;
+ trace_btrfs_try_tree_read_lock(eb);
+ return 1;
}
- btrfs_assert_tree_read_locks_get(eb);
- btrfs_assert_spinning_readers_get(eb);
- trace_btrfs_try_tree_read_lock(eb);
- return 1;
+ return 0;
}
/*
- * Try-lock for write. May block until the lock is uncontended, but does not
- * wait until it is free.
+ * Try-lock for write.
*
* Retrun 1 if the rwlock has been taken, 0 otherwise
*/
int btrfs_try_tree_write_lock(struct extent_buffer *eb)
{
- if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers))
- return 0;
-
- write_lock(&eb->lock);
- /* Refetch value after lock */
- if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers)) {
- write_unlock(&eb->lock);
- return 0;
+ if (down_write_trylock(&eb->lock)) {
+ eb->lock_owner = current->pid;
+ trace_btrfs_try_tree_write_lock(eb);
+ return 1;
}
- btrfs_assert_tree_write_locks_get(eb);
- btrfs_assert_spinning_writers_get(eb);
- eb->lock_owner = current->pid;
- trace_btrfs_try_tree_write_lock(eb);
- return 1;
+ return 0;
}
/*
- * Release read lock. Must be used only if the lock is in spinning mode. If
- * the read lock is nested, must pair with read lock before the write unlock.
- *
- * The rwlock is not held upon exit.
+ * Release read lock.
*/
void btrfs_tree_read_unlock(struct extent_buffer *eb)
{
trace_btrfs_tree_read_unlock(eb);
- /*
- * if we're nested, we have the write lock. No new locking
- * is needed as long as we are the lock owner.
- * The write unlock will do a barrier for us, and the lock_recursed
- * field only matters to the lock owner.
- */
- if (eb->lock_recursed && current->pid == eb->lock_owner) {
- eb->lock_recursed = false;
- return;
- }
- btrfs_assert_tree_read_locked(eb);
- btrfs_assert_spinning_readers_put(eb);
- btrfs_assert_tree_read_locks_put(eb);
- read_unlock(&eb->lock);
-}
-
-/*
- * Release read lock, previously set to blocking by a pairing call to
- * btrfs_set_lock_blocking_read(). Can be nested in write lock by the same
- * thread.
- *
- * State of rwlock is unchanged, last reader wakes waiting threads.
- */
-void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
-{
- trace_btrfs_tree_read_unlock_blocking(eb);
- /*
- * if we're nested, we have the write lock. No new locking
- * is needed as long as we are the lock owner.
- * The write unlock will do a barrier for us, and the lock_recursed
- * field only matters to the lock owner.
- */
- if (eb->lock_recursed && current->pid == eb->lock_owner) {
- eb->lock_recursed = false;
- return;
- }
- btrfs_assert_tree_read_locked(eb);
- WARN_ON(atomic_read(&eb->blocking_readers) == 0);
- /* atomic_dec_and_test implies a barrier */
- if (atomic_dec_and_test(&eb->blocking_readers))
- cond_wake_up_nomb(&eb->read_lock_wq);
- btrfs_assert_tree_read_locks_put(eb);
+ eb->lock_owner = 0;
+ up_read(&eb->lock);
}
/*
- * Lock for write. Wait for all blocking and spinning readers and writers. This
- * starts context where reader lock could be nested by the same thread.
+ * __btrfs_tree_lock - lock eb for write
+ * @eb: the eb to lock
+ * @nest: the nesting to use for the lock
*
- * The rwlock is held for write upon exit.
+ * Returns with the eb->lock write locked.
*/
void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
__acquires(&eb->lock)
@@ -424,19 +109,7 @@ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
if (trace_btrfs_tree_lock_enabled())
start_ns = ktime_get_ns();
- WARN_ON(eb->lock_owner == current->pid);
-again:
- wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
- wait_event(eb->write_lock_wq, READ_ONCE(eb->blocking_writers) == 0);
- write_lock(&eb->lock);
- /* Refetch value after lock */
- if (atomic_read(&eb->blocking_readers) ||
- READ_ONCE(eb->blocking_writers)) {
- write_unlock(&eb->lock);
- goto again;
- }
- btrfs_assert_spinning_writers_get(eb);
- btrfs_assert_tree_write_locks_get(eb);
+ down_write_nested(&eb->lock, nest);
eb->lock_owner = current->pid;
trace_btrfs_tree_lock(eb, start_ns);
}
@@ -447,68 +120,13 @@ void btrfs_tree_lock(struct extent_buffer *eb)
}
/*
- * Release the write lock, either blocking or spinning (ie. there's no need
- * for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking).
- * This also ends the context for nesting, the read lock must have been
- * released already.
- *
- * Tasks blocked and waiting are woken, rwlock is not held upon exit.
+ * Release the write lock.
*/
void btrfs_tree_unlock(struct extent_buffer *eb)
{
- /*
- * This is read both locked and unlocked but always by the same thread
- * that already owns the lock so we don't need to use READ_ONCE
- */
- int blockers = eb->blocking_writers;
-
- BUG_ON(blockers > 1);
-
- btrfs_assert_tree_locked(eb);
trace_btrfs_tree_unlock(eb);
eb->lock_owner = 0;
- btrfs_assert_tree_write_locks_put(eb);
-
- if (blockers) {
- btrfs_assert_no_spinning_writers(eb);
- /* Unlocked write */
- WRITE_ONCE(eb->blocking_writers, 0);
- /*
- * We need to order modifying blocking_writers above with
- * actually waking up the sleepers to ensure they see the
- * updated value of blocking_writers
- */
- cond_wake_up(&eb->write_lock_wq);
- } else {
- btrfs_assert_spinning_writers_put(eb);
- write_unlock(&eb->lock);
- }
-}
-
-/*
- * Set all locked nodes in the path to blocking locks. This should be done
- * before scheduling
- */
-void btrfs_set_path_blocking(struct btrfs_path *p)
-{
- int i;
-
- for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
- if (!p->nodes[i] || !p->locks[i])
- continue;
- /*
- * If we currently have a spinning reader or writer lock this
- * will bump the count of blocking holders and drop the
- * spinlock.
- */
- if (p->locks[i] == BTRFS_READ_LOCK) {
- btrfs_set_lock_blocking_read(p->nodes[i]);
- p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
- } else if (p->locks[i] == BTRFS_WRITE_LOCK) {
- btrfs_set_lock_blocking_write(p->nodes[i]);
- p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
- }
- }
+ up_write(&eb->lock);
}
/*
@@ -564,14 +182,13 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
*
* Return: root extent buffer with read lock held
*/
-struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root,
- bool recurse)
+struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
{
struct extent_buffer *eb;
while (1) {
eb = btrfs_root_node(root);
- __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, recurse);
+ btrfs_tree_read_lock(eb);
if (eb == root->node)
break;
btrfs_tree_read_unlock(eb);
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 3ea81ed3320b..a2e1f1f5c6e3 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -13,8 +13,6 @@
#define BTRFS_WRITE_LOCK 1
#define BTRFS_READ_LOCK 2
-#define BTRFS_WRITE_LOCK_BLOCKING 3
-#define BTRFS_READ_LOCK_BLOCKING 4
/*
* We are limited in number of subclasses by MAX_LOCKDEP_SUBCLASSES, which at
@@ -89,42 +87,28 @@ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
void btrfs_tree_lock(struct extent_buffer *eb);
void btrfs_tree_unlock(struct extent_buffer *eb);
-void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest,
- bool recurse);
+void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
void btrfs_tree_read_lock(struct extent_buffer *eb);
void btrfs_tree_read_unlock(struct extent_buffer *eb);
-void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
-void btrfs_set_lock_blocking_read(struct extent_buffer *eb);
-void btrfs_set_lock_blocking_write(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
-int btrfs_tree_read_lock_atomic(struct extent_buffer *eb);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
-struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root,
- bool recurse);
-
-static inline struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
-{
- return __btrfs_read_lock_root_node(root, false);
-}
+struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
#ifdef CONFIG_BTRFS_DEBUG
static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) {
- BUG_ON(!eb->write_locks);
+ lockdep_assert_held(&eb->lock);
}
#else
static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { }
#endif
-void btrfs_set_path_blocking(struct btrfs_path *p);
void btrfs_unlock_up_safe(struct btrfs_path *path, int level);
static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
{
- if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING)
+ if (rw == BTRFS_WRITE_LOCK)
btrfs_tree_unlock(eb);
- else if (rw == BTRFS_READ_LOCK_BLOCKING)
- btrfs_tree_read_unlock_blocking(eb);
else if (rw == BTRFS_READ_LOCK)
btrfs_tree_read_unlock(eb);
else
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 87bac9ecdf4c..79d366a36223 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -855,51 +855,6 @@ out:
}
/*
- * search the ordered extents for one corresponding to 'offset' and
- * try to find a checksum. This is used because we allow pages to
- * be reclaimed before their checksum is actually put into the btree
- */
-int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset,
- u64 disk_bytenr, u8 *sum, int len)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_ordered_sum *ordered_sum;
- struct btrfs_ordered_extent *ordered;
- struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
- unsigned long num_sectors;
- unsigned long i;
- u32 sectorsize = btrfs_inode_sectorsize(inode);
- const u8 blocksize_bits = inode->vfs_inode.i_sb->s_blocksize_bits;
- const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
- int index = 0;
-
- ordered = btrfs_lookup_ordered_extent(inode, offset);
- if (!ordered)
- return 0;
-
- spin_lock_irq(&tree->lock);
- list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
- if (disk_bytenr >= ordered_sum->bytenr &&
- disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
- i = (disk_bytenr - ordered_sum->bytenr) >> blocksize_bits;
- num_sectors = ordered_sum->len >> blocksize_bits;
- num_sectors = min_t(int, len - index, num_sectors - i);
- memcpy(sum + index, ordered_sum->sums + i * csum_size,
- num_sectors * csum_size);
-
- index += (int)num_sectors * csum_size;
- if (index == len)
- goto out;
- disk_bytenr += num_sectors * sectorsize;
- }
- }
-out:
- spin_unlock_irq(&tree->lock);
- btrfs_put_ordered_extent(ordered);
- return index;
-}
-
-/*
* btrfs_flush_ordered_range - Lock the passed range and ensures all pending
* ordered extents in it are run to completion.
*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c3a2325e64a4..0bfa82b58e23 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -137,9 +137,8 @@ static inline int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info,
unsigned long bytes)
{
int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize);
- int csum_size = btrfs_super_csum_size(fs_info->super_copy);
- return sizeof(struct btrfs_ordered_sum) + num_sectors * csum_size;
+ return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size;
}
static inline void
@@ -184,8 +183,6 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
u64 len);
void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
struct list_head *list);
-int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset,
- u64 disk_bytenr, u8 *sum, int len);
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
const u64 range_start, const u64 range_len);
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 7695c4783d33..fe5e0026129d 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -177,8 +177,7 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
__le64 subvol_id;
read_extent_buffer(l, &subvol_id, offset, sizeof(subvol_id));
- pr_info("\t\tsubvol_id %llu\n",
- (unsigned long long)le64_to_cpu(subvol_id));
+ pr_info("\t\tsubvol_id %llu\n", le64_to_cpu(subvol_id));
item_size -= sizeof(u64);
offset += sizeof(u64);
}
@@ -191,15 +190,8 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
static void print_eb_refs_lock(struct extent_buffer *eb)
{
#ifdef CONFIG_BTRFS_DEBUG
- btrfs_info(eb->fs_info,
-"refs %u lock (w:%d r:%d bw:%d br:%d sw:%d sr:%d) lock_owner %u current %u",
- atomic_read(&eb->refs), eb->write_locks,
- atomic_read(&eb->read_locks),
- eb->blocking_writers,
- atomic_read(&eb->blocking_readers),
- eb->spinning_writers,
- atomic_read(&eb->spinning_readers),
- eb->lock_owner, current->pid);
+ btrfs_info(eb->fs_info, "refs %u lock_owner %u current %u",
+ atomic_read(&eb->refs), eb->lock_owner, current->pid);
#endif
}
@@ -398,6 +390,7 @@ void btrfs_print_tree(struct extent_buffer *c, bool follow)
btrfs_node_key_to_cpu(c, &first_key, i);
next = read_tree_block(fs_info, btrfs_node_blockptr(c, i),
+ btrfs_header_owner(c),
btrfs_node_ptr_generation(c, i),
level - 1, &first_key);
if (IS_ERR(next)) {
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 87bd37b70738..fe3046007f52 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -894,8 +894,6 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
-
key.objectid = 0;
key.offset = 0;
key.type = 0;
@@ -1944,34 +1942,22 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
struct btrfs_key dst_key;
if (src_path->nodes[cur_level] == NULL) {
- struct btrfs_key first_key;
struct extent_buffer *eb;
int parent_slot;
- u64 child_gen;
- u64 child_bytenr;
eb = src_path->nodes[cur_level + 1];
parent_slot = src_path->slots[cur_level + 1];
- child_bytenr = btrfs_node_blockptr(eb, parent_slot);
- child_gen = btrfs_node_ptr_generation(eb, parent_slot);
- btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
- eb = read_tree_block(fs_info, child_bytenr, child_gen,
- cur_level, &first_key);
+ eb = btrfs_read_node_slot(eb, parent_slot);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
- } else if (!extent_buffer_uptodate(eb)) {
- free_extent_buffer(eb);
- ret = -EIO;
- goto out;
}
src_path->nodes[cur_level] = eb;
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_read(eb);
- src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
+ src_path->locks[cur_level] = BTRFS_READ_LOCK;
}
src_path->slots[cur_level] = dst_path->slots[cur_level];
@@ -2066,10 +2052,8 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
/* Read the tree block if needed */
if (dst_path->nodes[cur_level] == NULL) {
- struct btrfs_key first_key;
int parent_slot;
u64 child_gen;
- u64 child_bytenr;
/*
* dst_path->nodes[root_level] must be initialized before
@@ -2088,31 +2072,23 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
*/
eb = dst_path->nodes[cur_level + 1];
parent_slot = dst_path->slots[cur_level + 1];
- child_bytenr = btrfs_node_blockptr(eb, parent_slot);
child_gen = btrfs_node_ptr_generation(eb, parent_slot);
- btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
/* This node is old, no need to trace */
if (child_gen < last_snapshot)
goto out;
- eb = read_tree_block(fs_info, child_bytenr, child_gen,
- cur_level, &first_key);
+ eb = btrfs_read_node_slot(eb, parent_slot);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
- } else if (!extent_buffer_uptodate(eb)) {
- free_extent_buffer(eb);
- ret = -EIO;
- goto out;
}
dst_path->nodes[cur_level] = eb;
dst_path->slots[cur_level] = 0;
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_read(eb);
- dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
+ dst_path->locks[cur_level] = BTRFS_READ_LOCK;
need_cleanup = true;
}
@@ -2256,38 +2232,28 @@ walk_down:
level = root_level;
while (level >= 0) {
if (path->nodes[level] == NULL) {
- struct btrfs_key first_key;
int parent_slot;
- u64 child_gen;
u64 child_bytenr;
/*
- * We need to get child blockptr/gen from parent before
- * we can read it.
+ * We need to get child blockptr from parent before we
+ * can read it.
*/
eb = path->nodes[level + 1];
parent_slot = path->slots[level + 1];
child_bytenr = btrfs_node_blockptr(eb, parent_slot);
- child_gen = btrfs_node_ptr_generation(eb, parent_slot);
- btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
- eb = read_tree_block(fs_info, child_bytenr, child_gen,
- level, &first_key);
+ eb = btrfs_read_node_slot(eb, parent_slot);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
- } else if (!extent_buffer_uptodate(eb)) {
- free_extent_buffer(eb);
- ret = -EIO;
- goto out;
}
path->nodes[level] = eb;
path->slots[level] = 0;
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_read(eb);
- path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_READ_LOCK;
ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
fs_info->nodesize,
@@ -4242,7 +4208,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
spin_unlock(&blocks->lock);
/* Read out reloc subtree root */
- reloc_eb = read_tree_block(fs_info, block->reloc_bytenr,
+ reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, 0,
block->reloc_generation, block->level,
&block->first_key);
if (IS_ERR(reloc_eb)) {
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 255490f42b5d..93fbf87bdc8d 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1097,7 +1097,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
/* see if we can add this page onto our existing bio */
if (last) {
- u64 last_end = (u64)last->bi_iter.bi_sector << 9;
+ u64 last_end = last->bi_iter.bi_sector << 9;
last_end += last->bi_iter.bi_size;
/*
@@ -1163,7 +1163,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
struct bvec_iter iter;
int i = 0;
- start = (u64)bio->bi_iter.bi_sector << 9;
+ start = bio->bi_iter.bi_sector << 9;
stripe_offset = start - rbio->bbio->raid_map[0];
page_index = stripe_offset >> PAGE_SHIFT;
@@ -1374,7 +1374,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
struct bio *bio)
{
- u64 logical = (u64)bio->bi_iter.bi_sector << 9;
+ u64 logical = bio->bi_iter.bi_sector << 9;
int i;
for (i = 0; i < rbio->nr_data; i++) {
@@ -2150,7 +2150,7 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
if (rbio->faila == -1) {
btrfs_warn(fs_info,
"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
- __func__, (u64)bio->bi_iter.bi_sector << 9,
+ __func__, bio->bi_iter.bi_sector << 9,
(u64)bio->bi_iter.bi_size, bbio->map_type);
if (generic_io)
btrfs_put_bbio(bbio);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index d9a166eb344e..20fd4aa48a8c 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -52,6 +52,7 @@ struct reada_extctl {
struct reada_extent {
u64 logical;
+ u64 owner_root;
struct btrfs_key top;
struct list_head extctl;
int refcnt;
@@ -59,6 +60,7 @@ struct reada_extent {
struct reada_zone *zones[BTRFS_MAX_MIRRORS];
int nzones;
int scheduled;
+ int level;
};
struct reada_zone {
@@ -87,7 +89,8 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info);
static void __reada_start_machine(struct btrfs_fs_info *fs_info);
static int reada_add_block(struct reada_control *rc, u64 logical,
- struct btrfs_key *top, u64 generation);
+ struct btrfs_key *top, u64 owner_root,
+ u64 generation, int level);
/* recurses */
/* in case of err, eb might be NULL */
@@ -165,7 +168,9 @@ static void __readahead_hook(struct btrfs_fs_info *fs_info,
if (rec->generation == generation &&
btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
- reada_add_block(rc, bytenr, &next_key, n_gen);
+ reada_add_block(rc, bytenr, &next_key,
+ btrfs_header_owner(eb), n_gen,
+ btrfs_header_level(eb) - 1);
}
}
@@ -298,7 +303,8 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
u64 logical,
- struct btrfs_key *top)
+ struct btrfs_key *top,
+ u64 owner_root, int level)
{
int ret;
struct reada_extent *re = NULL;
@@ -331,6 +337,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&re->extctl);
spin_lock_init(&re->lock);
re->refcnt = 1;
+ re->owner_root = owner_root;
+ re->level = level;
/*
* map block
@@ -531,6 +539,8 @@ static void reada_zone_release(struct kref *kref)
{
struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
+ lockdep_assert_held(&zone->device->fs_info->reada_lock);
+
radix_tree_delete(&zone->device->reada_zones,
zone->end >> PAGE_SHIFT);
@@ -546,14 +556,15 @@ static void reada_control_release(struct kref *kref)
}
static int reada_add_block(struct reada_control *rc, u64 logical,
- struct btrfs_key *top, u64 generation)
+ struct btrfs_key *top, u64 owner_root,
+ u64 generation, int level)
{
struct btrfs_fs_info *fs_info = rc->fs_info;
struct reada_extent *re;
struct reada_extctl *rec;
/* takes one ref */
- re = reada_find_extent(fs_info, logical, top);
+ re = reada_find_extent(fs_info, logical, top, owner_root, level);
if (!re)
return -1;
@@ -645,12 +656,13 @@ static int reada_pick_zone(struct btrfs_device *dev)
}
static int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
- int mirror_num, struct extent_buffer **eb)
+ u64 owner_root, int level, int mirror_num,
+ struct extent_buffer **eb)
{
struct extent_buffer *buf = NULL;
int ret;
- buf = btrfs_find_create_tree_block(fs_info, bytenr);
+ buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
if (IS_ERR(buf))
return 0;
@@ -738,7 +750,8 @@ static int reada_start_machine_dev(struct btrfs_device *dev)
logical = re->logical;
atomic_inc(&dev->reada_in_flight);
- ret = reada_tree_block_flagged(fs_info, logical, mirror_num, &eb);
+ ret = reada_tree_block_flagged(fs_info, logical, re->owner_root,
+ re->level, mirror_num, &eb);
if (ret)
__readahead_hook(fs_info, re, NULL, ret);
else if (eb)
@@ -945,6 +958,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
u64 start;
u64 generation;
int ret;
+ int level;
struct extent_buffer *node;
static struct btrfs_key max_key = {
.objectid = (u64)-1,
@@ -967,9 +981,11 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
node = btrfs_root_node(root);
start = node->start;
generation = btrfs_header_generation(node);
+ level = btrfs_header_level(node);
free_extent_buffer(node);
- ret = reada_add_block(rc, start, &max_key, generation);
+ ret = reada_add_block(rc, start, &max_key, root->root_key.objectid,
+ generation, level);
if (ret) {
kfree(rc);
return ERR_PTR(ret);
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 78693d3dd15b..4b9b6c52a83b 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -551,34 +551,19 @@ static int process_leaf(struct btrfs_root *root,
static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
int level, u64 *bytenr, u64 *num_bytes)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *eb;
- u64 block_bytenr, gen;
int ret = 0;
while (level >= 0) {
if (level) {
- struct btrfs_key first_key;
-
- block_bytenr = btrfs_node_blockptr(path->nodes[level],
- path->slots[level]);
- gen = btrfs_node_ptr_generation(path->nodes[level],
- path->slots[level]);
- btrfs_node_key_to_cpu(path->nodes[level], &first_key,
- path->slots[level]);
- eb = read_tree_block(fs_info, block_bytenr, gen,
- level - 1, &first_key);
+ eb = btrfs_read_node_slot(path->nodes[level],
+ path->slots[level]);
if (IS_ERR(eb))
return PTR_ERR(eb);
- if (!extent_buffer_uptodate(eb)) {
- free_extent_buffer(eb);
- return -EIO;
- }
btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_read(eb);
path->nodes[level-1] = eb;
path->slots[level-1] = 0;
- path->locks[level-1] = BTRFS_READ_LOCK_BLOCKING;
+ path->locks[level-1] = BTRFS_READ_LOCK;
} else {
ret = process_leaf(root, path, bytenr, num_bytes);
if (ret)
@@ -799,8 +784,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
if (!be) {
btrfs_err(fs_info,
"trying to do action %d to bytenr %llu num_bytes %llu but there is no existing entry!",
- action, (unsigned long long)bytenr,
- (unsigned long long)num_bytes);
+ action, bytenr, num_bytes);
dump_ref_action(fs_info, ra);
kfree(ref);
kfree(ra);
@@ -1001,11 +985,10 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
return -ENOMEM;
eb = btrfs_read_lock_root_node(fs_info->extent_root);
- btrfs_set_lock_blocking_read(eb);
level = btrfs_header_level(eb);
path->nodes[level] = eb;
path->slots[level] = 0;
- path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+ path->locks[level] = BTRFS_READ_LOCK;
while (1) {
/*
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 99aa87c08912..ab80896315be 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -31,10 +31,10 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
endoff = destoff + olen;
if (endoff > inode->i_size) {
i_size_write(inode, endoff);
- btrfs_inode_safe_disk_i_size_write(inode, 0);
+ btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
}
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
@@ -163,6 +163,7 @@ static int clone_copy_inline_extent(struct inode *dst,
const u64 aligned_end = ALIGN(new_key->offset + datal,
fs_info->sectorsize);
struct btrfs_trans_handle *trans = NULL;
+ struct btrfs_drop_extents_args drop_args = { 0 };
int ret;
struct btrfs_key key;
@@ -252,7 +253,11 @@ copy_inline_extent:
trans = NULL;
goto out;
}
- ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
+ drop_args.path = path;
+ drop_args.start = drop_start;
+ drop_args.end = aligned_end;
+ drop_args.drop_cache = true;
+ ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args);
if (ret)
goto out;
ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
@@ -263,7 +268,7 @@ copy_inline_extent:
btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]),
size);
- inode_add_bytes(dst, datal);
+ btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found);
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end);
out:
@@ -347,7 +352,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
u64 drop_start;
/* Note the key will change type as we walk through the tree */
- path->leave_spinning = 1;
ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
0, 0);
if (ret < 0)
@@ -417,7 +421,6 @@ process_slot:
size);
btrfs_release_path(path);
- path->leave_spinning = 0;
memcpy(&new_key, &key, sizeof(new_key));
new_key.objectid = btrfs_ino(BTRFS_I(inode));
@@ -533,7 +536,6 @@ process_slot:
* mixing buffered and direct IO writes against this file.
*/
btrfs_release_path(path);
- path->leave_spinning = 0;
ret = btrfs_replace_file_extents(inode, path, last_dest_end,
destoff + len - 1, NULL, &trans);
@@ -652,7 +654,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
if (destoff > inode->i_size) {
const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
- ret = btrfs_cont_expand(inode, inode->i_size, destoff);
+ ret = btrfs_cont_expand(BTRFS_I(inode), inode->i_size, destoff);
if (ret)
return ret;
/*
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 9ba92d86da0b..19b7db8b2117 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -18,7 +18,6 @@
#include "btrfs_inode.h"
#include "async-thread.h"
#include "free-space-cache.h"
-#include "inode-map.h"
#include "qgroup.h"
#include "print-tree.h"
#include "delalloc-space.h"
@@ -783,7 +782,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
btrfs_set_root_refs(root_item, 0);
memset(&root_item->drop_progress, 0,
sizeof(struct btrfs_disk_key));
- root_item->drop_level = 0;
+ btrfs_set_root_drop_level(root_item, 0);
}
btrfs_tree_unlock(eb);
@@ -1196,7 +1195,6 @@ again:
btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
eb = btrfs_lock_root_node(dest);
- btrfs_set_lock_blocking_write(eb);
level = btrfs_header_level(eb);
if (level < lowest_level) {
@@ -1210,7 +1208,6 @@ again:
BTRFS_NESTING_COW);
BUG_ON(ret);
}
- btrfs_set_lock_blocking_write(eb);
if (next_key) {
next_key->objectid = (u64)-1;
@@ -1220,8 +1217,6 @@ again:
parent = eb;
while (1) {
- struct btrfs_key first_key;
-
level = btrfs_header_level(parent);
BUG_ON(level < lowest_level);
@@ -1237,7 +1232,6 @@ again:
old_bytenr = btrfs_node_blockptr(parent, slot);
blocksize = fs_info->nodesize;
old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
- btrfs_node_key_to_cpu(parent, &first_key, slot);
if (level <= max_level) {
eb = path->nodes[level];
@@ -1262,15 +1256,10 @@ again:
break;
}
- eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen,
- level - 1, &first_key);
+ eb = btrfs_read_node_slot(parent, slot);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
break;
- } else if (!extent_buffer_uptodate(eb)) {
- ret = -EIO;
- free_extent_buffer(eb);
- break;
}
btrfs_tree_lock(eb);
if (cow) {
@@ -1279,7 +1268,6 @@ again:
BTRFS_NESTING_COW);
BUG_ON(ret);
}
- btrfs_set_lock_blocking_write(eb);
btrfs_tree_unlock(parent);
free_extent_buffer(parent);
@@ -1418,10 +1406,8 @@ static noinline_for_stack
int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
int *level)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *eb = NULL;
int i;
- u64 bytenr;
u64 ptr_gen = 0;
u64 last_snapshot;
u32 nritems;
@@ -1429,8 +1415,6 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
last_snapshot = btrfs_root_last_snapshot(&root->root_item);
for (i = *level; i > 0; i--) {
- struct btrfs_key first_key;
-
eb = path->nodes[i];
nritems = btrfs_header_nritems(eb);
while (path->slots[i] < nritems) {
@@ -1450,16 +1434,9 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
return 0;
}
- bytenr = btrfs_node_blockptr(eb, path->slots[i]);
- btrfs_node_key_to_cpu(eb, &first_key, path->slots[i]);
- eb = read_tree_block(fs_info, bytenr, ptr_gen, i - 1,
- &first_key);
- if (IS_ERR(eb)) {
+ eb = btrfs_read_node_slot(eb, path->slots[i]);
+ if (IS_ERR(eb))
return PTR_ERR(eb);
- } else if (!extent_buffer_uptodate(eb)) {
- free_extent_buffer(eb);
- return -EIO;
- }
BUG_ON(btrfs_header_level(eb) != i - 1);
path->nodes[i - 1] = eb;
path->slots[i - 1] = 0;
@@ -1575,7 +1552,7 @@ static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
reloc_root_item = &reloc_root->root_item;
memset(&reloc_root_item->drop_progress, 0,
sizeof(reloc_root_item->drop_progress));
- reloc_root_item->drop_level = 0;
+ btrfs_set_root_drop_level(reloc_root_item, 0);
btrfs_set_root_refs(reloc_root_item, 0);
btrfs_update_reloc_root(trans, root);
@@ -1652,8 +1629,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
int level;
int max_level;
int replaced = 0;
- int ret;
- int err = 0;
+ int ret = 0;
u32 min_reserved;
path = btrfs_alloc_path();
@@ -1672,7 +1648,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
} else {
btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
- level = root_item->drop_level;
+ level = btrfs_root_drop_level(root_item);
BUG_ON(level == 0);
path->lowest_level = level;
ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
@@ -1704,13 +1680,11 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
while (1) {
ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
BTRFS_RESERVE_FLUSH_LIMIT);
- if (ret) {
- err = ret;
+ if (ret)
goto out;
- }
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
trans = NULL;
goto out;
}
@@ -1732,10 +1706,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
max_level = level;
ret = walk_down_reloc_tree(reloc_root, path, &level);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- }
if (ret > 0)
break;
@@ -1746,11 +1718,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
ret = replace_path(trans, rc, root, reloc_root, path,
&next_key, level, max_level);
}
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- }
-
if (ret > 0) {
level = ret;
btrfs_node_key_to_cpu(path->nodes[level], &key,
@@ -1769,7 +1738,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
*/
btrfs_node_key(path->nodes[level], &root_item->drop_progress,
path->slots[level]);
- root_item->drop_level = level;
+ btrfs_set_root_drop_level(root_item, level);
btrfs_end_transaction_throttle(trans);
trans = NULL;
@@ -1789,12 +1758,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
BTRFS_NESTING_COW);
btrfs_tree_unlock(leaf);
free_extent_buffer(leaf);
- if (ret < 0)
- err = ret;
out:
btrfs_free_path(path);
- if (err == 0)
+ if (ret == 0)
insert_dirty_subvol(trans, rc, root);
if (trans)
@@ -1805,7 +1772,7 @@ out:
if (replaced && rc->stage == UPDATE_DATA_PTRS)
invalidate_extent_cache(root, &key, &next_key);
- return err;
+ return ret;
}
static noinline_for_stack
@@ -2205,7 +2172,6 @@ static int do_relocation(struct btrfs_trans_handle *trans,
struct btrfs_key *key,
struct btrfs_path *path, int lowest)
{
- struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
struct btrfs_backref_node *upper;
struct btrfs_backref_edge *edge;
struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
@@ -2213,17 +2179,14 @@ static int do_relocation(struct btrfs_trans_handle *trans,
struct extent_buffer *eb;
u32 blocksize;
u64 bytenr;
- u64 generation;
int slot;
- int ret;
- int err = 0;
+ int ret = 0;
BUG_ON(lowest && node->eb);
path->lowest_level = node->level + 1;
rc->backref_cache.path[node->level] = node;
list_for_each_entry(edge, &node->upper, list[LOWER]) {
- struct btrfs_key first_key;
struct btrfs_ref ref = { 0 };
cond_resched();
@@ -2235,10 +2198,8 @@ static int do_relocation(struct btrfs_trans_handle *trans,
if (upper->eb && !upper->locked) {
if (!lowest) {
ret = btrfs_bin_search(upper->eb, key, &slot);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto next;
- }
BUG_ON(ret);
bytenr = btrfs_node_blockptr(upper->eb, slot);
if (node->eb->start == bytenr)
@@ -2250,10 +2211,8 @@ static int do_relocation(struct btrfs_trans_handle *trans,
if (!upper->eb) {
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
if (ret) {
- if (ret < 0)
- err = ret;
- else
- err = -ENOENT;
+ if (ret > 0)
+ ret = -ENOENT;
btrfs_release_path(path);
break;
@@ -2273,10 +2232,8 @@ static int do_relocation(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
} else {
ret = btrfs_bin_search(upper->eb, key, &slot);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto next;
- }
BUG_ON(ret);
}
@@ -2287,7 +2244,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
"lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
bytenr, node->bytenr, slot,
upper->eb->start);
- err = -EIO;
+ ret = -EIO;
goto next;
}
} else {
@@ -2296,30 +2253,20 @@ static int do_relocation(struct btrfs_trans_handle *trans,
}
blocksize = root->fs_info->nodesize;
- generation = btrfs_node_ptr_generation(upper->eb, slot);
- btrfs_node_key_to_cpu(upper->eb, &first_key, slot);
- eb = read_tree_block(fs_info, bytenr, generation,
- upper->level - 1, &first_key);
+ eb = btrfs_read_node_slot(upper->eb, slot);
if (IS_ERR(eb)) {
- err = PTR_ERR(eb);
- goto next;
- } else if (!extent_buffer_uptodate(eb)) {
- free_extent_buffer(eb);
- err = -EIO;
+ ret = PTR_ERR(eb);
goto next;
}
btrfs_tree_lock(eb);
- btrfs_set_lock_blocking_write(eb);
if (!node->eb) {
ret = btrfs_cow_block(trans, root, eb, upper->eb,
slot, &eb, BTRFS_NESTING_COW);
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto next;
- }
BUG_ON(node->eb != eb);
} else {
btrfs_set_node_blockptr(upper->eb, slot,
@@ -2345,19 +2292,19 @@ next:
btrfs_backref_drop_node_buffer(upper);
else
btrfs_backref_unlock_node_buffer(upper);
- if (err)
+ if (ret)
break;
}
- if (!err && node->pending) {
+ if (!ret && node->pending) {
btrfs_backref_drop_node_buffer(node);
list_move_tail(&node->list, &rc->backref_cache.changed);
node->pending = 0;
}
path->lowest_level = 0;
- BUG_ON(err == -ENOSPC);
- return err;
+ BUG_ON(ret == -ENOSPC);
+ return ret;
}
static int link_to_upper(struct btrfs_trans_handle *trans,
@@ -2446,7 +2393,7 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
{
struct extent_buffer *eb;
- eb = read_tree_block(fs_info, block->bytenr, block->key.offset,
+ eb = read_tree_block(fs_info, block->bytenr, 0, block->key.offset,
block->level, NULL);
if (IS_ERR(eb)) {
return PTR_ERR(eb);
@@ -2546,7 +2493,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
/* Kick in readahead for tree blocks with missing keys */
rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
if (!block->key_ready)
- readahead_tree_block(fs_info, block->bytenr);
+ btrfs_readahead_tree_block(fs_info, block->bytenr, 0, 0,
+ block->level);
}
/* Get first keys */
@@ -3071,7 +3019,7 @@ int add_data_references(struct reloc_control *rc,
while ((ref_node = ulist_next(leaves, &leaf_uiter))) {
struct extent_buffer *eb;
- eb = read_tree_block(fs_info, ref_node->val, 0, 0, NULL);
+ eb = read_tree_block(fs_info, ref_node->val, 0, 0, 0, NULL);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
break;
@@ -3694,7 +3642,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
memset(&root->root_item.drop_progress, 0,
sizeof(root->root_item.drop_progress));
- root->root_item.drop_level = 0;
+ btrfs_set_root_drop_level(&root->root_item, 0);
btrfs_set_root_refs(&root->root_item, 0);
ret = btrfs_update_root(trans, fs_info->tree_root,
&root->root_key, &root->root_item);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index e71e7586e9eb..5f4f88a4d2c8 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -20,6 +20,7 @@
#include "rcu-string.h"
#include "raid56.h"
#include "block-group.h"
+#include "zoned.h"
/*
* This is only the first step towards a full-features scrub. It reads all
@@ -71,11 +72,9 @@ struct scrub_page {
u64 physical;
u64 physical_for_dev_replace;
atomic_t refs;
- struct {
- unsigned int mirror_num:8;
- unsigned int have_csum:1;
- unsigned int io_error:1;
- };
+ u8 mirror_num;
+ int have_csum:1;
+ int io_error:1;
u8 csum[BTRFS_CSUM_SIZE];
struct scrub_recover *recover;
@@ -131,7 +130,7 @@ struct scrub_parity {
int nsectors;
- u64 stripe_len;
+ u32 stripe_len;
refcount_t refs;
@@ -161,7 +160,6 @@ struct scrub_ctx {
atomic_t workers_pending;
spinlock_t list_lock;
wait_queue_head_t list_wait;
- u16 csum_size;
struct list_head csum_list;
atomic_t cancel_req;
int readonly;
@@ -235,15 +233,15 @@ static void scrub_parity_get(struct scrub_parity *sparity);
static void scrub_parity_put(struct scrub_parity *sparity);
static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
struct scrub_page *spage);
-static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
u64 physical, struct btrfs_device *dev, u64 flags,
- u64 gen, int mirror_num, u8 *csum, int force,
+ u64 gen, int mirror_num, u8 *csum,
u64 physical_for_dev_replace);
static void scrub_bio_end_io(struct bio *bio);
static void scrub_bio_end_io_worker(struct btrfs_work *work);
static void scrub_block_complete(struct scrub_block *sblock);
static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
- u64 extent_logical, u64 extent_len,
+ u64 extent_logical, u32 extent_len,
u64 *extent_physical,
struct btrfs_device **extent_dev,
int *extent_mirror_num);
@@ -256,10 +254,10 @@ static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
static void scrub_put_ctx(struct scrub_ctx *sctx);
-static inline int scrub_is_page_on_raid56(struct scrub_page *page)
+static inline int scrub_is_page_on_raid56(struct scrub_page *spage)
{
- return page->recover &&
- (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+ return spage->recover &&
+ (spage->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
}
static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
@@ -610,7 +608,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
atomic_set(&sctx->bios_in_flight, 0);
atomic_set(&sctx->workers_pending, 0);
atomic_set(&sctx->cancel_req, 0);
- sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
spin_lock_init(&sctx->list_lock);
spin_lock_init(&sctx->stat_lock);
@@ -1092,11 +1089,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
success = 1;
for (page_num = 0; page_num < sblock_bad->page_count;
page_num++) {
- struct scrub_page *page_bad = sblock_bad->pagev[page_num];
+ struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
struct scrub_block *sblock_other = NULL;
/* skip no-io-error page in scrub */
- if (!page_bad->io_error && !sctx->is_dev_replace)
+ if (!spage_bad->io_error && !sctx->is_dev_replace)
continue;
if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
@@ -1108,7 +1105,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* sblock_for_recheck array to target device.
*/
sblock_other = NULL;
- } else if (page_bad->io_error) {
+ } else if (spage_bad->io_error) {
/* try to find no-io-error page in mirrors */
for (mirror_index = 0;
mirror_index < BTRFS_MAX_MIRRORS &&
@@ -1147,7 +1144,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
sblock_other,
page_num, 0);
if (0 == ret)
- page_bad->io_error = 0;
+ spage_bad->io_error = 0;
else
success = 0;
}
@@ -1325,13 +1322,13 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
for (mirror_index = 0; mirror_index < nmirrors;
mirror_index++) {
struct scrub_block *sblock;
- struct scrub_page *page;
+ struct scrub_page *spage;
sblock = sblocks_for_recheck + mirror_index;
sblock->sctx = sctx;
- page = kzalloc(sizeof(*page), GFP_NOFS);
- if (!page) {
+ spage = kzalloc(sizeof(*spage), GFP_NOFS);
+ if (!spage) {
leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -1339,17 +1336,17 @@ leave_nomem:
scrub_put_recover(fs_info, recover);
return -ENOMEM;
}
- scrub_page_get(page);
- sblock->pagev[page_index] = page;
- page->sblock = sblock;
- page->flags = flags;
- page->generation = generation;
- page->logical = logical;
- page->have_csum = have_csum;
+ scrub_page_get(spage);
+ sblock->pagev[page_index] = spage;
+ spage->sblock = sblock;
+ spage->flags = flags;
+ spage->generation = generation;
+ spage->logical = logical;
+ spage->have_csum = have_csum;
if (have_csum)
- memcpy(page->csum,
+ memcpy(spage->csum,
original_sblock->pagev[0]->csum,
- sctx->csum_size);
+ sctx->fs_info->csum_size);
scrub_stripe_index_and_offset(logical,
bbio->map_type,
@@ -1360,23 +1357,23 @@ leave_nomem:
mirror_index,
&stripe_index,
&stripe_offset);
- page->physical = bbio->stripes[stripe_index].physical +
+ spage->physical = bbio->stripes[stripe_index].physical +
stripe_offset;
- page->dev = bbio->stripes[stripe_index].dev;
+ spage->dev = bbio->stripes[stripe_index].dev;
BUG_ON(page_index >= original_sblock->page_count);
- page->physical_for_dev_replace =
+ spage->physical_for_dev_replace =
original_sblock->pagev[page_index]->
physical_for_dev_replace;
/* for missing devices, dev->bdev is NULL */
- page->mirror_num = mirror_index + 1;
+ spage->mirror_num = mirror_index + 1;
sblock->page_count++;
- page->page = alloc_page(GFP_NOFS);
- if (!page->page)
+ spage->page = alloc_page(GFP_NOFS);
+ if (!spage->page)
goto leave_nomem;
scrub_get_recover(recover);
- page->recover = recover;
+ spage->recover = recover;
}
scrub_put_recover(fs_info, recover);
length -= sublen;
@@ -1394,19 +1391,19 @@ static void scrub_bio_wait_endio(struct bio *bio)
static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
struct bio *bio,
- struct scrub_page *page)
+ struct scrub_page *spage)
{
DECLARE_COMPLETION_ONSTACK(done);
int ret;
int mirror_num;
- bio->bi_iter.bi_sector = page->logical >> 9;
+ bio->bi_iter.bi_sector = spage->logical >> 9;
bio->bi_private = &done;
bio->bi_end_io = scrub_bio_wait_endio;
- mirror_num = page->sblock->pagev[0]->mirror_num;
- ret = raid56_parity_recover(fs_info, bio, page->recover->bbio,
- page->recover->map_length,
+ mirror_num = spage->sblock->pagev[0]->mirror_num;
+ ret = raid56_parity_recover(fs_info, bio, spage->recover->bbio,
+ spage->recover->map_length,
mirror_num, 0);
if (ret)
return ret;
@@ -1431,10 +1428,10 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
bio_set_dev(bio, first_page->dev->bdev);
for (page_num = 0; page_num < sblock->page_count; page_num++) {
- struct scrub_page *page = sblock->pagev[page_num];
+ struct scrub_page *spage = sblock->pagev[page_num];
- WARN_ON(!page->page);
- bio_add_page(bio, page->page, PAGE_SIZE, 0);
+ WARN_ON(!spage->page);
+ bio_add_page(bio, spage->page, PAGE_SIZE, 0);
}
if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
@@ -1475,24 +1472,24 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
for (page_num = 0; page_num < sblock->page_count; page_num++) {
struct bio *bio;
- struct scrub_page *page = sblock->pagev[page_num];
+ struct scrub_page *spage = sblock->pagev[page_num];
- if (page->dev->bdev == NULL) {
- page->io_error = 1;
+ if (spage->dev->bdev == NULL) {
+ spage->io_error = 1;
sblock->no_io_error_seen = 0;
continue;
}
- WARN_ON(!page->page);
+ WARN_ON(!spage->page);
bio = btrfs_io_bio_alloc(1);
- bio_set_dev(bio, page->dev->bdev);
+ bio_set_dev(bio, spage->dev->bdev);
- bio_add_page(bio, page->page, PAGE_SIZE, 0);
- bio->bi_iter.bi_sector = page->physical >> 9;
+ bio_add_page(bio, spage->page, PAGE_SIZE, 0);
+ bio->bi_iter.bi_sector = spage->physical >> 9;
bio->bi_opf = REQ_OP_READ;
if (btrfsic_submit_bio_wait(bio)) {
- page->io_error = 1;
+ spage->io_error = 1;
sblock->no_io_error_seen = 0;
}
@@ -1548,36 +1545,36 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good,
int page_num, int force_write)
{
- struct scrub_page *page_bad = sblock_bad->pagev[page_num];
- struct scrub_page *page_good = sblock_good->pagev[page_num];
+ struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
+ struct scrub_page *spage_good = sblock_good->pagev[page_num];
struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
- BUG_ON(page_bad->page == NULL);
- BUG_ON(page_good->page == NULL);
+ BUG_ON(spage_bad->page == NULL);
+ BUG_ON(spage_good->page == NULL);
if (force_write || sblock_bad->header_error ||
- sblock_bad->checksum_error || page_bad->io_error) {
+ sblock_bad->checksum_error || spage_bad->io_error) {
struct bio *bio;
int ret;
- if (!page_bad->dev->bdev) {
+ if (!spage_bad->dev->bdev) {
btrfs_warn_rl(fs_info,
"scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
return -EIO;
}
bio = btrfs_io_bio_alloc(1);
- bio_set_dev(bio, page_bad->dev->bdev);
- bio->bi_iter.bi_sector = page_bad->physical >> 9;
+ bio_set_dev(bio, spage_bad->dev->bdev);
+ bio->bi_iter.bi_sector = spage_bad->physical >> 9;
bio->bi_opf = REQ_OP_WRITE;
- ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
+ ret = bio_add_page(bio, spage_good->page, PAGE_SIZE, 0);
if (PAGE_SIZE != ret) {
bio_put(bio);
return -EIO;
}
if (btrfsic_submit_bio_wait(bio)) {
- btrfs_dev_stat_inc_and_print(page_bad->dev,
+ btrfs_dev_stat_inc_and_print(spage_bad->dev,
BTRFS_DEV_STAT_WRITE_ERRS);
atomic64_inc(&fs_info->dev_replace.num_write_errors);
bio_put(bio);
@@ -1798,11 +1795,15 @@ static int scrub_checksum_data(struct scrub_block *sblock)
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
- crypto_shash_digest(shash, kaddr, PAGE_SIZE, csum);
- if (memcmp(csum, spage->csum, sctx->csum_size))
- sblock->checksum_error = 1;
+ /*
+ * In scrub_pages() and scrub_pages_for_parity() we ensure each spage
+ * only contains one sector of data.
+ */
+ crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
+ if (memcmp(csum, spage->csum, fs_info->csum_size))
+ sblock->checksum_error = 1;
return sblock->checksum_error;
}
@@ -1814,16 +1815,26 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 calculated_csum[BTRFS_CSUM_SIZE];
u8 on_disk_csum[BTRFS_CSUM_SIZE];
- const int num_pages = sctx->fs_info->nodesize >> PAGE_SHIFT;
+ /*
+ * This is done in sectorsize steps even for metadata as there's a
+ * constraint for nodesize to be aligned to sectorsize. This will need
+ * to change so we don't misuse data and metadata units like that.
+ */
+ const u32 sectorsize = sctx->fs_info->sectorsize;
+ const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
int i;
struct scrub_page *spage;
char *kaddr;
BUG_ON(sblock->page_count < 1);
+
+ /* Each member in pagev is just one block, not a full page */
+ ASSERT(sblock->page_count == num_sectors);
+
spage = sblock->pagev[0];
kaddr = page_address(spage->page);
h = (struct btrfs_header *)kaddr;
- memcpy(on_disk_csum, h->csum, sctx->csum_size);
+ memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
/*
* we don't use the getter functions here, as we
@@ -1848,15 +1859,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
- PAGE_SIZE - BTRFS_CSUM_SIZE);
+ sectorsize - BTRFS_CSUM_SIZE);
- for (i = 1; i < num_pages; i++) {
+ for (i = 1; i < num_sectors; i++) {
kaddr = page_address(sblock->pagev[i]->page);
- crypto_shash_update(shash, kaddr, PAGE_SIZE);
+ crypto_shash_update(shash, kaddr, sectorsize);
}
crypto_shash_final(shash, calculated_csum);
- if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
+ if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
sblock->checksum_error = 1;
return sblock->header_error || sblock->checksum_error;
@@ -1893,7 +1904,7 @@ static int scrub_checksum_super(struct scrub_block *sblock)
crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
- if (memcmp(calculated_csum, s->csum, sctx->csum_size))
+ if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
++fail_cor;
if (fail_cor + fail_gen) {
@@ -2150,12 +2161,13 @@ bbio_out:
spin_unlock(&sctx->stat_lock);
}
-static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
u64 physical, struct btrfs_device *dev, u64 flags,
- u64 gen, int mirror_num, u8 *csum, int force,
+ u64 gen, int mirror_num, u8 *csum,
u64 physical_for_dev_replace)
{
struct scrub_block *sblock;
+ const u32 sectorsize = sctx->fs_info->sectorsize;
int index;
sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
@@ -2174,7 +2186,12 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
for (index = 0; len > 0; index++) {
struct scrub_page *spage;
- u64 l = min_t(u64, len, PAGE_SIZE);
+ /*
+ * Here we will allocate one page for one sector to scrub.
+ * This is fine if PAGE_SIZE == sectorsize, but will cost
+ * more memory for PAGE_SIZE > sectorsize case.
+ */
+ u32 l = min(sectorsize, len);
spage = kzalloc(sizeof(*spage), GFP_KERNEL);
if (!spage) {
@@ -2198,7 +2215,7 @@ leave_nomem:
spage->mirror_num = mirror_num;
if (csum) {
spage->have_csum = 1;
- memcpy(spage->csum, csum, sctx->csum_size);
+ memcpy(spage->csum, csum, sctx->fs_info->csum_size);
} else {
spage->have_csum = 0;
}
@@ -2231,7 +2248,7 @@ leave_nomem:
}
}
- if (force)
+ if (flags & BTRFS_EXTENT_FLAG_SUPER)
scrub_submit(sctx);
}
@@ -2295,12 +2312,11 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
unsigned long *bitmap,
- u64 start, u64 len)
+ u64 start, u32 len)
{
u64 offset;
- u64 nsectors64;
u32 nsectors;
- int sectorsize = sparity->sctx->fs_info->sectorsize;
+ u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
if (len >= sparity->stripe_len) {
bitmap_set(bitmap, 0, sparity->nsectors);
@@ -2309,11 +2325,8 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
start -= sparity->logic_start;
start = div64_u64_rem(start, sparity->stripe_len, &offset);
- offset = div_u64(offset, sectorsize);
- nsectors64 = div_u64(len, sectorsize);
-
- ASSERT(nsectors64 < UINT_MAX);
- nsectors = (u32)nsectors64;
+ offset = offset >> sectorsize_bits;
+ nsectors = len >> sectorsize_bits;
if (offset + nsectors <= sparity->nsectors) {
bitmap_set(bitmap, offset, nsectors);
@@ -2325,13 +2338,13 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
}
static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
- u64 start, u64 len)
+ u64 start, u32 len)
{
__scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
}
static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
- u64 start, u64 len)
+ u64 start, u32 len)
{
__scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
}
@@ -2359,48 +2372,77 @@ static void scrub_block_complete(struct scrub_block *sblock)
u64 end = sblock->pagev[sblock->page_count - 1]->logical +
PAGE_SIZE;
+ ASSERT(end - start <= U32_MAX);
scrub_parity_mark_sectors_error(sblock->sparity,
start, end - start);
}
}
+static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
+{
+ sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
+ list_del(&sum->list);
+ kfree(sum);
+}
+
+/*
+ * Find the desired csum for range [logical, logical + sectorsize), and store
+ * the csum into @csum.
+ *
+ * The search source is sctx->csum_list, which is a pre-populated list
+ * storing bytenr ordered csum ranges. We're reponsible to cleanup any range
+ * that is before @logical.
+ *
+ * Return 0 if there is no csum for the range.
+ * Return 1 if there is csum for the range and copied to @csum.
+ */
static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
{
- struct btrfs_ordered_sum *sum = NULL;
- unsigned long index;
- unsigned long num_sectors;
+ bool found = false;
while (!list_empty(&sctx->csum_list)) {
+ struct btrfs_ordered_sum *sum = NULL;
+ unsigned long index;
+ unsigned long num_sectors;
+
sum = list_first_entry(&sctx->csum_list,
struct btrfs_ordered_sum, list);
+ /* The current csum range is beyond our range, no csum found */
if (sum->bytenr > logical)
- return 0;
- if (sum->bytenr + sum->len > logical)
break;
- ++sctx->stat.csum_discards;
- list_del(&sum->list);
- kfree(sum);
- sum = NULL;
- }
- if (!sum)
- return 0;
+ /*
+ * The current sum is before our bytenr, since scrub is always
+ * done in bytenr order, the csum will never be used anymore,
+ * clean it up so that later calls won't bother with the range,
+ * and continue search the next range.
+ */
+ if (sum->bytenr + sum->len <= logical) {
+ drop_csum_range(sctx, sum);
+ continue;
+ }
- index = div_u64(logical - sum->bytenr, sctx->fs_info->sectorsize);
- ASSERT(index < UINT_MAX);
+ /* Now the csum range covers our bytenr, copy the csum */
+ found = true;
+ index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
+ num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
- num_sectors = sum->len / sctx->fs_info->sectorsize;
- memcpy(csum, sum->sums + index * sctx->csum_size, sctx->csum_size);
- if (index == num_sectors - 1) {
- list_del(&sum->list);
- kfree(sum);
+ memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
+ sctx->fs_info->csum_size);
+
+ /* Cleanup the range if we're at the end of the csum range */
+ if (index == num_sectors - 1)
+ drop_csum_range(sctx, sum);
+ break;
}
+ if (!found)
+ return 0;
return 1;
}
/* scrub extent tries to collect up to 64 kB for each bio */
static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
- u64 logical, u64 len,
+ u64 logical, u32 len,
u64 physical, struct btrfs_device *dev, u64 flags,
u64 gen, int mirror_num, u64 physical_for_dev_replace)
{
@@ -2432,7 +2474,7 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
}
while (len) {
- u64 l = min_t(u64, len, blocksize);
+ u32 l = min(len, blocksize);
int have_csum = 0;
if (flags & BTRFS_EXTENT_FLAG_DATA) {
@@ -2442,7 +2484,7 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
++sctx->stat.no_csum;
}
ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
- mirror_num, have_csum ? csum : NULL, 0,
+ mirror_num, have_csum ? csum : NULL,
physical_for_dev_replace);
if (ret)
return ret;
@@ -2455,14 +2497,17 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
}
static int scrub_pages_for_parity(struct scrub_parity *sparity,
- u64 logical, u64 len,
+ u64 logical, u32 len,
u64 physical, struct btrfs_device *dev,
u64 flags, u64 gen, int mirror_num, u8 *csum)
{
struct scrub_ctx *sctx = sparity->sctx;
struct scrub_block *sblock;
+ const u32 sectorsize = sctx->fs_info->sectorsize;
int index;
+ ASSERT(IS_ALIGNED(len, sectorsize));
+
sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
if (!sblock) {
spin_lock(&sctx->stat_lock);
@@ -2481,7 +2526,6 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
for (index = 0; len > 0; index++) {
struct scrub_page *spage;
- u64 l = min_t(u64, len, PAGE_SIZE);
spage = kzalloc(sizeof(*spage), GFP_KERNEL);
if (!spage) {
@@ -2508,7 +2552,7 @@ leave_nomem:
spage->mirror_num = mirror_num;
if (csum) {
spage->have_csum = 1;
- memcpy(spage->csum, csum, sctx->csum_size);
+ memcpy(spage->csum, csum, sctx->fs_info->csum_size);
} else {
spage->have_csum = 0;
}
@@ -2516,9 +2560,12 @@ leave_nomem:
spage->page = alloc_page(GFP_KERNEL);
if (!spage->page)
goto leave_nomem;
- len -= l;
- logical += l;
- physical += l;
+
+
+ /* Iterate over the stripe range in sectorsize steps */
+ len -= sectorsize;
+ logical += sectorsize;
+ physical += sectorsize;
}
WARN_ON(sblock->page_count == 0);
@@ -2539,7 +2586,7 @@ leave_nomem:
}
static int scrub_extent_for_parity(struct scrub_parity *sparity,
- u64 logical, u64 len,
+ u64 logical, u32 len,
u64 physical, struct btrfs_device *dev,
u64 flags, u64 gen, int mirror_num)
{
@@ -2563,7 +2610,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
}
while (len) {
- u64 l = min_t(u64, len, blocksize);
+ u32 l = min(len, blocksize);
int have_csum = 0;
if (flags & BTRFS_EXTENT_FLAG_DATA) {
@@ -2767,7 +2814,8 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
u64 generation;
u64 extent_logical;
u64 extent_physical;
- u64 extent_len;
+ /* Check the comment in scrub_stripe() for why u32 is enough here */
+ u32 extent_len;
u64 mapped_length;
struct btrfs_device *extent_dev;
struct scrub_parity *sparity;
@@ -2776,7 +2824,8 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
int extent_mirror_num;
int stop_loop = 0;
- nsectors = div_u64(map->stripe_len, fs_info->sectorsize);
+ ASSERT(map->stripe_len <= U32_MAX);
+ nsectors = map->stripe_len >> fs_info->sectorsize_bits;
bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
GFP_NOFS);
@@ -2787,6 +2836,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
return -ENOMEM;
}
+ ASSERT(map->stripe_len <= U32_MAX);
sparity->stripe_len = map->stripe_len;
sparity->nsectors = nsectors;
sparity->sctx = sctx;
@@ -2881,6 +2931,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
}
again:
extent_logical = key.objectid;
+ ASSERT(bytes <= U32_MAX);
extent_len = bytes;
if (extent_logical < logic_start) {
@@ -2959,9 +3010,11 @@ next:
logic_start += map->stripe_len;
}
out:
- if (ret < 0)
+ if (ret < 0) {
+ ASSERT(logic_end - logic_start <= U32_MAX);
scrub_parity_mark_sectors_error(sparity, logic_start,
logic_end - logic_start);
+ }
scrub_parity_put(sparity);
scrub_submit(sctx);
mutex_lock(&sctx->wr_lock);
@@ -3003,7 +3056,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
u64 offset;
u64 extent_logical;
u64 extent_physical;
- u64 extent_len;
+ /*
+ * Unlike chunk length, extent length should never go beyond
+ * BTRFS_MAX_EXTENT_SIZE, thus u32 is enough here.
+ */
+ u32 extent_len;
u64 stripe_logical;
u64 stripe_end;
struct btrfs_device *extent_dev;
@@ -3084,17 +3141,21 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
key_end.offset = (u64)-1;
reada1 = btrfs_reada_add(root, &key, &key_end);
- key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- key.type = BTRFS_EXTENT_CSUM_KEY;
- key.offset = logical;
- key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- key_end.type = BTRFS_EXTENT_CSUM_KEY;
- key_end.offset = logic_end;
- reada2 = btrfs_reada_add(csum_root, &key, &key_end);
+ if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
+ key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key.type = BTRFS_EXTENT_CSUM_KEY;
+ key.offset = logical;
+ key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ key_end.type = BTRFS_EXTENT_CSUM_KEY;
+ key_end.offset = logic_end;
+ reada2 = btrfs_reada_add(csum_root, &key, &key_end);
+ } else {
+ reada2 = NULL;
+ }
if (!IS_ERR(reada1))
btrfs_reada_wait(reada1);
- if (!IS_ERR(reada2))
+ if (!IS_ERR_OR_NULL(reada2))
btrfs_reada_wait(reada2);
@@ -3248,6 +3309,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
again:
extent_logical = key.objectid;
+ ASSERT(bytes <= U32_MAX);
extent_len = bytes;
/*
@@ -3704,10 +3766,12 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
if (bytenr + BTRFS_SUPER_INFO_SIZE >
scrub_dev->commit_total_bytes)
break;
+ if (!btrfs_check_super_location(scrub_dev, bytenr))
+ continue;
ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
- NULL, 1, bytenr);
+ NULL, bytenr);
if (ret)
return ret;
}
@@ -3821,14 +3885,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return -EINVAL;
}
- if (fs_info->sectorsize != PAGE_SIZE) {
- /* not supported for data w/o checksums */
- btrfs_err_rl(fs_info,
- "scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails",
- fs_info->sectorsize, PAGE_SIZE);
- return -EINVAL;
- }
-
if (fs_info->nodesize >
PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
@@ -3855,7 +3911,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
goto out_free_ctx;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
+ dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
!is_dev_replace)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -4032,7 +4088,7 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct scrub_ctx *sctx = NULL;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
+ dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
if (dev)
sctx = dev->scrub_ctx;
if (sctx)
@@ -4043,7 +4099,7 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
}
static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
- u64 extent_logical, u64 extent_len,
+ u64 extent_logical, u32 extent_len,
u64 *extent_physical,
struct btrfs_device **extent_dev,
int *extent_mirror_num)
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 340c76a12ce1..d719a2755a40 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2410,7 +2410,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
sctx->send_root->root_item.uuid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
- le64_to_cpu(sctx->send_root->root_item.ctransid));
+ btrfs_root_ctransid(&sctx->send_root->root_item));
if (parent_root) {
if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid))
TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
@@ -2419,7 +2419,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
parent_root->root_item.uuid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
- le64_to_cpu(sctx->parent_root->root_item.ctransid));
+ btrfs_root_ctransid(&sctx->parent_root->root_item));
}
ret = send_cmd(sctx);
@@ -5101,7 +5101,7 @@ static int send_clone(struct send_ctx *sctx,
TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
clone_root->root->root_item.uuid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
- le64_to_cpu(clone_root->root->root_item.ctransid));
+ btrfs_root_ctransid(&clone_root->root->root_item));
TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
clone_root->offset);
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index c46be27be700..8260f8bb3ff0 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -57,8 +57,9 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
const void *ptr, unsigned long off) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long idx = member_offset >> PAGE_SHIFT; \
- const unsigned long oip = offset_in_page(member_offset); \
+ const unsigned long idx = get_eb_page_index(member_offset); \
+ const unsigned long oip = get_eb_offset_in_page(token->eb, \
+ member_offset); \
const int size = sizeof(u##bits); \
u8 lebytes[sizeof(u##bits)]; \
const int part = PAGE_SIZE - oip; \
@@ -85,8 +86,8 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
const void *ptr, unsigned long off) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long oip = offset_in_page(member_offset); \
- const unsigned long idx = member_offset >> PAGE_SHIFT; \
+ const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \
+ const unsigned long idx = get_eb_page_index(member_offset); \
char *kaddr = page_address(eb->pages[idx]); \
const int size = sizeof(u##bits); \
const int part = PAGE_SIZE - oip; \
@@ -106,8 +107,9 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \
u##bits val) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long idx = member_offset >> PAGE_SHIFT; \
- const unsigned long oip = offset_in_page(member_offset); \
+ const unsigned long idx = get_eb_page_index(member_offset); \
+ const unsigned long oip = get_eb_offset_in_page(token->eb, \
+ member_offset); \
const int size = sizeof(u##bits); \
u8 lebytes[sizeof(u##bits)]; \
const int part = PAGE_SIZE - oip; \
@@ -136,8 +138,8 @@ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
unsigned long off, u##bits val) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long oip = offset_in_page(member_offset); \
- const unsigned long idx = member_offset >> PAGE_SHIFT; \
+ const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \
+ const unsigned long idx = get_eb_page_index(member_offset); \
char *kaddr = page_address(eb->pages[idx]); \
const int size = sizeof(u##bits); \
const int part = PAGE_SIZE - oip; \
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8840a4fa81eb..022f20810089 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -44,6 +44,7 @@
#include "backref.h"
#include "space-info.h"
#include "sysfs.h"
+#include "zoned.h"
#include "tests/btrfs-tests.h"
#include "block-group.h"
#include "discard.h"
@@ -240,9 +241,13 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, .
vaf.fmt = fmt;
vaf.va = &args;
- if (__ratelimit(ratelimit))
- printk("%sBTRFS %s (device %s): %pV\n", lvl, type,
- fs_info ? fs_info->sb->s_id : "<unknown>", &vaf);
+ if (__ratelimit(ratelimit)) {
+ if (fs_info)
+ printk("%sBTRFS %s (device %s): %pV\n", lvl, type,
+ fs_info->sb->s_id, &vaf);
+ else
+ printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
+ }
va_end(args);
}
@@ -333,7 +338,6 @@ enum {
Opt_device,
Opt_fatal_errors,
Opt_flushoncommit, Opt_noflushoncommit,
- Opt_inode_cache, Opt_noinode_cache,
Opt_max_inline,
Opt_barrier, Opt_nobarrier,
Opt_datacow, Opt_nodatacow,
@@ -360,9 +364,13 @@ enum {
Opt_rescue,
Opt_usebackuproot,
Opt_nologreplay,
+ Opt_ignorebadroots,
+ Opt_ignoredatacsums,
+ Opt_rescue_all,
/* Deprecated options */
Opt_recovery,
+ Opt_inode_cache, Opt_noinode_cache,
/* Debugging options */
Opt_check_integrity,
@@ -455,9 +463,25 @@ static const match_table_t tokens = {
static const match_table_t rescue_tokens = {
{Opt_usebackuproot, "usebackuproot"},
{Opt_nologreplay, "nologreplay"},
+ {Opt_ignorebadroots, "ignorebadroots"},
+ {Opt_ignorebadroots, "ibadroots"},
+ {Opt_ignoredatacsums, "ignoredatacsums"},
+ {Opt_ignoredatacsums, "idatacsums"},
+ {Opt_rescue_all, "all"},
{Opt_err, NULL},
};
+static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt,
+ const char *opt_name)
+{
+ if (fs_info->mount_opt & opt) {
+ btrfs_err(fs_info, "%s must be used with ro mount option",
+ opt_name);
+ return true;
+ }
+ return false;
+}
+
static int parse_rescue_options(struct btrfs_fs_info *info, const char *options)
{
char *opts;
@@ -487,6 +511,23 @@ static int parse_rescue_options(struct btrfs_fs_info *info, const char *options)
btrfs_set_and_info(info, NOLOGREPLAY,
"disabling log replay at mount time");
break;
+ case Opt_ignorebadroots:
+ btrfs_set_and_info(info, IGNOREBADROOTS,
+ "ignoring bad roots");
+ break;
+ case Opt_ignoredatacsums:
+ btrfs_set_and_info(info, IGNOREDATACSUMS,
+ "ignoring data csums");
+ break;
+ case Opt_rescue_all:
+ btrfs_info(info, "enabling all of the rescue options");
+ btrfs_set_and_info(info, IGNOREDATACSUMS,
+ "ignoring data csums");
+ btrfs_set_and_info(info, IGNOREBADROOTS,
+ "ignoring bad roots");
+ btrfs_set_and_info(info, NOLOGREPLAY,
+ "disabling log replay at mount time");
+ break;
case Opt_err:
btrfs_info(info, "unrecognized rescue option '%s'", p);
ret = -EINVAL;
@@ -511,7 +552,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
{
substring_t args[MAX_OPT_ARGS];
char *p, *num;
- u64 cache_gen;
int intarg;
int ret = 0;
char *compress_type;
@@ -521,11 +561,17 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
bool saved_compress_force;
int no_compress = 0;
- cache_gen = btrfs_super_cache_generation(info->super_copy);
if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
- else if (cache_gen)
- btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+ else if (btrfs_free_space_cache_v1_active(info)) {
+ if (btrfs_is_zoned(info)) {
+ btrfs_info(info,
+ "zoned: clearing existing space cache");
+ btrfs_set_super_cache_generation(info->super_copy, 0);
+ } else {
+ btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+ }
+ }
/*
* Even the options are empty, we still need to do extra check
@@ -832,14 +878,9 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
}
break;
case Opt_inode_cache:
- btrfs_warn(info,
- "the 'inode_cache' option is deprecated and will have no effect from 5.11");
- btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
- "enabling inode map caching");
- break;
case Opt_noinode_cache:
- btrfs_clear_pending_and_info(info, INODE_MAP_CACHE,
- "disabling inode map caching");
+ btrfs_warn(info,
+ "the 'inode_cache' option is deprecated and has no effect since 5.11");
break;
case Opt_clear_cache:
btrfs_set_and_info(info, CLEAR_CACHE,
@@ -968,14 +1009,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
}
}
check:
- /*
- * Extra check for current option against current flag
- */
- if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & SB_RDONLY)) {
- btrfs_err(info,
- "nologreplay must be used with ro mount option");
+ /* We're read-only, don't have to check. */
+ if (new_flags & SB_RDONLY)
+ goto out;
+
+ if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") ||
+ check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") ||
+ check_ro_option(info, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums"))
ret = -EINVAL;
- }
out:
if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
!btrfs_test_opt(info, FREE_SPACE_TREE) &&
@@ -984,6 +1025,8 @@ out:
ret = -EINVAL;
}
+ if (!ret)
+ ret = btrfs_check_mountopts_zoned(info);
if (!ret && btrfs_test_opt(info, SPACE_CACHE))
btrfs_info(info, "disk space caching is enabled");
if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE))
@@ -1127,7 +1170,6 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
ret = -ENOMEM;
goto err;
}
- path->leave_spinning = 1;
name = kmalloc(PATH_MAX, GFP_KERNEL);
if (!name) {
@@ -1256,7 +1298,6 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
/*
* Find the "default" dir item which points to the root item that we
@@ -1383,11 +1424,18 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
return btrfs_commit_transaction(trans);
}
+static void print_rescue_option(struct seq_file *seq, const char *s, bool *printed)
+{
+ seq_printf(seq, "%s%s", (*printed) ? ":" : ",rescue=", s);
+ *printed = true;
+}
+
static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
{
struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
const char *compress_type;
const char *subvol_name;
+ bool printed = false;
if (btrfs_test_opt(info, DEGRADED))
seq_puts(seq, ",degraded");
@@ -1420,7 +1468,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
if (btrfs_test_opt(info, NOTREELOG))
seq_puts(seq, ",notreelog");
if (btrfs_test_opt(info, NOLOGREPLAY))
- seq_puts(seq, ",rescue=nologreplay");
+ print_rescue_option(seq, "nologreplay", &printed);
+ if (btrfs_test_opt(info, USEBACKUPROOT))
+ print_rescue_option(seq, "usebackuproot", &printed);
+ if (btrfs_test_opt(info, IGNOREBADROOTS))
+ print_rescue_option(seq, "ignorebadroots", &printed);
+ if (btrfs_test_opt(info, IGNOREDATACSUMS))
+ print_rescue_option(seq, "ignoredatacsums", &printed);
if (btrfs_test_opt(info, FLUSHONCOMMIT))
seq_puts(seq, ",flushoncommit");
if (btrfs_test_opt(info, DISCARD_SYNC))
@@ -1429,9 +1483,9 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",discard=async");
if (!(info->sb->s_flags & SB_POSIXACL))
seq_puts(seq, ",noacl");
- if (btrfs_test_opt(info, SPACE_CACHE))
+ if (btrfs_free_space_cache_v1_active(info))
seq_puts(seq, ",space_cache");
- else if (btrfs_test_opt(info, FREE_SPACE_TREE))
+ else if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
seq_puts(seq, ",space_cache=v2");
else
seq_puts(seq, ",nospace_cache");
@@ -1445,8 +1499,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",enospc_debug");
if (btrfs_test_opt(info, AUTO_DEFRAG))
seq_puts(seq, ",autodefrag");
- if (btrfs_test_opt(info, INODE_MAP_CACHE))
- seq_puts(seq, ",inode_cache");
if (btrfs_test_opt(info, SKIP_BALANCE))
seq_puts(seq, ",skip_balance");
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@@ -1810,6 +1862,8 @@ static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
unsigned long old_opts)
{
+ const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
+
/*
* We need to cleanup all defragable inodes if the autodefragment is
* close or the filesystem is read only.
@@ -1826,12 +1880,15 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
else if (btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) &&
!btrfs_test_opt(fs_info, DISCARD_ASYNC))
btrfs_discard_cleanup(fs_info);
+
+ /* If we toggled space cache */
+ if (cache_opt != btrfs_free_space_cache_v1_active(fs_info))
+ btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
}
static int btrfs_remount(struct super_block *sb, int *flags, char *data)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- struct btrfs_root *root = fs_info->tree_root;
unsigned old_flags = sb->s_flags;
unsigned long old_opts = fs_info->mount_opt;
unsigned long old_compress_type = fs_info->compress_type;
@@ -1862,6 +1919,22 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
btrfs_resize_thread_pool(fs_info,
fs_info->thread_pool_size, old_thread_pool_size);
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) !=
+ btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
+ (!sb_rdonly(sb) || (*flags & SB_RDONLY))) {
+ btrfs_warn(fs_info,
+ "remount supports changing free space tree only from ro to rw");
+ /* Make sure free space cache options match the state on disk */
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
+ }
+ if (btrfs_free_space_cache_v1_active(fs_info)) {
+ btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
+ }
+ }
+
if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
goto out;
@@ -1924,39 +1997,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
goto restore;
}
- ret = btrfs_cleanup_fs_roots(fs_info);
- if (ret)
- goto restore;
-
- /* recover relocation */
- mutex_lock(&fs_info->cleaner_mutex);
- ret = btrfs_recover_relocation(root);
- mutex_unlock(&fs_info->cleaner_mutex);
- if (ret)
- goto restore;
-
- ret = btrfs_resume_balance_async(fs_info);
+ /*
+ * NOTE: when remounting with a change that does writes, don't
+ * put it anywhere above this point, as we are not sure to be
+ * safe to write until we pass the above checks.
+ */
+ ret = btrfs_start_pre_rw_mount(fs_info);
if (ret)
goto restore;
- ret = btrfs_resume_dev_replace_async(fs_info);
- if (ret) {
- btrfs_warn(fs_info, "failed to resume dev_replace");
- goto restore;
- }
-
- btrfs_qgroup_rescan_resume(fs_info);
-
- if (!fs_info->uuid_root) {
- btrfs_info(fs_info, "creating UUID tree");
- ret = btrfs_create_uuid_tree(fs_info);
- if (ret) {
- btrfs_warn(fs_info,
- "failed to create the UUID tree %d",
- ret);
- goto restore;
- }
- }
sb->s_flags &= ~SB_RDONLY;
set_bit(BTRFS_FS_OPEN, &fs_info->flags);
@@ -1970,6 +2019,7 @@ out:
wake_up_process(fs_info->transaction_kthread);
btrfs_remount_cleanup(fs_info, old_opts);
+ btrfs_clear_oneshot_options(fs_info);
clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
return 0;
@@ -2156,7 +2206,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
u64 total_used = 0;
u64 total_free_data = 0;
u64 total_free_meta = 0;
- int bits = dentry->d_sb->s_blocksize_bits;
+ u32 bits = fs_info->sectorsize_bits;
__be32 *fsid = (__be32 *)fs_info->fs_devices->fsid;
unsigned factor = 1;
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
@@ -2463,6 +2513,11 @@ static void __init btrfs_print_mod_info(void)
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
", ref-verify=on"
#endif
+#ifdef CONFIG_BLK_DEV_ZONED
+ ", zoned=yes"
+#else
+ ", zoned=no"
+#endif
;
pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options);
}
@@ -2523,8 +2578,6 @@ static int __init init_btrfs_fs(void)
if (err)
goto free_end_io_wq;
- btrfs_init_lockdep();
-
btrfs_print_mod_info();
err = btrfs_run_sanity_tests();
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 279d9262b676..19b9fffa2c9c 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -263,6 +263,10 @@ BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
+/* Remove once support for zoned allocation is feature complete */
+#ifdef CONFIG_BTRFS_DEBUG
+BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
+#endif
static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(mixed_backref),
@@ -278,6 +282,9 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(metadata_uuid),
BTRFS_FEAT_ATTR_PTR(free_space_tree),
BTRFS_FEAT_ATTR_PTR(raid1c34),
+#ifdef CONFIG_BTRFS_DEBUG
+ BTRFS_FEAT_ATTR_PTR(zoned),
+#endif
NULL
};
@@ -329,10 +336,35 @@ static ssize_t send_stream_version_show(struct kobject *kobj,
}
BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show);
+static const char *rescue_opts[] = {
+ "usebackuproot",
+ "nologreplay",
+ "ignorebadroots",
+ "ignoredatacsums",
+ "all",
+};
+
+static ssize_t supported_rescue_options_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ ssize_t ret = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(rescue_opts); i++)
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
+ (i ? " " : ""), rescue_opts[i]);
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ return ret;
+}
+BTRFS_ATTR(static_feature, supported_rescue_options,
+ supported_rescue_options_show);
+
static struct attribute *btrfs_supported_static_feature_attrs[] = {
BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
BTRFS_ATTR_PTR(static_feature, supported_checksums),
BTRFS_ATTR_PTR(static_feature, send_stream_version),
+ BTRFS_ATTR_PTR(static_feature, supported_rescue_options),
NULL
};
@@ -433,7 +465,8 @@ static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj,
return -EINVAL;
WRITE_ONCE(discard_ctl->iops_limit, iops_limit);
-
+ btrfs_discard_calc_delay(discard_ctl);
+ btrfs_discard_schedule_work(discard_ctl, true);
return len;
}
BTRFS_ATTR_RW(discard, iops_limit, btrfs_discard_iops_limit_show,
@@ -463,7 +496,7 @@ static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj,
return -EINVAL;
WRITE_ONCE(discard_ctl->kbps_limit, kbps_limit);
-
+ btrfs_discard_schedule_work(discard_ctl, true);
return len;
}
BTRFS_ATTR_RW(discard, kbps_limit, btrfs_discard_kbps_limit_show,
@@ -854,6 +887,82 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj,
}
BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show);
+static ssize_t btrfs_generation_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return scnprintf(buf, PAGE_SIZE, "%llu\n", fs_info->generation);
+}
+BTRFS_ATTR(, generation, btrfs_generation_show);
+
+/*
+ * Look for an exact string @string in @buffer with possible leading or
+ * trailing whitespace
+ */
+static bool strmatch(const char *buffer, const char *string)
+{
+ const size_t len = strlen(string);
+
+ /* Skip leading whitespace */
+ buffer = skip_spaces(buffer);
+
+ /* Match entire string, check if the rest is whitespace or empty */
+ if (strncmp(string, buffer, len) == 0 &&
+ strlen(skip_spaces(buffer + len)) == 0)
+ return true;
+
+ return false;
+}
+
+static const char * const btrfs_read_policy_name[] = { "pid" };
+
+static ssize_t btrfs_read_policy_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+ ssize_t ret = 0;
+ int i;
+
+ for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
+ if (fs_devices->read_policy == i)
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s[%s]",
+ (ret == 0 ? "" : " "),
+ btrfs_read_policy_name[i]);
+ else
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
+ (ret == 0 ? "" : " "),
+ btrfs_read_policy_name[i]);
+ }
+
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+
+ return ret;
+}
+
+static ssize_t btrfs_read_policy_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
+ int i;
+
+ for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
+ if (strmatch(buf, btrfs_read_policy_name[i])) {
+ if (i != fs_devices->read_policy) {
+ fs_devices->read_policy = i;
+ btrfs_info(fs_devices->fs_info,
+ "read policy set to '%s'",
+ btrfs_read_policy_name[i]);
+ }
+ return len;
+ }
+ }
+
+ return -EINVAL;
+}
+BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store);
+
static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, label),
BTRFS_ATTR_PTR(, nodesize),
@@ -863,6 +972,8 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, metadata_uuid),
BTRFS_ATTR_PTR(, checksum),
BTRFS_ATTR_PTR(, exclusive_operation),
+ BTRFS_ATTR_PTR(, generation),
+ BTRFS_ATTR_PTR(, read_policy),
NULL,
};
@@ -1207,7 +1318,7 @@ static const char *alloc_name(u64 flags)
default:
WARN_ON(1);
return "invalid-combination";
- };
+ }
}
/*
@@ -1232,8 +1343,6 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
void btrfs_sysfs_remove_device(struct btrfs_device *device)
{
- struct hd_struct *disk;
- struct kobject *disk_kobj;
struct kobject *devices_kobj;
/*
@@ -1243,11 +1352,8 @@ void btrfs_sysfs_remove_device(struct btrfs_device *device)
devices_kobj = device->fs_info->fs_devices->devices_kobj;
ASSERT(devices_kobj);
- if (device->bdev) {
- disk = device->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
- sysfs_remove_link(devices_kobj, disk_kobj->name);
- }
+ if (device->bdev)
+ sysfs_remove_link(devices_kobj, bdev_kobj(device->bdev)->name);
if (device->devid_kobj.state_initialized) {
kobject_del(&device->devid_kobj);
@@ -1353,11 +1459,7 @@ int btrfs_sysfs_add_device(struct btrfs_device *device)
nofs_flag = memalloc_nofs_save();
if (device->bdev) {
- struct hd_struct *disk;
- struct kobject *disk_kobj;
-
- disk = device->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
+ struct kobject *disk_kobj = bdev_kobj(device->bdev);
ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name);
if (ret) {
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 999c14e5d0bd..8ca334d554af 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -134,6 +134,7 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
fs_info->nodesize = nodesize;
fs_info->sectorsize = sectorsize;
+ fs_info->sectorsize_bits = ilog2(sectorsize);
set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
test_mnt->mnt_sb->s_fs_info = fs_info;
@@ -224,7 +225,7 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
INIT_LIST_HEAD(&cache->bg_list);
- btrfs_init_free_space_ctl(cache);
+ btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
mutex_init(&cache->free_space_lock);
return cache;
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index df7ce874a74b..73e96d505f4f 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -379,54 +379,50 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
{
struct btrfs_fs_info *fs_info;
- unsigned long len;
unsigned long *bitmap = NULL;
struct extent_buffer *eb = NULL;
int ret;
test_msg("running extent buffer bitmap tests");
- /*
- * In ppc64, sectorsize can be 64K, thus 4 * 64K will be larger than
- * BTRFS_MAX_METADATA_BLOCKSIZE.
- */
- len = (sectorsize < BTRFS_MAX_METADATA_BLOCKSIZE)
- ? sectorsize * 4 : sectorsize;
-
- fs_info = btrfs_alloc_dummy_fs_info(len, len);
+ fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
if (!fs_info) {
test_std_err(TEST_ALLOC_FS_INFO);
return -ENOMEM;
}
- bitmap = kmalloc(len, GFP_KERNEL);
+ bitmap = kmalloc(nodesize, GFP_KERNEL);
if (!bitmap) {
test_err("couldn't allocate test bitmap");
ret = -ENOMEM;
goto out;
}
- eb = __alloc_dummy_extent_buffer(fs_info, 0, len);
+ eb = __alloc_dummy_extent_buffer(fs_info, 0, nodesize);
if (!eb) {
test_std_err(TEST_ALLOC_ROOT);
ret = -ENOMEM;
goto out;
}
- ret = __test_eb_bitmaps(bitmap, eb, len);
+ ret = __test_eb_bitmaps(bitmap, eb, nodesize);
if (ret)
goto out;
- /* Do it over again with an extent buffer which isn't page-aligned. */
free_extent_buffer(eb);
- eb = __alloc_dummy_extent_buffer(fs_info, nodesize / 2, len);
+
+ /*
+ * Test again for case where the tree block is sectorsize aligned but
+ * not nodesize aligned.
+ */
+ eb = __alloc_dummy_extent_buffer(fs_info, sectorsize, nodesize);
if (!eb) {
test_std_err(TEST_ALLOC_ROOT);
ret = -ENOMEM;
goto out;
}
- ret = __test_eb_bitmaps(bitmap, eb, len);
+ ret = __test_eb_bitmaps(bitmap, eb, nodesize);
out:
free_extent_buffer(eb);
kfree(bitmap);
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index aebdf23f0cdd..8f05c1eb833f 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -399,7 +399,6 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
u64 offset;
u64 max_extent_size;
const struct btrfs_free_space_op test_free_space_ops = {
- .recalc_thresholds = cache->free_space_ctl->op->recalc_thresholds,
.use_bitmap = test_use_bitmap,
};
const struct btrfs_free_space_op *orig_free_space_ops;
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index ce1ca8e73c2d..f3137285a9e2 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -36,7 +36,6 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
return -ENOMEM;
}
- path->leave_spinning = 1;
ret = btrfs_insert_empty_item(&trans, root, path, &ins, size);
if (ret) {
test_err("couldn't insert ref %d", ret);
@@ -86,7 +85,6 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
return -ENOMEM;
}
- path->leave_spinning = 1;
ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
if (ret) {
test_err("couldn't find extent ref");
@@ -135,7 +133,6 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
test_std_err(TEST_ALLOC_ROOT);
return -ENOMEM;
}
- path->leave_spinning = 1;
ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
if (ret) {
@@ -170,7 +167,6 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
return -ENOMEM;
}
- path->leave_spinning = 1;
ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
if (ret) {
test_err("couldn't find extent ref");
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 52ada47aff50..8e0f7a1029c6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -16,7 +16,6 @@
#include "transaction.h"
#include "locking.h"
#include "tree-log.h"
-#include "inode-map.h"
#include "volumes.h"
#include "dev-replace.h"
#include "qgroup.h"
@@ -155,6 +154,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root, *tmp;
+ struct btrfs_caching_control *caching_ctl, *next;
down_write(&fs_info->commit_root_sem);
list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits,
@@ -162,8 +162,6 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
list_del_init(&root->dirty_list);
free_extent_buffer(root->commit_root);
root->commit_root = btrfs_root_node(root);
- if (is_fstree(root->root_key.objectid))
- btrfs_unpin_free_ino(root);
extent_io_tree_release(&root->dirty_log_pages);
btrfs_qgroup_clean_swapped_blocks(root);
}
@@ -180,6 +178,47 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
spin_lock(&cur_trans->dropped_roots_lock);
}
spin_unlock(&cur_trans->dropped_roots_lock);
+
+ /*
+ * We have to update the last_byte_to_unpin under the commit_root_sem,
+ * at the same time we swap out the commit roots.
+ *
+ * This is because we must have a real view of the last spot the caching
+ * kthreads were while caching. Consider the following views of the
+ * extent tree for a block group
+ *
+ * commit root
+ * +----+----+----+----+----+----+----+
+ * |\\\\| |\\\\|\\\\| |\\\\|\\\\|
+ * +----+----+----+----+----+----+----+
+ * 0 1 2 3 4 5 6 7
+ *
+ * new commit root
+ * +----+----+----+----+----+----+----+
+ * | | | |\\\\| | |\\\\|
+ * +----+----+----+----+----+----+----+
+ * 0 1 2 3 4 5 6 7
+ *
+ * If the cache_ctl->progress was at 3, then we are only allowed to
+ * unpin [0,1) and [2,3], because the caching thread has already
+ * processed those extents. We are not allowed to unpin [5,6), because
+ * the caching thread will re-start it's search from 3, and thus find
+ * the hole from [4,6) to add to the free space cache.
+ */
+ spin_lock(&fs_info->block_group_cache_lock);
+ list_for_each_entry_safe(caching_ctl, next,
+ &fs_info->caching_block_groups, list) {
+ struct btrfs_block_group *cache = caching_ctl->block_group;
+
+ if (btrfs_block_group_done(cache)) {
+ cache->last_byte_to_unpin = (u64)-1;
+ list_del_init(&caching_ctl->list);
+ btrfs_put_caching_control(caching_ctl);
+ } else {
+ cache->last_byte_to_unpin = caching_ctl->progress;
+ }
+ }
+ spin_unlock(&fs_info->block_group_cache_lock);
up_write(&fs_info->commit_root_sem);
}
@@ -856,24 +895,24 @@ void btrfs_throttle(struct btrfs_fs_info *fs_info)
wait_current_trans(fs_info);
}
-static int should_end_transaction(struct btrfs_trans_handle *trans)
+static bool should_end_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
if (btrfs_check_space_for_delayed_refs(fs_info))
- return 1;
+ return true;
return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
}
-int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
+bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_transaction *cur_trans = trans->transaction;
smp_mb();
if (cur_trans->state >= TRANS_STATE_COMMIT_START ||
cur_trans->delayed_refs.flushing)
- return 1;
+ return true;
return should_end_transaction(trans);
}
@@ -1300,8 +1339,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
btrfs_free_log(trans, root);
btrfs_update_reloc_root(trans, root);
- btrfs_save_ino_cache(root, trans);
-
/* see comments in should_cow_block() */
clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
smp_mb__after_atomic();
@@ -1598,8 +1635,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
- btrfs_set_lock_blocking_write(old);
-
ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
/* clean up in any case */
btrfs_tree_unlock(old);
@@ -1681,7 +1716,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
dentry->d_name.len * 2);
parent_inode->i_mtime = parent_inode->i_ctime =
current_time(parent_inode);
- ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode);
+ ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode));
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -1761,6 +1796,8 @@ static void update_super_roots(struct btrfs_fs_info *fs_info)
super->root_level = root_item->level;
if (btrfs_test_opt(fs_info, SPACE_CACHE))
super->cache_generation = root_item->generation;
+ else if (test_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags))
+ super->cache_generation = 0;
if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
super->uuid_tree_generation = root_item->generation;
}
@@ -1956,10 +1993,8 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
}
}
-static inline int btrfs_start_delalloc_flush(struct btrfs_trans_handle *trans)
+static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
-
/*
* We use writeback_inodes_sb here because if we used
* btrfs_start_delalloc_roots we would deadlock with fs freeze.
@@ -1969,50 +2004,15 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_trans_handle *trans)
* from already being in a transaction and our join_transaction doesn't
* have to re-take the fs freeze lock.
*/
- if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) {
+ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
- } else {
- struct btrfs_pending_snapshot *pending;
- struct list_head *head = &trans->transaction->pending_snapshots;
-
- /*
- * Flush dellaloc for any root that is going to be snapshotted.
- * This is done to avoid a corrupted version of files, in the
- * snapshots, that had both buffered and direct IO writes (even
- * if they were done sequentially) due to an unordered update of
- * the inode's size on disk.
- */
- list_for_each_entry(pending, head, list) {
- int ret;
-
- ret = btrfs_start_delalloc_snapshot(pending->root);
- if (ret)
- return ret;
- }
- }
return 0;
}
-static inline void btrfs_wait_delalloc_flush(struct btrfs_trans_handle *trans)
+static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
-
- if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) {
+ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
- } else {
- struct btrfs_pending_snapshot *pending;
- struct list_head *head = &trans->transaction->pending_snapshots;
-
- /*
- * Wait for any dellaloc that we started previously for the roots
- * that are going to be snapshotted. This is to avoid a corrupted
- * version of files in the snapshots that had both buffered and
- * direct IO writes (even if they were done sequentially).
- */
- list_for_each_entry(pending, head, list)
- btrfs_wait_ordered_extents(pending->root,
- U64_MAX, 0, U64_MAX);
- }
}
int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
@@ -2150,7 +2150,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
extwriter_counter_dec(cur_trans, trans->type);
- ret = btrfs_start_delalloc_flush(trans);
+ ret = btrfs_start_delalloc_flush(fs_info);
if (ret)
goto cleanup_transaction;
@@ -2166,7 +2166,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret)
goto cleanup_transaction;
- btrfs_wait_delalloc_flush(trans);
+ btrfs_wait_delalloc_flush(fs_info);
/*
* Wait for all ordered extents started by a fast fsync that joined this
@@ -2293,8 +2293,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
goto unlock_tree_log;
}
- btrfs_prepare_extent_commit(fs_info);
-
cur_trans = fs_info->running_transaction;
btrfs_set_root_node(&fs_info->tree_root->root_item,
@@ -2435,10 +2433,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid);
btrfs_kill_all_delayed_nodes(root);
- if (root->ino_cache_inode) {
- iput(root->ino_cache_inode);
- root->ino_cache_inode = NULL;
- }
if (btrfs_header_backref_rev(root->node) <
BTRFS_MIXED_BACKREF_REV)
@@ -2459,16 +2453,6 @@ void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
if (!prev)
return;
- bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE;
- if (prev & bit)
- btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE);
- prev &= ~bit;
-
- bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE;
- if (prev & bit)
- btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE);
- prev &= ~bit;
-
bit = 1 << BTRFS_PENDING_COMMIT;
if (prev & bit)
btrfs_debug(fs_info, "pending commit done");
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 858d9153a1cd..31ca81bad822 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -112,7 +112,6 @@ struct btrfs_transaction {
#define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH)
#define BTRFS_SEND_TRANS_STUB ((void *)1)
-#define BTRFS_DIO_SYNC_STUB ((void *)2)
struct btrfs_trans_handle {
u64 transid;
@@ -219,7 +218,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
int wait_for_unblock);
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
-int btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
+bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
void btrfs_throttle(struct btrfs_fs_info *fs_info);
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index ea2bb4cb5890..028e733e42f3 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -100,7 +100,8 @@ static void file_extent_err(const struct extent_buffer *eb, int slot,
*/
#define CHECK_FE_ALIGNED(leaf, slot, fi, name, alignment) \
({ \
- if (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))) \
+ if (unlikely(!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), \
+ (alignment)))) \
file_extent_err((leaf), (slot), \
"invalid %s for file extent, have %llu, should be aligned to %u", \
(#name), btrfs_file_extent_##name((leaf), (fi)), \
@@ -203,7 +204,7 @@ static int check_extent_data_item(struct extent_buffer *leaf,
u32 item_size = btrfs_item_size_nr(leaf, slot);
u64 extent_end;
- if (!IS_ALIGNED(key->offset, sectorsize)) {
+ if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) {
file_extent_err(leaf, slot,
"unaligned file_offset for file extent, have %llu should be aligned to %u",
key->offset, sectorsize);
@@ -216,7 +217,7 @@ static int check_extent_data_item(struct extent_buffer *leaf,
* But if objectids mismatch, it means we have a missing
* INODE_ITEM.
*/
- if (!check_prev_ino(leaf, key, slot, prev_key))
+ if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
return -EUCLEAN;
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
@@ -225,14 +226,15 @@ static int check_extent_data_item(struct extent_buffer *leaf,
* Make sure the item contains at least inline header, so the file
* extent type is not some garbage.
*/
- if (item_size < BTRFS_FILE_EXTENT_INLINE_DATA_START) {
+ if (unlikely(item_size < BTRFS_FILE_EXTENT_INLINE_DATA_START)) {
file_extent_err(leaf, slot,
"invalid item size, have %u expect [%zu, %u)",
item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START,
SZ_4K);
return -EUCLEAN;
}
- if (btrfs_file_extent_type(leaf, fi) >= BTRFS_NR_FILE_EXTENT_TYPES) {
+ if (unlikely(btrfs_file_extent_type(leaf, fi) >=
+ BTRFS_NR_FILE_EXTENT_TYPES)) {
file_extent_err(leaf, slot,
"invalid type for file extent, have %u expect range [0, %u]",
btrfs_file_extent_type(leaf, fi),
@@ -244,14 +246,15 @@ static int check_extent_data_item(struct extent_buffer *leaf,
* Support for new compression/encryption must introduce incompat flag,
* and must be caught in open_ctree().
*/
- if (btrfs_file_extent_compression(leaf, fi) >= BTRFS_NR_COMPRESS_TYPES) {
+ if (unlikely(btrfs_file_extent_compression(leaf, fi) >=
+ BTRFS_NR_COMPRESS_TYPES)) {
file_extent_err(leaf, slot,
"invalid compression for file extent, have %u expect range [0, %u]",
btrfs_file_extent_compression(leaf, fi),
BTRFS_NR_COMPRESS_TYPES - 1);
return -EUCLEAN;
}
- if (btrfs_file_extent_encryption(leaf, fi)) {
+ if (unlikely(btrfs_file_extent_encryption(leaf, fi))) {
file_extent_err(leaf, slot,
"invalid encryption for file extent, have %u expect 0",
btrfs_file_extent_encryption(leaf, fi));
@@ -259,7 +262,7 @@ static int check_extent_data_item(struct extent_buffer *leaf,
}
if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
/* Inline extent must have 0 as key offset */
- if (key->offset) {
+ if (unlikely(key->offset)) {
file_extent_err(leaf, slot,
"invalid file_offset for inline file extent, have %llu expect 0",
key->offset);
@@ -272,8 +275,8 @@ static int check_extent_data_item(struct extent_buffer *leaf,
return 0;
/* Uncompressed inline extent size must match item size */
- if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
- btrfs_file_extent_ram_bytes(leaf, fi)) {
+ if (unlikely(item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
+ btrfs_file_extent_ram_bytes(leaf, fi))) {
file_extent_err(leaf, slot,
"invalid ram_bytes for uncompressed inline extent, have %u expect %llu",
item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START +
@@ -284,22 +287,22 @@ static int check_extent_data_item(struct extent_buffer *leaf,
}
/* Regular or preallocated extent has fixed item size */
- if (item_size != sizeof(*fi)) {
+ if (unlikely(item_size != sizeof(*fi))) {
file_extent_err(leaf, slot,
"invalid item size for reg/prealloc file extent, have %u expect %zu",
item_size, sizeof(*fi));
return -EUCLEAN;
}
- if (CHECK_FE_ALIGNED(leaf, slot, fi, ram_bytes, sectorsize) ||
- CHECK_FE_ALIGNED(leaf, slot, fi, disk_bytenr, sectorsize) ||
- CHECK_FE_ALIGNED(leaf, slot, fi, disk_num_bytes, sectorsize) ||
- CHECK_FE_ALIGNED(leaf, slot, fi, offset, sectorsize) ||
- CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize))
+ if (unlikely(CHECK_FE_ALIGNED(leaf, slot, fi, ram_bytes, sectorsize) ||
+ CHECK_FE_ALIGNED(leaf, slot, fi, disk_bytenr, sectorsize) ||
+ CHECK_FE_ALIGNED(leaf, slot, fi, disk_num_bytes, sectorsize) ||
+ CHECK_FE_ALIGNED(leaf, slot, fi, offset, sectorsize) ||
+ CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize)))
return -EUCLEAN;
/* Catch extent end overflow */
- if (check_add_overflow(btrfs_file_extent_num_bytes(leaf, fi),
- key->offset, &extent_end)) {
+ if (unlikely(check_add_overflow(btrfs_file_extent_num_bytes(leaf, fi),
+ key->offset, &extent_end))) {
file_extent_err(leaf, slot,
"extent end overflow, have file offset %llu extent num bytes %llu",
key->offset,
@@ -320,7 +323,7 @@ static int check_extent_data_item(struct extent_buffer *leaf,
prev_fi = btrfs_item_ptr(leaf, slot - 1,
struct btrfs_file_extent_item);
prev_end = file_extent_end(leaf, prev_key, prev_fi);
- if (prev_end > key->offset) {
+ if (unlikely(prev_end > key->offset)) {
file_extent_err(leaf, slot - 1,
"file extent end range (%llu) goes beyond start offset (%llu) of the next file extent",
prev_end, key->offset);
@@ -336,21 +339,21 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key,
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
u32 sectorsize = fs_info->sectorsize;
- u32 csumsize = btrfs_super_csum_size(fs_info->super_copy);
+ const u32 csumsize = fs_info->csum_size;
- if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) {
+ if (unlikely(key->objectid != BTRFS_EXTENT_CSUM_OBJECTID)) {
generic_err(leaf, slot,
"invalid key objectid for csum item, have %llu expect %llu",
key->objectid, BTRFS_EXTENT_CSUM_OBJECTID);
return -EUCLEAN;
}
- if (!IS_ALIGNED(key->offset, sectorsize)) {
+ if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) {
generic_err(leaf, slot,
"unaligned key offset for csum item, have %llu should be aligned to %u",
key->offset, sectorsize);
return -EUCLEAN;
}
- if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) {
+ if (unlikely(!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize))) {
generic_err(leaf, slot,
"unaligned item size for csum item, have %u should be aligned to %u",
btrfs_item_size_nr(leaf, slot), csumsize);
@@ -363,7 +366,7 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key,
prev_item_size = btrfs_item_size_nr(leaf, slot - 1);
prev_csum_end = (prev_item_size / csumsize) * sectorsize;
prev_csum_end += prev_key->offset;
- if (prev_csum_end > key->offset) {
+ if (unlikely(prev_csum_end > key->offset)) {
generic_err(leaf, slot - 1,
"csum end range (%llu) goes beyond the start range (%llu) of the next csum item",
prev_csum_end, key->offset);
@@ -388,15 +391,16 @@ static int check_inode_key(struct extent_buffer *leaf, struct btrfs_key *key,
/* For XATTR_ITEM, location key should be all 0 */
if (item_key.type == BTRFS_XATTR_ITEM_KEY) {
- if (key->type != 0 || key->objectid != 0 || key->offset != 0)
+ if (unlikely(key->objectid != 0 || key->type != 0 ||
+ key->offset != 0))
return -EUCLEAN;
return 0;
}
- if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID ||
- key->objectid > BTRFS_LAST_FREE_OBJECTID) &&
- key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID &&
- key->objectid != BTRFS_FREE_INO_OBJECTID) {
+ if (unlikely((key->objectid < BTRFS_FIRST_FREE_OBJECTID ||
+ key->objectid > BTRFS_LAST_FREE_OBJECTID) &&
+ key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID &&
+ key->objectid != BTRFS_FREE_INO_OBJECTID)) {
if (is_inode_item) {
generic_err(leaf, slot,
"invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu",
@@ -414,7 +418,7 @@ static int check_inode_key(struct extent_buffer *leaf, struct btrfs_key *key,
}
return -EUCLEAN;
}
- if (key->offset != 0) {
+ if (unlikely(key->offset != 0)) {
if (is_inode_item)
inode_item_err(leaf, slot,
"invalid key offset: has %llu expect 0",
@@ -438,7 +442,7 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key,
is_root_item = (item_key.type == BTRFS_ROOT_ITEM_KEY);
/* No such tree id */
- if (key->objectid == 0) {
+ if (unlikely(key->objectid == 0)) {
if (is_root_item)
generic_err(leaf, slot, "invalid root id 0");
else
@@ -448,7 +452,7 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key,
}
/* DIR_ITEM/INDEX/INODE_REF is not allowed to point to non-fs trees */
- if (!is_fstree(key->objectid) && !is_root_item) {
+ if (unlikely(!is_fstree(key->objectid) && !is_root_item)) {
dir_item_err(leaf, slot,
"invalid location key objectid, have %llu expect [%llu, %llu]",
key->objectid, BTRFS_FIRST_FREE_OBJECTID,
@@ -464,7 +468,8 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key,
* So here we only check offset for reloc tree whose key->offset must
* be a valid tree.
*/
- if (key->objectid == BTRFS_TREE_RELOC_OBJECTID && key->offset == 0) {
+ if (unlikely(key->objectid == BTRFS_TREE_RELOC_OBJECTID &&
+ key->offset == 0)) {
generic_err(leaf, slot, "invalid root id 0 for reloc tree");
return -EUCLEAN;
}
@@ -480,8 +485,9 @@ static int check_dir_item(struct extent_buffer *leaf,
u32 item_size = btrfs_item_size_nr(leaf, slot);
u32 cur = 0;
- if (!check_prev_ino(leaf, key, slot, prev_key))
+ if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
return -EUCLEAN;
+
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
while (cur < item_size) {
struct btrfs_key location_key;
@@ -494,7 +500,7 @@ static int check_dir_item(struct extent_buffer *leaf,
int ret;
/* header itself should not cross item boundary */
- if (cur + sizeof(*di) > item_size) {
+ if (unlikely(cur + sizeof(*di) > item_size)) {
dir_item_err(leaf, slot,
"dir item header crosses item boundary, have %zu boundary %u",
cur + sizeof(*di), item_size);
@@ -505,12 +511,12 @@ static int check_dir_item(struct extent_buffer *leaf,
btrfs_dir_item_key_to_cpu(leaf, di, &location_key);
if (location_key.type == BTRFS_ROOT_ITEM_KEY) {
ret = check_root_key(leaf, &location_key, slot);
- if (ret < 0)
+ if (unlikely(ret < 0))
return ret;
} else if (location_key.type == BTRFS_INODE_ITEM_KEY ||
location_key.type == 0) {
ret = check_inode_key(leaf, &location_key, slot);
- if (ret < 0)
+ if (unlikely(ret < 0))
return ret;
} else {
dir_item_err(leaf, slot,
@@ -522,22 +528,22 @@ static int check_dir_item(struct extent_buffer *leaf,
/* dir type check */
dir_type = btrfs_dir_type(leaf, di);
- if (dir_type >= BTRFS_FT_MAX) {
+ if (unlikely(dir_type >= BTRFS_FT_MAX)) {
dir_item_err(leaf, slot,
"invalid dir item type, have %u expect [0, %u)",
dir_type, BTRFS_FT_MAX);
return -EUCLEAN;
}
- if (key->type == BTRFS_XATTR_ITEM_KEY &&
- dir_type != BTRFS_FT_XATTR) {
+ if (unlikely(key->type == BTRFS_XATTR_ITEM_KEY &&
+ dir_type != BTRFS_FT_XATTR)) {
dir_item_err(leaf, slot,
"invalid dir item type for XATTR key, have %u expect %u",
dir_type, BTRFS_FT_XATTR);
return -EUCLEAN;
}
- if (dir_type == BTRFS_FT_XATTR &&
- key->type != BTRFS_XATTR_ITEM_KEY) {
+ if (unlikely(dir_type == BTRFS_FT_XATTR &&
+ key->type != BTRFS_XATTR_ITEM_KEY)) {
dir_item_err(leaf, slot,
"xattr dir type found for non-XATTR key");
return -EUCLEAN;
@@ -550,13 +556,13 @@ static int check_dir_item(struct extent_buffer *leaf,
/* Name/data length check */
name_len = btrfs_dir_name_len(leaf, di);
data_len = btrfs_dir_data_len(leaf, di);
- if (name_len > max_name_len) {
+ if (unlikely(name_len > max_name_len)) {
dir_item_err(leaf, slot,
"dir item name len too long, have %u max %u",
name_len, max_name_len);
return -EUCLEAN;
}
- if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(fs_info)) {
+ if (unlikely(name_len + data_len > BTRFS_MAX_XATTR_SIZE(fs_info))) {
dir_item_err(leaf, slot,
"dir item name and data len too long, have %u max %u",
name_len + data_len,
@@ -564,7 +570,7 @@ static int check_dir_item(struct extent_buffer *leaf,
return -EUCLEAN;
}
- if (data_len && dir_type != BTRFS_FT_XATTR) {
+ if (unlikely(data_len && dir_type != BTRFS_FT_XATTR)) {
dir_item_err(leaf, slot,
"dir item with invalid data len, have %u expect 0",
data_len);
@@ -574,7 +580,7 @@ static int check_dir_item(struct extent_buffer *leaf,
total_size = sizeof(*di) + name_len + data_len;
/* header and name/data should not cross item boundary */
- if (cur + total_size > item_size) {
+ if (unlikely(cur + total_size > item_size)) {
dir_item_err(leaf, slot,
"dir item data crosses item boundary, have %u boundary %u",
cur + total_size, item_size);
@@ -592,7 +598,7 @@ static int check_dir_item(struct extent_buffer *leaf,
read_extent_buffer(leaf, namebuf,
(unsigned long)(di + 1), name_len);
name_hash = btrfs_name_hash(namebuf, name_len);
- if (key->offset != name_hash) {
+ if (unlikely(key->offset != name_hash)) {
dir_item_err(leaf, slot,
"name hash mismatch with key, have 0x%016x expect 0x%016llx",
name_hash, key->offset);
@@ -641,13 +647,13 @@ static int check_block_group_item(struct extent_buffer *leaf,
* Here we don't really care about alignment since extent allocator can
* handle it. We care more about the size.
*/
- if (key->offset == 0) {
+ if (unlikely(key->offset == 0)) {
block_group_err(leaf, slot,
"invalid block group size 0");
return -EUCLEAN;
}
- if (item_size != sizeof(bgi)) {
+ if (unlikely(item_size != sizeof(bgi))) {
block_group_err(leaf, slot,
"invalid item size, have %u expect %zu",
item_size, sizeof(bgi));
@@ -656,8 +662,8 @@ static int check_block_group_item(struct extent_buffer *leaf,
read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
sizeof(bgi));
- if (btrfs_stack_block_group_chunk_objectid(&bgi) !=
- BTRFS_FIRST_CHUNK_TREE_OBJECTID) {
+ if (unlikely(btrfs_stack_block_group_chunk_objectid(&bgi) !=
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID)) {
block_group_err(leaf, slot,
"invalid block group chunk objectid, have %llu expect %llu",
btrfs_stack_block_group_chunk_objectid(&bgi),
@@ -665,7 +671,7 @@ static int check_block_group_item(struct extent_buffer *leaf,
return -EUCLEAN;
}
- if (btrfs_stack_block_group_used(&bgi) > key->offset) {
+ if (unlikely(btrfs_stack_block_group_used(&bgi) > key->offset)) {
block_group_err(leaf, slot,
"invalid block group used, have %llu expect [0, %llu)",
btrfs_stack_block_group_used(&bgi), key->offset);
@@ -673,7 +679,7 @@ static int check_block_group_item(struct extent_buffer *leaf,
}
flags = btrfs_stack_block_group_flags(&bgi);
- if (hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1) {
+ if (unlikely(hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1)) {
block_group_err(leaf, slot,
"invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set",
flags & BTRFS_BLOCK_GROUP_PROFILE_MASK,
@@ -682,11 +688,11 @@ static int check_block_group_item(struct extent_buffer *leaf,
}
type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
- if (type != BTRFS_BLOCK_GROUP_DATA &&
- type != BTRFS_BLOCK_GROUP_METADATA &&
- type != BTRFS_BLOCK_GROUP_SYSTEM &&
- type != (BTRFS_BLOCK_GROUP_METADATA |
- BTRFS_BLOCK_GROUP_DATA)) {
+ if (unlikely(type != BTRFS_BLOCK_GROUP_DATA &&
+ type != BTRFS_BLOCK_GROUP_METADATA &&
+ type != BTRFS_BLOCK_GROUP_SYSTEM &&
+ type != (BTRFS_BLOCK_GROUP_METADATA |
+ BTRFS_BLOCK_GROUP_DATA))) {
block_group_err(leaf, slot,
"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx",
type, hweight64(type),
@@ -773,49 +779,49 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
ncopies = btrfs_raid_array[raid_index].ncopies;
nparity = btrfs_raid_array[raid_index].nparity;
- if (!num_stripes) {
+ if (unlikely(!num_stripes)) {
chunk_err(leaf, chunk, logical,
"invalid chunk num_stripes, have %u", num_stripes);
return -EUCLEAN;
}
- if (num_stripes < ncopies) {
+ if (unlikely(num_stripes < ncopies)) {
chunk_err(leaf, chunk, logical,
"invalid chunk num_stripes < ncopies, have %u < %d",
num_stripes, ncopies);
return -EUCLEAN;
}
- if (nparity && num_stripes == nparity) {
+ if (unlikely(nparity && num_stripes == nparity)) {
chunk_err(leaf, chunk, logical,
"invalid chunk num_stripes == nparity, have %u == %d",
num_stripes, nparity);
return -EUCLEAN;
}
- if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(logical, fs_info->sectorsize))) {
chunk_err(leaf, chunk, logical,
"invalid chunk logical, have %llu should aligned to %u",
logical, fs_info->sectorsize);
return -EUCLEAN;
}
- if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) {
+ if (unlikely(btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize)) {
chunk_err(leaf, chunk, logical,
"invalid chunk sectorsize, have %u expect %u",
btrfs_chunk_sector_size(leaf, chunk),
fs_info->sectorsize);
return -EUCLEAN;
}
- if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) {
+ if (unlikely(!length || !IS_ALIGNED(length, fs_info->sectorsize))) {
chunk_err(leaf, chunk, logical,
"invalid chunk length, have %llu", length);
return -EUCLEAN;
}
- if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
+ if (unlikely(!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN)) {
chunk_err(leaf, chunk, logical,
"invalid chunk stripe length: %llu",
stripe_len);
return -EUCLEAN;
}
- if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
- type) {
+ if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
+ BTRFS_BLOCK_GROUP_PROFILE_MASK))) {
chunk_err(leaf, chunk, logical,
"unrecognized chunk type: 0x%llx",
~(BTRFS_BLOCK_GROUP_TYPE_MASK |
@@ -824,22 +830,23 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
return -EUCLEAN;
}
- if (!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
- (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) {
+ if (unlikely(!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
+ (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0)) {
chunk_err(leaf, chunk, logical,
"invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set",
type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
return -EUCLEAN;
}
- if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) {
+ if (unlikely((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0)) {
chunk_err(leaf, chunk, logical,
"missing chunk type flag, have 0x%llx one bit must be set in 0x%llx",
type, BTRFS_BLOCK_GROUP_TYPE_MASK);
return -EUCLEAN;
}
- if ((type & BTRFS_BLOCK_GROUP_SYSTEM) &&
- (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) {
+ if (unlikely((type & BTRFS_BLOCK_GROUP_SYSTEM) &&
+ (type & (BTRFS_BLOCK_GROUP_METADATA |
+ BTRFS_BLOCK_GROUP_DATA)))) {
chunk_err(leaf, chunk, logical,
"system chunk with data or metadata type: 0x%llx",
type);
@@ -851,20 +858,21 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
mixed = true;
if (!mixed) {
- if ((type & BTRFS_BLOCK_GROUP_METADATA) &&
- (type & BTRFS_BLOCK_GROUP_DATA)) {
+ if (unlikely((type & BTRFS_BLOCK_GROUP_METADATA) &&
+ (type & BTRFS_BLOCK_GROUP_DATA))) {
chunk_err(leaf, chunk, logical,
"mixed chunk type in non-mixed mode: 0x%llx", type);
return -EUCLEAN;
}
}
- if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
- (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
- (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
- (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
- (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
- ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != 1)) {
+ if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
+ (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
+ (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
+ (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
+ (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
+ ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
+ num_stripes != 1))) {
chunk_err(leaf, chunk, logical,
"invalid num_stripes:sub_stripes %u:%u for profile %llu",
num_stripes, sub_stripes,
@@ -887,7 +895,7 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
{
int num_stripes;
- if (btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk)) {
+ if (unlikely(btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk))) {
chunk_err(leaf, chunk, key->offset,
"invalid chunk item size: have %u expect [%zu, %u)",
btrfs_item_size_nr(leaf, slot),
@@ -901,8 +909,8 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
if (num_stripes == 0)
goto out;
- if (btrfs_chunk_item_size(num_stripes) !=
- btrfs_item_size_nr(leaf, slot)) {
+ if (unlikely(btrfs_chunk_item_size(num_stripes) !=
+ btrfs_item_size_nr(leaf, slot))) {
chunk_err(leaf, chunk, key->offset,
"invalid chunk item size: have %u expect %lu",
btrfs_item_size_nr(leaf, slot),
@@ -941,14 +949,14 @@ static int check_dev_item(struct extent_buffer *leaf,
{
struct btrfs_dev_item *ditem;
- if (key->objectid != BTRFS_DEV_ITEMS_OBJECTID) {
+ if (unlikely(key->objectid != BTRFS_DEV_ITEMS_OBJECTID)) {
dev_item_err(leaf, slot,
"invalid objectid: has=%llu expect=%llu",
key->objectid, BTRFS_DEV_ITEMS_OBJECTID);
return -EUCLEAN;
}
ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item);
- if (btrfs_device_id(leaf, ditem) != key->offset) {
+ if (unlikely(btrfs_device_id(leaf, ditem) != key->offset)) {
dev_item_err(leaf, slot,
"devid mismatch: key has=%llu item has=%llu",
key->offset, btrfs_device_id(leaf, ditem));
@@ -960,8 +968,8 @@ static int check_dev_item(struct extent_buffer *leaf,
* it can be 0 for device removal. Device size check can only be done
* by dev extents check.
*/
- if (btrfs_device_bytes_used(leaf, ditem) >
- btrfs_device_total_bytes(leaf, ditem)) {
+ if (unlikely(btrfs_device_bytes_used(leaf, ditem) >
+ btrfs_device_total_bytes(leaf, ditem))) {
dev_item_err(leaf, slot,
"invalid bytes used: have %llu expect [0, %llu]",
btrfs_device_bytes_used(leaf, ditem),
@@ -986,13 +994,13 @@ static int check_inode_item(struct extent_buffer *leaf,
int ret;
ret = check_inode_key(leaf, key, slot);
- if (ret < 0)
+ if (unlikely(ret < 0))
return ret;
iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item);
/* Here we use super block generation + 1 to handle log tree */
- if (btrfs_inode_generation(leaf, iitem) > super_gen + 1) {
+ if (unlikely(btrfs_inode_generation(leaf, iitem) > super_gen + 1)) {
inode_item_err(leaf, slot,
"invalid inode generation: has %llu expect (0, %llu]",
btrfs_inode_generation(leaf, iitem),
@@ -1000,7 +1008,7 @@ static int check_inode_item(struct extent_buffer *leaf,
return -EUCLEAN;
}
/* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */
- if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) {
+ if (unlikely(btrfs_inode_transid(leaf, iitem) > super_gen + 1)) {
inode_item_err(leaf, slot,
"invalid inode transid: has %llu expect [0, %llu]",
btrfs_inode_transid(leaf, iitem), super_gen + 1);
@@ -1013,7 +1021,7 @@ static int check_inode_item(struct extent_buffer *leaf,
* anything in the fs. So here we skip the check.
*/
mode = btrfs_inode_mode(leaf, iitem);
- if (mode & ~valid_mask) {
+ if (unlikely(mode & ~valid_mask)) {
inode_item_err(leaf, slot,
"unknown mode bit detected: 0x%x",
mode & ~valid_mask);
@@ -1026,20 +1034,20 @@ static int check_inode_item(struct extent_buffer *leaf,
* FIFO/CHR/DIR/REG. Only needs to check BLK, LNK and SOCKS
*/
if (!has_single_bit_set(mode & S_IFMT)) {
- if (!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode)) {
+ if (unlikely(!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode))) {
inode_item_err(leaf, slot,
"invalid mode: has 0%o expect valid S_IF* bit(s)",
mode & S_IFMT);
return -EUCLEAN;
}
}
- if (S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1) {
+ if (unlikely(S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1)) {
inode_item_err(leaf, slot,
"invalid nlink: has %u expect no more than 1 for dir",
btrfs_inode_nlink(leaf, iitem));
return -EUCLEAN;
}
- if (btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK) {
+ if (unlikely(btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK)) {
inode_item_err(leaf, slot,
"unknown flags detected: 0x%llx",
btrfs_inode_flags(leaf, iitem) &
@@ -1059,11 +1067,12 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
int ret;
ret = check_root_key(leaf, key, slot);
- if (ret < 0)
+ if (unlikely(ret < 0))
return ret;
- if (btrfs_item_size_nr(leaf, slot) != sizeof(ri) &&
- btrfs_item_size_nr(leaf, slot) != btrfs_legacy_root_item_size()) {
+ if (unlikely(btrfs_item_size_nr(leaf, slot) != sizeof(ri) &&
+ btrfs_item_size_nr(leaf, slot) !=
+ btrfs_legacy_root_item_size())) {
generic_err(leaf, slot,
"invalid root item size, have %u expect %zu or %u",
btrfs_item_size_nr(leaf, slot), sizeof(ri),
@@ -1080,24 +1089,24 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
btrfs_item_size_nr(leaf, slot));
/* Generation related */
- if (btrfs_root_generation(&ri) >
- btrfs_super_generation(fs_info->super_copy) + 1) {
+ if (unlikely(btrfs_root_generation(&ri) >
+ btrfs_super_generation(fs_info->super_copy) + 1)) {
generic_err(leaf, slot,
"invalid root generation, have %llu expect (0, %llu]",
btrfs_root_generation(&ri),
btrfs_super_generation(fs_info->super_copy) + 1);
return -EUCLEAN;
}
- if (btrfs_root_generation_v2(&ri) >
- btrfs_super_generation(fs_info->super_copy) + 1) {
+ if (unlikely(btrfs_root_generation_v2(&ri) >
+ btrfs_super_generation(fs_info->super_copy) + 1)) {
generic_err(leaf, slot,
"invalid root v2 generation, have %llu expect (0, %llu]",
btrfs_root_generation_v2(&ri),
btrfs_super_generation(fs_info->super_copy) + 1);
return -EUCLEAN;
}
- if (btrfs_root_last_snapshot(&ri) >
- btrfs_super_generation(fs_info->super_copy) + 1) {
+ if (unlikely(btrfs_root_last_snapshot(&ri) >
+ btrfs_super_generation(fs_info->super_copy) + 1)) {
generic_err(leaf, slot,
"invalid root last_snapshot, have %llu expect (0, %llu]",
btrfs_root_last_snapshot(&ri),
@@ -1106,27 +1115,27 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
}
/* Alignment and level check */
- if (!IS_ALIGNED(btrfs_root_bytenr(&ri), fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(btrfs_root_bytenr(&ri), fs_info->sectorsize))) {
generic_err(leaf, slot,
"invalid root bytenr, have %llu expect to be aligned to %u",
btrfs_root_bytenr(&ri), fs_info->sectorsize);
return -EUCLEAN;
}
- if (btrfs_root_level(&ri) >= BTRFS_MAX_LEVEL) {
+ if (unlikely(btrfs_root_level(&ri) >= BTRFS_MAX_LEVEL)) {
generic_err(leaf, slot,
"invalid root level, have %u expect [0, %u]",
btrfs_root_level(&ri), BTRFS_MAX_LEVEL - 1);
return -EUCLEAN;
}
- if (ri.drop_level >= BTRFS_MAX_LEVEL) {
+ if (unlikely(btrfs_root_drop_level(&ri) >= BTRFS_MAX_LEVEL)) {
generic_err(leaf, slot,
"invalid root level, have %u expect [0, %u]",
- ri.drop_level, BTRFS_MAX_LEVEL - 1);
+ btrfs_root_drop_level(&ri), BTRFS_MAX_LEVEL - 1);
return -EUCLEAN;
}
/* Flags check */
- if (btrfs_root_flags(&ri) & ~valid_root_flags) {
+ if (unlikely(btrfs_root_flags(&ri) & ~valid_root_flags)) {
generic_err(leaf, slot,
"invalid root flags, have 0x%llx expect mask 0x%llx",
btrfs_root_flags(&ri), valid_root_flags);
@@ -1180,14 +1189,14 @@ static int check_extent_item(struct extent_buffer *leaf,
u64 total_refs; /* Total refs in btrfs_extent_item */
u64 inline_refs = 0; /* found total inline refs */
- if (key->type == BTRFS_METADATA_ITEM_KEY &&
- !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
+ if (unlikely(key->type == BTRFS_METADATA_ITEM_KEY &&
+ !btrfs_fs_incompat(fs_info, SKINNY_METADATA))) {
generic_err(leaf, slot,
"invalid key type, METADATA_ITEM type invalid when SKINNY_METADATA feature disabled");
return -EUCLEAN;
}
/* key->objectid is the bytenr for both key types */
- if (!IS_ALIGNED(key->objectid, fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(key->objectid, fs_info->sectorsize))) {
generic_err(leaf, slot,
"invalid key objectid, have %llu expect to be aligned to %u",
key->objectid, fs_info->sectorsize);
@@ -1195,8 +1204,8 @@ static int check_extent_item(struct extent_buffer *leaf,
}
/* key->offset is tree level for METADATA_ITEM_KEY */
- if (key->type == BTRFS_METADATA_ITEM_KEY &&
- key->offset >= BTRFS_MAX_LEVEL) {
+ if (unlikely(key->type == BTRFS_METADATA_ITEM_KEY &&
+ key->offset >= BTRFS_MAX_LEVEL)) {
extent_err(leaf, slot,
"invalid tree level, have %llu expect [0, %u]",
key->offset, BTRFS_MAX_LEVEL - 1);
@@ -1222,7 +1231,7 @@ static int check_extent_item(struct extent_buffer *leaf,
* Either using btrfs_extent_inline_ref::offset, or specific
* data structure.
*/
- if (item_size < sizeof(*ei)) {
+ if (unlikely(item_size < sizeof(*ei))) {
extent_err(leaf, slot,
"invalid item size, have %u expect [%zu, %u)",
item_size, sizeof(*ei),
@@ -1236,15 +1245,16 @@ static int check_extent_item(struct extent_buffer *leaf,
flags = btrfs_extent_flags(leaf, ei);
total_refs = btrfs_extent_refs(leaf, ei);
generation = btrfs_extent_generation(leaf, ei);
- if (generation > btrfs_super_generation(fs_info->super_copy) + 1) {
+ if (unlikely(generation >
+ btrfs_super_generation(fs_info->super_copy) + 1)) {
extent_err(leaf, slot,
"invalid generation, have %llu expect (0, %llu]",
generation,
btrfs_super_generation(fs_info->super_copy) + 1);
return -EUCLEAN;
}
- if (!has_single_bit_set(flags & (BTRFS_EXTENT_FLAG_DATA |
- BTRFS_EXTENT_FLAG_TREE_BLOCK))) {
+ if (unlikely(!has_single_bit_set(flags & (BTRFS_EXTENT_FLAG_DATA |
+ BTRFS_EXTENT_FLAG_TREE_BLOCK)))) {
extent_err(leaf, slot,
"invalid extent flag, have 0x%llx expect 1 bit set in 0x%llx",
flags, BTRFS_EXTENT_FLAG_DATA |
@@ -1253,21 +1263,21 @@ static int check_extent_item(struct extent_buffer *leaf,
}
is_tree_block = !!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK);
if (is_tree_block) {
- if (key->type == BTRFS_EXTENT_ITEM_KEY &&
- key->offset != fs_info->nodesize) {
+ if (unlikely(key->type == BTRFS_EXTENT_ITEM_KEY &&
+ key->offset != fs_info->nodesize)) {
extent_err(leaf, slot,
"invalid extent length, have %llu expect %u",
key->offset, fs_info->nodesize);
return -EUCLEAN;
}
} else {
- if (key->type != BTRFS_EXTENT_ITEM_KEY) {
+ if (unlikely(key->type != BTRFS_EXTENT_ITEM_KEY)) {
extent_err(leaf, slot,
"invalid key type, have %u expect %u for data backref",
key->type, BTRFS_EXTENT_ITEM_KEY);
return -EUCLEAN;
}
- if (!IS_ALIGNED(key->offset, fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(key->offset, fs_info->sectorsize))) {
extent_err(leaf, slot,
"invalid extent length, have %llu expect aligned to %u",
key->offset, fs_info->sectorsize);
@@ -1281,7 +1291,7 @@ static int check_extent_item(struct extent_buffer *leaf,
struct btrfs_tree_block_info *info;
info = (struct btrfs_tree_block_info *)ptr;
- if (btrfs_tree_block_level(leaf, info) >= BTRFS_MAX_LEVEL) {
+ if (unlikely(btrfs_tree_block_level(leaf, info) >= BTRFS_MAX_LEVEL)) {
extent_err(leaf, slot,
"invalid tree block info level, have %u expect [0, %u]",
btrfs_tree_block_level(leaf, info),
@@ -1300,7 +1310,7 @@ static int check_extent_item(struct extent_buffer *leaf,
u64 inline_offset;
u8 inline_type;
- if (ptr + sizeof(*iref) > end) {
+ if (unlikely(ptr + sizeof(*iref) > end)) {
extent_err(leaf, slot,
"inline ref item overflows extent item, ptr %lu iref size %zu end %lu",
ptr, sizeof(*iref), end);
@@ -1309,7 +1319,7 @@ static int check_extent_item(struct extent_buffer *leaf,
iref = (struct btrfs_extent_inline_ref *)ptr;
inline_type = btrfs_extent_inline_ref_type(leaf, iref);
inline_offset = btrfs_extent_inline_ref_offset(leaf, iref);
- if (ptr + btrfs_extent_inline_ref_size(inline_type) > end) {
+ if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) {
extent_err(leaf, slot,
"inline ref item overflows extent item, ptr %lu iref size %u end %lu",
ptr, inline_type, end);
@@ -1323,7 +1333,8 @@ static int check_extent_item(struct extent_buffer *leaf,
break;
/* Contains parent bytenr */
case BTRFS_SHARED_BLOCK_REF_KEY:
- if (!IS_ALIGNED(inline_offset, fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(inline_offset,
+ fs_info->sectorsize))) {
extent_err(leaf, slot,
"invalid tree parent bytenr, have %llu expect aligned to %u",
inline_offset, fs_info->sectorsize);
@@ -1338,7 +1349,8 @@ static int check_extent_item(struct extent_buffer *leaf,
case BTRFS_EXTENT_DATA_REF_KEY:
dref = (struct btrfs_extent_data_ref *)(&iref->offset);
dref_offset = btrfs_extent_data_ref_offset(leaf, dref);
- if (!IS_ALIGNED(dref_offset, fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(dref_offset,
+ fs_info->sectorsize))) {
extent_err(leaf, slot,
"invalid data ref offset, have %llu expect aligned to %u",
dref_offset, fs_info->sectorsize);
@@ -1349,7 +1361,8 @@ static int check_extent_item(struct extent_buffer *leaf,
/* Contains parent bytenr and ref count */
case BTRFS_SHARED_DATA_REF_KEY:
sref = (struct btrfs_shared_data_ref *)(iref + 1);
- if (!IS_ALIGNED(inline_offset, fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(inline_offset,
+ fs_info->sectorsize))) {
extent_err(leaf, slot,
"invalid data parent bytenr, have %llu expect aligned to %u",
inline_offset, fs_info->sectorsize);
@@ -1365,14 +1378,14 @@ static int check_extent_item(struct extent_buffer *leaf,
ptr += btrfs_extent_inline_ref_size(inline_type);
}
/* No padding is allowed */
- if (ptr != end) {
+ if (unlikely(ptr != end)) {
extent_err(leaf, slot,
"invalid extent item size, padding bytes found");
return -EUCLEAN;
}
/* Finally, check the inline refs against total refs */
- if (inline_refs > total_refs) {
+ if (unlikely(inline_refs > total_refs)) {
extent_err(leaf, slot,
"invalid extent refs, have %llu expect >= inline %llu",
total_refs, inline_refs);
@@ -1389,21 +1402,21 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf,
if (key->type == BTRFS_SHARED_DATA_REF_KEY)
expect_item_size = sizeof(struct btrfs_shared_data_ref);
- if (btrfs_item_size_nr(leaf, slot) != expect_item_size) {
+ if (unlikely(btrfs_item_size_nr(leaf, slot) != expect_item_size)) {
generic_err(leaf, slot,
"invalid item size, have %u expect %u for key type %u",
btrfs_item_size_nr(leaf, slot),
expect_item_size, key->type);
return -EUCLEAN;
}
- if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) {
generic_err(leaf, slot,
"invalid key objectid for shared block ref, have %llu expect aligned to %u",
key->objectid, leaf->fs_info->sectorsize);
return -EUCLEAN;
}
- if (key->type != BTRFS_TREE_BLOCK_REF_KEY &&
- !IS_ALIGNED(key->offset, leaf->fs_info->sectorsize)) {
+ if (unlikely(key->type != BTRFS_TREE_BLOCK_REF_KEY &&
+ !IS_ALIGNED(key->offset, leaf->fs_info->sectorsize))) {
extent_err(leaf, slot,
"invalid tree parent bytenr, have %llu expect aligned to %u",
key->offset, leaf->fs_info->sectorsize);
@@ -1419,14 +1432,14 @@ static int check_extent_data_ref(struct extent_buffer *leaf,
unsigned long ptr = btrfs_item_ptr_offset(leaf, slot);
const unsigned long end = ptr + btrfs_item_size_nr(leaf, slot);
- if (btrfs_item_size_nr(leaf, slot) % sizeof(*dref) != 0) {
+ if (unlikely(btrfs_item_size_nr(leaf, slot) % sizeof(*dref) != 0)) {
generic_err(leaf, slot,
"invalid item size, have %u expect aligned to %zu for key type %u",
btrfs_item_size_nr(leaf, slot),
sizeof(*dref), key->type);
return -EUCLEAN;
}
- if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) {
generic_err(leaf, slot,
"invalid key objectid for shared block ref, have %llu expect aligned to %u",
key->objectid, leaf->fs_info->sectorsize);
@@ -1443,13 +1456,13 @@ static int check_extent_data_ref(struct extent_buffer *leaf,
owner = btrfs_extent_data_ref_objectid(leaf, dref);
offset = btrfs_extent_data_ref_offset(leaf, dref);
hash = hash_extent_data_ref(root_objectid, owner, offset);
- if (hash != key->offset) {
+ if (unlikely(hash != key->offset)) {
extent_err(leaf, slot,
"invalid extent data ref hash, item has 0x%016llx key has 0x%016llx",
hash, key->offset);
return -EUCLEAN;
}
- if (!IS_ALIGNED(offset, leaf->fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(offset, leaf->fs_info->sectorsize))) {
extent_err(leaf, slot,
"invalid extent data backref offset, have %llu expect aligned to %u",
offset, leaf->fs_info->sectorsize);
@@ -1469,10 +1482,10 @@ static int check_inode_ref(struct extent_buffer *leaf,
unsigned long ptr;
unsigned long end;
- if (!check_prev_ino(leaf, key, slot, prev_key))
+ if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
return -EUCLEAN;
/* namelen can't be 0, so item_size == sizeof() is also invalid */
- if (btrfs_item_size_nr(leaf, slot) <= sizeof(*iref)) {
+ if (unlikely(btrfs_item_size_nr(leaf, slot) <= sizeof(*iref))) {
inode_ref_err(leaf, slot,
"invalid item size, have %u expect (%zu, %u)",
btrfs_item_size_nr(leaf, slot),
@@ -1485,7 +1498,7 @@ static int check_inode_ref(struct extent_buffer *leaf,
while (ptr < end) {
u16 namelen;
- if (ptr + sizeof(iref) > end) {
+ if (unlikely(ptr + sizeof(iref) > end)) {
inode_ref_err(leaf, slot,
"inode ref overflow, ptr %lu end %lu inode_ref_size %zu",
ptr, end, sizeof(iref));
@@ -1494,7 +1507,7 @@ static int check_inode_ref(struct extent_buffer *leaf,
iref = (struct btrfs_inode_ref *)ptr;
namelen = btrfs_inode_ref_name_len(leaf, iref);
- if (ptr + sizeof(*iref) + namelen > end) {
+ if (unlikely(ptr + sizeof(*iref) + namelen > end)) {
inode_ref_err(leaf, slot,
"inode ref overflow, ptr %lu end %lu namelen %u",
ptr, end, namelen);
@@ -1577,7 +1590,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
u32 nritems = btrfs_header_nritems(leaf);
int slot;
- if (btrfs_header_level(leaf) != 0) {
+ if (unlikely(btrfs_header_level(leaf) != 0)) {
generic_err(leaf, 0,
"invalid level for leaf, have %d expect 0",
btrfs_header_level(leaf));
@@ -1596,19 +1609,19 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
u64 owner = btrfs_header_owner(leaf);
/* These trees must never be empty */
- if (owner == BTRFS_ROOT_TREE_OBJECTID ||
- owner == BTRFS_CHUNK_TREE_OBJECTID ||
- owner == BTRFS_EXTENT_TREE_OBJECTID ||
- owner == BTRFS_DEV_TREE_OBJECTID ||
- owner == BTRFS_FS_TREE_OBJECTID ||
- owner == BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ if (unlikely(owner == BTRFS_ROOT_TREE_OBJECTID ||
+ owner == BTRFS_CHUNK_TREE_OBJECTID ||
+ owner == BTRFS_EXTENT_TREE_OBJECTID ||
+ owner == BTRFS_DEV_TREE_OBJECTID ||
+ owner == BTRFS_FS_TREE_OBJECTID ||
+ owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) {
generic_err(leaf, 0,
"invalid root, root %llu must never be empty",
owner);
return -EUCLEAN;
}
/* Unknown tree */
- if (owner == 0) {
+ if (unlikely(owner == 0)) {
generic_err(leaf, 0,
"invalid owner, root 0 is not defined");
return -EUCLEAN;
@@ -1616,7 +1629,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
return 0;
}
- if (nritems == 0)
+ if (unlikely(nritems == 0))
return 0;
/*
@@ -1637,7 +1650,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
btrfs_item_key_to_cpu(leaf, &key, slot);
/* Make sure the keys are in the right order */
- if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) {
+ if (unlikely(btrfs_comp_cpu_keys(&prev_key, &key) >= 0)) {
generic_err(leaf, slot,
"bad key order, prev (%llu %u %llu) current (%llu %u %llu)",
prev_key.objectid, prev_key.type,
@@ -1656,7 +1669,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
else
item_end_expected = btrfs_item_offset_nr(leaf,
slot - 1);
- if (btrfs_item_end_nr(leaf, slot) != item_end_expected) {
+ if (unlikely(btrfs_item_end_nr(leaf, slot) != item_end_expected)) {
generic_err(leaf, slot,
"unexpected item end, have %u expect %u",
btrfs_item_end_nr(leaf, slot),
@@ -1669,8 +1682,8 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
* just in case all the items are consistent to each other, but
* all point outside of the leaf.
*/
- if (btrfs_item_end_nr(leaf, slot) >
- BTRFS_LEAF_DATA_SIZE(fs_info)) {
+ if (unlikely(btrfs_item_end_nr(leaf, slot) >
+ BTRFS_LEAF_DATA_SIZE(fs_info))) {
generic_err(leaf, slot,
"slot end outside of leaf, have %u expect range [0, %u]",
btrfs_item_end_nr(leaf, slot),
@@ -1679,8 +1692,8 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
}
/* Also check if the item pointer overlaps with btrfs item. */
- if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) >
- btrfs_item_ptr_offset(leaf, slot)) {
+ if (unlikely(btrfs_item_ptr_offset(leaf, slot) <
+ btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item))) {
generic_err(leaf, slot,
"slot overlaps with its data, item end %lu data start %lu",
btrfs_item_nr_offset(slot) +
@@ -1695,7 +1708,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
* criteria
*/
ret = check_leaf_item(leaf, &key, slot, &prev_key);
- if (ret < 0)
+ if (unlikely(ret < 0))
return ret;
}
@@ -1728,13 +1741,13 @@ int btrfs_check_node(struct extent_buffer *node)
u64 bytenr;
int ret = 0;
- if (level <= 0 || level >= BTRFS_MAX_LEVEL) {
+ if (unlikely(level <= 0 || level >= BTRFS_MAX_LEVEL)) {
generic_err(node, 0,
"invalid level for node, have %d expect [1, %d]",
level, BTRFS_MAX_LEVEL - 1);
return -EUCLEAN;
}
- if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info)) {
+ if (unlikely(nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info))) {
btrfs_crit(fs_info,
"corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]",
btrfs_header_owner(node), node->start,
@@ -1748,13 +1761,13 @@ int btrfs_check_node(struct extent_buffer *node)
btrfs_node_key_to_cpu(node, &key, slot);
btrfs_node_key_to_cpu(node, &next_key, slot + 1);
- if (!bytenr) {
+ if (unlikely(!bytenr)) {
generic_err(node, slot,
"invalid NULL node pointer");
ret = -EUCLEAN;
goto out;
}
- if (!IS_ALIGNED(bytenr, fs_info->sectorsize)) {
+ if (unlikely(!IS_ALIGNED(bytenr, fs_info->sectorsize))) {
generic_err(node, slot,
"unaligned pointer, have %llu should be aligned to %u",
bytenr, fs_info->sectorsize);
@@ -1762,7 +1775,7 @@ int btrfs_check_node(struct extent_buffer *node)
goto out;
}
- if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
+ if (unlikely(btrfs_comp_cpu_keys(&key, &next_key) >= 0)) {
generic_err(node, slot,
"bad key order, current (%llu %u %llu) next (%llu %u %llu)",
key.objectid, key.type, key.offset,
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index d3f28b8f4ff9..7c45d960b53c 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -52,7 +52,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
u32 nritems;
root_node = btrfs_lock_root_node(root);
- btrfs_set_lock_blocking_write(root_node);
nritems = btrfs_header_nritems(root_node);
root->defrag_max.objectid = 0;
/* from above we know this is not a leaf */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 56cbc1706b6f..254c2ee43aae 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -17,7 +17,6 @@
#include "backref.h"
#include "compression.h"
#include "qgroup.h"
-#include "inode-map.h"
#include "block-group.h"
#include "space-info.h"
@@ -139,8 +138,25 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_root *tree_root = fs_info->tree_root;
int ret = 0;
+ /*
+ * First check if the log root tree was already created. If not, create
+ * it before locking the root's log_mutex, just to keep lockdep happy.
+ */
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
+ mutex_lock(&tree_root->log_mutex);
+ if (!fs_info->log_root_tree) {
+ ret = btrfs_init_log_root_tree(trans, fs_info);
+ if (!ret)
+ set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
+ }
+ mutex_unlock(&tree_root->log_mutex);
+ if (ret)
+ return ret;
+ }
+
mutex_lock(&root->log_mutex);
if (root->log_root) {
@@ -156,13 +172,6 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
}
} else {
- mutex_lock(&fs_info->tree_log_mutex);
- if (!fs_info->log_root_tree)
- ret = btrfs_init_log_root_tree(trans, fs_info);
- mutex_unlock(&fs_info->tree_log_mutex);
- if (ret)
- goto out;
-
ret = btrfs_add_log_tree(trans, root);
if (ret)
goto out;
@@ -172,7 +181,6 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
root->log_start_pid = current->pid;
}
- atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
if (ctx && !ctx->logging_new_name) {
int index = root->log_transid % 2;
@@ -576,6 +584,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
+ struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_fs_info *fs_info = root->fs_info;
int found_type;
u64 extent_end;
@@ -653,7 +662,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
/* drop any overlapping extents */
- ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
+ drop_args.start = start;
+ drop_args.end = extent_end;
+ drop_args.drop_cache = true;
+ ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
if (ret)
goto out;
@@ -828,9 +840,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- inode_add_bytes(inode, nbytes);
update_inode:
- ret = btrfs_update_inode(trans, root, inode);
+ btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
out:
if (inode)
iput(inode);
@@ -1529,7 +1541,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- btrfs_update_inode(trans, root, inode);
+ btrfs_update_inode(trans, root, BTRFS_I(inode));
}
ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
@@ -1564,18 +1576,6 @@ out:
return ret;
}
-static int insert_orphan_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 ino)
-{
- int ret;
-
- ret = btrfs_insert_orphan_item(trans, root, ino);
- if (ret == -EEXIST)
- ret = 0;
-
- return ret;
-}
-
static int count_inode_extrefs(struct btrfs_root *root,
struct btrfs_inode *inode, struct btrfs_path *path)
{
@@ -1716,7 +1716,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (nlink != inode->i_nlink) {
set_nlink(inode, nlink);
- btrfs_update_inode(trans, root, inode);
+ btrfs_update_inode(trans, root, BTRFS_I(inode));
}
BTRFS_I(inode)->index_cnt = (u64)-1;
@@ -1727,7 +1727,9 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (ret)
goto out;
}
- ret = insert_orphan_item(trans, root, ino);
+ ret = btrfs_insert_orphan_item(trans, root, ino);
+ if (ret == -EEXIST)
+ ret = 0;
}
out:
@@ -1820,7 +1822,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
set_nlink(inode, 1);
else
inc_nlink(inode);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
} else if (ret == -EEXIST) {
ret = 0;
} else {
@@ -1973,7 +1975,7 @@ out:
btrfs_release_path(path);
if (!ret && update_size) {
btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
- ret = btrfs_update_inode(trans, root, dir);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
}
kfree(name);
iput(dir);
@@ -2586,6 +2588,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
* those prealloc extents just after replaying them.
*/
if (S_ISREG(mode)) {
+ struct btrfs_drop_extents_args drop_args = { 0 };
struct inode *inode;
u64 from;
@@ -2596,12 +2599,18 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
}
from = ALIGN(i_size_read(inode),
root->fs_info->sectorsize);
- ret = btrfs_drop_extents(wc->trans, root, inode,
- from, (u64)-1, 1);
+ drop_args.start = from;
+ drop_args.end = (u64)-1;
+ drop_args.drop_cache = true;
+ ret = btrfs_drop_extents(wc->trans, root,
+ BTRFS_I(inode),
+ &drop_args);
if (!ret) {
+ inode_sub_bytes(inode,
+ drop_args.bytes_found);
/* Update the inode's nbytes. */
ret = btrfs_update_inode(wc->trans,
- root, inode);
+ root, BTRFS_I(inode));
}
iput(inode);
if (ret)
@@ -2709,7 +2718,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
blocksize = fs_info->nodesize;
- next = btrfs_find_create_tree_block(fs_info, bytenr);
+ next = btrfs_find_create_tree_block(fs_info, bytenr,
+ btrfs_header_owner(cur),
+ *level - 1);
if (IS_ERR(next))
return PTR_ERR(next);
@@ -2732,7 +2743,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
- btrfs_set_lock_blocking_write(next);
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
@@ -2801,7 +2811,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
- btrfs_set_lock_blocking_write(next);
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
@@ -2883,7 +2892,6 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
- btrfs_set_lock_blocking_write(next);
btrfs_clean_tree_block(next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
@@ -3023,6 +3031,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
int log_transid = 0;
struct btrfs_log_ctx root_log_ctx;
struct blk_plug plug;
+ u64 log_root_start;
+ u64 log_root_level;
mutex_lock(&root->log_mutex);
log_transid = ctx->log_transid;
@@ -3200,22 +3210,31 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out_wake_log_root;
}
- btrfs_set_super_log_root(fs_info->super_for_commit,
- log_root_tree->node->start);
- btrfs_set_super_log_root_level(fs_info->super_for_commit,
- btrfs_header_level(log_root_tree->node));
-
+ log_root_start = log_root_tree->node->start;
+ log_root_level = btrfs_header_level(log_root_tree->node);
log_root_tree->log_transid++;
mutex_unlock(&log_root_tree->log_mutex);
/*
- * Nobody else is going to jump in and write the ctree
- * super here because the log_commit atomic below is protecting
- * us. We must be called with a transaction handle pinning
- * the running transaction open, so a full commit can't hop
- * in and cause problems either.
+ * Here we are guaranteed that nobody is going to write the superblock
+ * for the current transaction before us and that neither we do write
+ * our superblock before the previous transaction finishes its commit
+ * and writes its superblock, because:
+ *
+ * 1) We are holding a handle on the current transaction, so no body
+ * can commit it until we release the handle;
+ *
+ * 2) Before writing our superblock we acquire the tree_log_mutex, so
+ * if the previous transaction is still committing, and hasn't yet
+ * written its superblock, we wait for it to do it, because a
+ * transaction commit acquires the tree_log_mutex when the commit
+ * begins and releases it only after writing its superblock.
*/
+ mutex_lock(&fs_info->tree_log_mutex);
+ btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
+ btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
ret = write_all_supers(fs_info, 1);
+ mutex_unlock(&fs_info->tree_log_mutex);
if (ret) {
btrfs_set_log_full_commit(trans);
btrfs_abort_transaction(trans, ret);
@@ -3300,6 +3319,7 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
if (fs_info->log_root_tree) {
free_log_tree(trans, fs_info->log_root_tree);
fs_info->log_root_tree = NULL;
+ clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
}
return 0;
}
@@ -4196,6 +4216,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_log_ctx *ctx)
{
+ struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_root *log = root->log_root;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
@@ -4204,19 +4225,21 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
u64 extent_offset = em->start - em->orig_start;
u64 block_len;
int ret;
- int extent_inserted = 0;
ret = log_extent_csums(trans, inode, log, em, ctx);
if (ret)
return ret;
- ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
- em->start + em->len, NULL, 0, 1,
- sizeof(*fi), &extent_inserted);
+ drop_args.path = path;
+ drop_args.start = em->start;
+ drop_args.end = em->start + em->len;
+ drop_args.replace_extent = true;
+ drop_args.extent_item_size = sizeof(*fi);
+ ret = btrfs_drop_extents(trans, log, inode, &drop_args);
if (ret)
return ret;
- if (!extent_inserted) {
+ if (!drop_args.extent_inserted) {
key.objectid = btrfs_ino(inode);
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = em->start;
@@ -4375,8 +4398,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
do {
ret = btrfs_truncate_inode_items(trans,
root->log_root,
- &inode->vfs_inode,
- truncate_offset,
+ inode, truncate_offset,
BTRFS_EXTENT_DATA_KEY);
} while (ret == -EAGAIN);
if (ret)
@@ -4415,14 +4437,12 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct extent_map *em, *n;
struct list_head extents;
struct extent_map_tree *tree = &inode->extent_tree;
- u64 test_gen;
int ret = 0;
int num = 0;
INIT_LIST_HEAD(&extents);
write_lock(&tree->lock);
- test_gen = root->fs_info->last_trans_committed;
list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
list_del_init(&em->list);
@@ -4438,7 +4458,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
goto process;
}
- if (em->generation <= test_gen)
+ if (em->generation < trans->transid)
continue;
/* We log prealloc extents beyond eof later. */
@@ -4571,6 +4591,10 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
const u64 ino = btrfs_ino(inode);
int ins_nr = 0;
int start_slot = 0;
+ bool found_xattrs = false;
+
+ if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
+ return 0;
key.objectid = ino;
key.type = BTRFS_XATTR_ITEM_KEY;
@@ -4609,6 +4633,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
start_slot = slot;
ins_nr++;
path->slots[0]++;
+ found_xattrs = true;
cond_resched();
}
if (ins_nr > 0) {
@@ -4618,6 +4643,9 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
return ret;
}
+ if (!found_xattrs)
+ set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
+
return 0;
}
@@ -5303,7 +5331,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
&inode->runtime_flags);
while(1) {
ret = btrfs_truncate_inode_items(trans,
- log, &inode->vfs_inode, 0, 0);
+ log, inode, 0, 0);
if (ret != -EAGAIN)
break;
}
@@ -5442,11 +5470,10 @@ out_unlock:
static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
bool ret = false;
mutex_lock(&inode->log_mutex);
- if (inode->last_unlink_trans > fs_info->last_trans_committed) {
+ if (inode->last_unlink_trans >= trans->transid) {
/*
* Make sure any commits to the log are forced to be full
* commits.
@@ -5468,8 +5495,7 @@ static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct dentry *parent,
- struct super_block *sb,
- u64 last_committed)
+ struct super_block *sb)
{
int ret = 0;
struct dentry *old_parent = NULL;
@@ -5481,8 +5507,8 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
* and other fun in this file.
*/
if (S_ISREG(inode->vfs_inode.i_mode) &&
- inode->generation <= last_committed &&
- inode->last_unlink_trans <= last_committed)
+ inode->generation < trans->transid &&
+ inode->last_unlink_trans < trans->transid)
goto out;
if (!S_ISDIR(inode->vfs_inode.i_mode)) {
@@ -5828,7 +5854,6 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
while (true) {
struct btrfs_fs_info *fs_info = root->fs_info;
- const u64 last_committed = fs_info->last_trans_committed;
struct extent_buffer *leaf = path->nodes[0];
int slot = path->slots[0];
struct btrfs_key search_key;
@@ -5847,7 +5872,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (BTRFS_I(inode)->generation > last_committed)
+ if (BTRFS_I(inode)->generation >= trans->transid)
ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
LOG_INODE_EXISTS, ctx);
btrfs_add_delayed_iput(inode);
@@ -5888,7 +5913,6 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
struct btrfs_root *root = inode->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
struct dentry *old_parent = NULL;
struct super_block *sb = inode->vfs_inode.i_sb;
int ret = 0;
@@ -5902,7 +5926,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
if (root != inode->root)
break;
- if (inode->generation > fs_info->last_trans_committed) {
+ if (inode->generation >= trans->transid) {
ret = btrfs_log_inode(trans, root, inode,
LOG_INODE_EXISTS, ctx);
if (ret)
@@ -6019,7 +6043,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct super_block *sb;
int ret = 0;
- u64 last_committed = fs_info->last_trans_committed;
bool log_dentries = false;
sb = inode->vfs_inode.i_sb;
@@ -6029,23 +6052,12 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_no_trans;
}
- /*
- * The prev transaction commit doesn't complete, we need do
- * full commit by ourselves.
- */
- if (fs_info->last_trans_log_full_commit >
- fs_info->last_trans_committed) {
- ret = 1;
- goto end_no_trans;
- }
-
if (btrfs_root_refs(&root->root_item) == 0) {
ret = 1;
goto end_no_trans;
}
- ret = check_parent_dirs_for_sync(trans, inode, parent, sb,
- last_committed);
+ ret = check_parent_dirs_for_sync(trans, inode, parent, sb);
if (ret)
goto end_no_trans;
@@ -6075,8 +6087,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
* and other fun in this file.
*/
if (S_ISREG(inode->vfs_inode.i_mode) &&
- inode->generation <= last_committed &&
- inode->last_unlink_trans <= last_committed) {
+ inode->generation < trans->transid &&
+ inode->last_unlink_trans < trans->transid) {
ret = 0;
goto end_trans;
}
@@ -6125,7 +6137,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
* but the file inode does not have a matching BTRFS_INODE_REF_KEY item
* and has a link count of 2.
*/
- if (inode->last_unlink_trans > last_committed) {
+ if (inode->last_unlink_trans >= trans->transid) {
ret = btrfs_log_all_parents(trans, inode, ctx);
if (ret)
goto end_trans;
@@ -6434,7 +6446,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
struct dentry *parent)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_log_ctx ctx;
/*
@@ -6448,8 +6459,8 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* if this inode hasn't been logged and directory we're renaming it
* from hasn't been logged, we don't need to log it
*/
- if (inode->logged_trans <= fs_info->last_trans_committed &&
- (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
+ if (inode->logged_trans < trans->transid &&
+ (!old_dir || old_dir->logged_trans < trans->transid))
return;
btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 28525ad7ff8c..74023c8a783f 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -129,8 +129,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
} else {
btrfs_warn(fs_info,
"insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!",
- ret, (unsigned long long)key.objectid,
- (unsigned long long)key.offset, type);
+ ret, key.objectid, key.offset, type);
goto out;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 78637665166e..ee086fc56c30 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -31,6 +31,7 @@
#include "space-info.h"
#include "block-group.h"
#include "discard.h"
+#include "zoned.h"
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
@@ -374,6 +375,7 @@ void btrfs_free_device(struct btrfs_device *device)
rcu_string_free(device->name);
extent_io_tree_release(&device->alloc_state);
bio_put(device->flush_bio);
+ btrfs_destroy_dev_zone_info(device);
kfree(device);
}
@@ -667,6 +669,10 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
device->mode = flags;
+ ret = btrfs_get_dev_zone_info(device);
+ if (ret != 0)
+ goto error_free_page;
+
fs_devices->open_devices++;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
device->devid != BTRFS_DEV_REPLACE_DEVID) {
@@ -822,7 +828,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
} else {
mutex_lock(&fs_devices->device_list_mutex);
device = btrfs_find_device(fs_devices, devid,
- disk_super->dev_item.uuid, NULL, false);
+ disk_super->dev_item.uuid, NULL);
/*
* If this disk has been pulled into an fs devices created by
@@ -929,16 +935,16 @@ static noinline struct btrfs_device *device_list_add(const char *path,
* make sure it's the same device if the device is mounted
*/
if (device->bdev) {
- struct block_device *path_bdev;
+ int error;
+ dev_t path_dev;
- path_bdev = lookup_bdev(path);
- if (IS_ERR(path_bdev)) {
+ error = lookup_bdev(path, &path_dev);
+ if (error) {
mutex_unlock(&fs_devices->device_list_mutex);
- return ERR_CAST(path_bdev);
+ return ERR_PTR(error);
}
- if (device->bdev != path_bdev) {
- bdput(path_bdev);
+ if (device->bdev->bd_dev != path_dev) {
mutex_unlock(&fs_devices->device_list_mutex);
/*
* device->fs_info may not be reliable here, so
@@ -953,7 +959,6 @@ static noinline struct btrfs_device *device_list_add(const char *path,
task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
- bdput(path_bdev);
btrfs_info_in_rcu(device->fs_info,
"devid %llu device path %s changed to %s scanned by %s (%d)",
devid, rcu_str_deref(device->name),
@@ -1044,7 +1049,7 @@ error:
}
static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
- int step, struct btrfs_device **latest_dev)
+ struct btrfs_device **latest_dev)
{
struct btrfs_device *device, *next;
@@ -1089,16 +1094,16 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
* After we have read the system tree and know devids belonging to this
* filesystem, remove the device which does not belong there.
*/
-void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *latest_dev = NULL;
struct btrfs_fs_devices *seed_dev;
mutex_lock(&uuid_mutex);
- __btrfs_free_extra_devids(fs_devices, step, &latest_dev);
+ __btrfs_free_extra_devids(fs_devices, &latest_dev);
list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
- __btrfs_free_extra_devids(seed_dev, step, &latest_dev);
+ __btrfs_free_extra_devids(seed_dev, &latest_dev);
fs_devices->latest_bdev = latest_dev->bdev;
@@ -1137,6 +1142,7 @@ static void btrfs_close_one_device(struct btrfs_device *device)
device->bdev = NULL;
}
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ btrfs_destroy_dev_zone_info(device);
device->fs_info = NULL;
atomic_set(&device->dev_stats_ccnt, 0);
@@ -1217,6 +1223,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
fs_devices->latest_bdev = latest_dev->bdev;
fs_devices->total_rw_bytes = 0;
fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
+ fs_devices->read_policy = BTRFS_READ_POLICY_PID;
return 0;
}
@@ -1268,7 +1275,7 @@ void btrfs_release_disk_super(struct btrfs_super_block *super)
}
static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
- u64 bytenr)
+ u64 bytenr, u64 bytenr_orig)
{
struct btrfs_super_block *disk_super;
struct page *page;
@@ -1299,7 +1306,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev
/* align our pointer to the offset of the super block */
disk_super = p + offset_in_page(bytenr);
- if (btrfs_super_bytenr(disk_super) != bytenr ||
+ if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
btrfs_release_disk_super(p);
return ERR_PTR(-EINVAL);
@@ -1334,7 +1341,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
bool new_device_added = false;
struct btrfs_device *device = NULL;
struct block_device *bdev;
- u64 bytenr;
+ u64 bytenr, bytenr_orig;
+ int ret;
lockdep_assert_held(&uuid_mutex);
@@ -1344,14 +1352,18 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
* So, we need to add a special mount option to scan for
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
- bytenr = btrfs_sb_offset(0);
flags |= FMODE_EXCL;
bdev = blkdev_get_by_path(path, flags, holder);
if (IS_ERR(bdev))
return ERR_CAST(bdev);
- disk_super = btrfs_read_disk_super(bdev, bytenr);
+ bytenr_orig = btrfs_sb_offset(0);
+ ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
if (IS_ERR(disk_super)) {
device = ERR_CAST(disk_super);
goto error_bdev_put;
@@ -2015,6 +2027,11 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
if (IS_ERR(disk_super))
continue;
+ if (bdev_is_zoned(bdev)) {
+ btrfs_reset_sb_log_zones(bdev, copy_num);
+ continue;
+ }
+
memset(&disk_super->magic, 0, sizeof(disk_super->magic));
page = virt_to_page(disk_super);
@@ -2293,10 +2310,10 @@ static struct btrfs_device *btrfs_find_device_by_path(
dev_uuid = disk_super->dev_item.uuid;
if (btrfs_fs_incompat(fs_info, METADATA_UUID))
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- disk_super->metadata_uuid, true);
+ disk_super->metadata_uuid);
else
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- disk_super->fsid, true);
+ disk_super->fsid);
btrfs_release_disk_super(disk_super);
if (!device)
@@ -2316,7 +2333,7 @@ struct btrfs_device *btrfs_find_device_by_devspec(
if (devid) {
device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
- NULL, true);
+ NULL);
if (!device)
return ERR_PTR(-ENOENT);
return device;
@@ -2465,7 +2482,7 @@ next_slot:
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_FSID_SIZE);
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- fs_uuid, true);
+ fs_uuid);
BUG_ON(!device); /* Logic error */
if (device->fs_devices->seeding) {
@@ -2507,6 +2524,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (IS_ERR(bdev))
return PTR_ERR(bdev);
+ if (!btrfs_check_device_zone_type(fs_info, bdev)) {
+ ret = -EINVAL;
+ goto error;
+ }
+
if (fs_devices->seeding) {
seeding_dev = 1;
down_write(&sb->s_umount);
@@ -2540,10 +2562,17 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
}
rcu_assign_pointer(device->name, name);
+ device->fs_info = fs_info;
+ device->bdev = bdev;
+
+ ret = btrfs_get_dev_zone_info(device);
+ if (ret)
+ goto error_free_device;
+
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- goto error_free_device;
+ goto error_free_zone;
}
q = bdev_get_queue(bdev);
@@ -2556,8 +2585,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
fs_info->sectorsize);
device->disk_total_bytes = device->total_bytes;
device->commit_total_bytes = device->total_bytes;
- device->fs_info = fs_info;
- device->bdev = bdev;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->mode = FMODE_EXCL;
@@ -2704,6 +2731,8 @@ error_trans:
sb->s_flags |= SB_RDONLY;
if (trans)
btrfs_end_transaction(trans);
+error_free_zone:
+ btrfs_destroy_dev_zone_info(device);
error_free_device:
btrfs_free_device(device);
error:
@@ -5479,7 +5508,18 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
else
num_stripes = map->num_stripes;
- preferred_mirror = first + current->pid % num_stripes;
+ switch (fs_info->fs_devices->read_policy) {
+ default:
+ /* Shouldn't happen, just warn and use pid instead of failing */
+ btrfs_warn_rl(fs_info,
+ "unknown read_policy type %u, reset to pid",
+ fs_info->fs_devices->read_policy);
+ fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
+ fallthrough;
+ case BTRFS_READ_POLICY_PID:
+ preferred_mirror = first + (current->pid % num_stripes);
+ break;
+ }
if (dev_replace_is_ongoing &&
fs_info->dev_replace.cont_reading_from_srcdev_mode ==
@@ -6335,7 +6375,7 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
bio->bi_iter.bi_sector = physical >> 9;
btrfs_debug_in_rcu(fs_info,
"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
- bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector,
+ bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
(unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
dev->devid, bio->bi_iter.bi_size);
bio_set_dev(bio, dev->bdev);
@@ -6367,7 +6407,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
{
struct btrfs_device *dev;
struct bio *first_bio = bio;
- u64 logical = (u64)bio->bi_iter.bi_sector << 9;
+ u64 logical = bio->bi_iter.bi_sector << 9;
u64 length = 0;
u64 map_length;
int ret;
@@ -6447,8 +6487,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
* If @seed is true, traverse through the seed devices.
*/
struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
- u64 devid, u8 *uuid, u8 *fsid,
- bool seed)
+ u64 devid, u8 *uuid, u8 *fsid)
{
struct btrfs_device *device;
struct btrfs_fs_devices *seed_devs;
@@ -6655,7 +6694,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
btrfs_stripe_dev_uuid_nr(chunk, i),
BTRFS_UUID_SIZE);
map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
- devid, uuid, NULL, true);
+ devid, uuid, NULL);
if (!map->stripes[i].dev &&
!btrfs_test_opt(fs_info, DEGRADED)) {
free_extent_map(em);
@@ -6794,7 +6833,7 @@ static int read_one_dev(struct extent_buffer *leaf,
}
device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
- fs_uuid, true);
+ fs_uuid);
if (!device) {
if (!btrfs_test_opt(fs_info, DEGRADED)) {
btrfs_report_missing_device(fs_info, devid,
@@ -6857,6 +6896,16 @@ static int read_one_dev(struct extent_buffer *leaf,
}
fill_device_from_item(leaf, dev_item, device);
+ if (device->bdev) {
+ u64 max_total_bytes = i_size_read(device->bdev->bd_inode);
+
+ if (device->total_bytes > max_total_bytes) {
+ btrfs_err(fs_info,
+ "device total_bytes should be at most %llu but found %llu",
+ max_total_bytes, device->total_bytes);
+ return -EINVAL;
+ }
+ }
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
@@ -6891,11 +6940,11 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
* fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
* overallocate but we can keep it as-is, only the first page is used.
*/
- sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
+ sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET,
+ root->root_key.objectid, 0);
if (IS_ERR(sb))
return PTR_ERR(sb);
set_extent_buffer_uptodate(sb);
- btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
/*
* The sb extent buffer is artificial and just used to read the system array.
* set_extent_buffer_uptodate() call does not properly mark all it's
@@ -7059,12 +7108,8 @@ static void readahead_tree_node_children(struct extent_buffer *node)
int i;
const int nr_items = btrfs_header_nritems(node);
- for (i = 0; i < nr_items; i++) {
- u64 start;
-
- start = btrfs_node_blockptr(node, i);
- readahead_tree_block(node->fs_info, start);
- }
+ for (i = 0; i < nr_items; i++)
+ btrfs_readahead_node_child(node, i);
}
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
@@ -7451,8 +7496,7 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
int i;
mutex_lock(&fs_devices->device_list_mutex);
- dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
- true);
+ dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL);
mutex_unlock(&fs_devices->device_list_mutex);
if (!dev) {
@@ -7583,28 +7627,13 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
/* Make sure no dev extent is beyond device bondary */
- dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
+ dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
if (!dev) {
btrfs_err(fs_info, "failed to find devid %llu", devid);
ret = -EUCLEAN;
goto out;
}
- /* It's possible this device is a dummy for seed device */
- if (dev->disk_total_bytes == 0) {
- struct btrfs_fs_devices *devs;
-
- devs = list_first_entry(&fs_info->fs_devices->seed_list,
- struct btrfs_fs_devices, seed_list);
- dev = btrfs_find_device(devs, devid, NULL, NULL, false);
- if (!dev) {
- btrfs_err(fs_info, "failed to find seed devid %llu",
- devid);
- ret = -EUCLEAN;
- goto out;
- }
- }
-
if (physical_offset + physical_len > dev->disk_total_bytes) {
btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
@@ -7659,6 +7688,19 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
u64 prev_dev_ext_end = 0;
int ret = 0;
+ /*
+ * We don't have a dev_root because we mounted with ignorebadroots and
+ * failed to load the root, so we want to skip the verification in this
+ * case for sure.
+ *
+ * However if the dev root is fine, but the tree itself is corrupted
+ * we'd still fail to mount. This verification is only to make sure
+ * writes can happen safely, so instead just bypass this check
+ * completely in the case of IGNOREBADROOTS.
+ */
+ if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
+ return 0;
+
key.objectid = 1;
key.type = BTRFS_DEV_EXTENT_KEY;
key.offset = 0;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 232f02bd214f..1997a4649a66 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -52,6 +52,8 @@ struct btrfs_io_geometry {
#define BTRFS_DEV_STATE_FLUSH_SENT (4)
#define BTRFS_DEV_STATE_NO_READA (5)
+struct btrfs_zoned_device_info;
+
struct btrfs_device {
struct list_head dev_list; /* device_list_mutex */
struct list_head dev_alloc_list; /* chunk mutex */
@@ -65,6 +67,8 @@ struct btrfs_device {
struct block_device *bdev;
+ struct btrfs_zoned_device_info *zone_info;
+
/* the mode sent to blkdev_get */
fmode_t mode;
@@ -211,6 +215,16 @@ enum btrfs_chunk_allocation_policy {
BTRFS_CHUNK_ALLOC_REGULAR,
};
+/*
+ * Read policies for mirrored block group profiles, read picks the stripe based
+ * on these policies.
+ */
+enum btrfs_read_policy {
+ /* Use process PID to choose the stripe */
+ BTRFS_READ_POLICY_PID,
+ BTRFS_NR_READ_POLICY,
+};
+
struct btrfs_fs_devices {
u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
u8 metadata_uuid[BTRFS_FSID_SIZE];
@@ -264,6 +278,9 @@ struct btrfs_fs_devices {
struct completion kobj_unregister;
enum btrfs_chunk_allocation_policy chunk_alloc_policy;
+
+ /* Policy used to read the mirrored stripes */
+ enum btrfs_read_policy read_policy;
};
#define BTRFS_BIO_INLINE_CSUM_SIZE 64
@@ -436,7 +453,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path,
fmode_t flags, void *holder);
int btrfs_forget_devices(const char *path);
void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
-void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices);
void btrfs_assign_next_active_device(struct btrfs_device *device,
struct btrfs_device *this_dev);
struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info,
@@ -453,7 +470,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size);
struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
- u64 devid, u8 *uuid, u8 *fsid, bool seed);
+ u64 devid, u8 *uuid, u8 *fsid);
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
int btrfs_balance(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 95d9aebff2c4..af6246f36a9e 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -213,9 +213,11 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
}
out:
btrfs_free_path(path);
- if (!ret)
+ if (!ret) {
set_bit(BTRFS_INODE_COPY_EVERYTHING,
&BTRFS_I(inode)->runtime_flags);
+ clear_bit(BTRFS_INODE_NO_XATTRS, &BTRFS_I(inode)->runtime_flags);
+ }
return ret;
}
@@ -239,7 +241,7 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name,
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
BUG_ON(ret);
out:
btrfs_end_transaction(trans);
@@ -390,7 +392,7 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
if (!ret) {
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
- ret = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
BUG_ON(ret);
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
new file mode 100644
index 000000000000..c38846659019
--- /dev/null
+++ b/fs/btrfs/zoned.c
@@ -0,0 +1,616 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include "ctree.h"
+#include "volumes.h"
+#include "zoned.h"
+#include "rcu-string.h"
+
+/* Maximum number of zones to report per blkdev_report_zones() call */
+#define BTRFS_REPORT_NR_ZONES 4096
+
+/* Number of superblock log zones */
+#define BTRFS_NR_SB_LOG_ZONES 2
+
+static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
+{
+ struct blk_zone *zones = data;
+
+ memcpy(&zones[idx], zone, sizeof(*zone));
+
+ return 0;
+}
+
+static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
+ u64 *wp_ret)
+{
+ bool empty[BTRFS_NR_SB_LOG_ZONES];
+ bool full[BTRFS_NR_SB_LOG_ZONES];
+ sector_t sector;
+
+ ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL &&
+ zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL);
+
+ empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY);
+ empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY);
+ full[0] = (zones[0].cond == BLK_ZONE_COND_FULL);
+ full[1] = (zones[1].cond == BLK_ZONE_COND_FULL);
+
+ /*
+ * Possible states of log buffer zones
+ *
+ * Empty[0] In use[0] Full[0]
+ * Empty[1] * x 0
+ * In use[1] 0 x 0
+ * Full[1] 1 1 C
+ *
+ * Log position:
+ * *: Special case, no superblock is written
+ * 0: Use write pointer of zones[0]
+ * 1: Use write pointer of zones[1]
+ * C: Compare super blcoks from zones[0] and zones[1], use the latest
+ * one determined by generation
+ * x: Invalid state
+ */
+
+ if (empty[0] && empty[1]) {
+ /* Special case to distinguish no superblock to read */
+ *wp_ret = zones[0].start << SECTOR_SHIFT;
+ return -ENOENT;
+ } else if (full[0] && full[1]) {
+ /* Compare two super blocks */
+ struct address_space *mapping = bdev->bd_inode->i_mapping;
+ struct page *page[BTRFS_NR_SB_LOG_ZONES];
+ struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
+ int i;
+
+ for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ u64 bytenr;
+
+ bytenr = ((zones[i].start + zones[i].len)
+ << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE;
+
+ page[i] = read_cache_page_gfp(mapping,
+ bytenr >> PAGE_SHIFT, GFP_NOFS);
+ if (IS_ERR(page[i])) {
+ if (i == 1)
+ btrfs_release_disk_super(super[0]);
+ return PTR_ERR(page[i]);
+ }
+ super[i] = page_address(page[i]);
+ }
+
+ if (super[0]->generation > super[1]->generation)
+ sector = zones[1].start;
+ else
+ sector = zones[0].start;
+
+ for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
+ btrfs_release_disk_super(super[i]);
+ } else if (!full[0] && (empty[1] || full[1])) {
+ sector = zones[0].wp;
+ } else if (full[0]) {
+ sector = zones[1].wp;
+ } else {
+ return -EUCLEAN;
+ }
+ *wp_ret = sector << SECTOR_SHIFT;
+ return 0;
+}
+
+/*
+ * The following zones are reserved as the circular buffer on ZONED btrfs.
+ * - The primary superblock: zones 0 and 1
+ * - The first copy: zones 16 and 17
+ * - The second copy: zones 1024 or zone at 256GB which is minimum, and
+ * the following one
+ */
+static inline u32 sb_zone_number(int shift, int mirror)
+{
+ ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
+
+ switch (mirror) {
+ case 0: return 0;
+ case 1: return 16;
+ case 2: return min_t(u64, btrfs_sb_offset(mirror) >> shift, 1024);
+ }
+
+ return 0;
+}
+
+static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zones, unsigned int *nr_zones)
+{
+ int ret;
+
+ if (!*nr_zones)
+ return 0;
+
+ ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
+ copy_zone_info_cb, zones);
+ if (ret < 0) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: failed to read zone %llu on %s (devid %llu)",
+ pos, rcu_str_deref(device->name),
+ device->devid);
+ return ret;
+ }
+ *nr_zones = ret;
+ if (!ret)
+ return -EIO;
+
+ return 0;
+}
+
+int btrfs_get_dev_zone_info(struct btrfs_device *device)
+{
+ struct btrfs_zoned_device_info *zone_info = NULL;
+ struct block_device *bdev = device->bdev;
+ struct request_queue *queue = bdev_get_queue(bdev);
+ sector_t nr_sectors;
+ sector_t sector = 0;
+ struct blk_zone *zones = NULL;
+ unsigned int i, nreported = 0, nr_zones;
+ unsigned int zone_sectors;
+ int ret;
+
+ if (!bdev_is_zoned(bdev))
+ return 0;
+
+ if (device->zone_info)
+ return 0;
+
+ zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL);
+ if (!zone_info)
+ return -ENOMEM;
+
+ nr_sectors = bdev_nr_sectors(bdev);
+ zone_sectors = bdev_zone_sectors(bdev);
+ /* Check if it's power of 2 (see is_power_of_2) */
+ ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0);
+ zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
+ zone_info->zone_size_shift = ilog2(zone_info->zone_size);
+ zone_info->max_zone_append_size =
+ (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT;
+ zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
+ if (!IS_ALIGNED(nr_sectors, zone_sectors))
+ zone_info->nr_zones++;
+
+ zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->seq_zones) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+ if (!zone_info->empty_zones) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
+ if (!zones) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* Get zones type */
+ while (sector < nr_sectors) {
+ nr_zones = BTRFS_REPORT_NR_ZONES;
+ ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
+ &nr_zones);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < nr_zones; i++) {
+ if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
+ __set_bit(nreported, zone_info->seq_zones);
+ if (zones[i].cond == BLK_ZONE_COND_EMPTY)
+ __set_bit(nreported, zone_info->empty_zones);
+ nreported++;
+ }
+ sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
+ }
+
+ if (nreported != zone_info->nr_zones) {
+ btrfs_err_in_rcu(device->fs_info,
+ "inconsistent number of zones on %s (%u/%u)",
+ rcu_str_deref(device->name), nreported,
+ zone_info->nr_zones);
+ ret = -EIO;
+ goto out;
+ }
+
+ /* Validate superblock log */
+ nr_zones = BTRFS_NR_SB_LOG_ZONES;
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ u32 sb_zone;
+ u64 sb_wp;
+ int sb_pos = BTRFS_NR_SB_LOG_ZONES * i;
+
+ sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
+ if (sb_zone + 1 >= zone_info->nr_zones)
+ continue;
+
+ sector = sb_zone << (zone_info->zone_size_shift - SECTOR_SHIFT);
+ ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT,
+ &zone_info->sb_zones[sb_pos],
+ &nr_zones);
+ if (ret)
+ goto out;
+
+ if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: failed to read super block log zone info at devid %llu zone %u",
+ device->devid, sb_zone);
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ /*
+ * If zones[0] is conventional, always use the beggining of the
+ * zone to record superblock. No need to validate in that case.
+ */
+ if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
+ BLK_ZONE_TYPE_CONVENTIONAL)
+ continue;
+
+ ret = sb_write_pointer(device->bdev,
+ &zone_info->sb_zones[sb_pos], &sb_wp);
+ if (ret != -ENOENT && ret) {
+ btrfs_err_in_rcu(device->fs_info,
+ "zoned: super block log zone corrupted devid %llu zone %u",
+ device->devid, sb_zone);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+
+
+ kfree(zones);
+
+ device->zone_info = zone_info;
+
+ /* device->fs_info is not safe to use for printing messages */
+ btrfs_info_in_rcu(NULL,
+ "host-%s zoned block device %s, %u zones of %llu bytes",
+ bdev_zoned_model(bdev) == BLK_ZONED_HM ? "managed" : "aware",
+ rcu_str_deref(device->name), zone_info->nr_zones,
+ zone_info->zone_size);
+
+ return 0;
+
+out:
+ kfree(zones);
+ bitmap_free(zone_info->empty_zones);
+ bitmap_free(zone_info->seq_zones);
+ kfree(zone_info);
+
+ return ret;
+}
+
+void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+
+ if (!zone_info)
+ return;
+
+ bitmap_free(zone_info->seq_zones);
+ bitmap_free(zone_info->empty_zones);
+ kfree(zone_info);
+ device->zone_info = NULL;
+}
+
+int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zone)
+{
+ unsigned int nr_zones = 1;
+ int ret;
+
+ ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones);
+ if (ret != 0 || !nr_zones)
+ return ret ? ret : -EIO;
+
+ return 0;
+}
+
+int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+ u64 zoned_devices = 0;
+ u64 nr_devices = 0;
+ u64 zone_size = 0;
+ u64 max_zone_append_size = 0;
+ const bool incompat_zoned = btrfs_is_zoned(fs_info);
+ int ret = 0;
+
+ /* Count zoned devices */
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ enum blk_zoned_model model;
+
+ if (!device->bdev)
+ continue;
+
+ model = bdev_zoned_model(device->bdev);
+ if (model == BLK_ZONED_HM ||
+ (model == BLK_ZONED_HA && incompat_zoned)) {
+ struct btrfs_zoned_device_info *zone_info;
+
+ zone_info = device->zone_info;
+ zoned_devices++;
+ if (!zone_size) {
+ zone_size = zone_info->zone_size;
+ } else if (zone_info->zone_size != zone_size) {
+ btrfs_err(fs_info,
+ "zoned: unequal block device zone sizes: have %llu found %llu",
+ device->zone_info->zone_size,
+ zone_size);
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!max_zone_append_size ||
+ (zone_info->max_zone_append_size &&
+ zone_info->max_zone_append_size < max_zone_append_size))
+ max_zone_append_size =
+ zone_info->max_zone_append_size;
+ }
+ nr_devices++;
+ }
+
+ if (!zoned_devices && !incompat_zoned)
+ goto out;
+
+ if (!zoned_devices && incompat_zoned) {
+ /* No zoned block device found on ZONED filesystem */
+ btrfs_err(fs_info,
+ "zoned: no zoned devices found on a zoned filesystem");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (zoned_devices && !incompat_zoned) {
+ btrfs_err(fs_info,
+ "zoned: mode not enabled but zoned device found");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (zoned_devices != nr_devices) {
+ btrfs_err(fs_info,
+ "zoned: cannot mix zoned and regular devices");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * stripe_size is always aligned to BTRFS_STRIPE_LEN in
+ * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size,
+ * check the alignment here.
+ */
+ if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
+ btrfs_err(fs_info,
+ "zoned: zone size %llu not aligned to stripe %u",
+ zone_size, BTRFS_STRIPE_LEN);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
+ btrfs_err(fs_info, "zoned: mixed block groups not supported");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ fs_info->zone_size = zone_size;
+ fs_info->max_zone_append_size = max_zone_append_size;
+
+ btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
+out:
+ return ret;
+}
+
+int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
+{
+ if (!btrfs_is_zoned(info))
+ return 0;
+
+ /*
+ * Space cache writing is not COWed. Disable that to avoid write errors
+ * in sequential zones.
+ */
+ if (btrfs_test_opt(info, SPACE_CACHE)) {
+ btrfs_err(info, "zoned: space cache v1 is not supported");
+ return -EINVAL;
+ }
+
+ if (btrfs_test_opt(info, NODATACOW)) {
+ btrfs_err(info, "zoned: NODATACOW not supported");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
+ int rw, u64 *bytenr_ret)
+{
+ u64 wp;
+ int ret;
+
+ if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
+ *bytenr_ret = zones[0].start << SECTOR_SHIFT;
+ return 0;
+ }
+
+ ret = sb_write_pointer(bdev, zones, &wp);
+ if (ret != -ENOENT && ret < 0)
+ return ret;
+
+ if (rw == WRITE) {
+ struct blk_zone *reset = NULL;
+
+ if (wp == zones[0].start << SECTOR_SHIFT)
+ reset = &zones[0];
+ else if (wp == zones[1].start << SECTOR_SHIFT)
+ reset = &zones[1];
+
+ if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
+ ASSERT(reset->cond == BLK_ZONE_COND_FULL);
+
+ ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ reset->start, reset->len,
+ GFP_NOFS);
+ if (ret)
+ return ret;
+
+ reset->cond = BLK_ZONE_COND_EMPTY;
+ reset->wp = reset->start;
+ }
+ } else if (ret != -ENOENT) {
+ /* For READ, we want the precious one */
+ if (wp == zones[0].start << SECTOR_SHIFT)
+ wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT;
+ wp -= BTRFS_SUPER_INFO_SIZE;
+ }
+
+ *bytenr_ret = wp;
+ return 0;
+
+}
+
+int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
+ u64 *bytenr_ret)
+{
+ struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES];
+ unsigned int zone_sectors;
+ u32 sb_zone;
+ int ret;
+ u64 zone_size;
+ u8 zone_sectors_shift;
+ sector_t nr_sectors;
+ u32 nr_zones;
+
+ if (!bdev_is_zoned(bdev)) {
+ *bytenr_ret = btrfs_sb_offset(mirror);
+ return 0;
+ }
+
+ ASSERT(rw == READ || rw == WRITE);
+
+ zone_sectors = bdev_zone_sectors(bdev);
+ if (!is_power_of_2(zone_sectors))
+ return -EINVAL;
+ zone_size = zone_sectors << SECTOR_SHIFT;
+ zone_sectors_shift = ilog2(zone_sectors);
+ nr_sectors = bdev_nr_sectors(bdev);
+ nr_zones = nr_sectors >> zone_sectors_shift;
+
+ sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
+ if (sb_zone + 1 >= nr_zones)
+ return -ENOENT;
+
+ ret = blkdev_report_zones(bdev, sb_zone << zone_sectors_shift,
+ BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
+ zones);
+ if (ret < 0)
+ return ret;
+ if (ret != BTRFS_NR_SB_LOG_ZONES)
+ return -EIO;
+
+ return sb_log_location(bdev, zones, rw, bytenr_ret);
+}
+
+int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
+ u64 *bytenr_ret)
+{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ u32 zone_num;
+
+ if (!zinfo) {
+ *bytenr_ret = btrfs_sb_offset(mirror);
+ return 0;
+ }
+
+ zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
+ if (zone_num + 1 >= zinfo->nr_zones)
+ return -ENOENT;
+
+ return sb_log_location(device->bdev,
+ &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror],
+ rw, bytenr_ret);
+}
+
+static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
+ int mirror)
+{
+ u32 zone_num;
+
+ if (!zinfo)
+ return false;
+
+ zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
+ if (zone_num + 1 >= zinfo->nr_zones)
+ return false;
+
+ if (!test_bit(zone_num, zinfo->seq_zones))
+ return false;
+
+ return true;
+}
+
+void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
+{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ struct blk_zone *zone;
+
+ if (!is_sb_log_zone(zinfo, mirror))
+ return;
+
+ zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
+ if (zone->cond != BLK_ZONE_COND_FULL) {
+ if (zone->cond == BLK_ZONE_COND_EMPTY)
+ zone->cond = BLK_ZONE_COND_IMP_OPEN;
+
+ zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
+
+ if (zone->wp == zone->start + zone->len)
+ zone->cond = BLK_ZONE_COND_FULL;
+
+ return;
+ }
+
+ zone++;
+ ASSERT(zone->cond != BLK_ZONE_COND_FULL);
+ if (zone->cond == BLK_ZONE_COND_EMPTY)
+ zone->cond = BLK_ZONE_COND_IMP_OPEN;
+
+ zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
+
+ if (zone->wp == zone->start + zone->len)
+ zone->cond = BLK_ZONE_COND_FULL;
+}
+
+int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
+{
+ sector_t zone_sectors;
+ sector_t nr_sectors;
+ u8 zone_sectors_shift;
+ u32 sb_zone;
+ u32 nr_zones;
+
+ zone_sectors = bdev_zone_sectors(bdev);
+ zone_sectors_shift = ilog2(zone_sectors);
+ nr_sectors = bdev_nr_sectors(bdev);
+ nr_zones = nr_sectors >> zone_sectors_shift;
+
+ sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
+ if (sb_zone + 1 >= nr_zones)
+ return -ENOENT;
+
+ return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ sb_zone << zone_sectors_shift,
+ zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
new file mode 100644
index 000000000000..8abe2f83272b
--- /dev/null
+++ b/fs/btrfs/zoned.h
@@ -0,0 +1,160 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_ZONED_H
+#define BTRFS_ZONED_H
+
+#include <linux/types.h>
+#include <linux/blkdev.h>
+#include "volumes.h"
+#include "disk-io.h"
+
+struct btrfs_zoned_device_info {
+ /*
+ * Number of zones, zone size and types of zones if bdev is a
+ * zoned block device.
+ */
+ u64 zone_size;
+ u8 zone_size_shift;
+ u64 max_zone_append_size;
+ u32 nr_zones;
+ unsigned long *seq_zones;
+ unsigned long *empty_zones;
+ struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX];
+};
+
+#ifdef CONFIG_BLK_DEV_ZONED
+int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zone);
+int btrfs_get_dev_zone_info(struct btrfs_device *device);
+void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
+int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
+int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info);
+int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
+ u64 *bytenr_ret);
+int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
+ u64 *bytenr_ret);
+void btrfs_advance_sb_log(struct btrfs_device *device, int mirror);
+int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror);
+#else /* CONFIG_BLK_DEV_ZONED */
+static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zone)
+{
+ return 0;
+}
+
+static inline int btrfs_get_dev_zone_info(struct btrfs_device *device)
+{
+ return 0;
+}
+
+static inline void btrfs_destroy_dev_zone_info(struct btrfs_device *device) { }
+
+static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info)
+{
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ btrfs_err(fs_info, "zoned block devices support is not enabled");
+ return -EOPNOTSUPP;
+}
+
+static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
+{
+ return 0;
+}
+
+static inline int btrfs_sb_log_location_bdev(struct block_device *bdev,
+ int mirror, int rw, u64 *bytenr_ret)
+{
+ *bytenr_ret = btrfs_sb_offset(mirror);
+ return 0;
+}
+
+static inline int btrfs_sb_log_location(struct btrfs_device *device, int mirror,
+ int rw, u64 *bytenr_ret)
+{
+ *bytenr_ret = btrfs_sb_offset(mirror);
+ return 0;
+}
+
+static inline void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
+{ }
+
+static inline int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
+{
+ return 0;
+}
+
+#endif
+
+static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+
+ if (!zone_info)
+ return false;
+
+ return test_bit(pos >> zone_info->zone_size_shift, zone_info->seq_zones);
+}
+
+static inline bool btrfs_dev_is_empty_zone(struct btrfs_device *device, u64 pos)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+
+ if (!zone_info)
+ return true;
+
+ return test_bit(pos >> zone_info->zone_size_shift, zone_info->empty_zones);
+}
+
+static inline void btrfs_dev_set_empty_zone_bit(struct btrfs_device *device,
+ u64 pos, bool set)
+{
+ struct btrfs_zoned_device_info *zone_info = device->zone_info;
+ unsigned int zno;
+
+ if (!zone_info)
+ return;
+
+ zno = pos >> zone_info->zone_size_shift;
+ if (set)
+ set_bit(zno, zone_info->empty_zones);
+ else
+ clear_bit(zno, zone_info->empty_zones);
+}
+
+static inline void btrfs_dev_set_zone_empty(struct btrfs_device *device, u64 pos)
+{
+ btrfs_dev_set_empty_zone_bit(device, pos, true);
+}
+
+static inline void btrfs_dev_clear_zone_empty(struct btrfs_device *device, u64 pos)
+{
+ btrfs_dev_set_empty_zone_bit(device, pos, false);
+}
+
+static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_info,
+ struct block_device *bdev)
+{
+ u64 zone_size;
+
+ if (btrfs_is_zoned(fs_info)) {
+ zone_size = bdev_zone_sectors(bdev) << SECTOR_SHIFT;
+ /* Do not allow non-zoned device */
+ return bdev_is_zoned(bdev) && fs_info->zone_size == zone_size;
+ }
+
+ /* Do not allow Host Manged zoned device */
+ return bdev_zoned_model(bdev) != BLK_ZONED_HM;
+}
+
+static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 pos)
+{
+ /*
+ * On a non-zoned device, any address is OK. On a zoned device,
+ * non-SEQUENTIAL WRITE REQUIRED zones are capable.
+ */
+ return device->zone_info == NULL || !btrfs_dev_is_sequential(device, pos);
+}
+
+#endif
diff --git a/fs/buffer.c b/fs/buffer.c
index 23f645657488..32647d2011df 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -523,7 +523,7 @@ repeat:
void emergency_thaw_bdev(struct super_block *sb)
{
- while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
+ while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
}
@@ -657,7 +657,7 @@ int __set_page_dirty_buffers(struct page *page)
} while (bh != head);
}
/*
- * Lock out page->mem_cgroup migration to keep PageDirty
+ * Lock out page's memcg migration to keep PageDirty
* synchronized with per-memcg dirty page counters.
*/
lock_page_memcg(page);
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 2d24c765cbd7..2c557229696a 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -106,15 +106,25 @@
#endif
#ifdef compat_start_thread
-#undef start_thread
-#define start_thread compat_start_thread
+#define COMPAT_START_THREAD(ex, regs, new_ip, new_sp) \
+ compat_start_thread(regs, new_ip, new_sp)
#endif
-#ifdef compat_arch_setup_additional_pages
+#ifdef COMPAT_START_THREAD
+#undef START_THREAD
+#define START_THREAD COMPAT_START_THREAD
+#endif
+
+#ifdef compat_arch_setup_additional_pages
+#define COMPAT_ARCH_SETUP_ADDITIONAL_PAGES(bprm, ex, interpreter) \
+ compat_arch_setup_additional_pages(bprm, interpreter)
+#endif
+
+#ifdef COMPAT_ARCH_SETUP_ADDITIONAL_PAGES
#undef ARCH_HAS_SETUP_ADDITIONAL_PAGES
#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
-#undef arch_setup_additional_pages
-#define arch_setup_additional_pages compat_arch_setup_additional_pages
+#undef ARCH_SETUP_ADDITIONAL_PAGES
+#define ARCH_SETUP_ADDITIONAL_PAGES COMPAT_ARCH_SETUP_ADDITIONAL_PAGES
#endif
#ifdef compat_elf_read_implies_exec
diff --git a/fs/coredump.c b/fs/coredump.c
index c6acfc694f65..a2f6ecc8e345 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -586,7 +586,6 @@ void do_coredump(const kernel_siginfo_t *siginfo)
int ispipe;
size_t *argv = NULL;
int argc = 0;
- struct files_struct *displaced;
/* require nonrelative corefile path and be extra careful */
bool need_suid_safe = false;
bool core_dumped = false;
@@ -792,11 +791,10 @@ void do_coredump(const kernel_siginfo_t *siginfo)
}
/* get us an unshared descriptor table; almost always a no-op */
- retval = unshare_files(&displaced);
+ /* The cell spufs coredump code reads the file descriptor tables */
+ retval = unshare_files();
if (retval)
goto close_fail;
- if (displaced)
- put_files_struct(displaced);
if (!dump_interrupted()) {
/*
* umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 1fbe6c24d705..6ca7d16593ff 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -14,7 +14,7 @@
#include <linux/namei.h>
#include <linux/scatterlist.h>
#include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <crypto/skcipher.h>
#include "fscrypt_private.h"
@@ -404,7 +404,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
fname->disk_name.len = iname->len;
return 0;
}
- ret = fscrypt_get_encryption_info(dir);
+ ret = fscrypt_get_encryption_info(dir, lookup);
if (ret)
return ret;
@@ -560,7 +560,11 @@ int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
return -ECHILD;
dir = dget_parent(dentry);
- err = fscrypt_get_encryption_info(d_inode(dir));
+ /*
+ * Pass allow_unsupported=true, so that files with an unsupported
+ * encryption policy can be deleted.
+ */
+ err = fscrypt_get_encryption_info(d_inode(dir), true);
valid = !fscrypt_has_encryption_key(d_inode(dir));
dput(dir);
@@ -570,7 +574,3 @@ int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
return valid;
}
EXPORT_SYMBOL_GPL(fscrypt_d_revalidate);
-
-const struct dentry_operations fscrypt_d_ops = {
- .d_revalidate = fscrypt_d_revalidate,
-};
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 4f5806a3b73d..3fa965eb3336 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -25,6 +25,9 @@
#define FSCRYPT_CONTEXT_V1 1
#define FSCRYPT_CONTEXT_V2 2
+/* Keep this in sync with include/uapi/linux/fscrypt.h */
+#define FSCRYPT_MODE_MAX FSCRYPT_MODE_ADIANTUM
+
struct fscrypt_context_v1 {
u8 version; /* FSCRYPT_CONTEXT_V1 */
u8 contents_encryption_mode;
@@ -294,7 +297,6 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
u32 orig_len, u32 max_len,
u32 *encrypted_len_ret);
-extern const struct dentry_operations fscrypt_d_ops;
/* hkdf.c */
@@ -436,16 +438,9 @@ struct fscrypt_master_key {
* FS_IOC_REMOVE_ENCRYPTION_KEY can be retried, or
* FS_IOC_ADD_ENCRYPTION_KEY can add the secret again.
*
- * Locking: protected by key->sem (outer) and mk_secret_sem (inner).
- * The reason for two locks is that key->sem also protects modifying
- * mk_users, which ranks it above the semaphore for the keyring key
- * type, which is in turn above page faults (via keyring_read). But
- * sometimes filesystems call fscrypt_get_encryption_info() from within
- * a transaction, which ranks it below page faults. So we need a
- * separate lock which protects mk_secret but not also mk_users.
+ * Locking: protected by this master key's key->sem.
*/
struct fscrypt_master_key_secret mk_secret;
- struct rw_semaphore mk_secret_sem;
/*
* For v1 policy keys: an arbitrary key descriptor which was assigned by
@@ -464,8 +459,8 @@ struct fscrypt_master_key {
*
* This is NULL for v1 policy keys; those can only be added by root.
*
- * Locking: in addition to this keyrings own semaphore, this is
- * protected by the master key's key->sem, so we can do atomic
+ * Locking: in addition to this keyring's own semaphore, this is
+ * protected by this master key's key->sem, so we can do atomic
* search+insert. It can also be searched without taking any locks, but
* in that case the returned key may have already been removed.
*/
@@ -491,9 +486,9 @@ struct fscrypt_master_key {
* Per-mode encryption keys for the various types of encryption policies
* that use them. Allocated and derived on-demand.
*/
- struct fscrypt_prepared_key mk_direct_keys[__FSCRYPT_MODE_MAX + 1];
- struct fscrypt_prepared_key mk_iv_ino_lblk_64_keys[__FSCRYPT_MODE_MAX + 1];
- struct fscrypt_prepared_key mk_iv_ino_lblk_32_keys[__FSCRYPT_MODE_MAX + 1];
+ struct fscrypt_prepared_key mk_direct_keys[FSCRYPT_MODE_MAX + 1];
+ struct fscrypt_prepared_key mk_iv_ino_lblk_64_keys[FSCRYPT_MODE_MAX + 1];
+ struct fscrypt_prepared_key mk_iv_ino_lblk_32_keys[FSCRYPT_MODE_MAX + 1];
/* Hash key for inode numbers. Initialized only when needed. */
siphash_key_t mk_ino_hash_key;
@@ -507,9 +502,9 @@ is_master_key_secret_present(const struct fscrypt_master_key_secret *secret)
/*
* The READ_ONCE() is only necessary for fscrypt_drop_inode() and
* fscrypt_key_describe(). These run in atomic context, so they can't
- * take ->mk_secret_sem and thus 'secret' can change concurrently which
- * would be a data race. But they only need to know whether the secret
- * *was* present at the time of check, so READ_ONCE() suffices.
+ * take the key semaphore and thus 'secret' can change concurrently
+ * which would be a data race. But they only need to know whether the
+ * secret *was* present at the time of check, so READ_ONCE() suffices.
*/
return READ_ONCE(secret->size) != 0;
}
@@ -575,6 +570,34 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
void fscrypt_hash_inode_number(struct fscrypt_info *ci,
const struct fscrypt_master_key *mk);
+int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported);
+
+/**
+ * fscrypt_require_key() - require an inode's encryption key
+ * @inode: the inode we need the key for
+ *
+ * If the inode is encrypted, set up its encryption key if not already done.
+ * Then require that the key be present and return -ENOKEY otherwise.
+ *
+ * No locks are needed, and the key will live as long as the struct inode --- so
+ * it won't go away from under you.
+ *
+ * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
+ * if a problem occurred while setting up the encryption key.
+ */
+static inline int fscrypt_require_key(struct inode *inode)
+{
+ if (IS_ENCRYPTED(inode)) {
+ int err = fscrypt_get_encryption_info(inode, false);
+
+ if (err)
+ return err;
+ if (!fscrypt_has_encryption_key(inode))
+ return -ENOKEY;
+ }
+ return 0;
+}
+
/* keysetup_v1.c */
void fscrypt_put_direct_key(struct fscrypt_direct_key *dk);
diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c
index 0cba7928446d..e0ec21055505 100644
--- a/fs/crypto/hkdf.c
+++ b/fs/crypto/hkdf.c
@@ -10,7 +10,7 @@
*/
#include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include "fscrypt_private.h"
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 20b0df47fe6a..a73b0376e6f3 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -54,15 +54,12 @@ EXPORT_SYMBOL_GPL(fscrypt_file_open);
int __fscrypt_prepare_link(struct inode *inode, struct inode *dir,
struct dentry *dentry)
{
- int err;
-
- err = fscrypt_require_key(dir);
- if (err)
- return err;
-
- /* ... in case we looked up no-key name before key was added */
- if (dentry->d_flags & DCACHE_NOKEY_NAME)
+ if (fscrypt_is_nokey_name(dentry))
return -ENOKEY;
+ /*
+ * We don't need to separately check that the directory inode's key is
+ * available, as it's implied by the dentry not being a no-key name.
+ */
if (!fscrypt_has_permitted_context(dir, inode))
return -EXDEV;
@@ -75,19 +72,13 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
- int err;
-
- err = fscrypt_require_key(old_dir);
- if (err)
- return err;
-
- err = fscrypt_require_key(new_dir);
- if (err)
- return err;
-
- /* ... in case we looked up no-key name(s) before key was added */
- if ((old_dentry->d_flags | new_dentry->d_flags) & DCACHE_NOKEY_NAME)
+ if (fscrypt_is_nokey_name(old_dentry) ||
+ fscrypt_is_nokey_name(new_dentry))
return -ENOKEY;
+ /*
+ * We don't need to separately check that the directory inodes' keys are
+ * available, as it's implied by the dentries not being no-key names.
+ */
if (old_dir != new_dir) {
if (IS_ENCRYPTED(new_dir) &&
@@ -117,12 +108,25 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
spin_lock(&dentry->d_lock);
dentry->d_flags |= DCACHE_NOKEY_NAME;
spin_unlock(&dentry->d_lock);
- d_set_d_op(dentry, &fscrypt_d_ops);
}
return err;
}
EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup);
+int __fscrypt_prepare_readdir(struct inode *dir)
+{
+ return fscrypt_get_encryption_info(dir, true);
+}
+EXPORT_SYMBOL_GPL(__fscrypt_prepare_readdir);
+
+int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ if (attr->ia_valid & ATTR_SIZE)
+ return fscrypt_require_key(d_inode(dentry));
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__fscrypt_prepare_setattr);
+
/**
* fscrypt_prepare_setflags() - prepare to change flags with FS_IOC_SETFLAGS
* @inode: the inode on which flags are being changed
@@ -138,6 +142,7 @@ int fscrypt_prepare_setflags(struct inode *inode,
unsigned int oldflags, unsigned int flags)
{
struct fscrypt_info *ci;
+ struct key *key;
struct fscrypt_master_key *mk;
int err;
@@ -153,13 +158,14 @@ int fscrypt_prepare_setflags(struct inode *inode,
ci = inode->i_crypt_info;
if (ci->ci_policy.version != FSCRYPT_POLICY_V2)
return -EINVAL;
- mk = ci->ci_master_key->payload.data[0];
- down_read(&mk->mk_secret_sem);
+ key = ci->ci_master_key;
+ mk = key->payload.data[0];
+ down_read(&key->sem);
if (is_master_key_secret_present(&mk->mk_secret))
err = fscrypt_derive_dirhash_key(ci, mk);
else
err = -ENOKEY;
- up_read(&mk->mk_secret_sem);
+ up_read(&key->sem);
return err;
}
return 0;
@@ -325,7 +331,7 @@ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr,
* Try to set up the symlink's encryption key, but we can continue
* regardless of whether the key is available or not.
*/
- err = fscrypt_get_encryption_info(inode);
+ err = fscrypt_get_encryption_info(inode, false);
if (err)
return ERR_PTR(err);
has_key = fscrypt_has_encryption_key(inode);
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 53cc552a7b8f..0b3ffbb4faf4 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -44,7 +44,7 @@ static void free_master_key(struct fscrypt_master_key *mk)
wipe_master_key_secret(&mk->mk_secret);
- for (i = 0; i <= __FSCRYPT_MODE_MAX; i++) {
+ for (i = 0; i <= FSCRYPT_MODE_MAX; i++) {
fscrypt_destroy_prepared_key(&mk->mk_direct_keys[i]);
fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_64_keys[i]);
fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_32_keys[i]);
@@ -347,7 +347,6 @@ static int add_new_master_key(struct fscrypt_master_key_secret *secret,
mk->mk_spec = *mk_spec;
move_master_key_secret(&mk->mk_secret, secret);
- init_rwsem(&mk->mk_secret_sem);
refcount_set(&mk->mk_refcount, 1); /* secret is present */
INIT_LIST_HEAD(&mk->mk_decrypted_inodes);
@@ -427,11 +426,8 @@ static int add_existing_master_key(struct fscrypt_master_key *mk,
}
/* Re-add the secret if needed. */
- if (rekey) {
- down_write(&mk->mk_secret_sem);
+ if (rekey)
move_master_key_secret(&mk->mk_secret, secret);
- up_write(&mk->mk_secret_sem);
- }
return 0;
}
@@ -975,10 +971,8 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
/* No user claims remaining. Go ahead and wipe the secret. */
dead = false;
if (is_master_key_secret_present(&mk->mk_secret)) {
- down_write(&mk->mk_secret_sem);
wipe_master_key_secret(&mk->mk_secret);
dead = refcount_dec_and_test(&mk->mk_refcount);
- up_write(&mk->mk_secret_sem);
}
up_write(&key->sem);
if (dead) {
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index d595abb8ef90..261293fb7097 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -56,6 +56,8 @@ static struct fscrypt_mode *
select_encryption_mode(const union fscrypt_policy *policy,
const struct inode *inode)
{
+ BUILD_BUG_ON(ARRAY_SIZE(fscrypt_modes) != FSCRYPT_MODE_MAX + 1);
+
if (S_ISREG(inode->i_mode))
return &fscrypt_modes[fscrypt_policy_contents_mode(policy)];
@@ -168,7 +170,7 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci,
unsigned int hkdf_infolen = 0;
int err;
- if (WARN_ON(mode_num > __FSCRYPT_MODE_MAX))
+ if (WARN_ON(mode_num > FSCRYPT_MODE_MAX))
return -EINVAL;
prep_key = &keys[mode_num];
@@ -335,11 +337,11 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
* Find the master key, then set up the inode's actual encryption key.
*
* If the master key is found in the filesystem-level keyring, then the
- * corresponding 'struct key' is returned in *master_key_ret with
- * ->mk_secret_sem read-locked. This is needed to ensure that only one task
- * links the fscrypt_info into ->mk_decrypted_inodes (as multiple tasks may race
- * to create an fscrypt_info for the same inode), and to synchronize the master
- * key being removed with a new inode starting to use it.
+ * corresponding 'struct key' is returned in *master_key_ret with its semaphore
+ * read-locked. This is needed to ensure that only one task links the
+ * fscrypt_info into ->mk_decrypted_inodes (as multiple tasks may race to create
+ * an fscrypt_info for the same inode), and to synchronize the master key being
+ * removed with a new inode starting to use it.
*/
static int setup_file_encryption_key(struct fscrypt_info *ci,
bool need_dirhash_key,
@@ -388,7 +390,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
}
mk = key->payload.data[0];
- down_read(&mk->mk_secret_sem);
+ down_read(&key->sem);
/* Has the secret been removed (via FS_IOC_REMOVE_ENCRYPTION_KEY)? */
if (!is_master_key_secret_present(&mk->mk_secret)) {
@@ -431,7 +433,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
return 0;
out_release_key:
- up_read(&mk->mk_secret_sem);
+ up_read(&key->sem);
key_put(key);
return err;
}
@@ -534,9 +536,7 @@ fscrypt_setup_encryption_info(struct inode *inode,
res = 0;
out:
if (master_key) {
- struct fscrypt_master_key *mk = master_key->payload.data[0];
-
- up_read(&mk->mk_secret_sem);
+ up_read(&master_key->sem);
key_put(master_key);
}
put_crypt_info(crypt_info);
@@ -546,6 +546,11 @@ out:
/**
* fscrypt_get_encryption_info() - set up an inode's encryption key
* @inode: the inode to set up the key for. Must be encrypted.
+ * @allow_unsupported: if %true, treat an unsupported encryption policy (or
+ * unrecognized encryption context) the same way as the key
+ * being unavailable, instead of returning an error. Use
+ * %false unless the operation being performed is needed in
+ * order for files (or directories) to be deleted.
*
* Set up ->i_crypt_info, if it hasn't already been done.
*
@@ -556,7 +561,7 @@ out:
* encryption key is unavailable. (Use fscrypt_has_encryption_key() to
* distinguish these cases.) Also can return another -errno code.
*/
-int fscrypt_get_encryption_info(struct inode *inode)
+int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported)
{
int res;
union fscrypt_context ctx;
@@ -567,29 +572,38 @@ int fscrypt_get_encryption_info(struct inode *inode)
res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
if (res < 0) {
+ if (res == -ERANGE && allow_unsupported)
+ return 0;
fscrypt_warn(inode, "Error %d getting encryption context", res);
return res;
}
res = fscrypt_policy_from_context(&policy, &ctx, res);
if (res) {
+ if (allow_unsupported)
+ return 0;
fscrypt_warn(inode,
"Unrecognized or corrupt encryption context");
return res;
}
- if (!fscrypt_supported_policy(&policy, inode))
+ if (!fscrypt_supported_policy(&policy, inode)) {
+ if (allow_unsupported)
+ return 0;
return -EINVAL;
+ }
res = fscrypt_setup_encryption_info(inode, &policy,
fscrypt_context_nonce(&ctx),
IS_CASEFOLDED(inode) &&
S_ISDIR(inode->i_mode));
+
+ if (res == -ENOPKG && allow_unsupported) /* Algorithm unavailable? */
+ res = 0;
if (res == -ENOKEY)
res = 0;
return res;
}
-EXPORT_SYMBOL(fscrypt_get_encryption_info);
/**
* fscrypt_prepare_new_inode() - prepare to create a new inode in a directory
@@ -710,7 +724,7 @@ int fscrypt_drop_inode(struct inode *inode)
return 0;
/*
- * Note: since we aren't holding ->mk_secret_sem, the result here can
+ * Note: since we aren't holding the key semaphore, the result here can
* immediately become outdated. But there's no correctness problem with
* unnecessarily evicting. Nor is there a correctness problem with not
* evicting while iput() is racing with the key being removed, since
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 4441d9944b9e..a51cef6bd27f 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -175,7 +175,10 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy,
return false;
}
- if (policy->flags & ~FSCRYPT_POLICY_FLAGS_VALID) {
+ if (policy->flags & ~(FSCRYPT_POLICY_FLAGS_PAD_MASK |
+ FSCRYPT_POLICY_FLAG_DIRECT_KEY |
+ FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 |
+ FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) {
fscrypt_warn(inode, "Unsupported encryption flags (0x%02x)",
policy->flags);
return false;
@@ -587,7 +590,7 @@ EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_nonce);
int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
{
union fscrypt_policy parent_policy, child_policy;
- int err;
+ int err, err1, err2;
/* No restrictions on file types which are never encrypted */
if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) &&
@@ -617,19 +620,25 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
* In any case, if an unexpected error occurs, fall back to "forbidden".
*/
- err = fscrypt_get_encryption_info(parent);
+ err = fscrypt_get_encryption_info(parent, true);
if (err)
return 0;
- err = fscrypt_get_encryption_info(child);
+ err = fscrypt_get_encryption_info(child, true);
if (err)
return 0;
- err = fscrypt_get_policy(parent, &parent_policy);
- if (err)
- return 0;
+ err1 = fscrypt_get_policy(parent, &parent_policy);
+ err2 = fscrypt_get_policy(child, &child_policy);
- err = fscrypt_get_policy(child, &child_policy);
- if (err)
+ /*
+ * Allow the case where the parent and child both have an unrecognized
+ * encryption policy, so that files with an unrecognized encryption
+ * policy can be deleted.
+ */
+ if (err1 == -EINVAL && err2 == -EINVAL)
+ return 1;
+
+ if (err1 || err2)
return 0;
return fscrypt_policies_equal(&parent_policy, &child_policy);
diff --git a/fs/dax.c b/fs/dax.c
index 5b47834f2e1b..26d5dcd2d69e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -810,12 +810,11 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
address = pgoff_address(index, vma);
/*
- * Note because we provide range to follow_pte_pmd it will
- * call mmu_notifier_invalidate_range_start() on our behalf
- * before taking any lock.
+ * Note because we provide range to follow_pte it will call
+ * mmu_notifier_invalidate_range_start() on our behalf before
+ * taking any lock.
*/
- if (follow_pte_pmd(vma->vm_mm, address, &range,
- &ptep, &pmdp, &ptl))
+ if (follow_pte(vma->vm_mm, address, &range, &ptep, &pmdp, &ptl))
continue;
/*
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 624617c12250..561dcad08ad6 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -572,7 +572,7 @@ static int new_lockspace(const char *name, const char *cluster,
mutex_init(&ls->ls_requestqueue_mutex);
mutex_init(&ls->ls_clear_proc_locks);
- ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
+ ls->ls_recover_buf = kmalloc(LOWCOMMS_MAX_TX_BUFFER_LEN, GFP_NOFS);
if (!ls->ls_recover_buf)
goto out_lkbidr;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 79f56f16bc2c..372c34ff8594 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -78,9 +78,9 @@ struct connection {
#define CF_APP_LIMITED 7
#define CF_CLOSING 8
#define CF_SHUTDOWN 9
+#define CF_CONNECTED 10
struct list_head writequeue; /* List of outgoing writequeue_entries */
spinlock_t writequeue_lock;
- int (*rx_action) (struct connection *); /* What to do when active */
void (*connect_action) (struct connection *); /* What to do to connect */
void (*shutdown_action)(struct connection *con); /* What to do to shutdown */
int retries;
@@ -97,6 +97,11 @@ struct connection {
};
#define sock2con(x) ((struct connection *)(x)->sk_user_data)
+struct listen_connection {
+ struct socket *sock;
+ struct work_struct rwork;
+};
+
/* An entry waiting to be sent */
struct writequeue_entry {
struct list_head list;
@@ -126,6 +131,7 @@ static struct listen_sock_callbacks {
static LIST_HEAD(dlm_node_addrs);
static DEFINE_SPINLOCK(dlm_node_addrs_spin);
+static struct listen_connection listen_con;
static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
static int dlm_local_count;
static int dlm_allow_conn;
@@ -141,6 +147,9 @@ DEFINE_STATIC_SRCU(connections_srcu);
static void process_recv_sockets(struct work_struct *work);
static void process_send_sockets(struct work_struct *work);
+static void sctp_connect_to_sock(struct connection *con);
+static void tcp_connect_to_sock(struct connection *con);
+static void dlm_tcp_shutdown(struct connection *con);
/* This is deliberately very simple because most clusters have simple
sequential nodeids, so we should be able to go straight to a connection
@@ -169,6 +178,31 @@ static struct connection *__find_con(int nodeid)
return NULL;
}
+static int dlm_con_init(struct connection *con, int nodeid)
+{
+ con->rx_buflen = dlm_config.ci_buffer_size;
+ con->rx_buf = kmalloc(con->rx_buflen, GFP_NOFS);
+ if (!con->rx_buf)
+ return -ENOMEM;
+
+ con->nodeid = nodeid;
+ mutex_init(&con->sock_mutex);
+ INIT_LIST_HEAD(&con->writequeue);
+ spin_lock_init(&con->writequeue_lock);
+ INIT_WORK(&con->swork, process_send_sockets);
+ INIT_WORK(&con->rwork, process_recv_sockets);
+ init_waitqueue_head(&con->shutdown_wait);
+
+ if (dlm_config.ci_protocol == 0) {
+ con->connect_action = tcp_connect_to_sock;
+ con->shutdown_action = dlm_tcp_shutdown;
+ } else {
+ con->connect_action = sctp_connect_to_sock;
+ }
+
+ return 0;
+}
+
/*
* If 'allocation' is zero then we don't attempt to create a new
* connection structure for this node.
@@ -176,7 +210,7 @@ static struct connection *__find_con(int nodeid)
static struct connection *nodeid2con(int nodeid, gfp_t alloc)
{
struct connection *con, *tmp;
- int r;
+ int r, ret;
con = __find_con(nodeid);
if (con || !alloc)
@@ -186,30 +220,12 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc)
if (!con)
return NULL;
- con->rx_buflen = dlm_config.ci_buffer_size;
- con->rx_buf = kmalloc(con->rx_buflen, GFP_NOFS);
- if (!con->rx_buf) {
+ ret = dlm_con_init(con, nodeid);
+ if (ret) {
kfree(con);
return NULL;
}
- con->nodeid = nodeid;
- mutex_init(&con->sock_mutex);
- INIT_LIST_HEAD(&con->writequeue);
- spin_lock_init(&con->writequeue_lock);
- INIT_WORK(&con->swork, process_send_sockets);
- INIT_WORK(&con->rwork, process_recv_sockets);
- init_waitqueue_head(&con->shutdown_wait);
-
- /* Setup action pointers for child sockets */
- if (con->nodeid) {
- struct connection *zerocon = __find_con(0);
-
- con->connect_action = zerocon->connect_action;
- if (!con->rx_action)
- con->rx_action = zerocon->rx_action;
- }
-
r = nodeid_hash(nodeid);
spin_lock(&connections_lock);
@@ -258,7 +274,8 @@ static struct dlm_node_addr *find_node_addr(int nodeid)
return NULL;
}
-static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y)
+static int addr_compare(const struct sockaddr_storage *x,
+ const struct sockaddr_storage *y)
{
switch (x->ss_family) {
case AF_INET: {
@@ -357,10 +374,25 @@ unlock:
return rv;
}
+/* caller need to held dlm_node_addrs_spin lock */
+static bool dlm_lowcomms_na_has_addr(const struct dlm_node_addr *na,
+ const struct sockaddr_storage *addr)
+{
+ int i;
+
+ for (i = 0; i < na->addr_count; i++) {
+ if (addr_compare(na->addr[i], addr))
+ return true;
+ }
+
+ return false;
+}
+
int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
{
struct sockaddr_storage *new_addr;
struct dlm_node_addr *new_node, *na;
+ bool ret;
new_node = kzalloc(sizeof(struct dlm_node_addr), GFP_NOFS);
if (!new_node)
@@ -385,6 +417,14 @@ int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
return 0;
}
+ ret = dlm_lowcomms_na_has_addr(na, addr);
+ if (ret) {
+ spin_unlock(&dlm_node_addrs_spin);
+ kfree(new_addr);
+ kfree(new_node);
+ return -EEXIST;
+ }
+
if (na->addr_count >= DLM_MAX_ADDR_COUNT) {
spin_unlock(&dlm_node_addrs_spin);
kfree(new_addr);
@@ -410,6 +450,11 @@ static void lowcomms_data_ready(struct sock *sk)
read_unlock_bh(&sk->sk_callback_lock);
}
+static void lowcomms_listen_data_ready(struct sock *sk)
+{
+ queue_work(recv_workqueue, &listen_con.rwork);
+}
+
static void lowcomms_write_space(struct sock *sk)
{
struct connection *con;
@@ -419,6 +464,12 @@ static void lowcomms_write_space(struct sock *sk)
if (!con)
goto out;
+ if (!test_and_set_bit(CF_CONNECTED, &con->flags)) {
+ log_print("successful connected to node %d", con->nodeid);
+ queue_work(send_workqueue, &con->swork);
+ goto out;
+ }
+
clear_bit(SOCK_NOSPACE, &con->sock->flags);
if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
@@ -539,6 +590,21 @@ static void restore_callbacks(struct socket *sock)
write_unlock_bh(&sk->sk_callback_lock);
}
+static void add_listen_sock(struct socket *sock, struct listen_connection *con)
+{
+ struct sock *sk = sock->sk;
+
+ write_lock_bh(&sk->sk_callback_lock);
+ save_listen_callbacks(sock);
+ con->sock = sock;
+
+ sk->sk_user_data = con;
+ sk->sk_allocation = GFP_NOFS;
+ /* Install a data_ready callback */
+ sk->sk_data_ready = lowcomms_listen_data_ready;
+ write_unlock_bh(&sk->sk_callback_lock);
+}
+
/* Make a socket active */
static void add_sock(struct socket *sock, struct connection *con)
{
@@ -576,6 +642,15 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
memset((char *)saddr + *addr_len, 0, sizeof(struct sockaddr_storage) - *addr_len);
}
+static void dlm_close_sock(struct socket **sock)
+{
+ if (*sock) {
+ restore_callbacks(*sock);
+ sock_release(*sock);
+ *sock = NULL;
+ }
+}
+
/* Close a remote connection and tidy up */
static void close_connection(struct connection *con, bool and_other,
bool tx, bool rx)
@@ -592,11 +667,8 @@ static void close_connection(struct connection *con, bool and_other,
}
mutex_lock(&con->sock_mutex);
- if (con->sock) {
- restore_callbacks(con->sock);
- sock_release(con->sock);
- con->sock = NULL;
- }
+ dlm_close_sock(&con->sock);
+
if (con->othercon && and_other) {
/* Will only re-enter once. */
close_connection(con->othercon, false, true, true);
@@ -604,6 +676,7 @@ static void close_connection(struct connection *con, bool and_other,
con->rx_leftover = 0;
con->retries = 0;
+ clear_bit(CF_CONNECTED, &con->flags);
mutex_unlock(&con->sock_mutex);
clear_bit(CF_CLOSING, &con->flags);
}
@@ -691,11 +764,6 @@ static int receive_from_sock(struct connection *con)
goto out_close;
}
- if (con->nodeid == 0) {
- ret = -EINVAL;
- goto out_close;
- }
-
/* realloc if we get new buffer size to read out */
buflen = dlm_config.ci_buffer_size;
if (con->rx_buflen != buflen && con->rx_leftover <= buflen) {
@@ -767,7 +835,7 @@ out_close:
}
/* Listening socket is busy, accept a connection */
-static int accept_from_sock(struct connection *con)
+static int accept_from_sock(struct listen_connection *con)
{
int result;
struct sockaddr_storage peeraddr;
@@ -782,12 +850,8 @@ static int accept_from_sock(struct connection *con)
return -1;
}
- mutex_lock_nested(&con->sock_mutex, 0);
-
- if (!con->sock) {
- mutex_unlock(&con->sock_mutex);
+ if (!con->sock)
return -ENOTCONN;
- }
result = kernel_accept(con->sock, &newsock, O_NONBLOCK);
if (result < 0)
@@ -809,7 +873,6 @@ static int accept_from_sock(struct connection *con)
print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE,
b, sizeof(struct sockaddr_storage));
sock_release(newsock);
- mutex_unlock(&con->sock_mutex);
return -1;
}
@@ -828,7 +891,8 @@ static int accept_from_sock(struct connection *con)
result = -ENOMEM;
goto accept_err;
}
- mutex_lock_nested(&newcon->sock_mutex, 1);
+
+ mutex_lock(&newcon->sock_mutex);
if (newcon->sock) {
struct connection *othercon = newcon->othercon;
@@ -841,38 +905,24 @@ static int accept_from_sock(struct connection *con)
goto accept_err;
}
- othercon->rx_buflen = dlm_config.ci_buffer_size;
- othercon->rx_buf = kmalloc(othercon->rx_buflen, GFP_NOFS);
- if (!othercon->rx_buf) {
- mutex_unlock(&newcon->sock_mutex);
+ result = dlm_con_init(othercon, nodeid);
+ if (result < 0) {
kfree(othercon);
- log_print("failed to allocate incoming socket receive buffer");
- result = -ENOMEM;
goto accept_err;
}
- othercon->nodeid = nodeid;
- othercon->rx_action = receive_from_sock;
- mutex_init(&othercon->sock_mutex);
- INIT_LIST_HEAD(&othercon->writequeue);
- spin_lock_init(&othercon->writequeue_lock);
- INIT_WORK(&othercon->swork, process_send_sockets);
- INIT_WORK(&othercon->rwork, process_recv_sockets);
- init_waitqueue_head(&othercon->shutdown_wait);
- set_bit(CF_IS_OTHERCON, &othercon->flags);
+ newcon->othercon = othercon;
} else {
/* close other sock con if we have something new */
close_connection(othercon, false, true, false);
}
- mutex_lock_nested(&othercon->sock_mutex, 2);
- newcon->othercon = othercon;
+ mutex_lock_nested(&othercon->sock_mutex, 1);
add_sock(newsock, othercon);
addcon = othercon;
mutex_unlock(&othercon->sock_mutex);
}
else {
- newcon->rx_action = receive_from_sock;
/* accept copies the sk after we've saved the callbacks, so we
don't want to save them a second time or comm errors will
result in calling sk_error_report recursively. */
@@ -889,12 +939,10 @@ static int accept_from_sock(struct connection *con)
*/
if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags))
queue_work(recv_workqueue, &addcon->rwork);
- mutex_unlock(&con->sock_mutex);
return 0;
accept_err:
- mutex_unlock(&con->sock_mutex);
if (newsock)
sock_release(newsock);
@@ -930,7 +978,7 @@ static void writequeue_entry_complete(struct writequeue_entry *e, int completed)
/*
* sctp_bind_addrs - bind a SCTP socket to all our addresses
*/
-static int sctp_bind_addrs(struct connection *con, uint16_t port)
+static int sctp_bind_addrs(struct socket *sock, uint16_t port)
{
struct sockaddr_storage localaddr;
struct sockaddr *addr = (struct sockaddr *)&localaddr;
@@ -941,9 +989,9 @@ static int sctp_bind_addrs(struct connection *con, uint16_t port)
make_sockaddr(&localaddr, port, &addr_len);
if (!i)
- result = kernel_bind(con->sock, addr, addr_len);
+ result = kernel_bind(sock, addr, addr_len);
else
- result = sock_bind_add(con->sock->sk, addr, addr_len);
+ result = sock_bind_add(sock->sk, addr, addr_len);
if (result < 0) {
log_print("Can't bind to %d addr number %d, %d.\n",
@@ -967,11 +1015,6 @@ static void sctp_connect_to_sock(struct connection *con)
struct socket *sock;
unsigned int mark;
- if (con->nodeid == 0) {
- log_print("attempt to connect sock 0 foiled");
- return;
- }
-
dlm_comm_mark(con->nodeid, &mark);
mutex_lock(&con->sock_mutex);
@@ -1000,12 +1043,10 @@ static void sctp_connect_to_sock(struct connection *con)
sock_set_mark(sock->sk, mark);
- con->rx_action = receive_from_sock;
- con->connect_action = sctp_connect_to_sock;
add_sock(sock, con);
/* Bind to all addresses. */
- if (sctp_bind_addrs(con, 0))
+ if (sctp_bind_addrs(con->sock, 0))
goto bind_err;
make_sockaddr(&daddr, dlm_config.ci_tcp_port, &addr_len);
@@ -1027,8 +1068,11 @@ static void sctp_connect_to_sock(struct connection *con)
if (result == -EINPROGRESS)
result = 0;
- if (result == 0)
+ if (result == 0) {
+ if (!test_and_set_bit(CF_CONNECTED, &con->flags))
+ log_print("successful connected to node %d", con->nodeid);
goto out;
+ }
bind_err:
con->sock = NULL;
@@ -1065,11 +1109,6 @@ static void tcp_connect_to_sock(struct connection *con)
unsigned int mark;
int result;
- if (con->nodeid == 0) {
- log_print("attempt to connect sock 0 foiled");
- return;
- }
-
dlm_comm_mark(con->nodeid, &mark);
mutex_lock(&con->sock_mutex);
@@ -1095,9 +1134,6 @@ static void tcp_connect_to_sock(struct connection *con)
goto out_err;
}
- con->rx_action = receive_from_sock;
- con->connect_action = tcp_connect_to_sock;
- con->shutdown_action = dlm_tcp_shutdown;
add_sock(sock, con);
/* Bind to our cluster-known address connecting to avoid
@@ -1153,8 +1189,11 @@ out:
return;
}
-static struct socket *tcp_create_listen_sock(struct connection *con,
- struct sockaddr_storage *saddr)
+/* On error caller must run dlm_close_sock() for the
+ * listen connection socket.
+ */
+static int tcp_create_listen_sock(struct listen_connection *con,
+ struct sockaddr_storage *saddr)
{
struct socket *sock = NULL;
int result = 0;
@@ -1180,21 +1219,13 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
sock_set_reuseaddr(sock->sk);
- write_lock_bh(&sock->sk->sk_callback_lock);
- sock->sk->sk_user_data = con;
- save_listen_callbacks(sock);
- con->rx_action = accept_from_sock;
- con->connect_action = tcp_connect_to_sock;
- write_unlock_bh(&sock->sk->sk_callback_lock);
+ add_listen_sock(sock, con);
/* Bind to our port */
make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len);
result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
if (result < 0) {
log_print("Can't bind to port %d", dlm_config.ci_tcp_port);
- sock_release(sock);
- sock = NULL;
- con->sock = NULL;
goto create_out;
}
sock_set_keepalive(sock->sk);
@@ -1202,13 +1233,13 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
result = sock->ops->listen(sock, 5);
if (result < 0) {
log_print("Can't listen on port %d", dlm_config.ci_tcp_port);
- sock_release(sock);
- sock = NULL;
goto create_out;
}
+ return 0;
+
create_out:
- return sock;
+ return result;
}
/* Get local addresses */
@@ -1237,15 +1268,14 @@ static void deinit_local(void)
kfree(dlm_local_addr[i]);
}
-/* Initialise SCTP socket and bind to all interfaces */
-static int sctp_listen_for_all(void)
+/* Initialise SCTP socket and bind to all interfaces
+ * On error caller must run dlm_close_sock() for the
+ * listen connection socket.
+ */
+static int sctp_listen_for_all(struct listen_connection *con)
{
struct socket *sock = NULL;
int result = -EINVAL;
- struct connection *con = nodeid2con(0, GFP_NOFS);
-
- if (!con)
- return -ENOMEM;
log_print("Using SCTP for communications");
@@ -1260,47 +1290,29 @@ static int sctp_listen_for_all(void)
sock_set_mark(sock->sk, dlm_config.ci_mark);
sctp_sock_set_nodelay(sock->sk);
- write_lock_bh(&sock->sk->sk_callback_lock);
- /* Init con struct */
- sock->sk->sk_user_data = con;
- save_listen_callbacks(sock);
- con->sock = sock;
- con->sock->sk->sk_data_ready = lowcomms_data_ready;
- con->rx_action = accept_from_sock;
- con->connect_action = sctp_connect_to_sock;
-
- write_unlock_bh(&sock->sk->sk_callback_lock);
+ add_listen_sock(sock, con);
/* Bind to all addresses. */
- if (sctp_bind_addrs(con, dlm_config.ci_tcp_port))
- goto create_delsock;
+ result = sctp_bind_addrs(con->sock, dlm_config.ci_tcp_port);
+ if (result < 0)
+ goto out;
result = sock->ops->listen(sock, 5);
if (result < 0) {
log_print("Can't set socket listening");
- goto create_delsock;
+ goto out;
}
return 0;
-create_delsock:
- sock_release(sock);
- con->sock = NULL;
out:
return result;
}
static int tcp_listen_for_all(void)
{
- struct socket *sock = NULL;
- struct connection *con = nodeid2con(0, GFP_NOFS);
- int result = -EINVAL;
-
- if (!con)
- return -ENOMEM;
-
/* We don't support multi-homed hosts */
- if (dlm_local_addr[1] != NULL) {
+ if (dlm_local_count > 1) {
log_print("TCP protocol can't handle multi-homed hosts, "
"try SCTP");
return -EINVAL;
@@ -1308,16 +1320,7 @@ static int tcp_listen_for_all(void)
log_print("Using TCP for communications");
- sock = tcp_create_listen_sock(con, dlm_local_addr[0]);
- if (sock) {
- add_sock(sock, con);
- result = 0;
- }
- else {
- result = -EADDRINUSE;
- }
-
- return result;
+ return tcp_create_listen_sock(&listen_con, dlm_local_addr[0]);
}
@@ -1352,6 +1355,12 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
struct writequeue_entry *e;
int offset = 0;
+ if (len > LOWCOMMS_MAX_TX_BUFFER_LEN) {
+ BUILD_BUG_ON(PAGE_SIZE < LOWCOMMS_MAX_TX_BUFFER_LEN);
+ log_print("failed to allocate a buffer of size %d", len);
+ return NULL;
+ }
+
con = nodeid2con(nodeid, allocation);
if (!con)
return NULL;
@@ -1506,6 +1515,8 @@ int dlm_lowcomms_close(int nodeid)
set_bit(CF_CLOSE, &con->flags);
close_connection(con, true, true, true);
clean_one_writequeue(con);
+ if (con->othercon)
+ clean_one_writequeue(con->othercon);
}
spin_lock(&dlm_node_addrs_spin);
@@ -1529,10 +1540,15 @@ static void process_recv_sockets(struct work_struct *work)
clear_bit(CF_READ_PENDING, &con->flags);
do {
- err = con->rx_action(con);
+ err = receive_from_sock(con);
} while (!err);
}
+static void process_listen_recv_socket(struct work_struct *work)
+{
+ accept_from_sock(&listen_con);
+}
+
/* Send workqueue function */
static void process_send_sockets(struct work_struct *work)
{
@@ -1616,10 +1632,11 @@ static void free_conn(struct connection *con)
spin_unlock(&connections_lock);
if (con->othercon) {
clean_one_writequeue(con->othercon);
- call_rcu(&con->othercon->rcu, connection_release);
+ call_srcu(&connections_srcu, &con->othercon->rcu,
+ connection_release);
}
clean_one_writequeue(con);
- call_rcu(&con->rcu, connection_release);
+ call_srcu(&connections_srcu, &con->rcu, connection_release);
}
static void work_flush(void)
@@ -1665,6 +1682,8 @@ void dlm_lowcomms_stop(void)
if (send_workqueue)
flush_workqueue(send_workqueue);
+ dlm_close_sock(&listen_con.sock);
+
foreach_conn(shutdown_conn);
work_flush();
foreach_conn(free_conn);
@@ -1675,7 +1694,6 @@ void dlm_lowcomms_stop(void)
int dlm_lowcomms_start(void)
{
int error = -EINVAL;
- struct connection *con;
int i;
for (i = 0; i < CONN_HASH_SIZE; i++)
@@ -1688,6 +1706,8 @@ int dlm_lowcomms_start(void)
goto fail;
}
+ INIT_WORK(&listen_con.rwork, process_listen_recv_socket);
+
error = work_start();
if (error)
goto fail;
@@ -1698,7 +1718,7 @@ int dlm_lowcomms_start(void)
if (dlm_config.ci_protocol == 0)
error = tcp_listen_for_all();
else
- error = sctp_listen_for_all();
+ error = sctp_listen_for_all(&listen_con);
if (error)
goto fail_unlisten;
@@ -1706,9 +1726,7 @@ int dlm_lowcomms_start(void)
fail_unlisten:
dlm_allow_conn = 0;
- con = nodeid2con(0,0);
- if (con)
- free_conn(con);
+ dlm_close_sock(&listen_con.sock);
fail:
return error;
}
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
index 687b2894e469..0918f9376489 100644
--- a/fs/dlm/lowcomms.h
+++ b/fs/dlm/lowcomms.h
@@ -12,6 +12,8 @@
#ifndef __LOWCOMMS_DOT_H__
#define __LOWCOMMS_DOT_H__
+#define LOWCOMMS_MAX_TX_BUFFER_LEN 4096
+
int dlm_lowcomms_start(void);
void dlm_lowcomms_stop(void);
void dlm_lowcomms_exit(void);
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 7ad83deb4505..ceef3f2074ff 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -270,7 +270,7 @@ int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
log_slots(ls, gen, num, NULL, array, array_size);
- max_slots = (dlm_config.ci_buffer_size - sizeof(struct dlm_rcom) -
+ max_slots = (LOWCOMMS_MAX_TX_BUFFER_LEN - sizeof(struct dlm_rcom) -
sizeof(struct rcom_config)) / sizeof(struct rcom_slot);
if (num > max_slots) {
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 4daf5dc2b51c..73ddee5159d7 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -162,7 +162,7 @@ retry:
set_rcom_status(ls, (struct rcom_status *)rc->rc_buf, status_flags);
allow_sync_reply(ls, &rc->rc_id);
- memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size);
+ memset(ls->ls_recover_buf, 0, LOWCOMMS_MAX_TX_BUFFER_LEN);
send_rcom(ls, mh, rc);
@@ -284,7 +284,7 @@ retry:
memcpy(rc->rc_buf, last_name, last_len);
allow_sync_reply(ls, &rc->rc_id);
- memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size);
+ memset(ls->ls_recover_buf, 0, LOWCOMMS_MAX_TX_BUFFER_LEN);
send_rcom(ls, mh, rc);
@@ -304,7 +304,7 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
nodeid = rc_in->rc_header.h_nodeid;
inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
- outlen = dlm_config.ci_buffer_size - sizeof(struct dlm_rcom);
+ outlen = LOWCOMMS_MAX_TX_BUFFER_LEN - sizeof(struct dlm_rcom);
error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
if (error)
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 46f2aa4ba46c..af159539fc1b 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -1,11 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
-EROFS_VERSION = "1.0"
-
-ccflags-y += -DEROFS_VERSION=\"$(EROFS_VERSION)\"
-
obj-$(CONFIG_EROFS_FS) += erofs.o
erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
-
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 3d452443c545..aea129ddda74 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -26,30 +26,58 @@ struct z_erofs_decompress_req {
bool inplace_io, partial_decoding;
};
+/* some special page->private (unsigned long, see below) */
+#define Z_EROFS_SHORTLIVED_PAGE (-1UL << 2)
+#define Z_EROFS_PREALLOCATED_PAGE (-2UL << 2)
+
/*
- * - 0x5A110C8D ('sallocated', Z_EROFS_MAPPING_STAGING) -
- * used to mark temporary allocated pages from other
- * file/cached pages and NULL mapping pages.
+ * For all pages in a pcluster, page->private should be one of
+ * Type Last 2bits page->private
+ * short-lived page 00 Z_EROFS_SHORTLIVED_PAGE
+ * preallocated page (tryalloc) 00 Z_EROFS_PREALLOCATED_PAGE
+ * cached/managed page 00 pointer to z_erofs_pcluster
+ * online page (file-backed, 01/10/11 sub-index << 2 | count
+ * some pages can be used for inplace I/O)
+ *
+ * page->mapping should be one of
+ * Type page->mapping
+ * short-lived page NULL
+ * preallocated page NULL
+ * cached/managed page non-NULL or NULL (invalidated/truncated page)
+ * online page non-NULL
+ *
+ * For all managed pages, PG_private should be set with 1 extra refcount,
+ * which is used for page reclaim / migration.
*/
-#define Z_EROFS_MAPPING_STAGING ((void *)0x5A110C8D)
-/* check if a page is marked as staging */
-static inline bool z_erofs_page_is_staging(struct page *page)
+/*
+ * short-lived pages are pages directly from buddy system with specific
+ * page->private (no need to set PagePrivate since these are non-LRU /
+ * non-movable pages and bypass reclaim / migration code).
+ */
+static inline bool z_erofs_is_shortlived_page(struct page *page)
{
- return page->mapping == Z_EROFS_MAPPING_STAGING;
+ if (page->private != Z_EROFS_SHORTLIVED_PAGE)
+ return false;
+
+ DBG_BUGON(page->mapping);
+ return true;
}
-static inline bool z_erofs_put_stagingpage(struct list_head *pagepool,
- struct page *page)
+static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool,
+ struct page *page)
{
- if (!z_erofs_page_is_staging(page))
+ if (!z_erofs_is_shortlived_page(page))
return false;
- /* staging pages should not be used by others at the same time */
- if (page_ref_count(page) > 1)
+ /* short-lived pages should not be used by others at the same time */
+ if (page_ref_count(page) > 1) {
put_page(page);
- else
+ } else {
+ /* follow the pcluster rule above. */
+ set_page_private(page, 0);
list_add(&page->lru, pagepool);
+ }
return true;
}
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 347be146884c..ea4f693bee22 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -312,27 +312,12 @@ static void erofs_raw_access_readahead(struct readahead_control *rac)
submit_bio(bio);
}
-static int erofs_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int create)
-{
- struct erofs_map_blocks map = {
- .m_la = iblock << 9,
- };
- int err;
-
- err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
- if (err)
- return err;
-
- if (map.m_flags & EROFS_MAP_MAPPED)
- bh->b_blocknr = erofs_blknr(map.m_pa);
-
- return err;
-}
-
static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
{
struct inode *inode = mapping->host;
+ struct erofs_map_blocks map = {
+ .m_la = blknr_to_addr(block),
+ };
if (EROFS_I(inode)->datalayout == EROFS_INODE_FLAT_INLINE) {
erofs_blk_t blks = i_size_read(inode) >> LOG_BLOCK_SIZE;
@@ -341,7 +326,10 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
return 0;
}
- return generic_block_bmap(mapping, block, erofs_get_block);
+ if (!erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW))
+ return erofs_blknr(map.m_pa);
+
+ return 0;
}
/* for uncompressed (aligned) files and raw access for other files */
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index cbadbf55c6c2..1cb1ffd10569 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -76,7 +76,7 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
victim = erofs_allocpage(pagepool, GFP_KERNEL);
if (!victim)
return -ENOMEM;
- victim->mapping = Z_EROFS_MAPPING_STAGING;
+ set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE);
}
rq->out[i] = victim;
}
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 86fd3bf62af6..6cb356c4217b 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -20,6 +20,11 @@
enum z_erofs_cache_alloctype {
DONTALLOC, /* don't allocate any cached pages */
DELAYEDALLOC, /* delayed allocation (at the time of submitting io) */
+ /*
+ * try to use cached I/O if page allocation succeeds or fallback
+ * to in-place I/O instead to avoid any direct reclaim.
+ */
+ TRYALLOC,
};
/*
@@ -154,13 +159,16 @@ static DEFINE_MUTEX(z_pagemap_global_lock);
static void preload_compressed_pages(struct z_erofs_collector *clt,
struct address_space *mc,
- enum z_erofs_cache_alloctype type)
+ enum z_erofs_cache_alloctype type,
+ struct list_head *pagepool)
{
const struct z_erofs_pcluster *pcl = clt->pcl;
const unsigned int clusterpages = BIT(pcl->clusterbits);
struct page **pages = clt->compressedpages;
pgoff_t index = pcl->obj.index + (pages - pcl->compressed_pages);
bool standalone = true;
+ gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
+ __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
if (clt->mode < COLLECT_PRIMARY_FOLLOWED)
return;
@@ -168,6 +176,7 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
for (; pages < pcl->compressed_pages + clusterpages; ++pages) {
struct page *page;
compressed_page_t t;
+ struct page *newpage = NULL;
/* the compressed page was loaded before */
if (READ_ONCE(*pages))
@@ -179,7 +188,15 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
t = tag_compressed_page_justfound(page);
} else if (type == DELAYEDALLOC) {
t = tagptr_init(compressed_page_t, PAGE_UNALLOCATED);
+ } else if (type == TRYALLOC) {
+ newpage = erofs_allocpage(pagepool, gfp);
+ if (!newpage)
+ goto dontalloc;
+
+ set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
+ t = tag_compressed_page_justfound(newpage);
} else { /* DONTALLOC */
+dontalloc:
if (standalone)
clt->compressedpages = pages;
standalone = false;
@@ -189,8 +206,12 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t)))
continue;
- if (page)
+ if (page) {
put_page(page);
+ } else if (newpage) {
+ set_page_private(newpage, 0);
+ list_add(&newpage->lru, pagepool);
+ }
}
if (standalone) /* downgrade to PRIMARY_FOLLOWED_NOINPLACE */
@@ -226,11 +247,8 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
/* barrier is implied in the following 'unlock_page' */
WRITE_ONCE(pcl->compressed_pages[i], NULL);
- set_page_private(page, 0);
- ClearPagePrivate(page);
-
+ detach_page_private(page);
unlock_page(page);
- put_page(page);
}
return 0;
}
@@ -254,10 +272,8 @@ int erofs_try_to_free_cached_page(struct address_space *mapping,
}
erofs_workgroup_unfreeze(&pcl->obj, 1);
- if (ret) {
- ClearPagePrivate(page);
- put_page(page);
- }
+ if (ret)
+ detach_page_private(page);
}
return ret;
}
@@ -297,34 +313,33 @@ static int z_erofs_attach_page(struct z_erofs_collector *clt,
return ret ? 0 : -EAGAIN;
}
-static enum z_erofs_collectmode
-try_to_claim_pcluster(struct z_erofs_pcluster *pcl,
- z_erofs_next_pcluster_t *owned_head)
+static void z_erofs_try_to_claim_pcluster(struct z_erofs_collector *clt)
{
- /* let's claim these following types of pclusters */
-retry:
- if (pcl->next == Z_EROFS_PCLUSTER_NIL) {
- /* type 1, nil pcluster */
- if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
- *owned_head) != Z_EROFS_PCLUSTER_NIL)
- goto retry;
+ struct z_erofs_pcluster *pcl = clt->pcl;
+ z_erofs_next_pcluster_t *owned_head = &clt->owned_head;
+ /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */
+ if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
+ *owned_head) == Z_EROFS_PCLUSTER_NIL) {
*owned_head = &pcl->next;
- /* lucky, I am the followee :) */
- return COLLECT_PRIMARY_FOLLOWED;
- } else if (pcl->next == Z_EROFS_PCLUSTER_TAIL) {
- /*
- * type 2, link to the end of a existing open chain,
- * be careful that its submission itself is governed
- * by the original owned chain.
- */
- if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
- *owned_head) != Z_EROFS_PCLUSTER_TAIL)
- goto retry;
+ /* so we can attach this pcluster to our submission chain. */
+ clt->mode = COLLECT_PRIMARY_FOLLOWED;
+ return;
+ }
+
+ /*
+ * type 2, link to the end of an existing open chain, be careful
+ * that its submission is controlled by the original attached chain.
+ */
+ if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
+ *owned_head) == Z_EROFS_PCLUSTER_TAIL) {
*owned_head = Z_EROFS_PCLUSTER_TAIL;
- return COLLECT_PRIMARY_HOOKED;
+ clt->mode = COLLECT_PRIMARY_HOOKED;
+ clt->tailpcl = NULL;
+ return;
}
- return COLLECT_PRIMARY; /* :( better luck next time */
+ /* type 3, it belongs to a chain, but it isn't the end of the chain */
+ clt->mode = COLLECT_PRIMARY;
}
static int z_erofs_lookup_collection(struct z_erofs_collector *clt,
@@ -369,10 +384,8 @@ static int z_erofs_lookup_collection(struct z_erofs_collector *clt,
/* used to check tail merging loop due to corrupted images */
if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
clt->tailpcl = pcl;
- clt->mode = try_to_claim_pcluster(pcl, &clt->owned_head);
- /* clean tailpcl if the current owned_head is Z_EROFS_PCLUSTER_TAIL */
- if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
- clt->tailpcl = NULL;
+
+ z_erofs_try_to_claim_pcluster(clt);
clt->cl = cl;
return 0;
}
@@ -562,7 +575,7 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
}
static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
- struct page *page)
+ struct page *page, struct list_head *pagepool)
{
struct inode *const inode = fe->inode;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
@@ -615,11 +628,12 @@ restart_now:
/* preload all compressed pages (maybe downgrade role if necessary) */
if (should_alloc_managed_pages(fe, sbi->ctx.cache_strategy, map->m_la))
- cache_strategy = DELAYEDALLOC;
+ cache_strategy = TRYALLOC;
else
cache_strategy = DONTALLOC;
- preload_compressed_pages(clt, MNGD_MAPPING(sbi), cache_strategy);
+ preload_compressed_pages(clt, MNGD_MAPPING(sbi),
+ cache_strategy, pagepool);
hitted:
/*
@@ -648,12 +662,12 @@ hitted:
retry:
err = z_erofs_attach_page(clt, page, page_type);
- /* should allocate an additional staging page for pagevec */
+ /* should allocate an additional short-lived page for pagevec */
if (err == -EAGAIN) {
struct page *const newpage =
alloc_page(GFP_NOFS | __GFP_NOFAIL);
- newpage->mapping = Z_EROFS_MAPPING_STAGING;
+ set_page_private(newpage, Z_EROFS_SHORTLIVED_PAGE);
err = z_erofs_attach_page(clt, newpage,
Z_EROFS_PAGE_TYPE_EXCLUSIVE);
if (!err)
@@ -710,6 +724,11 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
queue_work(z_erofs_workqueue, &io->u.work);
}
+static bool z_erofs_page_is_invalidated(struct page *page)
+{
+ return !page->mapping && !z_erofs_is_shortlived_page(page);
+}
+
static void z_erofs_decompressqueue_endio(struct bio *bio)
{
tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private);
@@ -722,7 +741,7 @@ static void z_erofs_decompressqueue_endio(struct bio *bio)
struct page *page = bvec->bv_page;
DBG_BUGON(PageUptodate(page));
- DBG_BUGON(!page->mapping);
+ DBG_BUGON(z_erofs_page_is_invalidated(page));
if (err)
SetPageError(page);
@@ -795,9 +814,9 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
/* all pages in pagevec ought to be valid */
DBG_BUGON(!page);
- DBG_BUGON(!page->mapping);
+ DBG_BUGON(z_erofs_page_is_invalidated(page));
- if (z_erofs_put_stagingpage(pagepool, page))
+ if (z_erofs_put_shortlivedpage(pagepool, page))
continue;
if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
@@ -831,9 +850,9 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
/* all compressed pages ought to be valid */
DBG_BUGON(!page);
- DBG_BUGON(!page->mapping);
+ DBG_BUGON(z_erofs_page_is_invalidated(page));
- if (!z_erofs_page_is_staging(page)) {
+ if (!z_erofs_is_shortlived_page(page)) {
if (erofs_page_is_managed(sbi, page)) {
if (!PageUptodate(page))
err = -EIO;
@@ -858,7 +877,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
overlapped = true;
}
- /* PG_error needs checking for inplaced and staging pages */
+ /* PG_error needs checking for all non-managed pages */
if (PageError(page)) {
DBG_BUGON(PageUptodate(page));
err = -EIO;
@@ -897,8 +916,8 @@ out:
if (erofs_page_is_managed(sbi, page))
continue;
- /* recycle all individual staging pages */
- (void)z_erofs_put_stagingpage(pagepool, page);
+ /* recycle all individual short-lived pages */
+ (void)z_erofs_put_shortlivedpage(pagepool, page);
WRITE_ONCE(compressed_pages[i], NULL);
}
@@ -908,10 +927,10 @@ out:
if (!page)
continue;
- DBG_BUGON(!page->mapping);
+ DBG_BUGON(z_erofs_page_is_invalidated(page));
- /* recycle all individual staging pages */
- if (z_erofs_put_stagingpage(pagepool, page))
+ /* recycle all individual short-lived pages */
+ if (z_erofs_put_shortlivedpage(pagepool, page))
continue;
if (err < 0)
@@ -1008,16 +1027,30 @@ repeat:
justfound = tagptr_unfold_tags(t);
page = tagptr_unfold_ptr(t);
+ /*
+ * preallocated cached pages, which is used to avoid direct reclaim
+ * otherwise, it will go inplace I/O path instead.
+ */
+ if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
+ WRITE_ONCE(pcl->compressed_pages[nr], page);
+ set_page_private(page, 0);
+ tocache = true;
+ goto out_tocache;
+ }
mapping = READ_ONCE(page->mapping);
/*
- * unmanaged (file) pages are all locked solidly,
+ * file-backed online pages in plcuster are all locked steady,
* therefore it is impossible for `mapping' to be NULL.
*/
if (mapping && mapping != mc)
/* ought to be unmanaged pages */
goto out;
+ /* directly return for shortlived page as well */
+ if (z_erofs_is_shortlived_page(page))
+ goto out;
+
lock_page(page);
/* only true if page reclaim goes wrong, should never happen */
@@ -1061,28 +1094,21 @@ repeat:
put_page(page);
out_allocpage:
page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
- if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) {
- /* non-LRU / non-movable temporary page is needed */
- page->mapping = Z_EROFS_MAPPING_STAGING;
- tocache = false;
- }
-
if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) {
- if (tocache) {
- /* since it added to managed cache successfully */
- unlock_page(page);
- put_page(page);
- } else {
- list_add(&page->lru, pagepool);
- }
+ list_add(&page->lru, pagepool);
cond_resched();
goto repeat;
}
-
- if (tocache) {
- set_page_private(page, (unsigned long)pcl);
- SetPagePrivate(page);
+out_tocache:
+ if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) {
+ /* turn into temporary page if fails (1 ref) */
+ set_page_private(page, Z_EROFS_SHORTLIVED_PAGE);
+ goto out;
}
+ attach_page_private(page, pcl);
+ /* drop a refcount added by allocpage (then we have 2 refs here) */
+ put_page(page);
+
out: /* the only exit (for tracing and debugging) */
return page;
}
@@ -1284,7 +1310,7 @@ static int z_erofs_readpage(struct file *file, struct page *page)
f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
- err = z_erofs_do_read_page(&f, page);
+ err = z_erofs_do_read_page(&f, page, &pagepool);
(void)z_erofs_collector_end(&f.clt);
/* if some compressed cluster ready, need submit them anyway */
@@ -1338,7 +1364,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
/* traversal in reverse order */
head = (void *)page_private(page);
- err = z_erofs_do_read_page(&f, page);
+ err = z_erofs_do_read_page(&f, page, &pagepool);
if (err)
erofs_err(inode->i_sb,
"readahead error at page %lu @ nid %llu",
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index 68c9b29fc0ca..b503b353d4ab 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -173,6 +173,7 @@ static inline void z_erofs_onlinepage_endio(struct page *page)
v = atomic_dec_return(u.o);
if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
+ set_page_private(page, 0);
ClearPagePrivate(page);
if (!PageError(page))
SetPageUptodate(page);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4df61129566d..10b81e69db74 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -109,23 +109,22 @@ struct epoll_filefd {
int fd;
} __packed;
-/*
- * Structure used to track possible nested calls, for too deep recursions
- * and loop cycles.
- */
-struct nested_call_node {
- struct list_head llink;
- void *cookie;
- void *ctx;
-};
+/* Wait structure used by the poll hooks */
+struct eppoll_entry {
+ /* List header used to link this structure to the "struct epitem" */
+ struct eppoll_entry *next;
-/*
- * This structure is used as collector for nested calls, to check for
- * maximum recursion dept and loop cycles.
- */
-struct nested_calls {
- struct list_head tasks_call_list;
- spinlock_t lock;
+ /* The "base" pointer is set to the container "struct epitem" */
+ struct epitem *base;
+
+ /*
+ * Wait queue item that will be linked to the target file wait
+ * queue head.
+ */
+ wait_queue_entry_t wait;
+
+ /* The wait queue head that linked the "wait" wait queue item */
+ wait_queue_head_t *whead;
};
/*
@@ -154,17 +153,14 @@ struct epitem {
/* The file descriptor information this item refers to */
struct epoll_filefd ffd;
- /* Number of active wait queue attached to poll operations */
- int nwait;
-
/* List containing poll wait queues */
- struct list_head pwqlist;
+ struct eppoll_entry *pwqlist;
/* The "container" of this item */
struct eventpoll *ep;
/* List header used to link this item to the "struct file" items list */
- struct list_head fllink;
+ struct hlist_node fllink;
/* wakeup_source used when EPOLLWAKEUP is set */
struct wakeup_source __rcu *ws;
@@ -219,6 +215,7 @@ struct eventpoll {
/* used to optimize loop detection check */
u64 gen;
+ struct hlist_head refs;
#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
@@ -231,37 +228,12 @@ struct eventpoll {
#endif
};
-/* Wait structure used by the poll hooks */
-struct eppoll_entry {
- /* List header used to link this structure to the "struct epitem" */
- struct list_head llink;
-
- /* The "base" pointer is set to the container "struct epitem" */
- struct epitem *base;
-
- /*
- * Wait queue item that will be linked to the target file wait
- * queue head.
- */
- wait_queue_entry_t wait;
-
- /* The wait queue head that linked the "wait" wait queue item */
- wait_queue_head_t *whead;
-};
-
/* Wrapper struct used by poll queueing */
struct ep_pqueue {
poll_table pt;
struct epitem *epi;
};
-/* Used by the ep_send_events() function as callback private data */
-struct ep_send_events_data {
- int maxevents;
- struct epoll_event __user *events;
- int res;
-};
-
/*
* Configuration options available inside /proc/sys/fs/epoll/
*/
@@ -276,7 +248,7 @@ static DEFINE_MUTEX(epmutex);
static u64 loop_check_gen = 0;
/* Used to check for epoll file descriptor inclusion loops */
-static struct nested_calls poll_loop_ncalls;
+static struct eventpoll *inserting_into;
/* Slab cache used to allocate "struct epitem" */
static struct kmem_cache *epi_cache __read_mostly;
@@ -288,7 +260,45 @@ static struct kmem_cache *pwq_cache __read_mostly;
* List of files with newly added links, where we may need to limit the number
* of emanating paths. Protected by the epmutex.
*/
-static LIST_HEAD(tfile_check_list);
+struct epitems_head {
+ struct hlist_head epitems;
+ struct epitems_head *next;
+};
+static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;
+
+static struct kmem_cache *ephead_cache __read_mostly;
+
+static inline void free_ephead(struct epitems_head *head)
+{
+ if (head)
+ kmem_cache_free(ephead_cache, head);
+}
+
+static void list_file(struct file *file)
+{
+ struct epitems_head *head;
+
+ head = container_of(file->f_ep, struct epitems_head, epitems);
+ if (!head->next) {
+ head->next = tfile_check_list;
+ tfile_check_list = head;
+ }
+}
+
+static void unlist_file(struct epitems_head *head)
+{
+ struct epitems_head *to_free = head;
+ struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems));
+ if (p) {
+ struct epitem *epi= container_of(p, struct epitem, fllink);
+ spin_lock(&epi->ffd.file->f_lock);
+ if (!hlist_empty(&head->epitems))
+ to_free = NULL;
+ head->next = NULL;
+ spin_unlock(&epi->ffd.file->f_lock);
+ }
+ free_ephead(to_free);
+}
#ifdef CONFIG_SYSCTL
@@ -351,19 +361,6 @@ static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
return container_of(p, struct eppoll_entry, wait)->base;
}
-/* Get the "struct epitem" from an epoll queue wrapper */
-static inline struct epitem *ep_item_from_epqueue(poll_table *p)
-{
- return container_of(p, struct ep_pqueue, pt)->epi;
-}
-
-/* Initialize the poll safe wake up structure */
-static void ep_nested_calls_init(struct nested_calls *ncalls)
-{
- INIT_LIST_HEAD(&ncalls->tasks_call_list);
- spin_lock_init(&ncalls->lock);
-}
-
/**
* ep_events_available - Checks if ready events might be available.
*
@@ -397,7 +394,8 @@ static void ep_busy_loop(struct eventpoll *ep, int nonblock)
unsigned int napi_id = READ_ONCE(ep->napi_id);
if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
- napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep);
+ napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false,
+ BUSY_POLL_BUDGET);
}
static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
@@ -415,12 +413,11 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
unsigned int napi_id;
struct socket *sock;
struct sock *sk;
- int err;
if (!net_busy_loop_on())
return;
- sock = sock_from_file(epi->ffd.file, &err);
+ sock = sock_from_file(epi->ffd.file);
if (!sock)
return;
@@ -458,69 +455,6 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
#endif /* CONFIG_NET_RX_BUSY_POLL */
-/**
- * ep_call_nested - Perform a bound (possibly) nested call, by checking
- * that the recursion limit is not exceeded, and that
- * the same nested call (by the meaning of same cookie) is
- * no re-entered.
- *
- * @ncalls: Pointer to the nested_calls structure to be used for this call.
- * @nproc: Nested call core function pointer.
- * @priv: Opaque data to be passed to the @nproc callback.
- * @cookie: Cookie to be used to identify this nested call.
- * @ctx: This instance context.
- *
- * Returns: Returns the code returned by the @nproc callback, or -1 if
- * the maximum recursion limit has been exceeded.
- */
-static int ep_call_nested(struct nested_calls *ncalls,
- int (*nproc)(void *, void *, int), void *priv,
- void *cookie, void *ctx)
-{
- int error, call_nests = 0;
- unsigned long flags;
- struct list_head *lsthead = &ncalls->tasks_call_list;
- struct nested_call_node *tncur;
- struct nested_call_node tnode;
-
- spin_lock_irqsave(&ncalls->lock, flags);
-
- /*
- * Try to see if the current task is already inside this wakeup call.
- * We use a list here, since the population inside this set is always
- * very much limited.
- */
- list_for_each_entry(tncur, lsthead, llink) {
- if (tncur->ctx == ctx &&
- (tncur->cookie == cookie || ++call_nests > EP_MAX_NESTS)) {
- /*
- * Ops ... loop detected or maximum nest level reached.
- * We abort this wake by breaking the cycle itself.
- */
- error = -1;
- goto out_unlock;
- }
- }
-
- /* Add the current task and cookie to the list */
- tnode.ctx = ctx;
- tnode.cookie = cookie;
- list_add(&tnode.llink, lsthead);
-
- spin_unlock_irqrestore(&ncalls->lock, flags);
-
- /* Call the nested function */
- error = (*nproc)(priv, cookie, call_nests);
-
- /* Remove the current task from the list */
- spin_lock_irqsave(&ncalls->lock, flags);
- list_del(&tnode.llink);
-out_unlock:
- spin_unlock_irqrestore(&ncalls->lock, flags);
-
- return error;
-}
-
/*
* As described in commit 0ccf831cb lockdep: annotate epoll
* the use of wait queues used by epoll is done in a very controlled
@@ -617,13 +551,11 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq)
*/
static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
{
- struct list_head *lsthead = &epi->pwqlist;
+ struct eppoll_entry **p = &epi->pwqlist;
struct eppoll_entry *pwq;
- while (!list_empty(lsthead)) {
- pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
-
- list_del(&pwq->llink);
+ while ((pwq = *p) != NULL) {
+ *p = pwq->next;
ep_remove_wait_queue(pwq);
kmem_cache_free(pwq_cache, pwq);
}
@@ -661,38 +593,13 @@ static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
rcu_read_unlock();
}
-/**
- * ep_scan_ready_list - Scans the ready list in a way that makes possible for
- * the scan code, to call f_op->poll(). Also allows for
- * O(NumReady) performance.
- *
- * @ep: Pointer to the epoll private data structure.
- * @sproc: Pointer to the scan callback.
- * @priv: Private opaque data passed to the @sproc callback.
- * @depth: The current depth of recursive f_op->poll calls.
- * @ep_locked: caller already holds ep->mtx
- *
- * Returns: The same integer error code returned by the @sproc callback.
+
+/*
+ * ep->mutex needs to be held because we could be hit by
+ * eventpoll_release_file() and epoll_ctl().
*/
-static __poll_t ep_scan_ready_list(struct eventpoll *ep,
- __poll_t (*sproc)(struct eventpoll *,
- struct list_head *, void *),
- void *priv, int depth, bool ep_locked)
+static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
{
- __poll_t res;
- struct epitem *epi, *nepi;
- LIST_HEAD(txlist);
-
- lockdep_assert_irqs_enabled();
-
- /*
- * We need to lock this because we could be hit by
- * eventpoll_release_file() and epoll_ctl().
- */
-
- if (!ep_locked)
- mutex_lock_nested(&ep->mtx, depth);
-
/*
* Steal the ready list, and re-init the original one to the
* empty list. Also, set ep->ovflist to NULL so that events
@@ -701,15 +608,17 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* because we want the "sproc" callback to be able to do it
* in a lockless way.
*/
+ lockdep_assert_irqs_enabled();
write_lock_irq(&ep->lock);
- list_splice_init(&ep->rdllist, &txlist);
+ list_splice_init(&ep->rdllist, txlist);
WRITE_ONCE(ep->ovflist, NULL);
write_unlock_irq(&ep->lock);
+}
- /*
- * Now call the callback function.
- */
- res = (*sproc)(ep, &txlist, priv);
+static void ep_done_scan(struct eventpoll *ep,
+ struct list_head *txlist)
+{
+ struct epitem *epi, *nepi;
write_lock_irq(&ep->lock);
/*
@@ -744,14 +653,9 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
/*
* Quickly re-inject items left on "txlist".
*/
- list_splice(&txlist, &ep->rdllist);
+ list_splice(txlist, &ep->rdllist);
__pm_relax(ep->ws);
write_unlock_irq(&ep->lock);
-
- if (!ep_locked)
- mutex_unlock(&ep->mtx);
-
- return res;
}
static void epi_rcu_free(struct rcu_head *head)
@@ -767,6 +671,8 @@ static void epi_rcu_free(struct rcu_head *head)
static int ep_remove(struct eventpoll *ep, struct epitem *epi)
{
struct file *file = epi->ffd.file;
+ struct epitems_head *to_free;
+ struct hlist_head *head;
lockdep_assert_irqs_enabled();
@@ -777,8 +683,20 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
/* Remove the current item from the list of epoll hooks */
spin_lock(&file->f_lock);
- list_del_rcu(&epi->fllink);
+ to_free = NULL;
+ head = file->f_ep;
+ if (head->first == &epi->fllink && !epi->fllink.next) {
+ file->f_ep = NULL;
+ if (!is_file_epoll(file)) {
+ struct epitems_head *v;
+ v = container_of(head, struct epitems_head, epitems);
+ if (!smp_load_acquire(&v->next))
+ to_free = v;
+ }
+ }
+ hlist_del_rcu(&epi->fllink);
spin_unlock(&file->f_lock);
+ free_ephead(to_free);
rb_erase_cached(&epi->rbn, &ep->rbr);
@@ -864,48 +782,31 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
return 0;
}
-static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
- void *priv);
-static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
- poll_table *pt);
+static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth);
-/*
- * Differs from ep_eventpoll_poll() in that internal callers already have
- * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
- * is correctly annotated.
- */
-static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
- int depth)
-{
- struct eventpoll *ep;
- bool locked;
-
- pt->_key = epi->event.events;
- if (!is_file_epoll(epi->ffd.file))
- return vfs_poll(epi->ffd.file, pt) & epi->event.events;
-
- ep = epi->ffd.file->private_data;
- poll_wait(epi->ffd.file, &ep->poll_wait, pt);
- locked = pt && (pt->_qproc == ep_ptable_queue_proc);
-
- return ep_scan_ready_list(epi->ffd.file->private_data,
- ep_read_events_proc, &depth, depth,
- locked) & epi->event.events;
-}
-
-static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
- void *priv)
+static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth)
{
+ struct eventpoll *ep = file->private_data;
+ LIST_HEAD(txlist);
struct epitem *epi, *tmp;
poll_table pt;
- int depth = *(int *)priv;
+ __poll_t res = 0;
init_poll_funcptr(&pt, NULL);
- depth++;
- list_for_each_entry_safe(epi, tmp, head, rdllink) {
- if (ep_item_poll(epi, &pt, depth)) {
- return EPOLLIN | EPOLLRDNORM;
+ /* Insert inside our poll wait queue */
+ poll_wait(file, &ep->poll_wait, wait);
+
+ /*
+ * Proceed to find out if wanted events are really available inside
+ * the ready list.
+ */
+ mutex_lock_nested(&ep->mtx, depth);
+ ep_start_scan(ep, &txlist);
+ list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
+ if (ep_item_poll(epi, &pt, depth + 1)) {
+ res = EPOLLIN | EPOLLRDNORM;
+ break;
} else {
/*
* Item has been dropped into the ready list by the poll
@@ -916,24 +817,33 @@ static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head
list_del_init(&epi->rdllink);
}
}
-
- return 0;
+ ep_done_scan(ep, &txlist);
+ mutex_unlock(&ep->mtx);
+ return res;
}
-static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
+/*
+ * Differs from ep_eventpoll_poll() in that internal callers already have
+ * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
+ * is correctly annotated.
+ */
+static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
+ int depth)
{
- struct eventpoll *ep = file->private_data;
- int depth = 0;
+ struct file *file = epi->ffd.file;
+ __poll_t res;
- /* Insert inside our poll wait queue */
- poll_wait(file, &ep->poll_wait, wait);
+ pt->_key = epi->event.events;
+ if (!is_file_epoll(file))
+ res = vfs_poll(file, pt);
+ else
+ res = __ep_eventpoll_poll(file, pt, depth);
+ return res & epi->event.events;
+}
- /*
- * Proceed to find out if wanted events are really available inside
- * the ready list.
- */
- return ep_scan_ready_list(ep, ep_read_events_proc,
- &depth, depth, false);
+static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
+{
+ return __ep_eventpoll_poll(file, wait, 0);
}
#ifdef CONFIG_PROC_FS
@@ -978,7 +888,8 @@ static const struct file_operations eventpoll_fops = {
void eventpoll_release_file(struct file *file)
{
struct eventpoll *ep;
- struct epitem *epi, *next;
+ struct epitem *epi;
+ struct hlist_node *next;
/*
* We don't want to get "file->f_lock" because it is not
@@ -994,7 +905,11 @@ void eventpoll_release_file(struct file *file)
* Besides, ep_remove() acquires the lock, so we can't hold it here.
*/
mutex_lock(&epmutex);
- list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
+ if (unlikely(!file->f_ep)) {
+ mutex_unlock(&epmutex);
+ return;
+ }
+ hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) {
ep = epi->ep;
mutex_lock_nested(&ep->mtx, 0);
ep_remove(ep, epi);
@@ -1309,23 +1224,28 @@ out_unlock:
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
poll_table *pt)
{
- struct epitem *epi = ep_item_from_epqueue(pt);
+ struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt);
+ struct epitem *epi = epq->epi;
struct eppoll_entry *pwq;
- if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
- init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
- pwq->whead = whead;
- pwq->base = epi;
- if (epi->event.events & EPOLLEXCLUSIVE)
- add_wait_queue_exclusive(whead, &pwq->wait);
- else
- add_wait_queue(whead, &pwq->wait);
- list_add_tail(&pwq->llink, &epi->pwqlist);
- epi->nwait++;
- } else {
- /* We have to signal that an error occurred */
- epi->nwait = -1;
+ if (unlikely(!epi)) // an earlier allocation has failed
+ return;
+
+ pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL);
+ if (unlikely(!pwq)) {
+ epq->epi = NULL;
+ return;
}
+
+ init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
+ pwq->whead = whead;
+ pwq->base = epi;
+ if (epi->event.events & EPOLLEXCLUSIVE)
+ add_wait_queue_exclusive(whead, &pwq->wait);
+ else
+ add_wait_queue(whead, &pwq->wait);
+ pwq->next = epi->pwqlist;
+ epi->pwqlist = pwq;
}
static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
@@ -1385,42 +1305,29 @@ static void path_count_init(void)
path_count[i] = 0;
}
-static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
+static int reverse_path_check_proc(struct hlist_head *refs, int depth)
{
int error = 0;
- struct file *file = priv;
- struct file *child_file;
struct epitem *epi;
+ if (depth > EP_MAX_NESTS) /* too deep nesting */
+ return -1;
+
/* CTL_DEL can remove links here, but that can't increase our count */
- rcu_read_lock();
- list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
- child_file = epi->ep->file;
- if (is_file_epoll(child_file)) {
- if (list_empty(&child_file->f_ep_links)) {
- if (path_count_inc(call_nests)) {
- error = -1;
- break;
- }
- } else {
- error = ep_call_nested(&poll_loop_ncalls,
- reverse_path_check_proc,
- child_file, child_file,
- current);
- }
- if (error != 0)
- break;
- } else {
- printk(KERN_ERR "reverse_path_check_proc: "
- "file is not an ep!\n");
- }
+ hlist_for_each_entry_rcu(epi, refs, fllink) {
+ struct hlist_head *refs = &epi->ep->refs;
+ if (hlist_empty(refs))
+ error = path_count_inc(depth);
+ else
+ error = reverse_path_check_proc(refs, depth + 1);
+ if (error != 0)
+ break;
}
- rcu_read_unlock();
return error;
}
/**
- * reverse_path_check - The tfile_check_list is list of file *, which have
+ * reverse_path_check - The tfile_check_list is list of epitem_head, which have
* links that are proposed to be newly added. We need to
* make sure that those added links don't add too many
* paths such that we will spend all our time waking up
@@ -1431,19 +1338,18 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
*/
static int reverse_path_check(void)
{
- int error = 0;
- struct file *current_file;
+ struct epitems_head *p;
- /* let's call this for all tfiles */
- list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
+ for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) {
+ int error;
path_count_init();
- error = ep_call_nested(&poll_loop_ncalls,
- reverse_path_check_proc, current_file,
- current_file, current);
+ rcu_read_lock();
+ error = reverse_path_check_proc(&p->epitems, 0);
+ rcu_read_unlock();
if (error)
- break;
+ return error;
}
- return error;
+ return 0;
}
static int ep_create_wakeup_source(struct epitem *epi)
@@ -1484,6 +1390,39 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi)
wakeup_source_unregister(ws);
}
+static int attach_epitem(struct file *file, struct epitem *epi)
+{
+ struct epitems_head *to_free = NULL;
+ struct hlist_head *head = NULL;
+ struct eventpoll *ep = NULL;
+
+ if (is_file_epoll(file))
+ ep = file->private_data;
+
+ if (ep) {
+ head = &ep->refs;
+ } else if (!READ_ONCE(file->f_ep)) {
+allocate:
+ to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL);
+ if (!to_free)
+ return -ENOMEM;
+ head = &to_free->epitems;
+ }
+ spin_lock(&file->f_lock);
+ if (!file->f_ep) {
+ if (unlikely(!head)) {
+ spin_unlock(&file->f_lock);
+ goto allocate;
+ }
+ file->f_ep = head;
+ to_free = NULL;
+ }
+ hlist_add_head_rcu(&epi->fllink, file->f_ep);
+ spin_unlock(&file->f_lock);
+ free_ephead(to_free);
+ return 0;
+}
+
/*
* Must be called with "mtx" held.
*/
@@ -1495,47 +1434,62 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
long user_watches;
struct epitem *epi;
struct ep_pqueue epq;
+ struct eventpoll *tep = NULL;
+
+ if (is_file_epoll(tfile))
+ tep = tfile->private_data;
lockdep_assert_irqs_enabled();
user_watches = atomic_long_read(&ep->user->epoll_watches);
if (unlikely(user_watches >= max_user_watches))
return -ENOSPC;
- if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
+ if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL)))
return -ENOMEM;
/* Item initialization follow here ... */
INIT_LIST_HEAD(&epi->rdllink);
- INIT_LIST_HEAD(&epi->fllink);
- INIT_LIST_HEAD(&epi->pwqlist);
epi->ep = ep;
ep_set_ffd(&epi->ffd, tfile, fd);
epi->event = *event;
- epi->nwait = 0;
epi->next = EP_UNACTIVE_PTR;
- if (epi->event.events & EPOLLWAKEUP) {
- error = ep_create_wakeup_source(epi);
- if (error)
- goto error_create_wakeup_source;
- } else {
- RCU_INIT_POINTER(epi->ws, NULL);
- }
+ if (tep)
+ mutex_lock_nested(&tep->mtx, 1);
/* Add the current item to the list of active epoll hook for this file */
- spin_lock(&tfile->f_lock);
- list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
- spin_unlock(&tfile->f_lock);
+ if (unlikely(attach_epitem(tfile, epi) < 0)) {
+ kmem_cache_free(epi_cache, epi);
+ if (tep)
+ mutex_unlock(&tep->mtx);
+ return -ENOMEM;
+ }
+
+ if (full_check && !tep)
+ list_file(tfile);
+
+ atomic_long_inc(&ep->user->epoll_watches);
/*
* Add the current item to the RB tree. All RB tree operations are
* protected by "mtx", and ep_insert() is called with "mtx" held.
*/
ep_rbtree_insert(ep, epi);
+ if (tep)
+ mutex_unlock(&tep->mtx);
/* now check if we've created too many backpaths */
- error = -EINVAL;
- if (full_check && reverse_path_check())
- goto error_remove_epi;
+ if (unlikely(full_check && reverse_path_check())) {
+ ep_remove(ep, epi);
+ return -EINVAL;
+ }
+
+ if (epi->event.events & EPOLLWAKEUP) {
+ error = ep_create_wakeup_source(epi);
+ if (error) {
+ ep_remove(ep, epi);
+ return error;
+ }
+ }
/* Initialize the poll table using the queue callback */
epq.epi = epi;
@@ -1555,9 +1509,10 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
* install process. Namely an allocation for a wait queue failed due
* high memory pressure.
*/
- error = -ENOMEM;
- if (epi->nwait < 0)
- goto error_unregister;
+ if (unlikely(!epq.epi)) {
+ ep_remove(ep, epi);
+ return -ENOMEM;
+ }
/* We have to drop the new item inside our item list to keep track of it */
write_lock_irq(&ep->lock);
@@ -1579,40 +1534,11 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
write_unlock_irq(&ep->lock);
- atomic_long_inc(&ep->user->epoll_watches);
-
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(ep, NULL);
return 0;
-
-error_unregister:
- ep_unregister_pollwait(ep, epi);
-error_remove_epi:
- spin_lock(&tfile->f_lock);
- list_del_rcu(&epi->fllink);
- spin_unlock(&tfile->f_lock);
-
- rb_erase_cached(&epi->rbn, &ep->rbr);
-
- /*
- * We need to do this because an event could have been arrived on some
- * allocated wait queue. Note that we don't care about the ep->ovflist
- * list, since that is used/cleaned only inside a section bound by "mtx".
- * And ep_insert() is called with "mtx" held.
- */
- write_lock_irq(&ep->lock);
- if (ep_is_linked(epi))
- list_del_init(&epi->rdllink);
- write_unlock_irq(&ep->lock);
-
- wakeup_source_unregister(ep_wakeup_source(epi));
-
-error_create_wakeup_source:
- kmem_cache_free(epi_cache, epi);
-
- return error;
}
/*
@@ -1691,28 +1617,28 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
return 0;
}
-static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
- void *priv)
+static int ep_send_events(struct eventpoll *ep,
+ struct epoll_event __user *events, int maxevents)
{
- struct ep_send_events_data *esed = priv;
- __poll_t revents;
struct epitem *epi, *tmp;
- struct epoll_event __user *uevent = esed->events;
- struct wakeup_source *ws;
+ LIST_HEAD(txlist);
poll_table pt;
+ int res = 0;
init_poll_funcptr(&pt, NULL);
- esed->res = 0;
+
+ mutex_lock(&ep->mtx);
+ ep_start_scan(ep, &txlist);
/*
* We can loop without lock because we are passed a task private list.
- * Items cannot vanish during the loop because ep_scan_ready_list() is
- * holding "mtx" during this call.
+ * Items cannot vanish during the loop we are holding ep->mtx.
*/
- lockdep_assert_held(&ep->mtx);
+ list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
+ struct wakeup_source *ws;
+ __poll_t revents;
- list_for_each_entry_safe(epi, tmp, head, rdllink) {
- if (esed->res >= esed->maxevents)
+ if (res >= maxevents)
break;
/*
@@ -1735,24 +1661,23 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head
/*
* If the event mask intersect the caller-requested one,
- * deliver the event to userspace. Again, ep_scan_ready_list()
- * is holding ep->mtx, so no operations coming from userspace
- * can change the item.
+ * deliver the event to userspace. Again, we are holding ep->mtx,
+ * so no operations coming from userspace can change the item.
*/
revents = ep_item_poll(epi, &pt, 1);
if (!revents)
continue;
- if (__put_user(revents, &uevent->events) ||
- __put_user(epi->event.data, &uevent->data)) {
- list_add(&epi->rdllink, head);
+ if (__put_user(revents, &events->events) ||
+ __put_user(epi->event.data, &events->data)) {
+ list_add(&epi->rdllink, &txlist);
ep_pm_stay_awake(epi);
- if (!esed->res)
- esed->res = -EFAULT;
- return 0;
+ if (!res)
+ res = -EFAULT;
+ break;
}
- esed->res++;
- uevent++;
+ res++;
+ events++;
if (epi->event.events & EPOLLONESHOT)
epi->event.events &= EP_PRIVATE_BITS;
else if (!(epi->event.events & EPOLLET)) {
@@ -1771,20 +1696,10 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head
ep_pm_stay_awake(epi);
}
}
+ ep_done_scan(ep, &txlist);
+ mutex_unlock(&ep->mtx);
- return 0;
-}
-
-static int ep_send_events(struct eventpoll *ep,
- struct epoll_event __user *events, int maxevents)
-{
- struct ep_send_events_data esed;
-
- esed.maxevents = maxevents;
- esed.events = events;
-
- ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
- return esed.res;
+ return res;
}
static inline struct timespec64 ep_set_mstimeout(long ms)
@@ -1946,40 +1861,36 @@ send_events:
}
/**
- * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
- * API, to verify that adding an epoll file inside another
+ * ep_loop_check_proc - verify that adding an epoll file inside another
* epoll structure, does not violate the constraints, in
* terms of closed loops, or too deep chains (which can
* result in excessive stack usage).
*
* @priv: Pointer to the epoll file to be currently checked.
- * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
- * data structure pointer.
- * @call_nests: Current dept of the @ep_call_nested() call stack.
+ * @depth: Current depth of the path being checked.
*
* Returns: Returns zero if adding the epoll @file inside current epoll
* structure @ep does not violate the constraints, or -1 otherwise.
*/
-static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
+static int ep_loop_check_proc(struct eventpoll *ep, int depth)
{
int error = 0;
- struct file *file = priv;
- struct eventpoll *ep = file->private_data;
- struct eventpoll *ep_tovisit;
struct rb_node *rbp;
struct epitem *epi;
- mutex_lock_nested(&ep->mtx, call_nests + 1);
+ mutex_lock_nested(&ep->mtx, depth + 1);
ep->gen = loop_check_gen;
for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
epi = rb_entry(rbp, struct epitem, rbn);
if (unlikely(is_file_epoll(epi->ffd.file))) {
+ struct eventpoll *ep_tovisit;
ep_tovisit = epi->ffd.file->private_data;
if (ep_tovisit->gen == loop_check_gen)
continue;
- error = ep_call_nested(&poll_loop_ncalls,
- ep_loop_check_proc, epi->ffd.file,
- ep_tovisit, current);
+ if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
+ error = -1;
+ else
+ error = ep_loop_check_proc(ep_tovisit, depth + 1);
if (error != 0)
break;
} else {
@@ -1991,11 +1902,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
* not already there, and calling reverse_path_check()
* during ep_insert().
*/
- if (list_empty(&epi->ffd.file->f_tfile_llink)) {
- if (get_file_rcu(epi->ffd.file))
- list_add(&epi->ffd.file->f_tfile_llink,
- &tfile_check_list);
- }
+ list_file(epi->ffd.file);
}
}
mutex_unlock(&ep->mtx);
@@ -2004,34 +1911,31 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
}
/**
- * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
- * another epoll file (represented by @ep) does not create
+ * ep_loop_check - Performs a check to verify that adding an epoll file (@to)
+ * into another epoll file (represented by @from) does not create
* closed loops or too deep chains.
*
- * @ep: Pointer to the epoll private data structure.
- * @file: Pointer to the epoll file to be checked.
+ * @from: Pointer to the epoll we are inserting into.
+ * @to: Pointer to the epoll to be inserted.
*
- * Returns: Returns zero if adding the epoll @file inside current epoll
- * structure @ep does not violate the constraints, or -1 otherwise.
+ * Returns: Returns zero if adding the epoll @to inside the epoll @from
+ * does not violate the constraints, or -1 otherwise.
*/
-static int ep_loop_check(struct eventpoll *ep, struct file *file)
+static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
{
- return ep_call_nested(&poll_loop_ncalls,
- ep_loop_check_proc, file, ep, current);
+ inserting_into = ep;
+ return ep_loop_check_proc(to, 0);
}
static void clear_tfile_check_list(void)
{
- struct file *file;
-
- /* first clear the tfile_check_list */
- while (!list_empty(&tfile_check_list)) {
- file = list_first_entry(&tfile_check_list, struct file,
- f_tfile_llink);
- list_del_init(&file->f_tfile_llink);
- fput(file);
+ rcu_read_lock();
+ while (tfile_check_list != EP_UNACTIVE_PTR) {
+ struct epitems_head *head = tfile_check_list;
+ tfile_check_list = head->next;
+ unlist_file(head);
}
- INIT_LIST_HEAD(&tfile_check_list);
+ rcu_read_unlock();
}
/*
@@ -2181,9 +2085,8 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
if (error)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD) {
- if (!list_empty(&f.file->f_ep_links) ||
- ep->gen == loop_check_gen ||
- is_file_epoll(tf.file)) {
+ if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
+ is_file_epoll(tf.file)) {
mutex_unlock(&ep->mtx);
error = epoll_mutex_lock(&epmutex, 0, nonblock);
if (error)
@@ -2191,25 +2094,14 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
loop_check_gen++;
full_check = 1;
if (is_file_epoll(tf.file)) {
+ tep = tf.file->private_data;
error = -ELOOP;
- if (ep_loop_check(ep, tf.file) != 0)
+ if (ep_loop_check(ep, tep) != 0)
goto error_tgt_fput;
- } else {
- get_file(tf.file);
- list_add(&tf.file->f_tfile_llink,
- &tfile_check_list);
}
error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
if (error)
goto error_tgt_fput;
- if (is_file_epoll(tf.file)) {
- tep = tf.file->private_data;
- error = epoll_mutex_lock(&tep->mtx, 1, nonblock);
- if (error) {
- mutex_unlock(&ep->mtx);
- goto error_tgt_fput;
- }
- }
}
}
@@ -2245,8 +2137,6 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
error = -ENOENT;
break;
}
- if (tep != NULL)
- mutex_unlock(&tep->mtx);
mutex_unlock(&ep->mtx);
error_tgt_fput:
@@ -2394,12 +2284,6 @@ static int __init eventpoll_init(void)
BUG_ON(max_user_watches < 0);
/*
- * Initialize the structure used to perform epoll file descriptor
- * inclusion loops checks.
- */
- ep_nested_calls_init(&poll_loop_ncalls);
-
- /*
* We can have many thousands of epitems, so prevent this from
* using an extra cache line on 64-bit (and smaller) CPUs
*/
@@ -2413,6 +2297,9 @@ static int __init eventpoll_init(void)
pwq_cache = kmem_cache_create("eventpoll_pwq",
sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
+ ephead_cache = kmem_cache_create("ep_head",
+ sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
+
return 0;
}
fs_initcall(eventpoll_init);
diff --git a/fs/exec.c b/fs/exec.c
index 547a2390baf5..5d4d52039105 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -64,6 +64,7 @@
#include <linux/compat.h>
#include <linux/vmalloc.h>
#include <linux/io_uring.h>
+#include <linux/syscall_user_dispatch.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -756,8 +757,8 @@ int setup_arg_pages(struct linux_binprm *bprm,
#ifdef CONFIG_STACK_GROWSUP
/* Limit stack size */
stack_base = bprm->rlim_stack.rlim_max;
- if (stack_base > STACK_SIZE_MAX)
- stack_base = STACK_SIZE_MAX;
+
+ stack_base = calc_max_stack_size(stack_base);
/* Add space for stack randomization. */
stack_base += (STACK_RND_MASK << PAGE_SHIFT);
@@ -965,8 +966,8 @@ EXPORT_SYMBOL(read_code);
/*
* Maps the mm_struct mm into the current task struct.
- * On success, this function returns with the mutex
- * exec_update_mutex locked.
+ * On success, this function returns with exec_update_lock
+ * held for writing.
*/
static int exec_mmap(struct mm_struct *mm)
{
@@ -981,7 +982,7 @@ static int exec_mmap(struct mm_struct *mm)
if (old_mm)
sync_mm_rss(old_mm);
- ret = mutex_lock_killable(&tsk->signal->exec_update_mutex);
+ ret = down_write_killable(&tsk->signal->exec_update_lock);
if (ret)
return ret;
@@ -995,7 +996,7 @@ static int exec_mmap(struct mm_struct *mm)
mmap_read_lock(old_mm);
if (unlikely(old_mm->core_state)) {
mmap_read_unlock(old_mm);
- mutex_unlock(&tsk->signal->exec_update_mutex);
+ up_write(&tsk->signal->exec_update_lock);
return -EINTR;
}
}
@@ -1258,6 +1259,16 @@ int begin_new_exec(struct linux_binprm * bprm)
goto out;
/*
+ * Cancel any io_uring activity across execve
+ */
+ io_uring_task_cancel();
+
+ /* Ensure the files table is not shared. */
+ retval = unshare_files();
+ if (retval)
+ goto out;
+
+ /*
* Must be called _before_ exec_mmap() as bprm->mm is
* not visibile until then. This also enables the update
* to be lockless.
@@ -1302,6 +1313,8 @@ int begin_new_exec(struct linux_binprm * bprm)
flush_thread();
me->personality &= ~bprm->per_clear;
+ clear_syscall_work_syscall_user_dispatch(me);
+
/*
* We have to apply CLOEXEC before we change whether the process is
* dumpable (in setup_new_exec) to avoid a race with a process in userspace
@@ -1382,7 +1395,7 @@ int begin_new_exec(struct linux_binprm * bprm)
return 0;
out_unlock:
- mutex_unlock(&me->signal->exec_update_mutex);
+ up_write(&me->signal->exec_update_lock);
out:
return retval;
}
@@ -1423,7 +1436,7 @@ void setup_new_exec(struct linux_binprm * bprm)
* some architectures like powerpc
*/
me->mm->task_size = TASK_SIZE;
- mutex_unlock(&me->signal->exec_update_mutex);
+ up_write(&me->signal->exec_update_lock);
mutex_unlock(&me->signal->cred_guard_mutex);
}
EXPORT_SYMBOL(setup_new_exec);
@@ -1776,21 +1789,11 @@ static int bprm_execve(struct linux_binprm *bprm,
int fd, struct filename *filename, int flags)
{
struct file *file;
- struct files_struct *displaced;
int retval;
- /*
- * Cancel any io_uring activity across execve
- */
- io_uring_task_cancel();
-
- retval = unshare_files(&displaced);
- if (retval)
- return retval;
-
retval = prepare_bprm_creds(bprm);
if (retval)
- goto out_files;
+ return retval;
check_unsafe_exec(bprm);
current->in_execve = 1;
@@ -1805,11 +1808,14 @@ static int bprm_execve(struct linux_binprm *bprm,
bprm->file = file;
/*
* Record that a name derived from an O_CLOEXEC fd will be
- * inaccessible after exec. Relies on having exclusive access to
- * current->files (due to unshare_files above).
+ * inaccessible after exec. This allows the code in exec to
+ * choose to fail when the executable is not mmaped into the
+ * interpreter and an open file descriptor is not passed to
+ * the interpreter. This makes for a better user experience
+ * than having the interpreter start and then immediately fail
+ * when it finds the executable is inaccessible.
*/
- if (bprm->fdpath &&
- close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
+ if (bprm->fdpath && get_close_on_exec(fd))
bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
/* Set the unchanging part of bprm->cred */
@@ -1827,8 +1833,6 @@ static int bprm_execve(struct linux_binprm *bprm,
rseq_execve(current);
acct_update_integrals(current);
task_numa_free(current, false);
- if (displaced)
- put_files_struct(displaced);
return retval;
out:
@@ -1845,10 +1849,6 @@ out_unmark:
current->fs->in_exec = 0;
current->in_execve = 0;
-out_files:
- if (displaced)
- reset_files_struct(displaced);
-
return retval;
}
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 2dd55b172d57..0106eba46d5a 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -417,9 +417,11 @@ int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len,
}
EXPORT_SYMBOL_GPL(exportfs_encode_fh);
-struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
- int fh_len, int fileid_type,
- int (*acceptable)(void *, struct dentry *), void *context)
+struct dentry *
+exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len,
+ int fileid_type,
+ int (*acceptable)(void *, struct dentry *),
+ void *context)
{
const struct export_operations *nop = mnt->mnt_sb->s_export_op;
struct dentry *result, *alias;
@@ -432,10 +434,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
if (!nop || !nop->fh_to_dentry)
return ERR_PTR(-ESTALE);
result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
- if (PTR_ERR(result) == -ENOMEM)
- return ERR_CAST(result);
if (IS_ERR_OR_NULL(result))
- return ERR_PTR(-ESTALE);
+ return result;
/*
* If no acceptance criteria was specified by caller, a disconnected
@@ -561,10 +561,26 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
err_result:
dput(result);
- if (err != -ENOMEM)
- err = -ESTALE;
return ERR_PTR(err);
}
+EXPORT_SYMBOL_GPL(exportfs_decode_fh_raw);
+
+struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
+ int fh_len, int fileid_type,
+ int (*acceptable)(void *, struct dentry *),
+ void *context)
+{
+ struct dentry *ret;
+
+ ret = exportfs_decode_fh_raw(mnt, fid, fh_len, fileid_type,
+ acceptable, context);
+ if (IS_ERR_OR_NULL(ret)) {
+ if (ret == ERR_PTR(-ENOMEM))
+ return ret;
+ return ERR_PTR(-ESTALE);
+ }
+ return ret;
+}
EXPORT_SYMBOL_GPL(exportfs_decode_fh);
MODULE_LICENSE("GPL");
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 70355ab6740e..14aa45316ad2 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -66,12 +66,6 @@ static inline unsigned ext2_chunk_size(struct inode *inode)
return inode->i_sb->s_blocksize;
}
-static inline void ext2_put_page(struct page *page)
-{
- kunmap(page);
- put_page(page);
-}
-
/*
* Return the offset into page `page_nr' of the last valid
* byte in that page, plus one.
@@ -336,6 +330,8 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
* returns the page in which the entry was found (as a parameter - res_page),
* and the entry itself. Page is returned mapped and unlocked.
* Entry is guaranteed to be valid.
+ *
+ * On Success ext2_put_page() should be called on *res_page.
*/
struct ext2_dir_entry_2 *ext2_find_entry (struct inode *dir,
const struct qstr *child, struct page **res_page)
@@ -401,6 +397,12 @@ found:
return de;
}
+/**
+ * Return the '..' directory entry and the page in which the entry was found
+ * (as a parameter - p).
+ *
+ * On Success ext2_put_page() should be called on *p.
+ */
struct ext2_dir_entry_2 * ext2_dotdot (struct inode *dir, struct page **p)
{
struct page *page = ext2_get_page(dir, 0, 0);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 5136b7289e8d..2a4175fbaf5e 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -16,6 +16,8 @@
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
#include <linux/rbtree.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
/* XXX Here for now... not interested in restructing headers JUST now */
@@ -745,6 +747,11 @@ extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *);
extern int ext2_empty_dir (struct inode *);
extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int);
+static inline void ext2_put_page(struct page *page)
+{
+ kunmap(page);
+ put_page(page);
+}
/* ialloc.c */
extern struct inode * ext2_new_inode (struct inode *, umode_t, const struct qstr *);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 11c5c6fe75bb..78c417d3c898 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1256,6 +1256,7 @@ do_indirects:
mark_inode_dirty(inode);
ext2_free_branches(inode, &nr, &nr+1, 3);
}
+ break;
case EXT2_TIND_BLOCK:
;
}
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 5bf2c145643b..ea980f1e2e99 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -389,23 +389,18 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
if (dir_de) {
if (old_dir != new_dir)
ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0);
- else {
- kunmap(dir_page);
- put_page(dir_page);
- }
+ else
+ ext2_put_page(dir_page);
inode_dec_link_count(old_dir);
}
return 0;
out_dir:
- if (dir_de) {
- kunmap(dir_page);
- put_page(dir_page);
- }
+ if (dir_de)
+ ext2_put_page(dir_page);
out_old:
- kunmap(old_page);
- put_page(old_page);
+ ext2_put_page(old_page);
out:
return err;
}
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 09f1fe676972..6c4753277916 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1070,7 +1070,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
/ EXT2_BLOCKS_PER_GROUP(sb)) + 1;
db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) /
EXT2_DESC_PER_BLOCK(sb);
- sbi->s_group_desc = kmalloc_array (db_count,
+ sbi->s_group_desc = kmalloc_array(db_count,
sizeof(struct buffer_head *),
GFP_KERNEL);
if (sbi->s_group_desc == NULL) {
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ca50c90adc4c..5ed870614c8d 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -118,11 +118,9 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
struct buffer_head *bh = NULL;
struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
- if (IS_ENCRYPTED(inode)) {
- err = fscrypt_get_encryption_info(inode);
- if (err)
- return err;
- }
+ err = fscrypt_prepare_readdir(inode);
+ if (err)
+ return err;
if (is_dx_dir(inode)) {
err = ext4_dx_readdir(file, ctx);
@@ -616,13 +614,6 @@ finished:
return 0;
}
-static int ext4_dir_open(struct inode * inode, struct file * filp)
-{
- if (IS_ENCRYPTED(inode))
- return fscrypt_get_encryption_info(inode) ? -EACCES : 0;
- return 0;
-}
-
static int ext4_release_dir(struct inode *inode, struct file *filp)
{
if (filp->private_data)
@@ -664,13 +655,5 @@ const struct file_operations ext4_dir_operations = {
.compat_ioctl = ext4_compat_ioctl,
#endif
.fsync = ext4_sync_file,
- .open = ext4_dir_open,
.release = ext4_release_dir,
};
-
-#ifdef CONFIG_UNICODE
-const struct dentry_operations ext4_dentry_ops = {
- .d_hash = generic_ci_d_hash,
- .d_compare = generic_ci_d_compare,
-};
-#endif
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 65ecaf96d0a4..c64ea8f59ea7 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3381,10 +3381,6 @@ static inline void ext4_unlock_group(struct super_block *sb,
/* dir.c */
extern const struct file_operations ext4_dir_operations;
-#ifdef CONFIG_UNICODE
-extern const struct dentry_operations ext4_dentry_ops;
-#endif
-
/* file.c */
extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;
diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c
index d62d802c9c12..7935ea6cf92c 100644
--- a/fs/ext4/inode-test.c
+++ b/fs/ext4/inode-test.c
@@ -80,6 +80,145 @@ struct timestamp_expectation {
bool lower_bound;
};
+static const struct timestamp_expectation test_data[] = {
+ {
+ .test_case_name = LOWER_BOUND_NEG_NO_EXTRA_BITS_CASE,
+ .msb_set = true,
+ .lower_bound = true,
+ .extra_bits = 0,
+ .expected = {.tv_sec = -0x80000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NEG_NO_EXTRA_BITS_CASE,
+ .msb_set = true,
+ .lower_bound = false,
+ .extra_bits = 0,
+ .expected = {.tv_sec = -1LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_NO_EXTRA_BITS_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 0,
+ .expected = {0LL, 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_NO_EXTRA_BITS_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 0,
+ .expected = {.tv_sec = 0x7fffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NEG_LO_1_CASE,
+ .msb_set = true,
+ .lower_bound = true,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0x80000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NEG_LO_1_CASE,
+ .msb_set = true,
+ .lower_bound = false,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0xffffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_LO_1_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0x100000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_LO_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0x17fffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NEG_HI_1_CASE,
+ .msb_set = true,
+ .lower_bound = true,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x180000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NEG_HI_1_CASE,
+ .msb_set = true,
+ .lower_bound = false,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x1ffffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_HI_1_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x200000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_HI_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_HI_1_NS_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 6,
+ .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 1L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_HI_1_NS_MAX_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 0xFFFFFFFF,
+ .expected = {.tv_sec = 0x300000000LL,
+ .tv_nsec = MAX_NANOSECONDS},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_EXTRA_BITS_1_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 3,
+ .expected = {.tv_sec = 0x300000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_EXTRA_BITS_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 3,
+ .expected = {.tv_sec = 0x37fffffffLL, .tv_nsec = 0L},
+ }
+};
+
+static void timestamp_expectation_to_desc(const struct timestamp_expectation *t,
+ char *desc)
+{
+ strscpy(desc, t->test_case_name, KUNIT_PARAM_DESC_SIZE);
+}
+
+KUNIT_ARRAY_PARAM(ext4_inode, test_data, timestamp_expectation_to_desc);
+
static time64_t get_32bit_time(const struct timestamp_expectation * const test)
{
if (test->msb_set) {
@@ -101,166 +240,35 @@ static time64_t get_32bit_time(const struct timestamp_expectation * const test)
*/
static void inode_test_xtimestamp_decoding(struct kunit *test)
{
- const struct timestamp_expectation test_data[] = {
- {
- .test_case_name = LOWER_BOUND_NEG_NO_EXTRA_BITS_CASE,
- .msb_set = true,
- .lower_bound = true,
- .extra_bits = 0,
- .expected = {.tv_sec = -0x80000000LL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = UPPER_BOUND_NEG_NO_EXTRA_BITS_CASE,
- .msb_set = true,
- .lower_bound = false,
- .extra_bits = 0,
- .expected = {.tv_sec = -1LL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = LOWER_BOUND_NONNEG_NO_EXTRA_BITS_CASE,
- .msb_set = false,
- .lower_bound = true,
- .extra_bits = 0,
- .expected = {0LL, 0L},
- },
-
- {
- .test_case_name = UPPER_BOUND_NONNEG_NO_EXTRA_BITS_CASE,
- .msb_set = false,
- .lower_bound = false,
- .extra_bits = 0,
- .expected = {.tv_sec = 0x7fffffffLL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = LOWER_BOUND_NEG_LO_1_CASE,
- .msb_set = true,
- .lower_bound = true,
- .extra_bits = 1,
- .expected = {.tv_sec = 0x80000000LL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = UPPER_BOUND_NEG_LO_1_CASE,
- .msb_set = true,
- .lower_bound = false,
- .extra_bits = 1,
- .expected = {.tv_sec = 0xffffffffLL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = LOWER_BOUND_NONNEG_LO_1_CASE,
- .msb_set = false,
- .lower_bound = true,
- .extra_bits = 1,
- .expected = {.tv_sec = 0x100000000LL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = UPPER_BOUND_NONNEG_LO_1_CASE,
- .msb_set = false,
- .lower_bound = false,
- .extra_bits = 1,
- .expected = {.tv_sec = 0x17fffffffLL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = LOWER_BOUND_NEG_HI_1_CASE,
- .msb_set = true,
- .lower_bound = true,
- .extra_bits = 2,
- .expected = {.tv_sec = 0x180000000LL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = UPPER_BOUND_NEG_HI_1_CASE,
- .msb_set = true,
- .lower_bound = false,
- .extra_bits = 2,
- .expected = {.tv_sec = 0x1ffffffffLL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = LOWER_BOUND_NONNEG_HI_1_CASE,
- .msb_set = false,
- .lower_bound = true,
- .extra_bits = 2,
- .expected = {.tv_sec = 0x200000000LL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = UPPER_BOUND_NONNEG_HI_1_CASE,
- .msb_set = false,
- .lower_bound = false,
- .extra_bits = 2,
- .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = UPPER_BOUND_NONNEG_HI_1_NS_1_CASE,
- .msb_set = false,
- .lower_bound = false,
- .extra_bits = 6,
- .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 1L},
- },
-
- {
- .test_case_name = LOWER_BOUND_NONNEG_HI_1_NS_MAX_CASE,
- .msb_set = false,
- .lower_bound = true,
- .extra_bits = 0xFFFFFFFF,
- .expected = {.tv_sec = 0x300000000LL,
- .tv_nsec = MAX_NANOSECONDS},
- },
-
- {
- .test_case_name = LOWER_BOUND_NONNEG_EXTRA_BITS_1_CASE,
- .msb_set = false,
- .lower_bound = true,
- .extra_bits = 3,
- .expected = {.tv_sec = 0x300000000LL, .tv_nsec = 0L},
- },
-
- {
- .test_case_name = UPPER_BOUND_NONNEG_EXTRA_BITS_1_CASE,
- .msb_set = false,
- .lower_bound = false,
- .extra_bits = 3,
- .expected = {.tv_sec = 0x37fffffffLL, .tv_nsec = 0L},
- }
- };
-
struct timespec64 timestamp;
- int i;
-
- for (i = 0; i < ARRAY_SIZE(test_data); ++i) {
- timestamp.tv_sec = get_32bit_time(&test_data[i]);
- ext4_decode_extra_time(&timestamp,
- cpu_to_le32(test_data[i].extra_bits));
-
- KUNIT_EXPECT_EQ_MSG(test,
- test_data[i].expected.tv_sec,
- timestamp.tv_sec,
- CASE_NAME_FORMAT,
- test_data[i].test_case_name,
- test_data[i].msb_set,
- test_data[i].lower_bound,
- test_data[i].extra_bits);
- KUNIT_EXPECT_EQ_MSG(test,
- test_data[i].expected.tv_nsec,
- timestamp.tv_nsec,
- CASE_NAME_FORMAT,
- test_data[i].test_case_name,
- test_data[i].msb_set,
- test_data[i].lower_bound,
- test_data[i].extra_bits);
- }
+
+ struct timestamp_expectation *test_param =
+ (struct timestamp_expectation *)(test->param_value);
+
+ timestamp.tv_sec = get_32bit_time(test_param);
+ ext4_decode_extra_time(&timestamp,
+ cpu_to_le32(test_param->extra_bits));
+
+ KUNIT_EXPECT_EQ_MSG(test,
+ test_param->expected.tv_sec,
+ timestamp.tv_sec,
+ CASE_NAME_FORMAT,
+ test_param->test_case_name,
+ test_param->msb_set,
+ test_param->lower_bound,
+ test_param->extra_bits);
+ KUNIT_EXPECT_EQ_MSG(test,
+ test_param->expected.tv_nsec,
+ timestamp.tv_nsec,
+ CASE_NAME_FORMAT,
+ test_param->test_case_name,
+ test_param->msb_set,
+ test_param->lower_bound,
+ test_param->extra_bits);
}
static struct kunit_case ext4_inode_test_cases[] = {
- KUNIT_CASE(inode_test_xtimestamp_decoding),
+ KUNIT_CASE_PARAM(inode_test_xtimestamp_decoding, ext4_inode_gen_params),
{}
};
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index f0381876a7e5..524e13432447 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -624,7 +624,7 @@ static int ext4_shutdown(struct super_block *sb, unsigned long arg)
case EXT4_GOING_FLAGS_DEFAULT:
freeze_bdev(sb->s_bdev);
set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
- thaw_bdev(sb->s_bdev, sb);
+ thaw_bdev(sb->s_bdev);
break;
case EXT4_GOING_FLAGS_LOGFLUSH:
set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 33509266f5a0..326fe402e495 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -643,13 +643,7 @@ static struct stats dx_show_leaf(struct inode *dir,
name = de->name;
len = de->name_len;
- if (IS_ENCRYPTED(dir))
- res = fscrypt_get_encryption_info(dir);
- if (res) {
- printk(KERN_WARNING "Error setting up"
- " fname crypto: %d\n", res);
- }
- if (!fscrypt_has_encryption_key(dir)) {
+ if (!IS_ENCRYPTED(dir)) {
/* Directory is not encrypted */
ext4fs_dirhash(dir, de->name,
de->name_len, &h);
@@ -1010,7 +1004,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
EXT4_DIR_REC_LEN(0));
/* Check if the directory is encrypted */
if (IS_ENCRYPTED(dir)) {
- err = fscrypt_get_encryption_info(dir);
+ err = fscrypt_prepare_readdir(dir);
if (err < 0) {
brelse(bh);
return err;
@@ -1614,6 +1608,7 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir,
struct buffer_head *bh;
err = ext4_fname_prepare_lookup(dir, dentry, &fname);
+ generic_set_encrypted_ci_d_ops(dentry);
if (err == -ENOENT)
return NULL;
if (err)
@@ -2195,6 +2190,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
if (!dentry->d_name.len)
return -EINVAL;
+ if (fscrypt_is_nokey_name(dentry))
+ return -ENOKEY;
+
#ifdef CONFIG_UNICODE
if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) &&
sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 94472044f4c1..830c196ec069 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4044,9 +4044,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_sb = sb;
sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
sbi->s_sb_block = sb_block;
- if (sb->s_bdev->bd_part)
- sbi->s_sectors_written_start =
- part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]);
+ sbi->s_sectors_written_start =
+ part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);
/* Cleanup superblock name */
strreplace(sb->s_id, '/', '!');
@@ -4964,11 +4963,6 @@ no_journal:
goto failed_mount4;
}
-#ifdef CONFIG_UNICODE
- if (sb->s_encoding)
- sb->s_d_op = &ext4_dentry_ops;
-#endif
-
sb->s_root = d_make_root(root);
if (!sb->s_root) {
ext4_msg(sb, KERN_ERR, "get root dentry failed");
@@ -5505,15 +5499,10 @@ static int ext4_commit_super(struct super_block *sb, int sync)
*/
if (!(sb->s_flags & SB_RDONLY))
ext4_update_tstamp(es, s_wtime);
- if (sb->s_bdev->bd_part)
- es->s_kbytes_written =
- cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
- ((part_stat_read(sb->s_bdev->bd_part,
- sectors[STAT_WRITE]) -
- EXT4_SB(sb)->s_sectors_written_start) >> 1));
- else
- es->s_kbytes_written =
- cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
+ es->s_kbytes_written =
+ cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
+ ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
+ EXT4_SB(sb)->s_sectors_written_start) >> 1));
if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter))
ext4_free_blocks_count_set(es,
EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 4e27fe6ed3ae..075aa3a19ff5 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -62,11 +62,8 @@ static ssize_t session_write_kbytes_show(struct ext4_sb_info *sbi, char *buf)
{
struct super_block *sb = sbi->s_buddy_cache->i_sb;
- if (!sb->s_bdev->bd_part)
- return snprintf(buf, PAGE_SIZE, "0\n");
return snprintf(buf, PAGE_SIZE, "%lu\n",
- (part_stat_read(sb->s_bdev->bd_part,
- sectors[STAT_WRITE]) -
+ (part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
sbi->s_sectors_written_start) >> 1);
}
@@ -74,12 +71,9 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_sb_info *sbi, char *buf)
{
struct super_block *sb = sbi->s_buddy_cache->i_sb;
- if (!sb->s_bdev->bd_part)
- return snprintf(buf, PAGE_SIZE, "0\n");
return snprintf(buf, PAGE_SIZE, "%llu\n",
(unsigned long long)(sbi->s_kbytes_written +
- ((part_stat_read(sb->s_bdev->bd_part,
- sectors[STAT_WRITE]) -
+ ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
EXT4_SB(sb)->s_sectors_written_start) >> 1)));
}
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 306413589827..1e5e9b1136ee 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -384,7 +384,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
struct page *dpage)
{
struct posix_acl *default_acl = NULL, *acl = NULL;
- int error = 0;
+ int error;
error = f2fs_acl_create(dir, &inode->i_mode, &default_acl, &acl, dpage);
if (error)
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 023462e80e58..897edb7c951a 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -37,7 +37,7 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io)
struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
{
struct address_space *mapping = META_MAPPING(sbi);
- struct page *page = NULL;
+ struct page *page;
repeat:
page = f2fs_grab_cache_page(mapping, index, false);
if (!page) {
@@ -348,13 +348,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
goto skip_write;
/* if locked failed, cp will flush dirty pages instead */
- if (!mutex_trylock(&sbi->cp_mutex))
+ if (!down_write_trylock(&sbi->cp_global_sem))
goto skip_write;
trace_f2fs_writepages(mapping->host, wbc, META);
diff = nr_pages_to_write(sbi, META, wbc);
written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO);
- mutex_unlock(&sbi->cp_mutex);
+ up_write(&sbi->cp_global_sem);
wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
return 0;
@@ -1385,6 +1385,26 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi,
f2fs_submit_merged_write(sbi, META_FLUSH);
}
+static inline u64 get_sectors_written(struct block_device *bdev)
+{
+ return (u64)part_stat_read(bdev, sectors[STAT_WRITE]);
+}
+
+u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi)
+{
+ if (f2fs_is_multi_device(sbi)) {
+ u64 sectors = 0;
+ int i;
+
+ for (i = 0; i < sbi->s_ndevs; i++)
+ sectors += get_sectors_written(FDEV(i).bdev);
+
+ return sectors;
+ }
+
+ return get_sectors_written(sbi->sb->s_bdev);
+}
+
static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -1395,7 +1415,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
__u32 crc32 = 0;
int i;
int cp_payload_blks = __cp_payload(sbi);
- struct super_block *sb = sbi->sb;
struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
u64 kbytes_written;
int err;
@@ -1490,9 +1509,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* Record write statistics in the hot node summary */
kbytes_written = sbi->kbytes_written;
- if (sb->s_bdev->bd_part)
- kbytes_written += BD_PART_WRITTEN(sbi);
-
+ kbytes_written += (f2fs_get_sectors_written(sbi) -
+ sbi->sectors_written_start) >> 1;
seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written);
if (__remain_node_summaries(cpc->reason)) {
@@ -1572,7 +1590,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_warn(sbi, "Start checkpoint disabled!");
}
if (cpc->reason != CP_RESIZE)
- mutex_lock(&sbi->cp_mutex);
+ down_write(&sbi->cp_global_sem);
if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) ||
@@ -1600,7 +1618,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
goto out;
}
- if (NM_I(sbi)->dirty_nat_cnt == 0 &&
+ if (NM_I(sbi)->nat_cnt[DIRTY_NAT] == 0 &&
SIT_I(sbi)->dirty_sentries == 0 &&
prefree_segments(sbi) == 0) {
f2fs_flush_sit_entries(sbi, cpc);
@@ -1647,7 +1665,7 @@ stop:
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
out:
if (cpc->reason != CP_RESIZE)
- mutex_unlock(&sbi->cp_mutex);
+ up_write(&sbi->cp_global_sem);
return err;
}
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 14262e0f1cd6..4bcbacfe3325 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -602,6 +602,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
f2fs_cops[fi->i_compress_algorithm];
unsigned int max_len, new_nr_cpages;
struct page **new_cpages;
+ u32 chksum = 0;
int i, ret;
trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx,
@@ -655,6 +656,11 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
cc->cbuf->clen = cpu_to_le32(cc->clen);
+ if (fi->i_compress_flag & 1 << COMPRESS_CHKSUM)
+ chksum = f2fs_crc32(F2FS_I_SB(cc->inode),
+ cc->cbuf->cdata, cc->clen);
+ cc->cbuf->chksum = cpu_to_le32(chksum);
+
for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++)
cc->cbuf->reserved[i] = cpu_to_le32(0);
@@ -790,6 +796,22 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
ret = cops->decompress_pages(dic);
+ if (!ret && (fi->i_compress_flag & 1 << COMPRESS_CHKSUM)) {
+ u32 provided = le32_to_cpu(dic->cbuf->chksum);
+ u32 calculated = f2fs_crc32(sbi, dic->cbuf->cdata, dic->clen);
+
+ if (provided != calculated) {
+ if (!is_inode_flag_set(dic->inode, FI_COMPRESS_CORRUPT)) {
+ set_inode_flag(dic->inode, FI_COMPRESS_CORRUPT);
+ printk_ratelimited(
+ "%sF2FS-fs (%s): checksum invalid, nid = %lu, %x vs %x",
+ KERN_INFO, sbi->sb->s_id, dic->inode->i_ino,
+ provided, calculated);
+ }
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ }
+ }
+
out_vunmap_cbuf:
vm_unmap_ram(dic->cbuf, dic->nr_cpages);
out_vunmap_rbuf:
@@ -798,8 +820,6 @@ destroy_decompress_ctx:
if (cops->destroy_decompress_ctx)
cops->destroy_decompress_ctx(dic);
out_free_dic:
- if (verity)
- atomic_set(&dic->pending_pages, dic->nr_cpages);
if (!verity)
f2fs_decompress_end_io(dic->rpages, dic->cluster_size,
ret, false);
@@ -921,7 +941,7 @@ int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index)
static bool cluster_may_compress(struct compress_ctx *cc)
{
- if (!f2fs_compressed_file(cc->inode))
+ if (!f2fs_need_compress_data(cc->inode))
return false;
if (f2fs_is_atomic_file(cc->inode))
return false;
diff --git a/fs/f2fs/compress.h b/fs/f2fs/compress.h
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/fs/f2fs/compress.h
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index be4da52604ed..aa34d620bec9 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -202,7 +202,7 @@ static void f2fs_verify_bio(struct bio *bio)
dic = (struct decompress_io_ctx *)page_private(page);
if (dic) {
- if (atomic_dec_return(&dic->pending_pages))
+ if (atomic_dec_return(&dic->verity_pages))
continue;
f2fs_verify_pages(dic->rpages,
dic->cluster_size);
@@ -736,6 +736,9 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
static bool page_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio,
block_t last_blkaddr, block_t cur_blkaddr)
{
+ if (unlikely(sbi->max_io_bytes &&
+ bio->bi_iter.bi_size >= sbi->max_io_bytes))
+ return false;
if (last_blkaddr + 1 != cur_blkaddr)
return false;
return __same_bdev(sbi, cur_blkaddr, bio);
@@ -1027,7 +1030,8 @@ static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
unsigned nr_pages, unsigned op_flag,
- pgoff_t first_idx, bool for_write)
+ pgoff_t first_idx, bool for_write,
+ bool for_verity)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct bio *bio;
@@ -1049,7 +1053,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
post_read_steps |= 1 << STEP_DECRYPT;
if (f2fs_compressed_file(inode))
post_read_steps |= 1 << STEP_DECOMPRESS_NOWQ;
- if (f2fs_need_verity(inode, first_idx))
+ if (for_verity && f2fs_need_verity(inode, first_idx))
post_read_steps |= 1 << STEP_VERITY;
if (post_read_steps) {
@@ -1079,7 +1083,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page,
struct bio *bio;
bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags,
- page->index, for_write);
+ page->index, for_write, true);
if (IS_ERR(bio))
return PTR_ERR(bio);
@@ -1750,6 +1754,16 @@ bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len)
return true;
}
+static inline u64 bytes_to_blks(struct inode *inode, u64 bytes)
+{
+ return (bytes >> inode->i_blkbits);
+}
+
+static inline u64 blks_to_bytes(struct inode *inode, u64 blks)
+{
+ return (blks << inode->i_blkbits);
+}
+
static int __get_data_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create, int flag,
pgoff_t *next_pgofs, int seg_type, bool may_write)
@@ -1758,7 +1772,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
int err;
map.m_lblk = iblock;
- map.m_len = bh->b_size >> inode->i_blkbits;
+ map.m_len = bytes_to_blks(inode, bh->b_size);
map.m_next_pgofs = next_pgofs;
map.m_next_extent = NULL;
map.m_seg_type = seg_type;
@@ -1768,20 +1782,11 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
if (!err) {
map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags;
- bh->b_size = (u64)map.m_len << inode->i_blkbits;
+ bh->b_size = blks_to_bytes(inode, map.m_len);
}
return err;
}
-static int get_data_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create, int flag,
- pgoff_t *next_pgofs)
-{
- return __get_data_block(inode, iblock, bh_result, create,
- flag, next_pgofs,
- NO_CHECK_TYPE, create);
-}
-
static int get_data_block_dio_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
@@ -1800,24 +1805,6 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock,
false);
}
-static int get_data_block_bmap(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- return __get_data_block(inode, iblock, bh_result, create,
- F2FS_GET_BLOCK_BMAP, NULL,
- NO_CHECK_TYPE, create);
-}
-
-static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
-{
- return (offset >> inode->i_blkbits);
-}
-
-static inline loff_t blk_to_logical(struct inode *inode, sector_t blk)
-{
- return (blk << inode->i_blkbits);
-}
-
static int f2fs_xattr_fiemap(struct inode *inode,
struct fiemap_extent_info *fieinfo)
{
@@ -1843,7 +1830,7 @@ static int f2fs_xattr_fiemap(struct inode *inode,
return err;
}
- phys = (__u64)blk_to_logical(inode, ni.blk_addr);
+ phys = blks_to_bytes(inode, ni.blk_addr);
offset = offsetof(struct f2fs_inode, i_addr) +
sizeof(__le32) * (DEF_ADDRS_PER_INODE -
get_inline_xattr_addrs(inode));
@@ -1875,7 +1862,7 @@ static int f2fs_xattr_fiemap(struct inode *inode,
return err;
}
- phys = (__u64)blk_to_logical(inode, ni.blk_addr);
+ phys = blks_to_bytes(inode, ni.blk_addr);
len = inode->i_sb->s_blocksize;
f2fs_put_page(page, 1);
@@ -1913,7 +1900,7 @@ static loff_t max_inode_blocks(struct inode *inode)
int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
- struct buffer_head map_bh;
+ struct f2fs_map_blocks map;
sector_t start_blk, last_blk;
pgoff_t next_pgofs;
u64 logical = 0, phys = 0, size = 0;
@@ -1945,29 +1932,31 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
goto out;
}
- if (logical_to_blk(inode, len) == 0)
- len = blk_to_logical(inode, 1);
+ if (bytes_to_blks(inode, len) == 0)
+ len = blks_to_bytes(inode, 1);
- start_blk = logical_to_blk(inode, start);
- last_blk = logical_to_blk(inode, start + len - 1);
+ start_blk = bytes_to_blks(inode, start);
+ last_blk = bytes_to_blks(inode, start + len - 1);
next:
- memset(&map_bh, 0, sizeof(struct buffer_head));
- map_bh.b_size = len;
+ memset(&map, 0, sizeof(map));
+ map.m_lblk = start_blk;
+ map.m_len = bytes_to_blks(inode, len);
+ map.m_next_pgofs = &next_pgofs;
+ map.m_seg_type = NO_CHECK_TYPE;
if (compr_cluster)
- map_bh.b_size = blk_to_logical(inode, cluster_size - 1);
+ map.m_len = cluster_size - 1;
- ret = get_data_block(inode, start_blk, &map_bh, 0,
- F2FS_GET_BLOCK_FIEMAP, &next_pgofs);
+ ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP);
if (ret)
goto out;
/* HOLE */
- if (!buffer_mapped(&map_bh)) {
+ if (!(map.m_flags & F2FS_MAP_FLAGS)) {
start_blk = next_pgofs;
- if (blk_to_logical(inode, start_blk) < blk_to_logical(inode,
+ if (blks_to_bytes(inode, start_blk) < blks_to_bytes(inode,
max_inode_blocks(inode)))
goto prep_next;
@@ -1993,9 +1982,9 @@ next:
compr_cluster = false;
- logical = blk_to_logical(inode, start_blk - 1);
- phys = blk_to_logical(inode, map_bh.b_blocknr);
- size = blk_to_logical(inode, cluster_size);
+ logical = blks_to_bytes(inode, start_blk - 1);
+ phys = blks_to_bytes(inode, map.m_pblk);
+ size = blks_to_bytes(inode, cluster_size);
flags |= FIEMAP_EXTENT_ENCODED;
@@ -2007,20 +1996,20 @@ next:
goto prep_next;
}
- if (map_bh.b_blocknr == COMPRESS_ADDR) {
+ if (map.m_pblk == COMPRESS_ADDR) {
compr_cluster = true;
start_blk++;
goto prep_next;
}
- logical = blk_to_logical(inode, start_blk);
- phys = blk_to_logical(inode, map_bh.b_blocknr);
- size = map_bh.b_size;
+ logical = blks_to_bytes(inode, start_blk);
+ phys = blks_to_bytes(inode, map.m_pblk);
+ size = blks_to_bytes(inode, map.m_len);
flags = 0;
- if (buffer_unwritten(&map_bh))
+ if (map.m_flags & F2FS_MAP_UNWRITTEN)
flags = FIEMAP_EXTENT_UNWRITTEN;
- start_blk += logical_to_blk(inode, size);
+ start_blk += bytes_to_blks(inode, size);
prep_next:
cond_resched();
@@ -2053,8 +2042,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page,
bool is_readahead)
{
struct bio *bio = *bio_ret;
- const unsigned blkbits = inode->i_blkbits;
- const unsigned blocksize = 1 << blkbits;
+ const unsigned blocksize = blks_to_bytes(inode, 1);
sector_t block_in_file;
sector_t last_block;
sector_t last_block_in_file;
@@ -2063,8 +2051,8 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page,
block_in_file = (sector_t)page_index(page);
last_block = block_in_file + nr_pages;
- last_block_in_file = (f2fs_readpage_limit(inode) + blocksize - 1) >>
- blkbits;
+ last_block_in_file = bytes_to_blks(inode,
+ f2fs_readpage_limit(inode) + blocksize - 1);
if (last_block > last_block_in_file)
last_block = last_block_in_file;
@@ -2133,7 +2121,7 @@ submit_and_realloc:
if (bio == NULL) {
bio = f2fs_grab_read_bio(inode, block_nr, nr_pages,
is_readahead ? REQ_RAHEAD : 0, page->index,
- false);
+ false, true);
if (IS_ERR(bio)) {
ret = PTR_ERR(bio);
bio = NULL;
@@ -2177,16 +2165,17 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
struct bio *bio = *bio_ret;
unsigned int start_idx = cc->cluster_idx << cc->log_cluster_size;
sector_t last_block_in_file;
- const unsigned blkbits = inode->i_blkbits;
- const unsigned blocksize = 1 << blkbits;
+ const unsigned blocksize = blks_to_bytes(inode, 1);
struct decompress_io_ctx *dic = NULL;
+ struct bio_post_read_ctx *ctx;
+ bool for_verity = false;
int i;
int ret = 0;
f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc));
- last_block_in_file = (f2fs_readpage_limit(inode) +
- blocksize - 1) >> blkbits;
+ last_block_in_file = bytes_to_blks(inode,
+ f2fs_readpage_limit(inode) + blocksize - 1);
/* get rid of pages beyond EOF */
for (i = 0; i < cc->cluster_size; i++) {
@@ -2245,10 +2234,29 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
goto out_put_dnode;
}
+ /*
+ * It's possible to enable fsverity on the fly when handling a cluster,
+ * which requires complicated error handling. Instead of adding more
+ * complexity, let's give a rule where end_io post-processes fsverity
+ * per cluster. In order to do that, we need to submit bio, if previous
+ * bio sets a different post-process policy.
+ */
+ if (fsverity_active(cc->inode)) {
+ atomic_set(&dic->verity_pages, cc->nr_cpages);
+ for_verity = true;
+
+ if (bio) {
+ ctx = bio->bi_private;
+ if (!(ctx->enabled_steps & (1 << STEP_VERITY))) {
+ __submit_bio(sbi, bio, DATA);
+ bio = NULL;
+ }
+ }
+ }
+
for (i = 0; i < dic->nr_cpages; i++) {
struct page *page = dic->cpages[i];
block_t blkaddr;
- struct bio_post_read_ctx *ctx;
blkaddr = data_blkaddr(dn.inode, dn.node_page,
dn.ofs_in_node + i + 1);
@@ -2264,17 +2272,31 @@ submit_and_realloc:
if (!bio) {
bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages,
is_readahead ? REQ_RAHEAD : 0,
- page->index, for_write);
+ page->index, for_write, for_verity);
if (IS_ERR(bio)) {
+ unsigned int remained = dic->nr_cpages - i;
+ bool release = false;
+
ret = PTR_ERR(bio);
dic->failed = true;
- if (!atomic_sub_return(dic->nr_cpages - i,
- &dic->pending_pages)) {
+
+ if (for_verity) {
+ if (!atomic_sub_return(remained,
+ &dic->verity_pages))
+ release = true;
+ } else {
+ if (!atomic_sub_return(remained,
+ &dic->pending_pages))
+ release = true;
+ }
+
+ if (release) {
f2fs_decompress_end_io(dic->rpages,
- cc->cluster_size, true,
- false);
+ cc->cluster_size, true,
+ false);
f2fs_free_dic(dic);
}
+
f2fs_put_dnode(&dn);
*bio_ret = NULL;
return ret;
@@ -3164,7 +3186,7 @@ static inline bool __should_serialize_io(struct inode *inode,
if (IS_NOQUOTA(inode))
return false;
- if (f2fs_compressed_file(inode))
+ if (f2fs_need_compress_data(inode))
return true;
if (wbc->sync_mode != WB_SYNC_ALL)
return true;
@@ -3799,9 +3821,6 @@ static sector_t f2fs_bmap_compress(struct inode *inode, sector_t block)
static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
{
struct inode *inode = mapping->host;
- struct buffer_head tmp = {
- .b_size = i_blocksize(inode),
- };
sector_t blknr = 0;
if (f2fs_has_inline_data(inode))
@@ -3818,8 +3837,16 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
if (f2fs_compressed_file(inode)) {
blknr = f2fs_bmap_compress(inode, block);
} else {
- if (!get_data_block_bmap(inode, block, &tmp, 0))
- blknr = tmp.b_blocknr;
+ struct f2fs_map_blocks map;
+
+ memset(&map, 0, sizeof(map));
+ map.m_lblk = block;
+ map.m_len = 1;
+ map.m_next_pgofs = NULL;
+ map.m_seg_type = NO_CHECK_TYPE;
+
+ if (!f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_BMAP))
+ blknr = map.m_pblk;
}
out:
trace_f2fs_bmap(inode, block, blknr);
@@ -3895,7 +3922,7 @@ static int check_swap_activate_fast(struct swap_info_struct *sis,
sector_t highest_pblock = 0;
int nr_extents = 0;
unsigned long nr_pblocks;
- unsigned long len;
+ u64 len;
int ret;
/*
@@ -3903,29 +3930,31 @@ static int check_swap_activate_fast(struct swap_info_struct *sis,
* to be very smart.
*/
cur_lblock = 0;
- last_lblock = logical_to_blk(inode, i_size_read(inode));
+ last_lblock = bytes_to_blks(inode, i_size_read(inode));
len = i_size_read(inode);
while (cur_lblock <= last_lblock && cur_lblock < sis->max) {
- struct buffer_head map_bh;
+ struct f2fs_map_blocks map;
pgoff_t next_pgofs;
cond_resched();
- memset(&map_bh, 0, sizeof(struct buffer_head));
- map_bh.b_size = len - cur_lblock;
+ memset(&map, 0, sizeof(map));
+ map.m_lblk = cur_lblock;
+ map.m_len = bytes_to_blks(inode, len) - cur_lblock;
+ map.m_next_pgofs = &next_pgofs;
+ map.m_seg_type = NO_CHECK_TYPE;
- ret = get_data_block(inode, cur_lblock, &map_bh, 0,
- F2FS_GET_BLOCK_FIEMAP, &next_pgofs);
+ ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP);
if (ret)
goto err_out;
/* hole */
- if (!buffer_mapped(&map_bh))
+ if (!(map.m_flags & F2FS_MAP_FLAGS))
goto err_out;
- pblock = map_bh.b_blocknr;
- nr_pblocks = logical_to_blk(inode, map_bh.b_size);
+ pblock = map.m_pblk;
+ nr_pblocks = map.m_len;
if (cur_lblock + nr_pblocks >= sis->max)
nr_pblocks = sis->max - cur_lblock;
@@ -3968,7 +3997,6 @@ static int check_swap_activate(struct swap_info_struct *sis,
struct inode *inode = mapping->host;
unsigned blocks_per_page;
unsigned long page_no;
- unsigned blkbits;
sector_t probe_block;
sector_t last_block;
sector_t lowest_block = -1;
@@ -3979,8 +4007,7 @@ static int check_swap_activate(struct swap_info_struct *sis,
if (PAGE_SIZE == F2FS_BLKSIZE)
return check_swap_activate_fast(sis, swap_file, span);
- blkbits = inode->i_blkbits;
- blocks_per_page = PAGE_SIZE >> blkbits;
+ blocks_per_page = bytes_to_blks(inode, PAGE_SIZE);
/*
* Map all the blocks into the extent list. This code doesn't try
@@ -3988,7 +4015,7 @@ static int check_swap_activate(struct swap_info_struct *sis,
*/
probe_block = 0;
page_no = 0;
- last_block = i_size_read(inode) >> blkbits;
+ last_block = bytes_to_blks(inode, i_size_read(inode));
while ((probe_block + blocks_per_page) <= last_block &&
page_no < sis->max) {
unsigned block_in_page;
@@ -4028,7 +4055,7 @@ static int check_swap_activate(struct swap_info_struct *sis,
}
}
- first_block >>= (PAGE_SHIFT - blkbits);
+ first_block >>= (PAGE_SHIFT - inode->i_blkbits);
if (page_no) { /* exclude the header page */
if (first_block < lowest_block)
lowest_block = first_block;
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index a8357fd4f5fa..197c914119da 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -145,8 +145,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->node_pages = NODE_MAPPING(sbi)->nrpages;
if (sbi->meta_inode)
si->meta_pages = META_MAPPING(sbi)->nrpages;
- si->nats = NM_I(sbi)->nat_cnt;
- si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
+ si->nats = NM_I(sbi)->nat_cnt[TOTAL_NAT];
+ si->dirty_nats = NM_I(sbi)->nat_cnt[DIRTY_NAT];
si->sits = MAIN_SEGS(sbi);
si->dirty_sits = SIT_I(sbi)->dirty_sentries;
si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID];
@@ -278,9 +278,10 @@ get_cache:
si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID] +
NM_I(sbi)->nid_cnt[PREALLOC_NID]) *
sizeof(struct free_nid);
- si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry);
- si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
- sizeof(struct nat_entry_set);
+ si->cache_mem += NM_I(sbi)->nat_cnt[TOTAL_NAT] *
+ sizeof(struct nat_entry);
+ si->cache_mem += NM_I(sbi)->nat_cnt[DIRTY_NAT] *
+ sizeof(struct nat_entry_set);
si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
for (i = 0; i < MAX_INO_ENTRY; i++)
si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 4b9ef8bbfa4a..e6270a867be1 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -5,6 +5,7 @@
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
*/
+#include <asm/unaligned.h>
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
#include <linux/sched/signal.h>
@@ -206,30 +207,55 @@ static struct f2fs_dir_entry *find_in_block(struct inode *dir,
/*
* Test whether a case-insensitive directory entry matches the filename
* being searched for.
+ *
+ * Returns 1 for a match, 0 for no match, and -errno on an error.
*/
-static bool f2fs_match_ci_name(const struct inode *dir, const struct qstr *name,
+static int f2fs_match_ci_name(const struct inode *dir, const struct qstr *name,
const u8 *de_name, u32 de_name_len)
{
const struct super_block *sb = dir->i_sb;
const struct unicode_map *um = sb->s_encoding;
+ struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len);
struct qstr entry = QSTR_INIT(de_name, de_name_len);
int res;
+ if (IS_ENCRYPTED(dir)) {
+ const struct fscrypt_str encrypted_name =
+ FSTR_INIT((u8 *)de_name, de_name_len);
+
+ if (WARN_ON_ONCE(!fscrypt_has_encryption_key(dir)))
+ return -EINVAL;
+
+ decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL);
+ if (!decrypted_name.name)
+ return -ENOMEM;
+ res = fscrypt_fname_disk_to_usr(dir, 0, 0, &encrypted_name,
+ &decrypted_name);
+ if (res < 0)
+ goto out;
+ entry.name = decrypted_name.name;
+ entry.len = decrypted_name.len;
+ }
+
res = utf8_strncasecmp_folded(um, name, &entry);
- if (res < 0) {
- /*
- * In strict mode, ignore invalid names. In non-strict mode,
- * fall back to treating them as opaque byte sequences.
- */
- if (sb_has_strict_encoding(sb) || name->len != entry.len)
- return false;
- return !memcmp(name->name, entry.name, name->len);
+ /*
+ * In strict mode, ignore invalid names. In non-strict mode,
+ * fall back to treating them as opaque byte sequences.
+ */
+ if (res < 0 && !sb_has_strict_encoding(sb)) {
+ res = name->len == entry.len &&
+ memcmp(name->name, entry.name, name->len) == 0;
+ } else {
+ /* utf8_strncasecmp_folded returns 0 on match */
+ res = (res == 0);
}
- return res == 0;
+out:
+ kfree(decrypted_name.name);
+ return res;
}
#endif /* CONFIG_UNICODE */
-static inline bool f2fs_match_name(const struct inode *dir,
+static inline int f2fs_match_name(const struct inode *dir,
const struct f2fs_filename *fname,
const u8 *de_name, u32 de_name_len)
{
@@ -256,6 +282,7 @@ struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d,
struct f2fs_dir_entry *de;
unsigned long bit_pos = 0;
int max_len = 0;
+ int res = 0;
if (max_slots)
*max_slots = 0;
@@ -273,10 +300,15 @@ struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d,
continue;
}
- if (de->hash_code == fname->hash &&
- f2fs_match_name(d->inode, fname, d->filename[bit_pos],
- le16_to_cpu(de->name_len)))
- goto found;
+ if (de->hash_code == fname->hash) {
+ res = f2fs_match_name(d->inode, fname,
+ d->filename[bit_pos],
+ le16_to_cpu(de->name_len));
+ if (res < 0)
+ return ERR_PTR(res);
+ if (res)
+ goto found;
+ }
if (max_slots && max_len > *max_slots)
*max_slots = max_len;
@@ -326,7 +358,11 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
}
de = find_in_block(dir, dentry_page, fname, &max_slots);
- if (de) {
+ if (IS_ERR(de)) {
+ *res_page = ERR_CAST(de);
+ de = NULL;
+ break;
+ } else if (de) {
*res_page = dentry_page;
break;
}
@@ -448,17 +484,39 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
f2fs_put_page(page, 1);
}
-static void init_dent_inode(const struct f2fs_filename *fname,
+static void init_dent_inode(struct inode *dir, struct inode *inode,
+ const struct f2fs_filename *fname,
struct page *ipage)
{
struct f2fs_inode *ri;
+ if (!fname) /* tmpfile case? */
+ return;
+
f2fs_wait_on_page_writeback(ipage, NODE, true, true);
/* copy name info. to this inode page */
ri = F2FS_INODE(ipage);
ri->i_namelen = cpu_to_le32(fname->disk_name.len);
memcpy(ri->i_name, fname->disk_name.name, fname->disk_name.len);
+ if (IS_ENCRYPTED(dir)) {
+ file_set_enc_name(inode);
+ /*
+ * Roll-forward recovery doesn't have encryption keys available,
+ * so it can't compute the dirhash for encrypted+casefolded
+ * filenames. Append it to i_name if possible. Else, disable
+ * roll-forward recovery of the dentry (i.e., make fsync'ing the
+ * file force a checkpoint) by setting LOST_PINO.
+ */
+ if (IS_CASEFOLDED(dir)) {
+ if (fname->disk_name.len + sizeof(f2fs_hash_t) <=
+ F2FS_NAME_LEN)
+ put_unaligned(fname->hash, (f2fs_hash_t *)
+ &ri->i_name[fname->disk_name.len]);
+ else
+ file_lost_pino(inode);
+ }
+ }
set_page_dirty(ipage);
}
@@ -541,11 +599,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
return page;
}
- if (fname) {
- init_dent_inode(fname, page);
- if (IS_ENCRYPTED(dir))
- file_set_enc_name(inode);
- }
+ init_dent_inode(dir, inode, fname, page);
/*
* This file should be checkpointed during fsync.
@@ -1022,7 +1076,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
int err = 0;
if (IS_ENCRYPTED(inode)) {
- err = fscrypt_get_encryption_info(inode);
+ err = fscrypt_prepare_readdir(inode);
if (err)
goto out;
@@ -1081,28 +1135,13 @@ out:
return err < 0 ? err : 0;
}
-static int f2fs_dir_open(struct inode *inode, struct file *filp)
-{
- if (IS_ENCRYPTED(inode))
- return fscrypt_get_encryption_info(inode) ? -EACCES : 0;
- return 0;
-}
-
const struct file_operations f2fs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
.iterate_shared = f2fs_readdir,
.fsync = f2fs_sync_file,
- .open = f2fs_dir_open,
.unlocked_ioctl = f2fs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = f2fs_compat_ioctl,
#endif
};
-
-#ifdef CONFIG_UNICODE
-const struct dentry_operations f2fs_dentry_ops = {
- .d_hash = generic_ci_d_hash,
- .d_compare = generic_ci_d_compare,
-};
-#endif
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index cb700d797296..bb11759191dc 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -33,10 +33,8 @@
#else
#define f2fs_bug_on(sbi, condition) \
do { \
- if (unlikely(condition)) { \
- WARN_ON(1); \
+ if (WARN_ON(condition)) \
set_sbi_flag(sbi, SBI_NEED_FSCK); \
- } \
} while (0)
#endif
@@ -147,8 +145,10 @@ struct f2fs_mount_info {
/* For compression */
unsigned char compress_algorithm; /* algorithm type */
- unsigned compress_log_size; /* cluster log size */
+ unsigned char compress_log_size; /* cluster log size */
+ bool compress_chksum; /* compressed data chksum */
unsigned char compress_ext_cnt; /* extension count */
+ int compress_mode; /* compression mode */
unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */
};
@@ -402,85 +402,6 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
return size <= MAX_SIT_JENTRIES(journal);
}
-/*
- * f2fs-specific ioctl commands
- */
-#define F2FS_IOCTL_MAGIC 0xf5
-#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1)
-#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2)
-#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3)
-#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4)
-#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5)
-#define F2FS_IOC_GARBAGE_COLLECT _IOW(F2FS_IOCTL_MAGIC, 6, __u32)
-#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7)
-#define F2FS_IOC_DEFRAGMENT _IOWR(F2FS_IOCTL_MAGIC, 8, \
- struct f2fs_defragment)
-#define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \
- struct f2fs_move_range)
-#define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \
- struct f2fs_flush_device)
-#define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \
- struct f2fs_gc_range)
-#define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, __u32)
-#define F2FS_IOC_SET_PIN_FILE _IOW(F2FS_IOCTL_MAGIC, 13, __u32)
-#define F2FS_IOC_GET_PIN_FILE _IOR(F2FS_IOCTL_MAGIC, 14, __u32)
-#define F2FS_IOC_PRECACHE_EXTENTS _IO(F2FS_IOCTL_MAGIC, 15)
-#define F2FS_IOC_RESIZE_FS _IOW(F2FS_IOCTL_MAGIC, 16, __u64)
-#define F2FS_IOC_GET_COMPRESS_BLOCKS _IOR(F2FS_IOCTL_MAGIC, 17, __u64)
-#define F2FS_IOC_RELEASE_COMPRESS_BLOCKS \
- _IOR(F2FS_IOCTL_MAGIC, 18, __u64)
-#define F2FS_IOC_RESERVE_COMPRESS_BLOCKS \
- _IOR(F2FS_IOCTL_MAGIC, 19, __u64)
-#define F2FS_IOC_SEC_TRIM_FILE _IOW(F2FS_IOCTL_MAGIC, 20, \
- struct f2fs_sectrim_range)
-
-/*
- * should be same as XFS_IOC_GOINGDOWN.
- * Flags for going down operation used by FS_IOC_GOINGDOWN
- */
-#define F2FS_IOC_SHUTDOWN _IOR('X', 125, __u32) /* Shutdown */
-#define F2FS_GOING_DOWN_FULLSYNC 0x0 /* going down with full sync */
-#define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */
-#define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */
-#define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */
-#define F2FS_GOING_DOWN_NEED_FSCK 0x4 /* going down to trigger fsck */
-
-/*
- * Flags used by F2FS_IOC_SEC_TRIM_FILE
- */
-#define F2FS_TRIM_FILE_DISCARD 0x1 /* send discard command */
-#define F2FS_TRIM_FILE_ZEROOUT 0x2 /* zero out */
-#define F2FS_TRIM_FILE_MASK 0x3
-
-struct f2fs_gc_range {
- u32 sync;
- u64 start;
- u64 len;
-};
-
-struct f2fs_defragment {
- u64 start;
- u64 len;
-};
-
-struct f2fs_move_range {
- u32 dst_fd; /* destination fd */
- u64 pos_in; /* start position in src_fd */
- u64 pos_out; /* start position in dst_fd */
- u64 len; /* size to move */
-};
-
-struct f2fs_flush_device {
- u32 dev_num; /* device number to flush */
- u32 segments; /* # of segments to flush */
-};
-
-struct f2fs_sectrim_range {
- u64 start;
- u64 len;
- u64 flags;
-};
-
/* for inline stuff */
#define DEF_INLINE_RESERVED_SIZE 1
static inline int get_extra_isize(struct inode *inode);
@@ -533,9 +454,11 @@ struct f2fs_filename {
#ifdef CONFIG_UNICODE
/*
* For casefolded directories: the casefolded name, but it's left NULL
- * if the original name is not valid Unicode or if the filesystem is
- * doing an internal operation where usr_fname is also NULL. In these
- * cases we fall back to treating the name as an opaque byte sequence.
+ * if the original name is not valid Unicode, if the directory is both
+ * casefolded and encrypted and its encryption key is unavailable, or if
+ * the filesystem is doing an internal operation where usr_fname is also
+ * NULL. In all these cases we fall back to treating the name as an
+ * opaque byte sequence.
*/
struct fscrypt_str cf_name;
#endif
@@ -753,7 +676,9 @@ enum {
FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */
FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
FI_COMPRESSED_FILE, /* indicate file's data can be compressed */
+ FI_COMPRESS_CORRUPT, /* indicate compressed cluster is corrupted */
FI_MMAP_FILE, /* indicate file was mmapped */
+ FI_ENABLE_COMPRESS, /* enable compression in "user" compression mode */
FI_MAX, /* max flag, never be used */
};
@@ -810,6 +735,7 @@ struct f2fs_inode_info {
atomic_t i_compr_blocks; /* # of compressed blocks */
unsigned char i_compress_algorithm; /* algorithm type */
unsigned char i_log_cluster_size; /* log of cluster size */
+ unsigned short i_compress_flag; /* compress flag */
unsigned int i_cluster_size; /* cluster size */
};
@@ -894,6 +820,13 @@ enum nid_state {
MAX_NID_STATE,
};
+enum nat_state {
+ TOTAL_NAT,
+ DIRTY_NAT,
+ RECLAIMABLE_NAT,
+ MAX_NAT_STATE,
+};
+
struct f2fs_nm_info {
block_t nat_blkaddr; /* base disk address of NAT */
nid_t max_nid; /* maximum possible node ids */
@@ -909,8 +842,7 @@ struct f2fs_nm_info {
struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */
struct list_head nat_entries; /* cached nat entry list (clean) */
spinlock_t nat_list_lock; /* protect clean nat entry list */
- unsigned int nat_cnt; /* the # of cached nat entries */
- unsigned int dirty_nat_cnt; /* total num of nat entries in set */
+ unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */
unsigned int nat_blocks; /* # of nat blocks */
/* free node ids management */
@@ -1320,6 +1252,18 @@ enum fsync_mode {
FSYNC_MODE_NOBARRIER, /* fsync behaves nobarrier based on posix */
};
+enum {
+ COMPR_MODE_FS, /*
+ * automatically compress compression
+ * enabled files
+ */
+ COMPR_MODE_USER, /*
+ * automatical compression is disabled.
+ * user can control the file compression
+ * using ioctls
+ */
+};
+
/*
* this value is set in page as a private data which indicate that
* the page is atomically written, and it is in inmem_pages list.
@@ -1349,9 +1293,15 @@ enum compress_algorithm_type {
COMPRESS_MAX,
};
-#define COMPRESS_DATA_RESERVED_SIZE 5
+enum compress_flag {
+ COMPRESS_CHKSUM,
+ COMPRESS_MAX_FLAG,
+};
+
+#define COMPRESS_DATA_RESERVED_SIZE 4
struct compress_data {
__le32 clen; /* compressed data size */
+ __le32 chksum; /* compressed data chksum */
__le32 reserved[COMPRESS_DATA_RESERVED_SIZE]; /* reserved */
u8 cdata[]; /* compressed data */
};
@@ -1404,6 +1354,7 @@ struct decompress_io_ctx {
size_t rlen; /* valid data length in rbuf */
size_t clen; /* valid data length in cbuf */
atomic_t pending_pages; /* in-flight compressed page count */
+ atomic_t verity_pages; /* in-flight page count for verity */
bool failed; /* indicate IO error during decompression */
void *private; /* payload buffer for specified decompression algorithm */
void *private2; /* extra payload buffer */
@@ -1446,7 +1397,7 @@ struct f2fs_sb_info {
int cur_cp_pack; /* remain current cp pack */
spinlock_t cp_lock; /* for flag in ckpt */
struct inode *meta_inode; /* cache meta blocks */
- struct mutex cp_mutex; /* checkpoint procedure lock */
+ struct rw_semaphore cp_global_sem; /* checkpoint procedure lock */
struct rw_semaphore cp_rwsem; /* blocking FS operations */
struct rw_semaphore node_write; /* locking node writes */
struct rw_semaphore node_change; /* locking node change */
@@ -1496,6 +1447,7 @@ struct f2fs_sb_info {
loff_t max_file_blocks; /* max block index of file */
int dir_level; /* directory level */
int readdir_ra; /* readahead inode in readdir */
+ u64 max_io_bytes; /* max io bytes to merge IOs */
block_t user_block_count; /* # of user blocks */
block_t total_valid_block_count; /* # of valid blocks */
@@ -1671,13 +1623,6 @@ static inline bool f2fs_is_multi_device(struct f2fs_sb_info *sbi)
return sbi->s_ndevs > 1;
}
-/* For write statistics. Suppose sector size is 512 bytes,
- * and the return value is in kbytes. s is of struct f2fs_sb_info.
- */
-#define BD_PART_WRITTEN(s) \
-(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[STAT_WRITE]) - \
- (s)->sectors_written_start) >> 1)
-
static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
{
unsigned long now = jiffies;
@@ -2477,24 +2422,31 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
return entry;
}
-static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
+static inline bool is_inflight_io(struct f2fs_sb_info *sbi, int type)
{
- if (sbi->gc_mode == GC_URGENT_HIGH)
- return true;
-
if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) ||
get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) ||
get_pages(sbi, F2FS_WB_CP_DATA) ||
get_pages(sbi, F2FS_DIO_READ) ||
get_pages(sbi, F2FS_DIO_WRITE))
- return false;
+ return true;
if (type != DISCARD_TIME && SM_I(sbi) && SM_I(sbi)->dcc_info &&
atomic_read(&SM_I(sbi)->dcc_info->queued_discard))
- return false;
+ return true;
if (SM_I(sbi) && SM_I(sbi)->fcc_info &&
atomic_read(&SM_I(sbi)->fcc_info->queued_flush))
+ return true;
+ return false;
+}
+
+static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
+{
+ if (sbi->gc_mode == GC_URGENT_HIGH)
+ return true;
+
+ if (is_inflight_io(sbi, type))
return false;
if (sbi->gc_mode == GC_URGENT_LOW &&
@@ -2829,6 +2781,22 @@ static inline int f2fs_compressed_file(struct inode *inode)
is_inode_flag_set(inode, FI_COMPRESSED_FILE);
}
+static inline bool f2fs_need_compress_data(struct inode *inode)
+{
+ int compress_mode = F2FS_OPTION(F2FS_I_SB(inode)).compress_mode;
+
+ if (!f2fs_compressed_file(inode))
+ return false;
+
+ if (compress_mode == COMPR_MODE_FS)
+ return true;
+ else if (compress_mode == COMPR_MODE_USER &&
+ is_inode_flag_set(inode, FI_ENABLE_COMPRESS))
+ return true;
+
+ return false;
+}
+
static inline unsigned int addrs_per_inode(struct inode *inode)
{
unsigned int addrs = CUR_ADDRS_PER_INODE(inode) -
@@ -3251,6 +3219,8 @@ bool f2fs_empty_dir(struct inode *dir);
static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
{
+ if (fscrypt_is_nokey_name(dentry))
+ return -ENOKEY;
return f2fs_do_add_link(d_inode(dentry->d_parent), &dentry->d_name,
inode, inode->i_ino, inode->i_mode);
}
@@ -3443,6 +3413,7 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page);
void f2fs_remove_dirty_inode(struct inode *inode);
int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type);
void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type);
+u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi);
int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc);
void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi);
int __init f2fs_create_checkpoint_caches(void);
@@ -3767,9 +3738,6 @@ static inline void f2fs_update_sit_info(struct f2fs_sb_info *sbi) {}
#endif
extern const struct file_operations f2fs_dir_operations;
-#ifdef CONFIG_UNICODE
-extern const struct dentry_operations f2fs_dentry_ops;
-#endif
extern const struct file_operations f2fs_file_operations;
extern const struct inode_operations f2fs_file_inode_operations;
extern const struct address_space_operations f2fs_dblock_aops;
@@ -3961,6 +3929,9 @@ static inline void set_compress_context(struct inode *inode)
F2FS_OPTION(sbi).compress_algorithm;
F2FS_I(inode)->i_log_cluster_size =
F2FS_OPTION(sbi).compress_log_size;
+ F2FS_I(inode)->i_compress_flag =
+ F2FS_OPTION(sbi).compress_chksum ?
+ 1 << COMPRESS_CHKSUM : 0;
F2FS_I(inode)->i_cluster_size =
1 << F2FS_I(inode)->i_log_cluster_size;
F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index ee861c6d9ff0..f585545277d7 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -31,6 +31,7 @@
#include "gc.h"
#include "trace.h"
#include <trace/events/f2fs.h>
+#include <uapi/linux/f2fs.h>
static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf)
{
@@ -412,9 +413,14 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
goto fail;
/* handle inline data case */
- if (f2fs_has_inline_data(inode) && whence == SEEK_HOLE) {
- data_ofs = isize;
- goto found;
+ if (f2fs_has_inline_data(inode)) {
+ if (whence == SEEK_HOLE) {
+ data_ofs = isize;
+ goto found;
+ } else if (whence == SEEK_DATA) {
+ data_ofs = offset;
+ goto found;
+ }
}
pgofs = (pgoff_t)(offset >> PAGE_SHIFT);
@@ -2230,16 +2236,12 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
switch (in) {
case F2FS_GOING_DOWN_FULLSYNC:
- sb = freeze_bdev(sb->s_bdev);
- if (IS_ERR(sb)) {
- ret = PTR_ERR(sb);
+ ret = freeze_bdev(sb->s_bdev);
+ if (ret)
goto out;
- }
- if (sb) {
- f2fs_stop_checkpoint(sbi, false);
- set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
- thaw_bdev(sb->s_bdev, sb);
- }
+ f2fs_stop_checkpoint(sbi, false);
+ set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
+ thaw_bdev(sb->s_bdev);
break;
case F2FS_GOING_DOWN_METASYNC:
/* do checkpoint only */
@@ -2474,26 +2476,19 @@ out:
return ret;
}
-static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg)
+static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range)
{
- struct inode *inode = file_inode(filp);
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct f2fs_gc_range range;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp));
u64 end;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
-
- if (copy_from_user(&range, (struct f2fs_gc_range __user *)arg,
- sizeof(range)))
- return -EFAULT;
-
if (f2fs_readonly(sbi->sb))
return -EROFS;
- end = range.start + range.len;
- if (end < range.start || range.start < MAIN_BLKADDR(sbi) ||
+ end = range->start + range->len;
+ if (end < range->start || range->start < MAIN_BLKADDR(sbi) ||
end >= MAX_BLKADDR(sbi))
return -EINVAL;
@@ -2502,7 +2497,7 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg)
return ret;
do_more:
- if (!range.sync) {
+ if (!range->sync) {
if (!down_write_trylock(&sbi->gc_lock)) {
ret = -EBUSY;
goto out;
@@ -2511,20 +2506,30 @@ do_more:
down_write(&sbi->gc_lock);
}
- ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start));
+ ret = f2fs_gc(sbi, range->sync, true, GET_SEGNO(sbi, range->start));
if (ret) {
if (ret == -EBUSY)
ret = -EAGAIN;
goto out;
}
- range.start += BLKS_PER_SEC(sbi);
- if (range.start <= end)
+ range->start += BLKS_PER_SEC(sbi);
+ if (range->start <= end)
goto do_more;
out:
mnt_drop_write_file(filp);
return ret;
}
+static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg)
+{
+ struct f2fs_gc_range range;
+
+ if (copy_from_user(&range, (struct f2fs_gc_range __user *)arg,
+ sizeof(range)))
+ return -EFAULT;
+ return __f2fs_ioc_gc_range(filp, &range);
+}
+
static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -2861,9 +2866,9 @@ out:
return ret;
}
-static int f2fs_ioc_move_range(struct file *filp, unsigned long arg)
+static int __f2fs_ioc_move_range(struct file *filp,
+ struct f2fs_move_range *range)
{
- struct f2fs_move_range range;
struct fd dst;
int err;
@@ -2871,11 +2876,7 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg)
!(filp->f_mode & FMODE_WRITE))
return -EBADF;
- if (copy_from_user(&range, (struct f2fs_move_range __user *)arg,
- sizeof(range)))
- return -EFAULT;
-
- dst = fdget(range.dst_fd);
+ dst = fdget(range->dst_fd);
if (!dst.file)
return -EBADF;
@@ -2888,21 +2889,25 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg)
if (err)
goto err_out;
- err = f2fs_move_file_range(filp, range.pos_in, dst.file,
- range.pos_out, range.len);
+ err = f2fs_move_file_range(filp, range->pos_in, dst.file,
+ range->pos_out, range->len);
mnt_drop_write_file(filp);
- if (err)
- goto err_out;
-
- if (copy_to_user((struct f2fs_move_range __user *)arg,
- &range, sizeof(range)))
- err = -EFAULT;
err_out:
fdput(dst);
return err;
}
+static int f2fs_ioc_move_range(struct file *filp, unsigned long arg)
+{
+ struct f2fs_move_range range;
+
+ if (copy_from_user(&range, (struct f2fs_move_range __user *)arg,
+ sizeof(range)))
+ return -EFAULT;
+ return __f2fs_ioc_move_range(filp, &range);
+}
+
static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -3939,13 +3944,265 @@ err:
return ret;
}
-long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static int f2fs_ioc_get_compress_option(struct file *filp, unsigned long arg)
{
- if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp)))))
- return -EIO;
- if (!f2fs_is_checkpoint_ready(F2FS_I_SB(file_inode(filp))))
- return -ENOSPC;
+ struct inode *inode = file_inode(filp);
+ struct f2fs_comp_option option;
+
+ if (!f2fs_sb_has_compression(F2FS_I_SB(inode)))
+ return -EOPNOTSUPP;
+
+ inode_lock_shared(inode);
+
+ if (!f2fs_compressed_file(inode)) {
+ inode_unlock_shared(inode);
+ return -ENODATA;
+ }
+
+ option.algorithm = F2FS_I(inode)->i_compress_algorithm;
+ option.log_cluster_size = F2FS_I(inode)->i_log_cluster_size;
+
+ inode_unlock_shared(inode);
+
+ if (copy_to_user((struct f2fs_comp_option __user *)arg, &option,
+ sizeof(option)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_comp_option option;
+ int ret = 0;
+
+ if (!f2fs_sb_has_compression(sbi))
+ return -EOPNOTSUPP;
+
+ if (!(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+
+ if (copy_from_user(&option, (struct f2fs_comp_option __user *)arg,
+ sizeof(option)))
+ return -EFAULT;
+
+ if (!f2fs_compressed_file(inode) ||
+ option.log_cluster_size < MIN_COMPRESS_LOG_SIZE ||
+ option.log_cluster_size > MAX_COMPRESS_LOG_SIZE ||
+ option.algorithm >= COMPRESS_MAX)
+ return -EINVAL;
+
+ file_start_write(filp);
+ inode_lock(inode);
+
+ if (f2fs_is_mmap_file(inode) || get_dirty_pages(inode)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ if (inode->i_size != 0) {
+ ret = -EFBIG;
+ goto out;
+ }
+
+ F2FS_I(inode)->i_compress_algorithm = option.algorithm;
+ F2FS_I(inode)->i_log_cluster_size = option.log_cluster_size;
+ F2FS_I(inode)->i_cluster_size = 1 << option.log_cluster_size;
+ f2fs_mark_inode_dirty_sync(inode, true);
+
+ if (!f2fs_is_compress_backend_ready(inode))
+ f2fs_warn(sbi, "compression algorithm is successfully set, "
+ "but current kernel doesn't support this algorithm.");
+out:
+ inode_unlock(inode);
+ file_end_write(filp);
+
+ return ret;
+}
+
+static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len)
+{
+ DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, page_idx);
+ struct address_space *mapping = inode->i_mapping;
+ struct page *page;
+ pgoff_t redirty_idx = page_idx;
+ int i, page_len = 0, ret = 0;
+
+ page_cache_ra_unbounded(&ractl, len, 0);
+
+ for (i = 0; i < len; i++, page_idx++) {
+ page = read_cache_page(mapping, page_idx, NULL, NULL);
+ if (IS_ERR(page)) {
+ ret = PTR_ERR(page);
+ break;
+ }
+ page_len++;
+ }
+
+ for (i = 0; i < page_len; i++, redirty_idx++) {
+ page = find_lock_page(mapping, redirty_idx);
+ if (!page)
+ ret = -ENOENT;
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
+ f2fs_put_page(page, 0);
+ }
+
+ return ret;
+}
+
+static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ pgoff_t page_idx = 0, last_idx;
+ unsigned int blk_per_seg = sbi->blocks_per_seg;
+ int cluster_size = F2FS_I(inode)->i_cluster_size;
+ int count, ret;
+
+ if (!f2fs_sb_has_compression(sbi) ||
+ F2FS_OPTION(sbi).compress_mode != COMPR_MODE_USER)
+ return -EOPNOTSUPP;
+
+ if (!(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+
+ if (!f2fs_compressed_file(inode))
+ return -EINVAL;
+
+ f2fs_balance_fs(F2FS_I_SB(inode), true);
+
+ file_start_write(filp);
+ inode_lock(inode);
+
+ if (!f2fs_is_compress_backend_ready(inode)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (f2fs_is_mmap_file(inode)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
+ if (ret)
+ goto out;
+
+ if (!atomic_read(&fi->i_compr_blocks))
+ goto out;
+
+ last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+
+ count = last_idx - page_idx;
+ while (count) {
+ int len = min(cluster_size, count);
+
+ ret = redirty_blocks(inode, page_idx, len);
+ if (ret < 0)
+ break;
+
+ if (get_dirty_pages(inode) >= blk_per_seg)
+ filemap_fdatawrite(inode->i_mapping);
+
+ count -= len;
+ page_idx += len;
+ }
+
+ if (!ret)
+ ret = filemap_write_and_wait_range(inode->i_mapping, 0,
+ LLONG_MAX);
+
+ if (ret)
+ f2fs_warn(sbi, "%s: The file might be partially decompressed "
+ "(errno=%d). Please delete the file.\n",
+ __func__, ret);
+out:
+ inode_unlock(inode);
+ file_end_write(filp);
+
+ return ret;
+}
+
+static int f2fs_ioc_compress_file(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ pgoff_t page_idx = 0, last_idx;
+ unsigned int blk_per_seg = sbi->blocks_per_seg;
+ int cluster_size = F2FS_I(inode)->i_cluster_size;
+ int count, ret;
+
+ if (!f2fs_sb_has_compression(sbi) ||
+ F2FS_OPTION(sbi).compress_mode != COMPR_MODE_USER)
+ return -EOPNOTSUPP;
+
+ if (!(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+ if (!f2fs_compressed_file(inode))
+ return -EINVAL;
+
+ f2fs_balance_fs(F2FS_I_SB(inode), true);
+
+ file_start_write(filp);
+ inode_lock(inode);
+
+ if (!f2fs_is_compress_backend_ready(inode)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (f2fs_is_mmap_file(inode)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
+ if (ret)
+ goto out;
+
+ set_inode_flag(inode, FI_ENABLE_COMPRESS);
+
+ last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+
+ count = last_idx - page_idx;
+ while (count) {
+ int len = min(cluster_size, count);
+
+ ret = redirty_blocks(inode, page_idx, len);
+ if (ret < 0)
+ break;
+
+ if (get_dirty_pages(inode) >= blk_per_seg)
+ filemap_fdatawrite(inode->i_mapping);
+
+ count -= len;
+ page_idx += len;
+ }
+
+ if (!ret)
+ ret = filemap_write_and_wait_range(inode->i_mapping, 0,
+ LLONG_MAX);
+
+ clear_inode_flag(inode, FI_ENABLE_COMPRESS);
+
+ if (ret)
+ f2fs_warn(sbi, "%s: The file might be partially compressed "
+ "(errno=%d). Please delete the file.\n",
+ __func__, ret);
+out:
+ inode_unlock(inode);
+ file_end_write(filp);
+
+ return ret;
+}
+
+static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
switch (cmd) {
case FS_IOC_GETFLAGS:
return f2fs_ioc_getflags(filp, arg);
@@ -4027,11 +4284,29 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_reserve_compress_blocks(filp, arg);
case F2FS_IOC_SEC_TRIM_FILE:
return f2fs_sec_trim_file(filp, arg);
+ case F2FS_IOC_GET_COMPRESS_OPTION:
+ return f2fs_ioc_get_compress_option(filp, arg);
+ case F2FS_IOC_SET_COMPRESS_OPTION:
+ return f2fs_ioc_set_compress_option(filp, arg);
+ case F2FS_IOC_DECOMPRESS_FILE:
+ return f2fs_ioc_decompress_file(filp, arg);
+ case F2FS_IOC_COMPRESS_FILE:
+ return f2fs_ioc_compress_file(filp, arg);
default:
return -ENOTTY;
}
}
+long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp)))))
+ return -EIO;
+ if (!f2fs_is_checkpoint_ready(F2FS_I_SB(file_inode(filp))))
+ return -ENOSPC;
+
+ return __f2fs_ioctl(filp, cmd, arg);
+}
+
static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
@@ -4148,8 +4423,63 @@ out:
}
#ifdef CONFIG_COMPAT
+struct compat_f2fs_gc_range {
+ u32 sync;
+ compat_u64 start;
+ compat_u64 len;
+};
+#define F2FS_IOC32_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11,\
+ struct compat_f2fs_gc_range)
+
+static int f2fs_compat_ioc_gc_range(struct file *file, unsigned long arg)
+{
+ struct compat_f2fs_gc_range __user *urange;
+ struct f2fs_gc_range range;
+ int err;
+
+ urange = compat_ptr(arg);
+ err = get_user(range.sync, &urange->sync);
+ err |= get_user(range.start, &urange->start);
+ err |= get_user(range.len, &urange->len);
+ if (err)
+ return -EFAULT;
+
+ return __f2fs_ioc_gc_range(file, &range);
+}
+
+struct compat_f2fs_move_range {
+ u32 dst_fd;
+ compat_u64 pos_in;
+ compat_u64 pos_out;
+ compat_u64 len;
+};
+#define F2FS_IOC32_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \
+ struct compat_f2fs_move_range)
+
+static int f2fs_compat_ioc_move_range(struct file *file, unsigned long arg)
+{
+ struct compat_f2fs_move_range __user *urange;
+ struct f2fs_move_range range;
+ int err;
+
+ urange = compat_ptr(arg);
+ err = get_user(range.dst_fd, &urange->dst_fd);
+ err |= get_user(range.pos_in, &urange->pos_in);
+ err |= get_user(range.pos_out, &urange->pos_out);
+ err |= get_user(range.len, &urange->len);
+ if (err)
+ return -EFAULT;
+
+ return __f2fs_ioc_move_range(file, &range);
+}
+
long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
+ if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(file)))))
+ return -EIO;
+ if (!f2fs_is_checkpoint_ready(F2FS_I_SB(file_inode(file))))
+ return -ENOSPC;
+
switch (cmd) {
case FS_IOC32_GETFLAGS:
cmd = FS_IOC_GETFLAGS;
@@ -4160,6 +4490,10 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case FS_IOC32_GETVERSION:
cmd = FS_IOC_GETVERSION;
break;
+ case F2FS_IOC32_GARBAGE_COLLECT_RANGE:
+ return f2fs_compat_ioc_gc_range(file, arg);
+ case F2FS_IOC32_MOVE_RANGE:
+ return f2fs_compat_ioc_move_range(file, arg);
case F2FS_IOC_START_ATOMIC_WRITE:
case F2FS_IOC_COMMIT_ATOMIC_WRITE:
case F2FS_IOC_START_VOLATILE_WRITE:
@@ -4177,10 +4511,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
case FS_IOC_GET_ENCRYPTION_NONCE:
case F2FS_IOC_GARBAGE_COLLECT:
- case F2FS_IOC_GARBAGE_COLLECT_RANGE:
case F2FS_IOC_WRITE_CHECKPOINT:
case F2FS_IOC_DEFRAGMENT:
- case F2FS_IOC_MOVE_RANGE:
case F2FS_IOC_FLUSH_DEVICE:
case F2FS_IOC_GET_FEATURES:
case FS_IOC_FSGETXATTR:
@@ -4197,11 +4529,15 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC_RELEASE_COMPRESS_BLOCKS:
case F2FS_IOC_RESERVE_COMPRESS_BLOCKS:
case F2FS_IOC_SEC_TRIM_FILE:
+ case F2FS_IOC_GET_COMPRESS_OPTION:
+ case F2FS_IOC_SET_COMPRESS_OPTION:
+ case F2FS_IOC_DECOMPRESS_FILE:
+ case F2FS_IOC_COMPRESS_FILE:
break;
default:
return -ENOIOCTLCMD;
}
- return f2fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+ return __f2fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
}
#endif
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 05641a1e36cc..3ef84e6ded41 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1986,7 +1986,7 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
freeze_super(sbi->sb);
down_write(&sbi->gc_lock);
- mutex_lock(&sbi->cp_mutex);
+ down_write(&sbi->cp_global_sem);
spin_lock(&sbi->stat_lock);
if (shrunk_blocks + valid_user_blocks(sbi) +
@@ -2031,7 +2031,7 @@ recover_out:
spin_unlock(&sbi->stat_lock);
}
out_err:
- mutex_unlock(&sbi->cp_mutex);
+ up_write(&sbi->cp_global_sem);
up_write(&sbi->gc_lock);
thaw_super(sbi->sb);
clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index de841aaf3c43..e3beac546c63 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c
@@ -111,7 +111,9 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname)
* If the casefolded name is provided, hash it instead of the
* on-disk name. If the casefolded name is *not* provided, that
* should only be because the name wasn't valid Unicode, so fall
- * back to treating the name as an opaque byte sequence.
+ * back to treating the name as an opaque byte sequence. Note
+ * that to handle encrypted directories, the fallback must use
+ * usr_fname (plaintext) rather than disk_name (ciphertext).
*/
WARN_ON_ONCE(!fname->usr_fname->name);
if (fname->cf_name.name) {
@@ -121,6 +123,13 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname)
name = fname->usr_fname->name;
len = fname->usr_fname->len;
}
+ if (IS_ENCRYPTED(dir)) {
+ struct qstr tmp = QSTR_INIT(name, len);
+
+ fname->hash =
+ cpu_to_le32(fscrypt_fname_siphash(dir, &tmp));
+ return;
+ }
}
#endif
fname->hash = cpu_to_le32(TEA_hash_name(name, len));
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 70384e31788d..806ebabf5870 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -188,7 +188,8 @@ int f2fs_convert_inline_inode(struct inode *inode)
struct page *ipage, *page;
int err = 0;
- if (!f2fs_has_inline_data(inode))
+ if (!f2fs_has_inline_data(inode) ||
+ f2fs_hw_is_readonly(sbi) || f2fs_readonly(sbi->sb))
return 0;
page = f2fs_grab_cache_page(inode->i_mapping, 0, false);
@@ -266,7 +267,7 @@ int f2fs_recover_inline_data(struct inode *inode, struct page *npage)
* [prev.] [next] of inline_data flag
* o o -> recover inline_data
* o x -> remove inline_data, and then recover data blocks
- * x o -> remove inline_data, and then recover inline_data
+ * x o -> remove data blocks, and then recover inline_data
* x x -> recover data blocks
*/
if (IS_INODE(npage))
@@ -298,6 +299,7 @@ process_inline:
if (IS_ERR(ipage))
return PTR_ERR(ipage);
f2fs_truncate_inline_inode(inode, ipage, 0);
+ stat_dec_inline_inode(inode);
clear_inode_flag(inode, FI_INLINE_DATA);
f2fs_put_page(ipage, 1);
} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
@@ -306,6 +308,7 @@ process_inline:
ret = f2fs_truncate_blocks(inode, 0, false);
if (ret)
return ret;
+ stat_inc_inline_inode(inode);
goto process_inline;
}
return 0;
@@ -332,6 +335,10 @@ struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir,
make_dentry_ptr_inline(dir, &d, inline_dentry);
de = f2fs_find_target_dentry(&d, fname, NULL);
unlock_page(ipage);
+ if (IS_ERR(de)) {
+ *res_page = ERR_CAST(de);
+ de = NULL;
+ }
if (de)
*res_page = ipage;
else
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 657db2fb6739..349d9cb933ee 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -456,6 +456,7 @@ static int do_read_inode(struct inode *inode)
le64_to_cpu(ri->i_compr_blocks));
fi->i_compress_algorithm = ri->i_compress_algorithm;
fi->i_log_cluster_size = ri->i_log_cluster_size;
+ fi->i_compress_flag = le16_to_cpu(ri->i_compress_flag);
fi->i_cluster_size = 1 << fi->i_log_cluster_size;
set_inode_flag(inode, FI_COMPRESSED_FILE);
}
@@ -634,6 +635,8 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
&F2FS_I(inode)->i_compr_blocks));
ri->i_compress_algorithm =
F2FS_I(inode)->i_compress_algorithm;
+ ri->i_compress_flag =
+ cpu_to_le16(F2FS_I(inode)->i_compress_flag);
ri->i_log_cluster_size =
F2FS_I(inode)->i_log_cluster_size;
}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 8fa37d1434de..6edb1ab579a1 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -497,6 +497,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
}
err = f2fs_prepare_lookup(dir, dentry, &fname);
+ generic_set_encrypted_ci_d_ops(dentry);
if (err == -ENOENT)
goto out_splice;
if (err)
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index d5d8ce077f29..3a24423ac65f 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -62,8 +62,8 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
sizeof(struct free_nid)) >> PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
} else if (type == NAT_ENTRIES) {
- mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >>
- PAGE_SHIFT;
+ mem_size = (nm_i->nat_cnt[TOTAL_NAT] *
+ sizeof(struct nat_entry)) >> PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
if (excess_cached_nats(sbi))
res = false;
@@ -109,7 +109,7 @@ static void clear_node_page_dirty(struct page *page)
static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
{
- return f2fs_get_meta_page(sbi, current_nat_addr(sbi, nid));
+ return f2fs_get_meta_page_retry(sbi, current_nat_addr(sbi, nid));
}
static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
@@ -177,7 +177,8 @@ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i,
list_add_tail(&ne->list, &nm_i->nat_entries);
spin_unlock(&nm_i->nat_list_lock);
- nm_i->nat_cnt++;
+ nm_i->nat_cnt[TOTAL_NAT]++;
+ nm_i->nat_cnt[RECLAIMABLE_NAT]++;
return ne;
}
@@ -207,7 +208,8 @@ static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i,
static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
{
radix_tree_delete(&nm_i->nat_root, nat_get_nid(e));
- nm_i->nat_cnt--;
+ nm_i->nat_cnt[TOTAL_NAT]--;
+ nm_i->nat_cnt[RECLAIMABLE_NAT]--;
__free_nat_entry(e);
}
@@ -253,7 +255,8 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
if (get_nat_flag(ne, IS_DIRTY))
goto refresh_list;
- nm_i->dirty_nat_cnt++;
+ nm_i->nat_cnt[DIRTY_NAT]++;
+ nm_i->nat_cnt[RECLAIMABLE_NAT]--;
set_nat_flag(ne, IS_DIRTY, true);
refresh_list:
spin_lock(&nm_i->nat_list_lock);
@@ -273,7 +276,8 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
set_nat_flag(ne, IS_DIRTY, false);
set->entry_cnt--;
- nm_i->dirty_nat_cnt--;
+ nm_i->nat_cnt[DIRTY_NAT]--;
+ nm_i->nat_cnt[RECLAIMABLE_NAT]++;
}
static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
@@ -2590,9 +2594,15 @@ int f2fs_recover_inline_xattr(struct inode *inode, struct page *page)
ri = F2FS_INODE(page);
if (ri->i_inline & F2FS_INLINE_XATTR) {
- set_inode_flag(inode, FI_INLINE_XATTR);
+ if (!f2fs_has_inline_xattr(inode)) {
+ set_inode_flag(inode, FI_INLINE_XATTR);
+ stat_inc_inline_xattr(inode);
+ }
} else {
- clear_inode_flag(inode, FI_INLINE_XATTR);
+ if (f2fs_has_inline_xattr(inode)) {
+ stat_dec_inline_xattr(inode);
+ clear_inode_flag(inode, FI_INLINE_XATTR);
+ }
goto update_inode;
}
@@ -2944,14 +2954,17 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
LIST_HEAD(sets);
int err = 0;
- /* during unmount, let's flush nat_bits before checking dirty_nat_cnt */
+ /*
+ * during unmount, let's flush nat_bits before checking
+ * nat_cnt[DIRTY_NAT].
+ */
if (enabled_nat_bits(sbi, cpc)) {
down_write(&nm_i->nat_tree_lock);
remove_nats_in_journal(sbi);
up_write(&nm_i->nat_tree_lock);
}
- if (!nm_i->dirty_nat_cnt)
+ if (!nm_i->nat_cnt[DIRTY_NAT])
return 0;
down_write(&nm_i->nat_tree_lock);
@@ -2962,7 +2975,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
* into nat entry set.
*/
if (enabled_nat_bits(sbi, cpc) ||
- !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
+ !__has_cursum_space(journal,
+ nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
remove_nats_in_journal(sbi);
while ((found = __gang_lookup_nat_set(nm_i,
@@ -3086,7 +3100,6 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
F2FS_RESERVED_NODE_NUM;
nm_i->nid_cnt[FREE_NID] = 0;
nm_i->nid_cnt[PREALLOC_NID] = 0;
- nm_i->nat_cnt = 0;
nm_i->ram_thresh = DEF_RAM_THRESHOLD;
nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
@@ -3220,7 +3233,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
__del_from_nat_cache(nm_i, natvec[idx]);
}
}
- f2fs_bug_on(sbi, nm_i->nat_cnt);
+ f2fs_bug_on(sbi, nm_i->nat_cnt[TOTAL_NAT]);
/* destroy nat set cache */
nid = 0;
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 69e5859e993c..f84541b57acb 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -126,13 +126,13 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne,
static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi)
{
- return NM_I(sbi)->dirty_nat_cnt >= NM_I(sbi)->max_nid *
+ return NM_I(sbi)->nat_cnt[DIRTY_NAT] >= NM_I(sbi)->max_nid *
NM_I(sbi)->dirty_nats_ratio / 100;
}
static inline bool excess_cached_nats(struct f2fs_sb_info *sbi)
{
- return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD;
+ return NM_I(sbi)->nat_cnt[TOTAL_NAT] >= DEF_NAT_CACHE_THRESHOLD;
}
static inline bool excess_dirty_nodes(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 4f12ade6410a..da75d5d52f0a 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -5,6 +5,7 @@
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
*/
+#include <asm/unaligned.h>
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
#include "f2fs.h"
@@ -128,7 +129,16 @@ static int init_recovered_filename(const struct inode *dir,
}
/* Compute the hash of the filename */
- if (IS_CASEFOLDED(dir)) {
+ if (IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir)) {
+ /*
+ * In this case the hash isn't computable without the key, so it
+ * was saved on-disk.
+ */
+ if (fname->disk_name.len + sizeof(f2fs_hash_t) > F2FS_NAME_LEN)
+ return -EINVAL;
+ fname->hash = get_unaligned((f2fs_hash_t *)
+ &raw_inode->i_name[fname->disk_name.len]);
+ } else if (IS_CASEFOLDED(dir)) {
err = f2fs_init_casefolded_name(dir, fname);
if (err)
return err;
@@ -789,7 +799,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
INIT_LIST_HEAD(&dir_list);
/* prevent checkpoint */
- mutex_lock(&sbi->cp_mutex);
+ down_write(&sbi->cp_global_sem);
/* step #1: find fsynced inode numbers */
err = find_fsync_dnodes(sbi, &inode_list, check_only);
@@ -840,7 +850,7 @@ skip:
if (!err)
clear_sbi_flag(sbi, SBI_POR_DOING);
- mutex_unlock(&sbi->cp_mutex);
+ up_write(&sbi->cp_global_sem);
/* let's drop all the directory inodes for clean checkpoint */
destroy_fsync_dnodes(&dir_list, err);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 1596502f7375..deca74cb17df 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -529,31 +529,38 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
else
f2fs_build_free_nids(sbi, false, false);
- if (!is_idle(sbi, REQ_TIME) &&
- (!excess_dirty_nats(sbi) && !excess_dirty_nodes(sbi)))
+ if (excess_dirty_nats(sbi) || excess_dirty_nodes(sbi) ||
+ excess_prefree_segs(sbi))
+ goto do_sync;
+
+ /* there is background inflight IO or foreground operation recently */
+ if (is_inflight_io(sbi, REQ_TIME) ||
+ (!f2fs_time_over(sbi, REQ_TIME) && rwsem_is_locked(&sbi->cp_rwsem)))
return;
+ /* exceed periodical checkpoint timeout threshold */
+ if (f2fs_time_over(sbi, CP_TIME))
+ goto do_sync;
+
/* checkpoint is the only way to shrink partial cached entries */
- if (!f2fs_available_free_memory(sbi, NAT_ENTRIES) ||
- !f2fs_available_free_memory(sbi, INO_ENTRIES) ||
- excess_prefree_segs(sbi) ||
- excess_dirty_nats(sbi) ||
- excess_dirty_nodes(sbi) ||
- f2fs_time_over(sbi, CP_TIME)) {
- if (test_opt(sbi, DATA_FLUSH) && from_bg) {
- struct blk_plug plug;
-
- mutex_lock(&sbi->flush_lock);
-
- blk_start_plug(&plug);
- f2fs_sync_dirty_inodes(sbi, FILE_INODE);
- blk_finish_plug(&plug);
+ if (f2fs_available_free_memory(sbi, NAT_ENTRIES) ||
+ f2fs_available_free_memory(sbi, INO_ENTRIES))
+ return;
- mutex_unlock(&sbi->flush_lock);
- }
- f2fs_sync_fs(sbi->sb, true);
- stat_inc_bg_cp_count(sbi->stat_info);
+do_sync:
+ if (test_opt(sbi, DATA_FLUSH) && from_bg) {
+ struct blk_plug plug;
+
+ mutex_lock(&sbi->flush_lock);
+
+ blk_start_plug(&plug);
+ f2fs_sync_dirty_inodes(sbi, FILE_INODE);
+ blk_finish_plug(&plug);
+
+ mutex_unlock(&sbi->flush_lock);
}
+ f2fs_sync_fs(sbi->sb, true);
+ stat_inc_bg_cp_count(sbi->stat_info);
}
static int __submit_flush_wait(struct f2fs_sb_info *sbi,
@@ -3254,7 +3261,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
else
return CURSEG_COLD_DATA;
}
- if (file_is_cold(inode) || f2fs_compressed_file(inode))
+ if (file_is_cold(inode) || f2fs_need_compress_data(inode))
return CURSEG_COLD_DATA;
if (file_is_hot(inode) ||
is_inode_flag_set(inode, FI_HOT_DATA) ||
@@ -4544,7 +4551,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
return;
mutex_lock(&dirty_i->seglist_lock);
- for (segno = 0; segno < MAIN_SECS(sbi); segno += blks_per_sec) {
+ for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
valid_blocks = get_valid_blocks(sbi, segno, true);
secno = GET_SEC_FROM_SEG(sbi, segno);
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index d66de5999a26..dd3c3c7a90ec 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -18,9 +18,7 @@ static unsigned int shrinker_run_no;
static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi)
{
- long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt;
-
- return count > 0 ? count : 0;
+ return NM_I(sbi)->nat_cnt[RECLAIMABLE_NAT];
}
static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 00eff2f51807..b4a07fe62d1a 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -146,6 +146,8 @@ enum {
Opt_compress_algorithm,
Opt_compress_log_size,
Opt_compress_extension,
+ Opt_compress_chksum,
+ Opt_compress_mode,
Opt_atgc,
Opt_err,
};
@@ -214,6 +216,8 @@ static match_table_t f2fs_tokens = {
{Opt_compress_algorithm, "compress_algorithm=%s"},
{Opt_compress_log_size, "compress_log_size=%u"},
{Opt_compress_extension, "compress_extension=%s"},
+ {Opt_compress_chksum, "compress_chksum"},
+ {Opt_compress_mode, "compress_mode=%s"},
{Opt_atgc, "atgc"},
{Opt_err, NULL},
};
@@ -934,10 +938,29 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
F2FS_OPTION(sbi).compress_ext_cnt++;
kfree(name);
break;
+ case Opt_compress_chksum:
+ F2FS_OPTION(sbi).compress_chksum = true;
+ break;
+ case Opt_compress_mode:
+ name = match_strdup(&args[0]);
+ if (!name)
+ return -ENOMEM;
+ if (!strcmp(name, "fs")) {
+ F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS;
+ } else if (!strcmp(name, "user")) {
+ F2FS_OPTION(sbi).compress_mode = COMPR_MODE_USER;
+ } else {
+ kfree(name);
+ return -EINVAL;
+ }
+ kfree(name);
+ break;
#else
case Opt_compress_algorithm:
case Opt_compress_log_size:
case Opt_compress_extension:
+ case Opt_compress_chksum:
+ case Opt_compress_mode:
f2fs_info(sbi, "compression options not supported");
break;
#endif
@@ -1523,6 +1546,14 @@ static inline void f2fs_show_compress_options(struct seq_file *seq,
seq_printf(seq, ",compress_extension=%s",
F2FS_OPTION(sbi).extensions[i]);
}
+
+ if (F2FS_OPTION(sbi).compress_chksum)
+ seq_puts(seq, ",compress_chksum");
+
+ if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_FS)
+ seq_printf(seq, ",compress_mode=%s", "fs");
+ else if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER)
+ seq_printf(seq, ",compress_mode=%s", "user");
}
static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
@@ -1672,6 +1703,7 @@ static void default_options(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4;
F2FS_OPTION(sbi).compress_log_size = MIN_COMPRESS_LOG_SIZE;
F2FS_OPTION(sbi).compress_ext_cnt = 0;
+ F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS;
F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON;
sbi->sb->s_flags &= ~SB_INLINECRYPT;
@@ -1904,7 +1936,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
if (*flags & SB_RDONLY ||
F2FS_OPTION(sbi).whint_mode != org_mount_opt.whint_mode) {
- writeback_inodes_sb(sb, WB_REASON_SYNC);
sync_inodes_sb(sb);
set_sbi_flag(sbi, SBI_IS_DIRTY);
@@ -2744,7 +2775,6 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
block_t total_sections, blocks_per_seg;
struct f2fs_super_block *raw_super = (struct f2fs_super_block *)
(bh->b_data + F2FS_SUPER_OFFSET);
- unsigned int blocksize;
size_t crc_offset = 0;
__u32 crc = 0;
@@ -2770,18 +2800,11 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
}
}
- /* Currently, support only 4KB page cache size */
- if (F2FS_BLKSIZE != PAGE_SIZE) {
- f2fs_info(sbi, "Invalid page_cache_size (%lu), supports only 4KB",
- PAGE_SIZE);
- return -EFSCORRUPTED;
- }
-
/* Currently, support only 4KB block size */
- blocksize = 1 << le32_to_cpu(raw_super->log_blocksize);
- if (blocksize != F2FS_BLKSIZE) {
- f2fs_info(sbi, "Invalid blocksize (%u), supports only 4KB",
- blocksize);
+ if (le32_to_cpu(raw_super->log_blocksize) != F2FS_BLKSIZE_BITS) {
+ f2fs_info(sbi, "Invalid log_blocksize (%u), supports only %u",
+ le32_to_cpu(raw_super->log_blocksize),
+ F2FS_BLKSIZE_BITS);
return -EFSCORRUPTED;
}
@@ -3071,9 +3094,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->total_node_count =
(le32_to_cpu(raw_super->segment_count_nat) / 2)
* sbi->blocks_per_seg * NAT_ENTRY_PER_BLOCK;
- sbi->root_ino_num = le32_to_cpu(raw_super->root_ino);
- sbi->node_ino_num = le32_to_cpu(raw_super->node_ino);
- sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino);
+ F2FS_ROOT_INO(sbi) = le32_to_cpu(raw_super->root_ino);
+ F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino);
+ F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino);
sbi->cur_victim_sec = NULL_SECNO;
sbi->next_victim_seg[BG_GC] = NULL_SEGNO;
sbi->next_victim_seg[FG_GC] = NULL_SEGNO;
@@ -3151,7 +3174,7 @@ static int f2fs_report_zone_cb(struct blk_zone *zone, unsigned int idx,
static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
{
struct block_device *bdev = FDEV(devi).bdev;
- sector_t nr_sectors = bdev->bd_part->nr_sects;
+ sector_t nr_sectors = bdev_nr_sectors(bdev);
struct f2fs_report_zones_args rep_zone_arg;
int ret;
@@ -3399,12 +3422,6 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi)
struct unicode_map *encoding;
__u16 encoding_flags;
- if (f2fs_sb_has_encrypt(sbi)) {
- f2fs_err(sbi,
- "Can't mount with encoding and encryption");
- return -EINVAL;
- }
-
if (f2fs_sb_read_encoding(sbi->raw_super, &encoding_info,
&encoding_flags)) {
f2fs_err(sbi,
@@ -3427,7 +3444,6 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi)
sbi->sb->s_encoding = encoding;
sbi->sb->s_encoding_flags = encoding_flags;
- sbi->sb->s_d_op = &f2fs_dentry_ops;
}
#else
if (f2fs_sb_has_casefold(sbi)) {
@@ -3559,7 +3575,7 @@ try_onemore:
sbi->valid_super_block = valid_super_block;
init_rwsem(&sbi->gc_lock);
mutex_init(&sbi->writepages);
- mutex_init(&sbi->cp_mutex);
+ init_rwsem(&sbi->cp_global_sem);
init_rwsem(&sbi->node_write);
init_rwsem(&sbi->node_change);
@@ -3700,10 +3716,7 @@ try_onemore:
}
/* For write statistics */
- if (sb->s_bdev->bd_part)
- sbi->sectors_written_start =
- (u64)part_stat_read(sb->s_bdev->bd_part,
- sectors[STAT_WRITE]);
+ sbi->sectors_written_start = f2fs_get_sectors_written(sbi);
/* Read accumulated write IO statistics if exists */
seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
@@ -3918,6 +3931,7 @@ free_bio_info:
#ifdef CONFIG_UNICODE
utf8_unload(sb->s_encoding);
+ sb->s_encoding = NULL;
#endif
free_options:
#ifdef CONFIG_QUOTA
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index ec77ccfea923..30bae57428d1 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -90,25 +90,17 @@ static ssize_t free_segments_show(struct f2fs_attr *a,
static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- struct super_block *sb = sbi->sb;
-
- if (!sb->s_bdev->bd_part)
- return sprintf(buf, "0\n");
-
return sprintf(buf, "%llu\n",
(unsigned long long)(sbi->kbytes_written +
- BD_PART_WRITTEN(sbi)));
+ ((f2fs_get_sectors_written(sbi) -
+ sbi->sectors_written_start) >> 1)));
}
static ssize_t features_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- struct super_block *sb = sbi->sb;
int len = 0;
- if (!sb->s_bdev->bd_part)
- return sprintf(buf, "0\n");
-
if (f2fs_sb_has_encrypt(sbi))
len += scnprintf(buf, PAGE_SIZE - len, "%s",
"encryption");
@@ -566,6 +558,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info,
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_period_ms, iostat_period_ms);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_io_bytes, max_io_bytes);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold);
F2FS_RW_ATTR(F2FS_SBI, f2fs_super_block, extension_list, extension_list);
#ifdef CONFIG_F2FS_FAULT_INJECTION
@@ -650,6 +643,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(iostat_enable),
ATTR_LIST(iostat_period_ms),
ATTR_LIST(readdir_ra),
+ ATTR_LIST(max_io_bytes),
ATTR_LIST(gc_pin_file_thresh),
ATTR_LIST(extension_list),
#ifdef CONFIG_F2FS_FAULT_INJECTION
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 19ac5baad50f..05b36b28f2e8 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -781,9 +781,10 @@ void send_sigio(struct fown_struct *fown, int fd, int band)
{
struct task_struct *p;
enum pid_type type;
+ unsigned long flags;
struct pid *pid;
- read_lock(&fown->lock);
+ read_lock_irqsave(&fown->lock, flags);
type = fown->pid_type;
pid = fown->pid;
@@ -804,7 +805,7 @@ void send_sigio(struct fown_struct *fown, int fd, int band)
read_unlock(&tasklist_lock);
}
out_unlock_fown:
- read_unlock(&fown->lock);
+ read_unlock_irqrestore(&fown->lock, flags);
}
static void send_sigurg_to_task(struct task_struct *p,
@@ -819,9 +820,10 @@ int send_sigurg(struct fown_struct *fown)
struct task_struct *p;
enum pid_type type;
struct pid *pid;
+ unsigned long flags;
int ret = 0;
- read_lock(&fown->lock);
+ read_lock_irqsave(&fown->lock, flags);
type = fown->pid_type;
pid = fown->pid;
@@ -844,7 +846,7 @@ int send_sigurg(struct fown_struct *fown)
read_unlock(&tasklist_lock);
}
out_unlock_fown:
- read_unlock(&fown->lock);
+ read_unlock_irqrestore(&fown->lock, flags);
return ret;
}
diff --git a/fs/file.c b/fs/file.c
index 4559b5fec3bd..8434e0afecc7 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -158,7 +158,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
spin_unlock(&files->file_lock);
new_fdt = alloc_fdtable(nr);
- /* make sure all __fd_install() have seen resize_in_progress
+ /* make sure all fd_install() have seen resize_in_progress
* or have finished their rcu_read_lock_sched() section.
*/
if (atomic_read(&files->count) > 1)
@@ -181,7 +181,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
rcu_assign_pointer(files->fdt, new_fdt);
if (cur_fdt != &files->fdtab)
call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
- /* coupled with smp_rmb() in __fd_install() */
+ /* coupled with smp_rmb() in fd_install() */
smp_wmb();
return 1;
}
@@ -411,19 +411,6 @@ static struct fdtable *close_files(struct files_struct * files)
return fdt;
}
-struct files_struct *get_files_struct(struct task_struct *task)
-{
- struct files_struct *files;
-
- task_lock(task);
- files = task->files;
- if (files)
- atomic_inc(&files->count);
- task_unlock(task);
-
- return files;
-}
-
void put_files_struct(struct files_struct *files)
{
if (atomic_dec_and_test(&files->count)) {
@@ -436,18 +423,6 @@ void put_files_struct(struct files_struct *files)
}
}
-void reset_files_struct(struct files_struct *files)
-{
- struct task_struct *tsk = current;
- struct files_struct *old;
-
- old = tsk->files;
- task_lock(tsk);
- tsk->files = files;
- task_unlock(tsk);
- put_files_struct(old);
-}
-
void exit_files(struct task_struct *tsk)
{
struct files_struct * files = tsk->files;
@@ -492,9 +467,9 @@ static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
/*
* allocate a file descriptor, mark it busy.
*/
-int __alloc_fd(struct files_struct *files,
- unsigned start, unsigned end, unsigned flags)
+static int alloc_fd(unsigned start, unsigned end, unsigned flags)
{
+ struct files_struct *files = current->files;
unsigned int fd;
int error;
struct fdtable *fdt;
@@ -550,14 +525,9 @@ out:
return error;
}
-static int alloc_fd(unsigned start, unsigned flags)
-{
- return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags);
-}
-
int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
{
- return __alloc_fd(current->files, 0, nofile, flags);
+ return alloc_fd(0, nofile, flags);
}
int get_unused_fd_flags(unsigned flags)
@@ -596,17 +566,13 @@ EXPORT_SYMBOL(put_unused_fd);
* It should never happen - if we allow dup2() do it, _really_ bad things
* will follow.
*
- * NOTE: __fd_install() variant is really, really low-level; don't
- * use it unless you are forced to by truly lousy API shoved down
- * your throat. 'files' *MUST* be either current->files or obtained
- * by get_files_struct(current) done by whoever had given it to you,
- * or really bad things will happen. Normally you want to use
- * fd_install() instead.
+ * This consumes the "file" refcount, so callers should treat it
+ * as if they had called fput(file).
*/
-void __fd_install(struct files_struct *files, unsigned int fd,
- struct file *file)
+void fd_install(unsigned int fd, struct file *file)
{
+ struct files_struct *files = current->files;
struct fdtable *fdt;
rcu_read_lock_sched();
@@ -628,15 +594,6 @@ void __fd_install(struct files_struct *files, unsigned int fd,
rcu_read_unlock_sched();
}
-/*
- * This consumes the "file" refcount, so callers should treat it
- * as if they had called fput(file).
- */
-void fd_install(unsigned int fd, struct file *file)
-{
- __fd_install(current->files, fd, file);
-}
-
EXPORT_SYMBOL(fd_install);
static struct file *pick_file(struct files_struct *files, unsigned fd)
@@ -659,11 +616,9 @@ out_unlock:
return file;
}
-/*
- * The same warnings as for __alloc_fd()/__fd_install() apply here...
- */
-int __close_fd(struct files_struct *files, unsigned fd)
+int close_fd(unsigned fd)
{
+ struct files_struct *files = current->files;
struct file *file;
file = pick_file(files, fd);
@@ -672,7 +627,36 @@ int __close_fd(struct files_struct *files, unsigned fd)
return filp_close(file, files);
}
-EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
+EXPORT_SYMBOL(close_fd); /* for ksys_close() */
+
+static inline void __range_cloexec(struct files_struct *cur_fds,
+ unsigned int fd, unsigned int max_fd)
+{
+ struct fdtable *fdt;
+
+ if (fd > max_fd)
+ return;
+
+ spin_lock(&cur_fds->file_lock);
+ fdt = files_fdtable(cur_fds);
+ bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1);
+ spin_unlock(&cur_fds->file_lock);
+}
+
+static inline void __range_close(struct files_struct *cur_fds, unsigned int fd,
+ unsigned int max_fd)
+{
+ while (fd <= max_fd) {
+ struct file *file;
+
+ file = pick_file(cur_fds, fd++);
+ if (!file)
+ continue;
+
+ filp_close(file, cur_fds);
+ cond_resched();
+ }
+}
/**
* __close_range() - Close all file descriptors in a given range.
@@ -689,7 +673,7 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
struct task_struct *me = current;
struct files_struct *cur_fds = me->files, *fds = NULL;
- if (flags & ~CLOSE_RANGE_UNSHARE)
+ if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
return -EINVAL;
if (fd > max_fd)
@@ -727,16 +711,11 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
}
max_fd = min(max_fd, cur_max);
- while (fd <= max_fd) {
- struct file *file;
- file = pick_file(cur_fds, fd++);
- if (!file)
- continue;
-
- filp_close(file, cur_fds);
- cond_resched();
- }
+ if (flags & CLOSE_RANGE_CLOEXEC)
+ __range_cloexec(cur_fds, fd, max_fd);
+ else
+ __range_close(cur_fds, fd, max_fd);
if (fds) {
/*
@@ -753,11 +732,11 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
}
/*
- * variant of __close_fd that gets a ref on the file for later fput.
+ * variant of close_fd that gets a ref on the file for later fput.
* The caller must ensure that filp_close() called on the file, and then
* an fput().
*/
-int __close_fd_get_file(unsigned int fd, struct file **res)
+int close_fd_get_file(unsigned int fd, struct file **res)
{
struct files_struct *files = current->files;
struct file *file;
@@ -826,7 +805,7 @@ static struct file *__fget_files(struct files_struct *files, unsigned int fd,
rcu_read_lock();
loop:
- file = fcheck_files(files, fd);
+ file = files_lookup_fd_rcu(files, fd);
if (file) {
/* File object ref couldn't be taken.
* dup2() atomicity guarantee is the reason
@@ -877,6 +856,42 @@ struct file *fget_task(struct task_struct *task, unsigned int fd)
return file;
}
+struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
+{
+ /* Must be called with rcu_read_lock held */
+ struct files_struct *files;
+ struct file *file = NULL;
+
+ task_lock(task);
+ files = task->files;
+ if (files)
+ file = files_lookup_fd_rcu(files, fd);
+ task_unlock(task);
+
+ return file;
+}
+
+struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd)
+{
+ /* Must be called with rcu_read_lock held */
+ struct files_struct *files;
+ unsigned int fd = *ret_fd;
+ struct file *file = NULL;
+
+ task_lock(task);
+ files = task->files;
+ if (files) {
+ for (; fd < files_fdtable(files)->max_fds; fd++) {
+ file = files_lookup_fd_rcu(files, fd);
+ if (file)
+ break;
+ }
+ }
+ task_unlock(task);
+ *ret_fd = fd;
+ return file;
+}
+
/*
* Lightweight file lookup - no refcnt increment if fd table isn't shared.
*
@@ -899,7 +914,7 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask)
struct file *file;
if (atomic_read(&files->count) == 1) {
- file = __fcheck_files(files, fd);
+ file = files_lookup_fd_raw(files, fd);
if (!file || unlikely(file->f_mode & mask))
return 0;
return (unsigned long)file;
@@ -1021,7 +1036,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags)
struct files_struct *files = current->files;
if (!file)
- return __close_fd(files, fd);
+ return close_fd(fd);
if (fd >= rlimit(RLIMIT_NOFILE))
return -EBADF;
@@ -1110,7 +1125,7 @@ static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
spin_lock(&files->file_lock);
err = expand_files(files, newfd);
- file = fcheck(oldfd);
+ file = files_lookup_fd_locked(files, oldfd);
if (unlikely(!file))
goto Ebadf;
if (unlikely(err < 0)) {
@@ -1139,7 +1154,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
int retval = oldfd;
rcu_read_lock();
- if (!fcheck_files(files, oldfd))
+ if (!files_lookup_fd_rcu(files, oldfd))
retval = -EBADF;
rcu_read_unlock();
return retval;
@@ -1164,10 +1179,11 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes)
int f_dupfd(unsigned int from, struct file *file, unsigned flags)
{
+ unsigned long nofile = rlimit(RLIMIT_NOFILE);
int err;
- if (from >= rlimit(RLIMIT_NOFILE))
+ if (from >= nofile)
return -EINVAL;
- err = alloc_fd(from, flags);
+ err = alloc_fd(from, nofile, flags);
if (err >= 0) {
get_file(file);
fd_install(err, file);
diff --git a/fs/file_table.c b/fs/file_table.c
index 709ada3151da..45437f8e1003 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -113,7 +113,6 @@ static struct file *__alloc_file(int flags, const struct cred *cred)
rwlock_init(&f->f_owner.lock);
spin_lock_init(&f->f_lock);
mutex_init(&f->f_pos_lock);
- eventpoll_init_file(f);
f->f_flags = flags;
f->f_mode = OPEN_FMODE(flags);
/* f->f_version: 0 */
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e6005c78bfa9..acfb55834af2 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -2321,10 +2321,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
wb = locked_inode_to_wb_and_lock_list(inode);
- WARN((wb->bdi->capabilities & BDI_CAP_WRITEBACK) &&
- !test_bit(WB_registered, &wb->state),
- "bdi-%s not registered\n", bdi_dev_name(wb->bdi));
-
inode->dirtied_when = jiffies;
if (dirtytime)
inode->dirtied_time_when = jiffies;
diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c
index 5a48cee6d7d3..f529075a2ce8 100644
--- a/fs/fuse/acl.c
+++ b/fs/fuse/acl.c
@@ -19,6 +19,9 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type)
void *value = NULL;
struct posix_acl *acl;
+ if (fuse_is_bad(inode))
+ return ERR_PTR(-EIO);
+
if (!fc->posix_acl || fc->no_getxattr)
return NULL;
@@ -53,6 +56,9 @@ int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type)
const char *name;
int ret;
+ if (fuse_is_bad(inode))
+ return -EIO;
+
if (!fc->posix_acl || fc->no_setxattr)
return -EOPNOTSUPP;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index ff7dbeb16f88..78f9f209078c 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -202,10 +202,10 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
int ret;
inode = d_inode_rcu(entry);
- if (inode && is_bad_inode(inode))
+ if (inode && fuse_is_bad(inode))
goto invalid;
else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) ||
- (flags & LOOKUP_REVAL)) {
+ (flags & (LOOKUP_EXCL | LOOKUP_REVAL))) {
struct fuse_entry_out outarg;
FUSE_ARGS(args);
struct fuse_forget_link *forget;
@@ -328,12 +328,11 @@ static struct vfsmount *fuse_dentry_automount(struct path *path)
if (!fm)
goto out_put_fsc;
- refcount_set(&fm->count, 1);
fsc->s_fs_info = fm;
sb = sget_fc(fsc, NULL, set_anon_super_fc);
if (IS_ERR(sb)) {
err = PTR_ERR(sb);
- fuse_mount_put(fm);
+ kfree(fm);
goto out_put_fsc;
}
fm->fc = fuse_conn_get(fc);
@@ -463,6 +462,9 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
bool outarg_valid = true;
bool locked;
+ if (fuse_is_bad(dir))
+ return ERR_PTR(-EIO);
+
locked = fuse_lock_inode(dir);
err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name,
&outarg, &inode);
@@ -542,6 +544,12 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
inarg.flags = flags;
inarg.mode = mode;
inarg.umask = current_umask();
+
+ if (fm->fc->handle_killpriv_v2 && (flags & O_TRUNC) &&
+ !(flags & O_EXCL) && !capable(CAP_FSETID)) {
+ inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
+ }
+
args.opcode = FUSE_CREATE;
args.nodeid = get_node_id(dir);
args.in_numargs = 2;
@@ -606,6 +614,9 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
struct fuse_conn *fc = get_fuse_conn(dir);
struct dentry *res = NULL;
+ if (fuse_is_bad(dir))
+ return -EIO;
+
if (d_in_lookup(entry)) {
res = fuse_lookup(dir, entry, 0);
if (IS_ERR(res))
@@ -654,6 +665,9 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args,
int err;
struct fuse_forget_link *forget;
+ if (fuse_is_bad(dir))
+ return -EIO;
+
forget = fuse_alloc_forget();
if (!forget)
return -ENOMEM;
@@ -781,6 +795,9 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
+ if (fuse_is_bad(dir))
+ return -EIO;
+
args.opcode = FUSE_UNLINK;
args.nodeid = get_node_id(dir);
args.in_numargs = 1;
@@ -817,6 +834,9 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
+ if (fuse_is_bad(dir))
+ return -EIO;
+
args.opcode = FUSE_RMDIR;
args.nodeid = get_node_id(dir);
args.in_numargs = 1;
@@ -895,6 +915,9 @@ static int fuse_rename2(struct inode *olddir, struct dentry *oldent,
struct fuse_conn *fc = get_fuse_conn(olddir);
int err;
+ if (fuse_is_bad(olddir))
+ return -EIO;
+
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
@@ -1030,7 +1053,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
if (!err) {
if (fuse_invalid_attr(&outarg.attr) ||
(inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
- make_bad_inode(inode);
+ fuse_make_bad(inode);
err = -EIO;
} else {
fuse_change_attributes(inode, &outarg.attr,
@@ -1232,6 +1255,9 @@ static int fuse_permission(struct inode *inode, int mask)
bool refreshed = false;
int err = 0;
+ if (fuse_is_bad(inode))
+ return -EIO;
+
if (!fuse_allow_current_process(fc))
return -EACCES;
@@ -1327,7 +1353,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode,
int err;
err = -EIO;
- if (is_bad_inode(inode))
+ if (fuse_is_bad(inode))
goto out_err;
if (fc->cache_symlinks)
@@ -1375,7 +1401,7 @@ static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end,
struct fuse_conn *fc = get_fuse_conn(inode);
int err;
- if (is_bad_inode(inode))
+ if (fuse_is_bad(inode))
return -EIO;
if (fc->no_fsyncdir)
@@ -1649,10 +1675,20 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
inarg.valid |= FATTR_FH;
inarg.fh = ff->fh;
}
+
+ /* Kill suid/sgid for non-directory chown unconditionally */
+ if (fc->handle_killpriv_v2 && !S_ISDIR(inode->i_mode) &&
+ attr->ia_valid & (ATTR_UID | ATTR_GID))
+ inarg.valid |= FATTR_KILL_SUIDGID;
+
if (attr->ia_valid & ATTR_SIZE) {
/* For mandatory locking in truncate */
inarg.valid |= FATTR_LOCKOWNER;
inarg.lock_owner = fuse_lock_owner_id(fc, current->files);
+
+ /* Kill suid/sgid for truncate only if no CAP_FSETID */
+ if (fc->handle_killpriv_v2 && !capable(CAP_FSETID))
+ inarg.valid |= FATTR_KILL_SUIDGID;
}
fuse_setattr_fill(fc, &args, inode, &inarg, &outarg);
err = fuse_simple_request(fm, &args);
@@ -1664,7 +1700,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
if (fuse_invalid_attr(&outarg.attr) ||
(inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
- make_bad_inode(inode);
+ fuse_make_bad(inode);
err = -EIO;
goto error;
}
@@ -1727,6 +1763,9 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
struct file *file = (attr->ia_valid & ATTR_FILE) ? attr->ia_file : NULL;
int ret;
+ if (fuse_is_bad(inode))
+ return -EIO;
+
if (!fuse_allow_current_process(get_fuse_conn(inode)))
return -EACCES;
@@ -1740,7 +1779,7 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
*
* This should be done on write(), truncate() and chown().
*/
- if (!fc->handle_killpriv) {
+ if (!fc->handle_killpriv && !fc->handle_killpriv_v2) {
/*
* ia_mode calculation may have used stale i_mode.
* Refresh and recalculate.
@@ -1785,6 +1824,9 @@ static int fuse_getattr(const struct path *path, struct kstat *stat,
struct inode *inode = d_inode(path->dentry);
struct fuse_conn *fc = get_fuse_conn(inode);
+ if (fuse_is_bad(inode))
+ return -EIO;
+
if (!fuse_allow_current_process(fc)) {
if (!request_mask) {
/*
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c03034e8c152..8cccecb55fb8 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -42,6 +42,12 @@ static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
if (!fm->fc->atomic_o_trunc)
inarg.flags &= ~O_TRUNC;
+
+ if (fm->fc->handle_killpriv_v2 &&
+ (inarg.flags & O_TRUNC) && !capable(CAP_FSETID)) {
+ inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
+ }
+
args.opcode = opcode;
args.nodeid = nodeid;
args.in_numargs = 1;
@@ -226,6 +232,9 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
bool dax_truncate = (file->f_flags & O_TRUNC) &&
fc->atomic_o_trunc && FUSE_IS_DAX(inode);
+ if (fuse_is_bad(inode))
+ return -EIO;
+
err = generic_file_open(inode, file);
if (err)
return err;
@@ -463,7 +472,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)
FUSE_ARGS(args);
int err;
- if (is_bad_inode(inode))
+ if (fuse_is_bad(inode))
return -EIO;
err = write_inode_now(inode, 1);
@@ -535,7 +544,7 @@ static int fuse_fsync(struct file *file, loff_t start, loff_t end,
struct fuse_conn *fc = get_fuse_conn(inode);
int err;
- if (is_bad_inode(inode))
+ if (fuse_is_bad(inode))
return -EIO;
inode_lock(inode);
@@ -859,7 +868,7 @@ static int fuse_readpage(struct file *file, struct page *page)
int err;
err = -EIO;
- if (is_bad_inode(inode))
+ if (fuse_is_bad(inode))
goto out;
err = fuse_do_readpage(file, page);
@@ -952,7 +961,7 @@ static void fuse_readahead(struct readahead_control *rac)
struct fuse_conn *fc = get_fuse_conn(inode);
unsigned int i, max_pages, nr_pages = 0;
- if (is_bad_inode(inode))
+ if (fuse_is_bad(inode))
return;
max_pages = min_t(unsigned int, fc->max_pages,
@@ -1097,6 +1106,8 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
fuse_write_args_fill(ia, ff, pos, count);
ia->write.in.flags = fuse_write_flags(iocb);
+ if (fm->fc->handle_killpriv_v2 && !capable(CAP_FSETID))
+ ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID;
err = fuse_simple_request(fm, &ap->args);
if (!err && ia->write.out.size > count)
@@ -1260,17 +1271,24 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
ssize_t written_buffered = 0;
struct inode *inode = mapping->host;
ssize_t err;
+ struct fuse_conn *fc = get_fuse_conn(inode);
loff_t endbyte = 0;
- if (get_fuse_conn(inode)->writeback_cache) {
+ if (fc->writeback_cache) {
/* Update size (EOF optimization) and mode (SUID clearing) */
err = fuse_update_attributes(mapping->host, file);
if (err)
return err;
+ if (fc->handle_killpriv_v2 &&
+ should_remove_suid(file_dentry(file))) {
+ goto writethrough;
+ }
+
return generic_file_write_iter(iocb, from);
}
+writethrough:
inode_lock(inode);
/* We can write back this queue in page reclaim */
@@ -1451,7 +1469,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
if (write) {
if (!capable(CAP_FSETID))
- ia->write.in.write_flags |= FUSE_WRITE_KILL_PRIV;
+ ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID;
nres = fuse_send_write(ia, pos, nbytes, owner);
} else {
@@ -1555,7 +1573,7 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct fuse_file *ff = file->private_data;
struct inode *inode = file_inode(file);
- if (is_bad_inode(inode))
+ if (fuse_is_bad(inode))
return -EIO;
if (FUSE_IS_DAX(inode))
@@ -1573,7 +1591,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct fuse_file *ff = file->private_data;
struct inode *inode = file_inode(file);
- if (is_bad_inode(inode))
+ if (fuse_is_bad(inode))
return -EIO;
if (FUSE_IS_DAX(inode))
@@ -2172,7 +2190,7 @@ static int fuse_writepages(struct address_space *mapping,
int err;
err = -EIO;
- if (is_bad_inode(inode))
+ if (fuse_is_bad(inode))
goto out;
data.inode = inode;
@@ -2281,6 +2299,9 @@ static int fuse_launder_page(struct page *page)
int err = 0;
if (clear_page_dirty_for_io(page)) {
struct inode *inode = page->mapping->host;
+
+ /* Serialize with pending writeback for the same page */
+ fuse_wait_on_page_writeback(inode, page->index);
err = fuse_writepage_locked(page);
if (!err)
fuse_wait_on_page_writeback(inode, page->index);
@@ -2954,7 +2975,7 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd,
if (!fuse_allow_current_process(fc))
return -EACCES;
- if (is_bad_inode(inode))
+ if (fuse_is_bad(inode))
return -EIO;
return fuse_do_ioctl(file, cmd, arg, flags);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index d51598017d13..7c4b8cb93f9f 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -172,6 +172,8 @@ enum {
FUSE_I_INIT_RDPLUS,
/** An operation changing file size is in progress */
FUSE_I_SIZE_UNSTABLE,
+ /* Bad inode */
+ FUSE_I_BAD,
};
struct fuse_conn;
@@ -636,6 +638,14 @@ struct fuse_conn {
unsigned int legacy_opts_show:1;
/*
+ * fs kills suid/sgid/cap on write/chown/trunc. suid is killed on
+ * write/trunc only if caller did not have CAP_FSETID. sgid is killed
+ * on write/truncate only if caller did not have CAP_FSETID as well as
+ * file has group execute permission.
+ */
+ unsigned handle_killpriv_v2:1;
+
+ /*
* The following bitfields are only for optimization purposes
* and hence races in setting them will not cause malfunction
*/
@@ -801,9 +811,6 @@ struct fuse_mount {
/* Underlying (potentially shared) connection to the FUSE server */
struct fuse_conn *fc;
- /* Refcount */
- refcount_t count;
-
/*
* Super block for this connection (fc->killsb must be held when
* accessing this).
@@ -821,9 +828,7 @@ static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb)
static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
{
- struct fuse_mount *fm = get_fuse_mount_super(sb);
-
- return fm ? fm->fc : NULL;
+ return get_fuse_mount_super(sb)->fc;
}
static inline struct fuse_mount *get_fuse_mount(struct inode *inode)
@@ -833,9 +838,7 @@ static inline struct fuse_mount *get_fuse_mount(struct inode *inode)
static inline struct fuse_conn *get_fuse_conn(struct inode *inode)
{
- struct fuse_mount *fm = get_fuse_mount(inode);
-
- return fm ? fm->fc : NULL;
+ return get_fuse_mount_super(inode->i_sb)->fc;
}
static inline struct fuse_inode *get_fuse_inode(struct inode *inode)
@@ -858,6 +861,16 @@ static inline u64 fuse_get_attr_version(struct fuse_conn *fc)
return atomic64_read(&fc->attr_version);
}
+static inline void fuse_make_bad(struct inode *inode)
+{
+ set_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state);
+}
+
+static inline bool fuse_is_bad(struct inode *inode)
+{
+ return unlikely(test_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state));
+}
+
/** Device operations */
extern const struct file_operations fuse_dev_operations;
@@ -1024,16 +1037,6 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
*/
void fuse_conn_put(struct fuse_conn *fc);
-/**
- * Acquire reference to fuse_mount
- */
-struct fuse_mount *fuse_mount_get(struct fuse_mount *fm);
-
-/**
- * Release reference to fuse_mount
- */
-void fuse_mount_put(struct fuse_mount *fm);
-
struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc);
struct fuse_dev *fuse_dev_alloc(void);
void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 1a47afc95f80..b0e18b470e91 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -132,7 +132,7 @@ static void fuse_evict_inode(struct inode *inode)
fi->forget = NULL;
}
}
- if (S_ISREG(inode->i_mode) && !is_bad_inode(inode)) {
+ if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) {
WARN_ON(!list_empty(&fi->write_files));
WARN_ON(!list_empty(&fi->queued_writes));
}
@@ -204,6 +204,16 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
inode->i_mode &= ~S_ISVTX;
fi->orig_ino = attr->ino;
+
+ /*
+ * We are refreshing inode data and it is possible that another
+ * client set suid/sgid or security.capability xattr. So clear
+ * S_NOSEC. Ideally, we could have cleared it only if suid/sgid
+ * was set or if security.capability xattr was set. But we don't
+ * know if security.capability has been set or not. So clear it
+ * anyway. Its less efficient but should be safe.
+ */
+ inode->i_flags &= ~S_NOSEC;
}
void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
@@ -342,7 +352,7 @@ retry:
unlock_new_inode(inode);
} else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
/* Inode has changed type, any I/O on the old should fail */
- make_bad_inode(inode);
+ fuse_make_bad(inode);
iput(inode);
goto retry;
}
@@ -452,7 +462,8 @@ static void fuse_put_super(struct super_block *sb)
{
struct fuse_mount *fm = get_fuse_mount_super(sb);
- fuse_mount_put(fm);
+ fuse_conn_put(fm->fc);
+ kfree(fm);
}
static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr)
@@ -705,7 +716,6 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
INIT_LIST_HEAD(&fc->mounts);
list_add(&fm->fc_entry, &fc->mounts);
fm->fc = fc;
- refcount_set(&fm->count, 1);
}
EXPORT_SYMBOL_GPL(fuse_conn_init);
@@ -732,23 +742,6 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
}
EXPORT_SYMBOL_GPL(fuse_conn_get);
-void fuse_mount_put(struct fuse_mount *fm)
-{
- if (refcount_dec_and_test(&fm->count)) {
- if (fm->fc)
- fuse_conn_put(fm->fc);
- kfree(fm);
- }
-}
-EXPORT_SYMBOL_GPL(fuse_mount_put);
-
-struct fuse_mount *fuse_mount_get(struct fuse_mount *fm)
-{
- refcount_inc(&fm->count);
- return fm;
-}
-EXPORT_SYMBOL_GPL(fuse_mount_get);
-
static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
{
struct fuse_attr attr;
@@ -1055,6 +1048,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
!fuse_dax_check_alignment(fc, arg->map_alignment)) {
ok = false;
}
+ if (arg->flags & FUSE_HANDLE_KILLPRIV_V2) {
+ fc->handle_killpriv_v2 = 1;
+ fm->sb->s_flags |= SB_NOSEC;
+ }
} else {
ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
@@ -1097,7 +1094,8 @@ void fuse_send_init(struct fuse_mount *fm)
FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT |
FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS |
- FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA;
+ FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA |
+ FUSE_HANDLE_KILLPRIV_V2;
#ifdef CONFIG_FUSE_DAX
if (fm->fc->dax)
ia->in.flags |= FUSE_MAP_ALIGNMENT;
@@ -1465,7 +1463,8 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
return 0;
err_put_conn:
- fuse_mount_put(fm);
+ fuse_conn_put(fc);
+ kfree(fm);
sb->s_fs_info = NULL;
err_fput:
fput(file);
@@ -1557,7 +1556,7 @@ void fuse_conn_destroy(struct fuse_mount *fm)
}
EXPORT_SYMBOL_GPL(fuse_conn_destroy);
-static void fuse_kill_sb_anon(struct super_block *sb)
+static void fuse_sb_destroy(struct super_block *sb)
{
struct fuse_mount *fm = get_fuse_mount_super(sb);
bool last;
@@ -1567,6 +1566,11 @@ static void fuse_kill_sb_anon(struct super_block *sb)
if (last)
fuse_conn_destroy(fm);
}
+}
+
+static void fuse_kill_sb_anon(struct super_block *sb)
+{
+ fuse_sb_destroy(sb);
kill_anon_super(sb);
}
@@ -1583,14 +1587,7 @@ MODULE_ALIAS_FS("fuse");
#ifdef CONFIG_BLOCK
static void fuse_kill_sb_blk(struct super_block *sb)
{
- struct fuse_mount *fm = get_fuse_mount_super(sb);
- bool last;
-
- if (fm) {
- last = fuse_mount_remove(fm);
- if (last)
- fuse_conn_destroy(fm);
- }
+ fuse_sb_destroy(sb);
kill_block_super(sb);
}
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index 3b5e91045871..3441ffa740f3 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -207,7 +207,7 @@ retry:
dput(dentry);
goto retry;
}
- if (is_bad_inode(inode)) {
+ if (fuse_is_bad(inode)) {
dput(dentry);
return -EIO;
}
@@ -568,7 +568,7 @@ int fuse_readdir(struct file *file, struct dir_context *ctx)
struct inode *inode = file_inode(file);
int err;
- if (is_bad_inode(inode))
+ if (fuse_is_bad(inode))
return -EIO;
mutex_lock(&ff->readdir.lock);
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 21a9e534417c..8868ac31a3c0 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -1402,18 +1402,6 @@ static int virtio_fs_test_super(struct super_block *sb,
return fsc_fm->fc->iq.priv == sb_fm->fc->iq.priv;
}
-static int virtio_fs_set_super(struct super_block *sb,
- struct fs_context *fsc)
-{
- int err;
-
- err = get_anon_bdev(&sb->s_dev);
- if (!err)
- fuse_mount_get(fsc->s_fs_info);
-
- return err;
-}
-
static int virtio_fs_get_tree(struct fs_context *fsc)
{
struct virtio_fs *fs;
@@ -1432,22 +1420,14 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
return -EINVAL;
}
+ err = -ENOMEM;
fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL);
- if (!fc) {
- mutex_lock(&virtio_fs_mutex);
- virtio_fs_put(fs);
- mutex_unlock(&virtio_fs_mutex);
- return -ENOMEM;
- }
+ if (!fc)
+ goto out_err;
fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL);
- if (!fm) {
- mutex_lock(&virtio_fs_mutex);
- virtio_fs_put(fs);
- mutex_unlock(&virtio_fs_mutex);
- kfree(fc);
- return -ENOMEM;
- }
+ if (!fm)
+ goto out_err;
fuse_conn_init(fc, fm, get_user_ns(current_user_ns()),
&virtio_fs_fiq_ops, fs);
@@ -1456,14 +1436,20 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
fc->auto_submounts = true;
fsc->s_fs_info = fm;
- sb = sget_fc(fsc, virtio_fs_test_super, virtio_fs_set_super);
- fuse_mount_put(fm);
+ sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc);
+ if (fsc->s_fs_info) {
+ fuse_conn_put(fc);
+ kfree(fm);
+ }
if (IS_ERR(sb))
return PTR_ERR(sb);
if (!sb->s_root) {
err = virtio_fs_fill_super(sb, fsc);
if (err) {
+ fuse_conn_put(fc);
+ kfree(fm);
+ sb->s_fs_info = NULL;
deactivate_locked_super(sb);
return err;
}
@@ -1474,6 +1460,13 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
WARN_ON(fsc->root);
fsc->root = dget(sb->s_root);
return 0;
+
+out_err:
+ kfree(fc);
+ mutex_lock(&virtio_fs_mutex);
+ virtio_fs_put(fs);
+ mutex_unlock(&virtio_fs_mutex);
+ return err;
}
static const struct fs_context_operations virtio_fs_context_ops = {
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index 371bdcbc7233..cdea18de94f7 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -113,6 +113,9 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
struct fuse_getxattr_out outarg;
ssize_t ret;
+ if (fuse_is_bad(inode))
+ return -EIO;
+
if (!fuse_allow_current_process(fm->fc))
return -EACCES;
@@ -178,6 +181,9 @@ static int fuse_xattr_get(const struct xattr_handler *handler,
struct dentry *dentry, struct inode *inode,
const char *name, void *value, size_t size)
{
+ if (fuse_is_bad(inode))
+ return -EIO;
+
return fuse_getxattr(inode, name, value, size);
}
@@ -186,6 +192,9 @@ static int fuse_xattr_set(const struct xattr_handler *handler,
const char *name, const void *value, size_t size,
int flags)
{
+ if (fuse_is_bad(inode))
+ return -EIO;
+
if (!value)
return fuse_removexattr(inode, name);
diff --git a/fs/inode.c b/fs/inode.c
index 9d78c37b00b8..cb008acf0efd 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -155,7 +155,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_bytes = 0;
inode->i_generation = 0;
inode->i_pipe = NULL;
- inode->i_bdev = NULL;
inode->i_cdev = NULL;
inode->i_link = NULL;
inode->i_dir_seq = 0;
@@ -580,8 +579,6 @@ static void evict(struct inode *inode)
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
}
- if (S_ISBLK(inode->i_mode) && inode->i_bdev)
- bd_forget(inode);
if (S_ISCHR(inode->i_mode) && inode->i_cdev)
cd_forget(inode);
diff --git a/fs/internal.h b/fs/internal.h
index a7cd0f64faa4..77c50befbfbe 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -25,7 +25,6 @@ extern void __init bdev_cache_init(void);
extern int __sync_blockdev(struct block_device *bdev, int wait);
void iterate_bdevs(void (*)(struct block_device *, void *), void *);
void emergency_thaw_bdev(struct super_block *sb);
-void bd_forget(struct inode *inode);
#else
static inline void bdev_cache_init(void)
{
@@ -43,9 +42,6 @@ static inline int emergency_thaw_bdev(struct super_block *sb)
{
return 0;
}
-static inline void bd_forget(struct inode *inode)
-{
-}
#endif /* CONFIG_BLOCK */
/*
@@ -78,6 +74,8 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
long do_rmdir(int dfd, struct filename *name);
long do_unlinkat(int dfd, struct filename *name);
int may_linkat(struct path *link);
+int do_renameat2(int olddfd, struct filename *oldname, int newdfd,
+ struct filename *newname, unsigned int flags);
/*
* namespace.c
@@ -114,7 +112,8 @@ extern struct file *alloc_empty_file_noaccount(int, const struct cred *);
*/
extern int reconfigure_super(struct fs_context *);
extern bool trylock_super(struct super_block *sb);
-extern struct super_block *user_get_super(dev_t);
+struct super_block *user_get_super(dev_t, bool excl);
+void put_super(struct super_block *sb);
extern bool mount_capable(struct fs_context *);
/*
diff --git a/fs/io-wq.c b/fs/io-wq.c
index b53c055bea6a..f72d53848dcb 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -1078,16 +1078,6 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
return IO_WQ_CANCEL_NOTFOUND;
}
-static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data)
-{
- return work == data;
-}
-
-enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
-{
- return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork, false);
-}
-
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
{
int ret = -ENOMEM, node;
diff --git a/fs/io-wq.h b/fs/io-wq.h
index cba36f03c355..069496c6d4f9 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -129,7 +129,6 @@ static inline bool io_wq_is_hashed(struct io_wq_work *work)
}
void io_wq_cancel_all(struct io_wq *wq);
-enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 86dac2b2e276..6f9392c35eef 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -245,6 +245,8 @@ struct io_sq_data {
struct task_struct *thread;
struct wait_queue_head wait;
+
+ unsigned sq_thread_idle;
};
struct io_ring_ctx {
@@ -285,7 +287,6 @@ struct io_ring_ctx {
struct list_head timeout_list;
struct list_head cq_overflow_list;
- wait_queue_head_t inflight_wait;
struct io_uring_sqe *sq_sqes;
} ____cacheline_aligned_in_smp;
@@ -310,7 +311,6 @@ struct io_ring_ctx {
struct io_sq_data *sq_data; /* if using sq thread polling */
struct wait_queue_head sqo_sq_wait;
- struct wait_queue_entry sqo_wait_entry;
struct list_head sqd_list;
/*
@@ -395,16 +395,18 @@ struct io_ring_ctx {
*/
struct io_poll_iocb {
struct file *file;
- union {
- struct wait_queue_head *head;
- u64 addr;
- };
+ struct wait_queue_head *head;
__poll_t events;
bool done;
bool canceled;
struct wait_queue_entry wait;
};
+struct io_poll_remove {
+ struct file *file;
+ u64 addr;
+};
+
struct io_close {
struct file *file;
struct file *put_file;
@@ -444,11 +446,17 @@ struct io_timeout {
u32 off;
u32 target_seq;
struct list_head list;
+ /* head of the link, used by linked timeouts only */
+ struct io_kiocb *head;
};
struct io_timeout_rem {
struct file *file;
u64 addr;
+
+ /* timeout update */
+ struct timespec64 ts;
+ u32 flags;
};
struct io_rw {
@@ -541,6 +549,27 @@ struct io_statx {
struct statx __user *buffer;
};
+struct io_shutdown {
+ struct file *file;
+ int how;
+};
+
+struct io_rename {
+ struct file *file;
+ int old_dfd;
+ int new_dfd;
+ struct filename *oldpath;
+ struct filename *newpath;
+ int flags;
+};
+
+struct io_unlink {
+ struct file *file;
+ int dfd;
+ int flags;
+ struct filename *filename;
+};
+
struct io_completion {
struct file *file;
struct list_head list;
@@ -575,7 +604,6 @@ enum {
REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
- REQ_F_LINK_HEAD_BIT,
REQ_F_FAIL_LINK_BIT,
REQ_F_INFLIGHT_BIT,
REQ_F_CUR_POS_BIT,
@@ -607,8 +635,6 @@ enum {
/* IOSQE_BUFFER_SELECT */
REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
- /* head of a link */
- REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT),
/* fail rest of links */
REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
/* on inflight list */
@@ -651,6 +677,7 @@ struct io_kiocb {
struct file *file;
struct io_rw rw;
struct io_poll_iocb poll;
+ struct io_poll_remove poll_remove;
struct io_accept accept;
struct io_sync sync;
struct io_cancel cancel;
@@ -667,6 +694,9 @@ struct io_kiocb {
struct io_splice splice;
struct io_provide_buf pbuf;
struct io_statx statx;
+ struct io_shutdown shutdown;
+ struct io_rename rename;
+ struct io_unlink unlink;
/* use only after cleaning per-op data, see io_clean_op() */
struct io_completion compl;
};
@@ -686,15 +716,14 @@ struct io_kiocb {
struct task_struct *task;
u64 user_data;
- struct list_head link_list;
+ struct io_kiocb *link;
+ struct percpu_ref *fixed_file_refs;
/*
* 1. used with ctx->iopoll_list with reads/writes
* 2. to track reqs with ->files (see io_op_def::file_table)
*/
struct list_head inflight_entry;
-
- struct percpu_ref *fixed_file_refs;
struct callback_head task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
struct hlist_node hash_node;
@@ -725,6 +754,8 @@ struct io_submit_state {
void *reqs[IO_IOPOLL_BATCH];
unsigned int free_reqs;
+ bool plug_started;
+
/*
* Batch completion logic
*/
@@ -735,7 +766,7 @@ struct io_submit_state {
*/
struct file *file;
unsigned int fd;
- unsigned int has_refs;
+ unsigned int file_refs;
unsigned int ios_left;
};
@@ -757,6 +788,8 @@ struct io_op_def {
unsigned buffer_select : 1;
/* must always have async data allocated */
unsigned needs_async_data : 1;
+ /* should block plug */
+ unsigned plug : 1;
/* size of async data needed, if any */
unsigned short async_size;
unsigned work_flags;
@@ -770,6 +803,7 @@ static const struct io_op_def io_op_defs[] = {
.pollin = 1,
.buffer_select = 1,
.needs_async_data = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
@@ -779,6 +813,7 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollout = 1,
.needs_async_data = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
IO_WQ_WORK_FSIZE,
@@ -791,6 +826,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
},
@@ -799,6 +835,7 @@ static const struct io_op_def io_op_defs[] = {
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE |
IO_WQ_WORK_MM,
@@ -818,8 +855,7 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_msghdr),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_RECVMSG] = {
.needs_file = 1,
@@ -828,15 +864,17 @@ static const struct io_op_def io_op_defs[] = {
.buffer_select = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_msghdr),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_TIMEOUT] = {
.needs_async_data = 1,
.async_size = sizeof(struct io_timeout_data),
.work_flags = IO_WQ_WORK_MM,
},
- [IORING_OP_TIMEOUT_REMOVE] = {},
+ [IORING_OP_TIMEOUT_REMOVE] = {
+ /* used by timeout updates' prep() */
+ .work_flags = IO_WQ_WORK_MM,
+ },
[IORING_OP_ACCEPT] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
@@ -863,7 +901,7 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_OPENAT] = {
.work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FS,
+ IO_WQ_WORK_FS | IO_WQ_WORK_MM,
},
[IORING_OP_CLOSE] = {
.needs_file = 1,
@@ -882,6 +920,7 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
@@ -889,6 +928,7 @@ static const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
+ .plug = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
IO_WQ_WORK_FSIZE,
@@ -915,7 +955,7 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_OPENAT2] = {
.work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS |
- IO_WQ_WORK_BLKCG,
+ IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
@@ -934,6 +974,17 @@ static const struct io_op_def io_op_defs[] = {
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
},
+ [IORING_OP_SHUTDOWN] = {
+ .needs_file = 1,
+ },
+ [IORING_OP_RENAMEAT] = {
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
+ IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
+ },
+ [IORING_OP_UNLINKAT] = {
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
+ IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
+ },
};
enum io_mem_account {
@@ -983,6 +1034,9 @@ struct sock *io_uring_get_socket(struct file *file)
}
EXPORT_SYMBOL(io_uring_get_socket);
+#define io_for_each_link(pos, head) \
+ for (pos = (head); pos; pos = pos->link)
+
static inline void io_clean_op(struct io_kiocb *req)
{
if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED |
@@ -990,8 +1044,39 @@ static inline void io_clean_op(struct io_kiocb *req)
__io_clean_op(req);
}
-static void io_sq_thread_drop_mm(void)
+static inline void io_set_resource_node(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!req->fixed_file_refs) {
+ req->fixed_file_refs = &ctx->file_data->node->refs;
+ percpu_ref_get(req->fixed_file_refs);
+ }
+}
+
+static bool io_match_task(struct io_kiocb *head,
+ struct task_struct *task,
+ struct files_struct *files)
{
+ struct io_kiocb *req;
+
+ if (task && head->task != task)
+ return false;
+ if (!files)
+ return true;
+
+ io_for_each_link(req, head) {
+ if ((req->flags & REQ_F_WORK_INITIALIZED) &&
+ (req->work.flags & IO_WQ_WORK_FILES) &&
+ req->work.identity->files == files)
+ return true;
+ }
+ return false;
+}
+
+static void io_sq_thread_drop_mm_files(void)
+{
+ struct files_struct *files = current->files;
struct mm_struct *mm = current->mm;
if (mm) {
@@ -999,6 +1084,41 @@ static void io_sq_thread_drop_mm(void)
mmput(mm);
current->mm = NULL;
}
+ if (files) {
+ struct nsproxy *nsproxy = current->nsproxy;
+
+ task_lock(current);
+ current->files = NULL;
+ current->nsproxy = NULL;
+ task_unlock(current);
+ put_files_struct(files);
+ put_nsproxy(nsproxy);
+ }
+}
+
+static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx)
+{
+ if (!current->files) {
+ struct files_struct *files;
+ struct nsproxy *nsproxy;
+
+ task_lock(ctx->sqo_task);
+ files = ctx->sqo_task->files;
+ if (!files) {
+ task_unlock(ctx->sqo_task);
+ return -EOWNERDEAD;
+ }
+ atomic_inc(&files->count);
+ get_nsproxy(ctx->sqo_task->nsproxy);
+ nsproxy = ctx->sqo_task->nsproxy;
+ task_unlock(ctx->sqo_task);
+
+ task_lock(current);
+ current->files = files;
+ current->nsproxy = nsproxy;
+ task_unlock(current);
+ }
+ return 0;
}
static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
@@ -1026,12 +1146,25 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
return -EFAULT;
}
-static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
+static int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
{
- if (!(io_op_defs[req->opcode].work_flags & IO_WQ_WORK_MM))
- return 0;
- return __io_sq_thread_acquire_mm(ctx);
+ const struct io_op_def *def = &io_op_defs[req->opcode];
+ int ret;
+
+ if (def->work_flags & IO_WQ_WORK_MM) {
+ ret = __io_sq_thread_acquire_mm(ctx);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) {
+ ret = __io_sq_thread_acquire_files(ctx);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ return 0;
}
static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx,
@@ -1174,7 +1307,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
- init_waitqueue_head(&ctx->inflight_wait);
spin_lock_init(&ctx->inflight_lock);
INIT_LIST_HEAD(&ctx->inflight_list);
INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
@@ -1416,10 +1548,8 @@ static void io_prep_async_link(struct io_kiocb *req)
{
struct io_kiocb *cur;
- io_prep_async_work(req);
- if (req->flags & REQ_F_LINK_HEAD)
- list_for_each_entry(cur, &req->link_list, link_list)
- io_prep_async_work(cur);
+ io_for_each_link(cur, req)
+ io_prep_async_work(cur);
}
static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
@@ -1460,30 +1590,18 @@ static void io_kill_timeout(struct io_kiocb *req)
}
}
-static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- if (!tsk || req->task == tsk)
- return true;
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- if (ctx->sq_data && req->task == ctx->sq_data->thread)
- return true;
- }
- return false;
-}
-
/*
* Returns true if we found and killed one or more timeouts
*/
-static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk)
+static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
+ struct files_struct *files)
{
struct io_kiocb *req, *tmp;
int canceled = 0;
spin_lock_irq(&ctx->completion_lock);
list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
- if (io_task_match(req, tsk)) {
+ if (io_match_task(req, tsk, files)) {
io_kill_timeout(req);
canceled++;
}
@@ -1594,32 +1712,6 @@ static void io_cqring_mark_overflow(struct io_ring_ctx *ctx)
}
}
-static inline bool __io_match_files(struct io_kiocb *req,
- struct files_struct *files)
-{
- return ((req->flags & REQ_F_WORK_INITIALIZED) &&
- (req->work.flags & IO_WQ_WORK_FILES)) &&
- req->work.identity->files == files;
-}
-
-static bool io_match_files(struct io_kiocb *req,
- struct files_struct *files)
-{
- struct io_kiocb *link;
-
- if (!files)
- return true;
- if (__io_match_files(req, files))
- return true;
- if (req->flags & REQ_F_LINK_HEAD) {
- list_for_each_entry(link, &req->link_list, link_list) {
- if (__io_match_files(link, files))
- return true;
- }
- }
- return false;
-}
-
/* Returns true if there are no backlogged entries after the flush */
static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
struct task_struct *tsk,
@@ -1647,9 +1739,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
cqe = NULL;
list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
- if (tsk && req->task != tsk)
- continue;
- if (!io_match_files(req, files))
+ if (!io_match_task(req, tsk, files))
continue;
cqe = io_get_cqring(ctx);
@@ -1845,9 +1935,7 @@ fallback:
static inline void io_put_file(struct io_kiocb *req, struct file *file,
bool fixed)
{
- if (fixed)
- percpu_ref_put(req->fixed_file_refs);
- else
+ if (!fixed)
fput(file);
}
@@ -1859,7 +1947,8 @@ static void io_dismantle_req(struct io_kiocb *req)
kfree(req->async_data);
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
-
+ if (req->fixed_file_refs)
+ percpu_ref_put(req->fixed_file_refs);
io_req_clean_work(req);
}
@@ -1882,6 +1971,14 @@ static void __io_free_req(struct io_kiocb *req)
percpu_ref_put(&ctx->refs);
}
+static inline void io_remove_next_linked(struct io_kiocb *req)
+{
+ struct io_kiocb *nxt = req->link;
+
+ req->link = nxt->link;
+ nxt->link = NULL;
+}
+
static void io_kill_linked_timeout(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -1890,8 +1987,8 @@ static void io_kill_linked_timeout(struct io_kiocb *req)
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
- link = list_first_entry_or_null(&req->link_list, struct io_kiocb,
- link_list);
+ link = req->link;
+
/*
* Can happen if a linked timeout fired and link had been like
* req -> link t-out -> link t-out [-> ...]
@@ -1900,7 +1997,8 @@ static void io_kill_linked_timeout(struct io_kiocb *req)
struct io_timeout_data *io = link->async_data;
int ret;
- list_del_init(&link->link_list);
+ io_remove_next_linked(req);
+ link->timeout.head = NULL;
ret = hrtimer_try_to_cancel(&io->timer);
if (ret != -1) {
io_cqring_fill_event(link, -ECANCELED);
@@ -1917,41 +2015,22 @@ static void io_kill_linked_timeout(struct io_kiocb *req)
}
}
-static struct io_kiocb *io_req_link_next(struct io_kiocb *req)
-{
- struct io_kiocb *nxt;
-
- /*
- * The list should never be empty when we are called here. But could
- * potentially happen if the chain is messed up, check to be on the
- * safe side.
- */
- if (unlikely(list_empty(&req->link_list)))
- return NULL;
- nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list);
- list_del_init(&req->link_list);
- if (!list_empty(&nxt->link_list))
- nxt->flags |= REQ_F_LINK_HEAD;
- return nxt;
-}
-
-/*
- * Called if REQ_F_LINK_HEAD is set, and we fail the head request
- */
static void io_fail_links(struct io_kiocb *req)
{
+ struct io_kiocb *link, *nxt;
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
- while (!list_empty(&req->link_list)) {
- struct io_kiocb *link = list_first_entry(&req->link_list,
- struct io_kiocb, link_list);
+ link = req->link;
+ req->link = NULL;
- list_del_init(&link->link_list);
- trace_io_uring_fail_link(req, link);
+ while (link) {
+ nxt = link->link;
+ link->link = NULL;
+ trace_io_uring_fail_link(req, link);
io_cqring_fill_event(link, -ECANCELED);
/*
@@ -1963,8 +2042,8 @@ static void io_fail_links(struct io_kiocb *req)
io_put_req_deferred(link, 2);
else
io_double_put_req(link);
+ link = nxt;
}
-
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
@@ -1973,7 +2052,6 @@ static void io_fail_links(struct io_kiocb *req)
static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
{
- req->flags &= ~REQ_F_LINK_HEAD;
if (req->flags & REQ_F_LINK_TIMEOUT)
io_kill_linked_timeout(req);
@@ -1983,20 +2061,24 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
* dependencies to the next request. In case of failure, fail the rest
* of the chain.
*/
- if (likely(!(req->flags & REQ_F_FAIL_LINK)))
- return io_req_link_next(req);
+ if (likely(!(req->flags & REQ_F_FAIL_LINK))) {
+ struct io_kiocb *nxt = req->link;
+
+ req->link = NULL;
+ return nxt;
+ }
io_fail_links(req);
return NULL;
}
-static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
+static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
{
- if (likely(!(req->flags & REQ_F_LINK_HEAD)))
+ if (likely(!(req->link) && !(req->flags & REQ_F_LINK_TIMEOUT)))
return NULL;
return __io_req_find_next(req);
}
-static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok)
+static int io_req_task_work_add(struct io_kiocb *req)
{
struct task_struct *tsk = req->task;
struct io_ring_ctx *ctx = req->ctx;
@@ -2013,7 +2095,7 @@ static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok)
* will do the job.
*/
notify = TWA_NONE;
- if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok)
+ if (!(ctx->flags & IORING_SETUP_SQPOLL))
notify = TWA_SIGNAL;
ret = task_work_add(tsk, &req->task_work, notify);
@@ -2050,7 +2132,8 @@ static void __io_req_task_submit(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- if (!__io_sq_thread_acquire_mm(ctx)) {
+ if (!__io_sq_thread_acquire_mm(ctx) &&
+ !__io_sq_thread_acquire_files(ctx)) {
mutex_lock(&ctx->uring_lock);
__io_queue_sqe(req, NULL);
mutex_unlock(&ctx->uring_lock);
@@ -2075,7 +2158,7 @@ static void io_req_task_queue(struct io_kiocb *req)
init_task_work(&req->task_work, io_req_task_submit);
percpu_ref_get(&req->ctx->refs);
- ret = io_req_task_work_add(req, true);
+ ret = io_req_task_work_add(req);
if (unlikely(ret)) {
struct task_struct *tsk;
@@ -2086,7 +2169,7 @@ static void io_req_task_queue(struct io_kiocb *req)
}
}
-static void io_queue_next(struct io_kiocb *req)
+static inline void io_queue_next(struct io_kiocb *req)
{
struct io_kiocb *nxt = io_req_find_next(req);
@@ -2143,8 +2226,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
io_free_req(req);
return;
}
- if (req->flags & REQ_F_LINK_HEAD)
- io_queue_next(req);
+ io_queue_next(req);
if (req->task != rb->task) {
if (rb->task) {
@@ -2197,7 +2279,7 @@ static void io_free_req_deferred(struct io_kiocb *req)
int ret;
init_task_work(&req->task_work, io_put_req_deferred_cb);
- ret = io_req_task_work_add(req, true);
+ ret = io_req_task_work_add(req);
if (unlikely(ret)) {
struct task_struct *tsk;
@@ -2246,7 +2328,7 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
* we wake up the task, and the next invocation will flush the
* entries. We cannot safely to it from here.
*/
- if (noflush && !list_empty(&ctx->cq_overflow_list))
+ if (noflush)
return -1U;
io_cqring_overflow_flush(ctx, false, NULL, NULL);
@@ -2593,7 +2675,7 @@ static bool io_rw_reissue(struct io_kiocb *req, long res)
if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker())
return false;
- ret = io_sq_thread_acquire_mm(req->ctx, req);
+ ret = io_sq_thread_acquire_mm_files(req->ctx, req);
if (io_resubmit_prep(req, ret)) {
refcount_inc(&req->refs);
@@ -2641,7 +2723,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
* find it from a io_iopoll_getevents() thread before the issuer is done
* accessing the kiocb cookie.
*/
-static void io_iopoll_req_issued(struct io_kiocb *req)
+static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -2670,21 +2752,25 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
else
list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
- if ((ctx->flags & IORING_SETUP_SQPOLL) &&
+ /*
+ * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread
+ * task context or in io worker task context. If current task context is
+ * sq thread, we don't need to check whether should wake up sq thread.
+ */
+ if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) &&
wq_has_sleeper(&ctx->sq_data->wait))
wake_up(&ctx->sq_data->wait);
}
-static void __io_state_file_put(struct io_submit_state *state)
+static inline void __io_state_file_put(struct io_submit_state *state)
{
- if (state->has_refs)
- fput_many(state->file, state->has_refs);
- state->file = NULL;
+ fput_many(state->file, state->file_refs);
+ state->file_refs = 0;
}
static inline void io_state_file_put(struct io_submit_state *state)
{
- if (state->file)
+ if (state->file_refs)
__io_state_file_put(state);
}
@@ -2698,29 +2784,25 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
if (!state)
return fget(fd);
- if (state->file) {
+ if (state->file_refs) {
if (state->fd == fd) {
- state->has_refs--;
+ state->file_refs--;
return state->file;
}
__io_state_file_put(state);
}
state->file = fget_many(fd, state->ios_left);
- if (!state->file)
+ if (unlikely(!state->file))
return NULL;
state->fd = fd;
- state->has_refs = state->ios_left - 1;
+ state->file_refs = state->ios_left - 1;
return state->file;
}
static bool io_bdev_nowait(struct block_device *bdev)
{
-#ifdef CONFIG_BLOCK
return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
-#else
- return true;
-#endif
}
/*
@@ -2733,14 +2815,16 @@ static bool io_file_supports_async(struct file *file, int rw)
umode_t mode = file_inode(file)->i_mode;
if (S_ISBLK(mode)) {
- if (io_bdev_nowait(file->f_inode->i_bdev))
+ if (IS_ENABLED(CONFIG_BLOCK) &&
+ io_bdev_nowait(I_BDEV(file->f_mapping->host)))
return true;
return false;
}
if (S_ISCHR(mode) || S_ISSOCK(mode))
return true;
if (S_ISREG(mode)) {
- if (io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
+ if (IS_ENABLED(CONFIG_BLOCK) &&
+ io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
file->f_op != &io_uring_fops)
return true;
return false;
@@ -3065,7 +3149,7 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
return __io_iov_buffer_select(req, iov, needs_lock);
}
-static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
+static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
struct iovec **iovec, struct iov_iter *iter,
bool needs_lock)
{
@@ -3094,7 +3178,7 @@ static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
*iovec = NULL;
- return ret < 0 ? ret : sqe_len;
+ return ret;
}
if (req->flags & REQ_F_BUFFER_SELECT) {
@@ -3111,18 +3195,6 @@ static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
req->ctx->compat);
}
-static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
- struct iovec **iovec, struct iov_iter *iter,
- bool needs_lock)
-{
- struct io_async_rw *iorw = req->async_data;
-
- if (!iorw)
- return __io_import_iovec(rw, req, iovec, iter, needs_lock);
- *iovec = NULL;
- return iov_iter_count(&iorw->iter);
-}
-
static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
{
return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
@@ -3246,7 +3318,7 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
struct iovec *iov = iorw->fast_iov;
ssize_t ret;
- ret = __io_import_iovec(rw, req, &iov, &iorw->iter, false);
+ ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
if (unlikely(ret < 0))
return ret;
@@ -3305,7 +3377,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
/* submit ref gets dropped, acquire a new one */
refcount_inc(&req->refs);
- ret = io_req_task_work_add(req, true);
+ ret = io_req_task_work_add(req);
if (unlikely(ret)) {
struct task_struct *tsk;
@@ -3379,17 +3451,17 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
struct iov_iter __iter, *iter = &__iter;
struct io_async_rw *rw = req->async_data;
ssize_t io_size, ret, ret2;
- size_t iov_count;
bool no_async;
- if (rw)
+ if (rw) {
iter = &rw->iter;
-
- ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
- if (ret < 0)
- return ret;
- iov_count = iov_iter_count(iter);
- io_size = ret;
+ iovec = NULL;
+ } else {
+ ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
+ if (ret < 0)
+ return ret;
+ }
+ io_size = iov_iter_count(iter);
req->result = io_size;
ret = 0;
@@ -3405,7 +3477,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
if (no_async)
goto copy_iov;
- ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count);
+ ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
if (unlikely(ret))
goto out_free;
@@ -3424,7 +3496,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
if (req->file->f_flags & O_NONBLOCK)
goto done;
/* some cases will consume bytes even on error returns */
- iov_iter_revert(iter, iov_count - iov_iter_count(iter));
+ iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = 0;
goto copy_iov;
} else if (ret < 0) {
@@ -3507,17 +3579,17 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
struct kiocb *kiocb = &req->rw.kiocb;
struct iov_iter __iter, *iter = &__iter;
struct io_async_rw *rw = req->async_data;
- size_t iov_count;
ssize_t ret, ret2, io_size;
- if (rw)
+ if (rw) {
iter = &rw->iter;
-
- ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
- if (ret < 0)
- return ret;
- iov_count = iov_iter_count(iter);
- io_size = ret;
+ iovec = NULL;
+ } else {
+ ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
+ if (ret < 0)
+ return ret;
+ }
+ io_size = iov_iter_count(iter);
req->result = io_size;
/* Ensure we clear previously set non-block flag */
@@ -3535,7 +3607,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
(req->flags & REQ_F_ISREG))
goto copy_iov;
- ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count);
+ ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
if (unlikely(ret))
goto out_free;
@@ -3578,7 +3650,7 @@ done:
} else {
copy_iov:
/* some cases will consume bytes even on error returns */
- iov_iter_revert(iter, iov_count - iov_iter_count(iter));
+ iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
if (!ret)
return -EAGAIN;
@@ -3590,6 +3662,135 @@ out_free:
return ret;
}
+static int io_renameat_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_rename *ren = &req->rename;
+ const char __user *oldf, *newf;
+
+ if (unlikely(req->flags & REQ_F_FIXED_FILE))
+ return -EBADF;
+
+ ren->old_dfd = READ_ONCE(sqe->fd);
+ oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ ren->new_dfd = READ_ONCE(sqe->len);
+ ren->flags = READ_ONCE(sqe->rename_flags);
+
+ ren->oldpath = getname(oldf);
+ if (IS_ERR(ren->oldpath))
+ return PTR_ERR(ren->oldpath);
+
+ ren->newpath = getname(newf);
+ if (IS_ERR(ren->newpath)) {
+ putname(ren->oldpath);
+ return PTR_ERR(ren->newpath);
+ }
+
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return 0;
+}
+
+static int io_renameat(struct io_kiocb *req, bool force_nonblock)
+{
+ struct io_rename *ren = &req->rename;
+ int ret;
+
+ if (force_nonblock)
+ return -EAGAIN;
+
+ ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
+ ren->newpath, ren->flags);
+
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_req_complete(req, ret);
+ return 0;
+}
+
+static int io_unlinkat_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ struct io_unlink *un = &req->unlink;
+ const char __user *fname;
+
+ if (unlikely(req->flags & REQ_F_FIXED_FILE))
+ return -EBADF;
+
+ un->dfd = READ_ONCE(sqe->fd);
+
+ un->flags = READ_ONCE(sqe->unlink_flags);
+ if (un->flags & ~AT_REMOVEDIR)
+ return -EINVAL;
+
+ fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ un->filename = getname(fname);
+ if (IS_ERR(un->filename))
+ return PTR_ERR(un->filename);
+
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return 0;
+}
+
+static int io_unlinkat(struct io_kiocb *req, bool force_nonblock)
+{
+ struct io_unlink *un = &req->unlink;
+ int ret;
+
+ if (force_nonblock)
+ return -EAGAIN;
+
+ if (un->flags & AT_REMOVEDIR)
+ ret = do_rmdir(un->dfd, un->filename);
+ else
+ ret = do_unlinkat(un->dfd, un->filename);
+
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_req_complete(req, ret);
+ return 0;
+}
+
+static int io_shutdown_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+#if defined(CONFIG_NET)
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
+ sqe->buf_index)
+ return -EINVAL;
+
+ req->shutdown.how = READ_ONCE(sqe->len);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int io_shutdown(struct io_kiocb *req, bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+ struct socket *sock;
+ int ret;
+
+ if (force_nonblock)
+ return -EAGAIN;
+
+ sock = sock_from_file(req->file);
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+
+ ret = __sys_shutdown_sock(sock, req->shutdown.how);
+ io_req_complete(req, ret);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
static int __io_splice_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
@@ -3804,7 +4005,7 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
u64 flags, mode;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
mode = READ_ONCE(sqe->len);
flags = READ_ONCE(sqe->open_flags);
@@ -3818,7 +4019,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
size_t len;
int ret;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
len = READ_ONCE(sqe->len);
@@ -3948,11 +4149,17 @@ static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock,
head = idr_find(&ctx->io_buffer_idr, p->bgid);
if (head)
ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
-
- io_ring_submit_lock(ctx, !force_nonblock);
if (ret < 0)
req_set_fail_links(req);
- __io_req_complete(req, ret, 0, cs);
+
+ /* need to hold the lock to complete IOPOLL requests */
+ if (ctx->flags & IORING_SETUP_IOPOLL) {
+ __io_req_complete(req, ret, 0, cs);
+ io_ring_submit_unlock(ctx, !force_nonblock);
+ } else {
+ io_ring_submit_unlock(ctx, !force_nonblock);
+ __io_req_complete(req, ret, 0, cs);
+ }
return 0;
}
@@ -4037,10 +4244,17 @@ static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock,
}
}
out:
- io_ring_submit_unlock(ctx, !force_nonblock);
if (ret < 0)
req_set_fail_links(req);
- __io_req_complete(req, ret, 0, cs);
+
+ /* need to hold the lock to complete IOPOLL requests */
+ if (ctx->flags & IORING_SETUP_IOPOLL) {
+ __io_req_complete(req, ret, 0, cs);
+ io_ring_submit_unlock(ctx, !force_nonblock);
+ } else {
+ io_ring_submit_unlock(ctx, !force_nonblock);
+ __io_req_complete(req, ret, 0, cs);
+ }
return 0;
}
@@ -4212,7 +4426,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
io_req_init_async(req);
req->work.flags |= IO_WQ_WORK_NO_CANCEL;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
sqe->rw_flags || sqe->buf_index)
@@ -4236,7 +4450,7 @@ static int io_close(struct io_kiocb *req, bool force_nonblock,
/* might be already done during nonblock submission */
if (!close->put_file) {
- ret = __close_fd_get_file(close->fd, &close->put_file);
+ ret = close_fd_get_file(close->fd, &close->put_file);
if (ret < 0)
return (ret == -ENOENT) ? -EBADF : ret;
}
@@ -4356,9 +4570,9 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
unsigned flags;
int ret;
- sock = sock_from_file(req->file, &ret);
+ sock = sock_from_file(req->file);
if (unlikely(!sock))
- return ret;
+ return -ENOTSOCK;
if (req->async_data) {
kmsg = req->async_data;
@@ -4405,9 +4619,9 @@ static int io_send(struct io_kiocb *req, bool force_nonblock,
unsigned flags;
int ret;
- sock = sock_from_file(req->file, &ret);
+ sock = sock_from_file(req->file);
if (unlikely(!sock))
- return ret;
+ return -ENOTSOCK;
ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
if (unlikely(ret))
@@ -4585,9 +4799,9 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
unsigned flags;
int ret, cflags = 0;
- sock = sock_from_file(req->file, &ret);
+ sock = sock_from_file(req->file);
if (unlikely(!sock))
- return ret;
+ return -ENOTSOCK;
if (req->async_data) {
kmsg = req->async_data;
@@ -4648,9 +4862,9 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock,
unsigned flags;
int ret, cflags = 0;
- sock = sock_from_file(req->file, &ret);
+ sock = sock_from_file(req->file);
if (unlikely(!sock))
- return ret;
+ return -ENOTSOCK;
if (req->flags & REQ_F_BUFFER_SELECT) {
kbuf = io_recv_buffer_select(req, !force_nonblock);
@@ -4694,7 +4908,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_accept *accept = &req->accept;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->len || sqe->buf_index)
return -EINVAL;
@@ -4735,7 +4949,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_connect *conn = &req->connect;
struct io_async_connect *io = req->async_data;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
return -EINVAL;
@@ -4859,7 +5073,6 @@ struct io_poll_table {
static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
__poll_t mask, task_work_func_t func)
{
- bool twa_signal_ok;
int ret;
/* for instances that support it check for an event match first: */
@@ -4875,20 +5088,12 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
percpu_ref_get(&req->ctx->refs);
/*
- * If we using the signalfd wait_queue_head for this wakeup, then
- * it's not safe to use TWA_SIGNAL as we could be recursing on the
- * tsk->sighand->siglock on doing the wakeup. Should not be needed
- * either, as the normal wakeup will suffice.
- */
- twa_signal_ok = (poll->head != &req->task->sighand->signalfd_wqh);
-
- /*
* If this fails, then the task is exiting. When a task exits, the
* work gets canceled, so just cancel this request as well instead
* of executing it. We can't safely execute it anyway, as we may not
* have the needed state needed for it anyway.
*/
- ret = io_req_task_work_add(req, twa_signal_ok);
+ ret = io_req_task_work_add(req);
if (unlikely(ret)) {
struct task_struct *tsk;
@@ -5279,7 +5484,8 @@ static bool io_poll_remove_one(struct io_kiocb *req)
/*
* Returns true if we found and killed one or more poll requests
*/
-static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk)
+static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
+ struct files_struct *files)
{
struct hlist_node *tmp;
struct io_kiocb *req;
@@ -5291,7 +5497,7 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk)
list = &ctx->cancel_hash[i];
hlist_for_each_entry_safe(req, tmp, list, hash_node) {
- if (io_task_match(req, tsk))
+ if (io_match_task(req, tsk, files))
posted += io_poll_remove_one(req);
}
}
@@ -5329,7 +5535,7 @@ static int io_poll_remove_prep(struct io_kiocb *req,
sqe->poll_events)
return -EINVAL;
- req->poll.addr = READ_ONCE(sqe->addr);
+ req->poll_remove.addr = READ_ONCE(sqe->addr);
return 0;
}
@@ -5340,12 +5546,10 @@ static int io_poll_remove_prep(struct io_kiocb *req,
static int io_poll_remove(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- u64 addr;
int ret;
- addr = req->poll.addr;
spin_lock_irq(&ctx->completion_lock);
- ret = io_poll_cancel(ctx, addr);
+ ret = io_poll_cancel(ctx, req->poll_remove.addr);
spin_unlock_irq(&ctx->completion_lock);
if (ret < 0)
@@ -5438,15 +5642,37 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static int __io_timeout_cancel(struct io_kiocb *req)
+static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
+ __u64 user_data)
{
- struct io_timeout_data *io = req->async_data;
- int ret;
+ struct io_timeout_data *io;
+ struct io_kiocb *req;
+ int ret = -ENOENT;
+
+ list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
+ if (user_data == req->user_data) {
+ ret = 0;
+ break;
+ }
+ }
+
+ if (ret == -ENOENT)
+ return ERR_PTR(ret);
+ io = req->async_data;
ret = hrtimer_try_to_cancel(&io->timer);
if (ret == -1)
- return -EALREADY;
+ return ERR_PTR(-EALREADY);
list_del_init(&req->timeout.list);
+ return req;
+}
+
+static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+{
+ struct io_kiocb *req = io_timeout_extract(ctx, user_data);
+
+ if (IS_ERR(req))
+ return PTR_ERR(req);
req_set_fail_links(req);
io_cqring_fill_event(req, -ECANCELED);
@@ -5454,35 +5680,48 @@ static int __io_timeout_cancel(struct io_kiocb *req)
return 0;
}
-static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
+ struct timespec64 *ts, enum hrtimer_mode mode)
{
- struct io_kiocb *req;
- int ret = -ENOENT;
-
- list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
- if (user_data == req->user_data) {
- ret = 0;
- break;
- }
- }
+ struct io_kiocb *req = io_timeout_extract(ctx, user_data);
+ struct io_timeout_data *data;
- if (ret == -ENOENT)
- return ret;
+ if (IS_ERR(req))
+ return PTR_ERR(req);
- return __io_timeout_cancel(req);
+ req->timeout.off = 0; /* noseq */
+ data = req->async_data;
+ list_add_tail(&req->timeout.list, &ctx->timeout_list);
+ hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode);
+ data->timer.function = io_timeout_fn;
+ hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
+ return 0;
}
static int io_timeout_remove_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
+ struct io_timeout_rem *tr = &req->timeout_rem;
+
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->timeout_flags)
+ if (sqe->ioprio || sqe->buf_index || sqe->len)
+ return -EINVAL;
+
+ tr->addr = READ_ONCE(sqe->addr);
+ tr->flags = READ_ONCE(sqe->timeout_flags);
+ if (tr->flags & IORING_TIMEOUT_UPDATE) {
+ if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS))
+ return -EINVAL;
+ if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
+ return -EFAULT;
+ } else if (tr->flags) {
+ /* timeout removal doesn't support flags */
return -EINVAL;
+ }
- req->timeout_rem.addr = READ_ONCE(sqe->addr);
return 0;
}
@@ -5491,11 +5730,19 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
*/
static int io_timeout_remove(struct io_kiocb *req)
{
+ struct io_timeout_rem *tr = &req->timeout_rem;
struct io_ring_ctx *ctx = req->ctx;
int ret;
spin_lock_irq(&ctx->completion_lock);
- ret = io_timeout_cancel(ctx, req->timeout_rem.addr);
+ if (req->timeout_rem.flags & IORING_TIMEOUT_UPDATE) {
+ enum hrtimer_mode mode = (tr->flags & IORING_TIMEOUT_ABS)
+ ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
+
+ ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
+ } else {
+ ret = io_timeout_cancel(ctx, tr->addr);
+ }
io_cqring_fill_event(req, ret);
io_commit_cqring(ctx);
@@ -5775,6 +6022,12 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return io_remove_buffers_prep(req, sqe);
case IORING_OP_TEE:
return io_tee_prep(req, sqe);
+ case IORING_OP_SHUTDOWN:
+ return io_shutdown_prep(req, sqe);
+ case IORING_OP_RENAMEAT:
+ return io_renameat_prep(req, sqe);
+ case IORING_OP_UNLINKAT:
+ return io_unlinkat_prep(req, sqe);
}
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -5796,11 +6049,10 @@ static u32 io_get_sequence(struct io_kiocb *req)
{
struct io_kiocb *pos;
struct io_ring_ctx *ctx = req->ctx;
- u32 total_submitted, nr_reqs = 1;
+ u32 total_submitted, nr_reqs = 0;
- if (req->flags & REQ_F_LINK_HEAD)
- list_for_each_entry(pos, &req->link_list, link_list)
- nr_reqs++;
+ io_for_each_link(pos, req)
+ nr_reqs++;
total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
return total_submitted - nr_reqs;
@@ -5852,12 +6104,13 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
static void io_req_drop_files(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ struct io_uring_task *tctx = req->task->io_uring;
unsigned long flags;
spin_lock_irqsave(&ctx->inflight_lock, flags);
list_del(&req->inflight_entry);
- if (waitqueue_active(&ctx->inflight_wait))
- wake_up(&ctx->inflight_wait);
+ if (atomic_read(&tctx->in_idle))
+ wake_up(&tctx->wait);
spin_unlock_irqrestore(&ctx->inflight_lock, flags);
req->flags &= ~REQ_F_INFLIGHT;
put_files_struct(req->work.identity->files);
@@ -5912,6 +6165,13 @@ static void __io_clean_op(struct io_kiocb *req)
if (req->open.filename)
putname(req->open.filename);
break;
+ case IORING_OP_RENAMEAT:
+ putname(req->rename.oldpath);
+ putname(req->rename.newpath);
+ break;
+ case IORING_OP_UNLINKAT:
+ putname(req->unlink.filename);
+ break;
}
req->flags &= ~REQ_F_NEED_CLEANUP;
}
@@ -6018,6 +6278,15 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
case IORING_OP_TEE:
ret = io_tee(req, force_nonblock);
break;
+ case IORING_OP_SHUTDOWN:
+ ret = io_shutdown(req, force_nonblock);
+ break;
+ case IORING_OP_RENAMEAT:
+ ret = io_renameat(req, force_nonblock);
+ break;
+ case IORING_OP_UNLINKAT:
+ ret = io_unlinkat(req, force_nonblock);
+ break;
default:
ret = -EINVAL;
break;
@@ -6034,7 +6303,7 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
if (in_async)
mutex_lock(&ctx->uring_lock);
- io_iopoll_req_issued(req);
+ io_iopoll_req_issued(req, in_async);
if (in_async)
mutex_unlock(&ctx->uring_lock);
@@ -6074,8 +6343,19 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
}
if (ret) {
- req_set_fail_links(req);
- io_req_complete(req, ret);
+ /*
+ * io_iopoll_complete() does not hold completion_lock to complete
+ * polled io, so here for polled io, just mark it done and still let
+ * io_iopoll_complete() complete it.
+ */
+ if (req->ctx->flags & IORING_SETUP_IOPOLL) {
+ struct kiocb *kiocb = &req->rw.kiocb;
+
+ kiocb_done(kiocb, ret, NULL);
+ } else {
+ req_set_fail_links(req);
+ io_req_complete(req, ret);
+ }
}
return io_steal_work(req);
@@ -6101,10 +6381,7 @@ static struct file *io_file_get(struct io_submit_state *state,
return NULL;
fd = array_index_nospec(fd, ctx->nr_user_files);
file = io_file_from_index(ctx, fd);
- if (file) {
- req->fixed_file_refs = &ctx->file_data->node->refs;
- percpu_ref_get(req->fixed_file_refs);
- }
+ io_set_resource_node(req);
} else {
trace_io_uring_file_get(ctx, fd);
file = __io_file_get(state, fd);
@@ -6113,45 +6390,26 @@ static struct file *io_file_get(struct io_submit_state *state,
return file;
}
-static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
- int fd)
-{
- bool fixed;
-
- fixed = (req->flags & REQ_F_FIXED_FILE) != 0;
- if (unlikely(!fixed && io_async_submit(req->ctx)))
- return -EBADF;
-
- req->file = io_file_get(state, req, fd, fixed);
- if (req->file || io_op_defs[req->opcode].needs_file_no_error)
- return 0;
- return -EBADF;
-}
-
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
{
struct io_timeout_data *data = container_of(timer,
struct io_timeout_data, timer);
- struct io_kiocb *req = data->req;
+ struct io_kiocb *prev, *req = data->req;
struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *prev = NULL;
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
+ prev = req->timeout.head;
+ req->timeout.head = NULL;
/*
* We don't expect the list to be empty, that will only happen if we
* race with the completion of the linked work.
*/
- if (!list_empty(&req->link_list)) {
- prev = list_entry(req->link_list.prev, struct io_kiocb,
- link_list);
- if (refcount_inc_not_zero(&prev->refs))
- list_del_init(&req->link_list);
- else
- prev = NULL;
- }
-
+ if (prev && refcount_inc_not_zero(&prev->refs))
+ io_remove_next_linked(prev);
+ else
+ prev = NULL;
spin_unlock_irqrestore(&ctx->completion_lock, flags);
if (prev) {
@@ -6167,10 +6425,10 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
static void __io_queue_linked_timeout(struct io_kiocb *req)
{
/*
- * If the list is now empty, then our linked request finished before
- * we got a chance to setup the timer
+ * If the back reference is NULL, then our linked request finished
+ * before we got a chance to setup the timer
*/
- if (!list_empty(&req->link_list)) {
+ if (req->timeout.head) {
struct io_timeout_data *data = req->async_data;
data->timer.function = io_link_timeout_fn;
@@ -6193,18 +6451,13 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
{
- struct io_kiocb *nxt;
-
- if (!(req->flags & REQ_F_LINK_HEAD))
- return NULL;
- if (req->flags & REQ_F_LINK_TIMEOUT)
- return NULL;
+ struct io_kiocb *nxt = req->link;
- nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
- link_list);
- if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
+ if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) ||
+ nxt->opcode != IORING_OP_LINK_TIMEOUT)
return NULL;
+ nxt->timeout.head = req;
nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
req->flags |= REQ_F_LINK_TIMEOUT;
return nxt;
@@ -6310,8 +6563,13 @@ static inline void io_queue_link_head(struct io_kiocb *req,
io_queue_sqe(req, NULL, cs);
}
+struct io_submit_link {
+ struct io_kiocb *head;
+ struct io_kiocb *last;
+};
+
static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_kiocb **link, struct io_comp_state *cs)
+ struct io_submit_link *link, struct io_comp_state *cs)
{
struct io_ring_ctx *ctx = req->ctx;
int ret;
@@ -6323,8 +6581,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
* submitted sync once the chain is complete. If none of those
* conditions are true (normal request), then just queue it.
*/
- if (*link) {
- struct io_kiocb *head = *link;
+ if (link->head) {
+ struct io_kiocb *head = link->head;
/*
* Taking sequential execution of a link, draining both sides
@@ -6344,12 +6602,13 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return ret;
}
trace_io_uring_link(ctx, req, head);
- list_add_tail(&req->link_list, &head->link_list);
+ link->last->link = req;
+ link->last = req;
/* last request of a link, enqueue the link */
if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
io_queue_link_head(head, cs);
- *link = NULL;
+ link->head = NULL;
}
} else {
if (unlikely(ctx->drain_next)) {
@@ -6357,13 +6616,11 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
ctx->drain_next = 0;
}
if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
- req->flags |= REQ_F_LINK_HEAD;
- INIT_LIST_HEAD(&req->link_list);
-
ret = io_req_defer_prep(req, sqe);
if (unlikely(ret))
req->flags |= REQ_F_FAIL_LINK;
- *link = req;
+ link->head = req;
+ link->last = req;
} else {
io_queue_sqe(req, sqe, cs);
}
@@ -6379,7 +6636,8 @@ static void io_submit_state_end(struct io_submit_state *state)
{
if (!list_empty(&state->comp.list))
io_submit_flush_completions(&state->comp);
- blk_finish_plug(&state->plug);
+ if (state->plug_started)
+ blk_finish_plug(&state->plug);
io_state_file_put(state);
if (state->free_reqs)
kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
@@ -6391,12 +6649,12 @@ static void io_submit_state_end(struct io_submit_state *state)
static void io_submit_state_start(struct io_submit_state *state,
struct io_ring_ctx *ctx, unsigned int max_ios)
{
- blk_start_plug(&state->plug);
+ state->plug_started = false;
state->comp.nr = 0;
INIT_LIST_HEAD(&state->comp.list);
state->comp.ctx = ctx;
state->free_reqs = 0;
- state->file = NULL;
+ state->file_refs = 0;
state->ios_left = max_ios;
}
@@ -6491,6 +6749,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->file = NULL;
req->ctx = ctx;
req->flags = 0;
+ req->link = NULL;
+ req->fixed_file_refs = NULL;
/* one is dropped after submission, the other at completion */
refcount_set(&req->refs, 2);
req->task = current;
@@ -6499,7 +6759,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
if (unlikely(req->opcode >= IORING_OP_LAST))
return -EINVAL;
- if (unlikely(io_sq_thread_acquire_mm(ctx, req)))
+ if (unlikely(io_sq_thread_acquire_mm_files(ctx, req)))
return -EFAULT;
sqe_flags = READ_ONCE(sqe->flags);
@@ -6532,10 +6792,26 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
/* same numerical values with corresponding REQ_F_*, safe to copy */
req->flags |= sqe_flags;
- if (!io_op_defs[req->opcode].needs_file)
- return 0;
+ /*
+ * Plug now if we have more than 1 IO left after this, and the target
+ * is potentially a read/write to block based storage.
+ */
+ if (!state->plug_started && state->ios_left > 1 &&
+ io_op_defs[req->opcode].plug) {
+ blk_start_plug(&state->plug);
+ state->plug_started = true;
+ }
+
+ ret = 0;
+ if (io_op_defs[req->opcode].needs_file) {
+ bool fixed = req->flags & REQ_F_FIXED_FILE;
+
+ req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
+ if (unlikely(!req->file &&
+ !io_op_defs[req->opcode].needs_file_no_error))
+ ret = -EBADF;
+ }
- ret = io_req_set_file(state, req, READ_ONCE(sqe->fd));
state->ios_left--;
return ret;
}
@@ -6543,7 +6819,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
{
struct io_submit_state state;
- struct io_kiocb *link = NULL;
+ struct io_submit_link link;
int i, submitted = 0;
/* if we have a backlog and couldn't flush it all, return BUSY */
@@ -6563,6 +6839,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
refcount_add(nr, &current->usage);
io_submit_state_start(&state, ctx, nr);
+ link.head = NULL;
for (i = 0; i < nr; i++) {
const struct io_uring_sqe *sqe;
@@ -6608,8 +6885,8 @@ fail_req:
percpu_counter_sub(&tctx->inflight, unused);
put_task_struct_many(current, unused);
}
- if (link)
- io_queue_link_head(link, &state.comp);
+ if (link.head)
+ io_queue_link_head(link.head, &state.comp);
io_submit_state_end(&state);
/* Commit SQ ring head once we've consumed and submitted all SQEs */
@@ -6633,111 +6910,45 @@ static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
spin_unlock_irq(&ctx->completion_lock);
}
-static int io_sq_wake_function(struct wait_queue_entry *wqe, unsigned mode,
- int sync, void *key)
-{
- struct io_ring_ctx *ctx = container_of(wqe, struct io_ring_ctx, sqo_wait_entry);
- int ret;
-
- ret = autoremove_wake_function(wqe, mode, sync, key);
- if (ret) {
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->completion_lock, flags);
- ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
- }
- return ret;
-}
-
-enum sq_ret {
- SQT_IDLE = 1,
- SQT_SPIN = 2,
- SQT_DID_WORK = 4,
-};
-
-static enum sq_ret __io_sq_thread(struct io_ring_ctx *ctx,
- unsigned long start_jiffies, bool cap_entries)
+static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
{
- unsigned long timeout = start_jiffies + ctx->sq_thread_idle;
- struct io_sq_data *sqd = ctx->sq_data;
unsigned int to_submit;
int ret = 0;
-again:
- if (!list_empty(&ctx->iopoll_list)) {
+ to_submit = io_sqring_entries(ctx);
+ /* if we're handling multiple rings, cap submit size for fairness */
+ if (cap_entries && to_submit > 8)
+ to_submit = 8;
+
+ if (!list_empty(&ctx->iopoll_list) || to_submit) {
unsigned nr_events = 0;
mutex_lock(&ctx->uring_lock);
- if (!list_empty(&ctx->iopoll_list) && !need_resched())
+ if (!list_empty(&ctx->iopoll_list))
io_do_iopoll(ctx, &nr_events, 0);
+
+ if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)))
+ ret = io_submit_sqes(ctx, to_submit);
mutex_unlock(&ctx->uring_lock);
}
- to_submit = io_sqring_entries(ctx);
-
- /*
- * If submit got -EBUSY, flag us as needing the application
- * to enter the kernel to reap and flush events.
- */
- if (!to_submit || ret == -EBUSY || need_resched()) {
- /*
- * Drop cur_mm before scheduling, we can't hold it for
- * long periods (or over schedule()). Do this before
- * adding ourselves to the waitqueue, as the unuse/drop
- * may sleep.
- */
- io_sq_thread_drop_mm();
-
- /*
- * We're polling. If we're within the defined idle
- * period, then let us spin without work before going
- * to sleep. The exception is if we got EBUSY doing
- * more IO, we should wait for the application to
- * reap events and wake us up.
- */
- if (!list_empty(&ctx->iopoll_list) || need_resched() ||
- (!time_after(jiffies, timeout) && ret != -EBUSY &&
- !percpu_ref_is_dying(&ctx->refs)))
- return SQT_SPIN;
+ if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
+ wake_up(&ctx->sqo_sq_wait);
- prepare_to_wait(&sqd->wait, &ctx->sqo_wait_entry,
- TASK_INTERRUPTIBLE);
+ return ret;
+}
- /*
- * While doing polled IO, before going to sleep, we need
- * to check if there are new reqs added to iopoll_list,
- * it is because reqs may have been punted to io worker
- * and will be added to iopoll_list later, hence check
- * the iopoll_list again.
- */
- if ((ctx->flags & IORING_SETUP_IOPOLL) &&
- !list_empty_careful(&ctx->iopoll_list)) {
- finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
- goto again;
- }
+static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
+{
+ struct io_ring_ctx *ctx;
+ unsigned sq_thread_idle = 0;
- to_submit = io_sqring_entries(ctx);
- if (!to_submit || ret == -EBUSY)
- return SQT_IDLE;
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+ if (sq_thread_idle < ctx->sq_thread_idle)
+ sq_thread_idle = ctx->sq_thread_idle;
}
- finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
- io_ring_clear_wakeup_flag(ctx);
-
- /* if we're handling multiple rings, cap submit size for fairness */
- if (cap_entries && to_submit > 8)
- to_submit = 8;
-
- mutex_lock(&ctx->uring_lock);
- if (likely(!percpu_ref_is_dying(&ctx->refs)))
- ret = io_submit_sqes(ctx, to_submit);
- mutex_unlock(&ctx->uring_lock);
-
- if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
- wake_up(&ctx->sqo_sq_wait);
-
- return SQT_DID_WORK;
+ sqd->sq_thread_idle = sq_thread_idle;
}
static void io_sqd_init_new(struct io_sq_data *sqd)
@@ -6746,39 +6957,56 @@ static void io_sqd_init_new(struct io_sq_data *sqd)
while (!list_empty(&sqd->ctx_new_list)) {
ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list);
- init_wait(&ctx->sqo_wait_entry);
- ctx->sqo_wait_entry.func = io_sq_wake_function;
list_move_tail(&ctx->sqd_list, &sqd->ctx_list);
complete(&ctx->sq_thread_comp);
}
+
+ io_sqd_update_thread_idle(sqd);
}
static int io_sq_thread(void *data)
{
struct cgroup_subsys_state *cur_css = NULL;
+ struct files_struct *old_files = current->files;
+ struct nsproxy *old_nsproxy = current->nsproxy;
const struct cred *old_cred = NULL;
struct io_sq_data *sqd = data;
struct io_ring_ctx *ctx;
- unsigned long start_jiffies;
+ unsigned long timeout = 0;
+ DEFINE_WAIT(wait);
+
+ task_lock(current);
+ current->files = NULL;
+ current->nsproxy = NULL;
+ task_unlock(current);
- start_jiffies = jiffies;
while (!kthread_should_stop()) {
- enum sq_ret ret = 0;
- bool cap_entries;
+ int ret;
+ bool cap_entries, sqt_spin, needs_sched;
/*
* Any changes to the sqd lists are synchronized through the
* kthread parking. This synchronizes the thread vs users,
* the users are synchronized on the sqd->ctx_lock.
*/
- if (kthread_should_park())
+ if (kthread_should_park()) {
kthread_parkme();
+ /*
+ * When sq thread is unparked, in case the previous park operation
+ * comes from io_put_sq_data(), which means that sq thread is going
+ * to be stopped, so here needs to have a check.
+ */
+ if (kthread_should_stop())
+ break;
+ }
- if (unlikely(!list_empty(&sqd->ctx_new_list)))
+ if (unlikely(!list_empty(&sqd->ctx_new_list))) {
io_sqd_init_new(sqd);
+ timeout = jiffies + sqd->sq_thread_idle;
+ }
+ sqt_spin = false;
cap_entries = !list_is_singular(&sqd->ctx_list);
-
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
if (current->cred != ctx->creds) {
if (old_cred)
@@ -6791,24 +7019,49 @@ static int io_sq_thread(void *data)
current->sessionid = ctx->sessionid;
#endif
- ret |= __io_sq_thread(ctx, start_jiffies, cap_entries);
+ ret = __io_sq_thread(ctx, cap_entries);
+ if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
+ sqt_spin = true;
- io_sq_thread_drop_mm();
+ io_sq_thread_drop_mm_files();
}
- if (ret & SQT_SPIN) {
+ if (sqt_spin || !time_after(jiffies, timeout)) {
io_run_task_work();
cond_resched();
- } else if (ret == SQT_IDLE) {
- if (kthread_should_park())
- continue;
+ if (sqt_spin)
+ timeout = jiffies + sqd->sq_thread_idle;
+ continue;
+ }
+
+ if (kthread_should_park())
+ continue;
+
+ needs_sched = true;
+ prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+ if ((ctx->flags & IORING_SETUP_IOPOLL) &&
+ !list_empty_careful(&ctx->iopoll_list)) {
+ needs_sched = false;
+ break;
+ }
+ if (io_sqring_entries(ctx)) {
+ needs_sched = false;
+ break;
+ }
+ }
+
+ if (needs_sched) {
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_set_wakeup_flag(ctx);
+
schedule();
- start_jiffies = jiffies;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_clear_wakeup_flag(ctx);
}
+
+ finish_wait(&sqd->wait, &wait);
+ timeout = jiffies + sqd->sq_thread_idle;
}
io_run_task_work();
@@ -6818,6 +7071,11 @@ static int io_sq_thread(void *data)
if (old_cred)
revert_creds(old_cred);
+ task_lock(current);
+ current->files = old_files;
+ current->nsproxy = old_nsproxy;
+ task_unlock(current);
+
kthread_parkme();
return 0;
@@ -6862,13 +7120,8 @@ static int io_run_task_work_sig(void)
return 1;
if (!signal_pending(current))
return 0;
- if (current->jobctl & JOBCTL_TASK_WORK) {
- spin_lock_irq(&current->sighand->siglock);
- current->jobctl &= ~JOBCTL_TASK_WORK;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
- return 1;
- }
+ if (test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))
+ return -ERESTARTSYS;
return -EINTR;
}
@@ -6877,7 +7130,8 @@ static int io_run_task_work_sig(void)
* application must reap them itself, as they reside on the shared cq ring.
*/
static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
- const sigset_t __user *sig, size_t sigsz)
+ const sigset_t __user *sig, size_t sigsz,
+ struct __kernel_timespec __user *uts)
{
struct io_wait_queue iowq = {
.wq = {
@@ -6889,6 +7143,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
.to_wait = min_events,
};
struct io_rings *rings = ctx->rings;
+ struct timespec64 ts;
+ signed long timeout = 0;
int ret = 0;
do {
@@ -6911,6 +7167,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return ret;
}
+ if (uts) {
+ if (get_timespec64(&ts, uts))
+ return -EFAULT;
+ timeout = timespec64_to_jiffies(&ts);
+ }
+
iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
trace_io_uring_cqring_wait(ctx, min_events);
do {
@@ -6924,7 +7186,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
break;
if (io_should_wake(&iowq, false))
break;
- schedule();
+ if (uts) {
+ timeout = schedule_timeout(timeout);
+ if (timeout == 0) {
+ ret = -ETIME;
+ break;
+ }
+ } else {
+ schedule();
+ }
} while (1);
finish_wait(&ctx->wait, &iowq.wq);
@@ -6973,9 +7243,9 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
if (!data)
return -ENXIO;
- spin_lock(&data->lock);
+ spin_lock_bh(&data->lock);
ref_node = data->node;
- spin_unlock(&data->lock);
+ spin_unlock_bh(&data->lock);
if (ref_node)
percpu_ref_kill(&ref_node->refs);
@@ -7098,12 +7368,11 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx)
mutex_lock(&sqd->ctx_lock);
list_del(&ctx->sqd_list);
+ io_sqd_update_thread_idle(sqd);
mutex_unlock(&sqd->ctx_lock);
- if (sqd->thread) {
- finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
+ if (sqd->thread)
io_sq_thread_unpark(sqd);
- }
io_put_sq_data(sqd);
ctx->sq_data = NULL;
@@ -7358,7 +7627,7 @@ static void io_file_data_ref_zero(struct percpu_ref *ref)
data = ref_node->file_data;
ctx = data->ctx;
- spin_lock(&data->lock);
+ spin_lock_bh(&data->lock);
ref_node->done = true;
while (!list_empty(&data->ref_list)) {
@@ -7370,7 +7639,7 @@ static void io_file_data_ref_zero(struct percpu_ref *ref)
list_del(&ref_node->node);
first_add |= llist_add(&ref_node->llist, &ctx->file_put_llist);
}
- spin_unlock(&data->lock);
+ spin_unlock_bh(&data->lock);
if (percpu_ref_is_dying(&data->refs))
delay = 0;
@@ -7493,9 +7762,9 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
}
file_data->node = ref_node;
- spin_lock(&file_data->lock);
+ spin_lock_bh(&file_data->lock);
list_add_tail(&ref_node->node, &file_data->ref_list);
- spin_unlock(&file_data->lock);
+ spin_unlock_bh(&file_data->lock);
percpu_ref_get(&file_data->refs);
return ret;
out_fput:
@@ -7652,10 +7921,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
if (needs_switch) {
percpu_ref_kill(&data->node->refs);
- spin_lock(&data->lock);
+ spin_lock_bh(&data->lock);
list_add_tail(&ref_node->node, &data->ref_list);
data->node = ref_node;
- spin_unlock(&data->lock);
+ spin_unlock_bh(&data->lock);
percpu_ref_get(&ctx->file_data->refs);
} else
destroy_fixed_file_ref_node(ref_node);
@@ -7783,7 +8052,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
struct io_sq_data *sqd;
ret = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
goto err;
sqd = io_get_sq_data(p);
@@ -8369,8 +8638,6 @@ static void io_ring_exit_work(struct work_struct *work)
* as nobody else will be looking for them.
*/
do {
- if (ctx->rings)
- io_cqring_overflow_flush(ctx, true, NULL, NULL);
io_iopoll_try_reap_events(ctx);
} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
io_ring_ctx_free(ctx);
@@ -8380,17 +8647,17 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
mutex_lock(&ctx->uring_lock);
percpu_ref_kill(&ctx->refs);
+ if (ctx->rings)
+ io_cqring_overflow_flush(ctx, true, NULL, NULL);
mutex_unlock(&ctx->uring_lock);
- io_kill_timeouts(ctx, NULL);
- io_poll_remove_all(ctx, NULL);
+ io_kill_timeouts(ctx, NULL, NULL);
+ io_poll_remove_all(ctx, NULL, NULL);
if (ctx->io_wq)
io_wq_cancel_all(ctx->io_wq);
/* if we failed setting up the ctx, we might not have any rings */
- if (ctx->rings)
- io_cqring_overflow_flush(ctx, true, NULL, NULL);
io_iopoll_try_reap_events(ctx);
idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
@@ -8421,120 +8688,31 @@ static int io_uring_release(struct inode *inode, struct file *file)
return 0;
}
-static bool io_wq_files_match(struct io_wq_work *work, void *data)
-{
- struct files_struct *files = data;
-
- return !files || ((work->flags & IO_WQ_WORK_FILES) &&
- work->identity->files == files);
-}
-
-/*
- * Returns true if 'preq' is the link parent of 'req'
- */
-static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req)
-{
- struct io_kiocb *link;
-
- if (!(preq->flags & REQ_F_LINK_HEAD))
- return false;
-
- list_for_each_entry(link, &preq->link_list, link_list) {
- if (link == req)
- return true;
- }
-
- return false;
-}
-
-/*
- * We're looking to cancel 'req' because it's holding on to our files, but
- * 'req' could be a link to another request. See if it is, and cancel that
- * parent request if so.
- */
-static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req)
-{
- struct hlist_node *tmp;
- struct io_kiocb *preq;
- bool found = false;
- int i;
-
- spin_lock_irq(&ctx->completion_lock);
- for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
- struct hlist_head *list;
-
- list = &ctx->cancel_hash[i];
- hlist_for_each_entry_safe(preq, tmp, list, hash_node) {
- found = io_match_link(preq, req);
- if (found) {
- io_poll_remove_one(preq);
- break;
- }
- }
- }
- spin_unlock_irq(&ctx->completion_lock);
- return found;
-}
-
-static bool io_timeout_remove_link(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
-{
- struct io_kiocb *preq;
- bool found = false;
-
- spin_lock_irq(&ctx->completion_lock);
- list_for_each_entry(preq, &ctx->timeout_list, timeout.list) {
- found = io_match_link(preq, req);
- if (found) {
- __io_timeout_cancel(preq);
- break;
- }
- }
- spin_unlock_irq(&ctx->completion_lock);
- return found;
-}
+struct io_task_cancel {
+ struct task_struct *task;
+ struct files_struct *files;
+};
-static bool io_cancel_link_cb(struct io_wq_work *work, void *data)
+static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ struct io_task_cancel *cancel = data;
bool ret;
- if (req->flags & REQ_F_LINK_TIMEOUT) {
+ if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) {
unsigned long flags;
struct io_ring_ctx *ctx = req->ctx;
/* protect against races with linked timeouts */
spin_lock_irqsave(&ctx->completion_lock, flags);
- ret = io_match_link(req, data);
+ ret = io_match_task(req, cancel->task, cancel->files);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
} else {
- ret = io_match_link(req, data);
+ ret = io_match_task(req, cancel->task, cancel->files);
}
return ret;
}
-static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
-{
- enum io_wq_cancel cret;
-
- /* cancel this particular work, if it's running */
- cret = io_wq_cancel_work(ctx->io_wq, &req->work);
- if (cret != IO_WQ_CANCEL_NOTFOUND)
- return;
-
- /* find links that hold this pending, cancel those */
- cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true);
- if (cret != IO_WQ_CANCEL_NOTFOUND)
- return;
-
- /* if we have a poll link holding this pending, cancel that */
- if (io_poll_remove_link(ctx, req))
- return;
-
- /* final option, timeout link is holding this req pending */
- io_timeout_remove_link(ctx, req);
-}
-
static void io_cancel_defer_files(struct io_ring_ctx *ctx,
struct task_struct *task,
struct files_struct *files)
@@ -8544,8 +8722,7 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx,
spin_lock_irq(&ctx->completion_lock);
list_for_each_entry_reverse(de, &ctx->defer_list, list) {
- if (io_task_match(de->req, task) &&
- io_match_files(de->req, files)) {
+ if (io_match_task(de->req, task, files)) {
list_cut_position(&list, &ctx->defer_list, &de->list);
break;
}
@@ -8562,72 +8739,52 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx,
}
}
-/*
- * Returns true if we found and killed one or more files pinning requests
- */
-static bool io_uring_cancel_files(struct io_ring_ctx *ctx,
+static void io_uring_cancel_files(struct io_ring_ctx *ctx,
+ struct task_struct *task,
struct files_struct *files)
{
- if (list_empty_careful(&ctx->inflight_list))
- return false;
-
- /* cancel all at once, should be faster than doing it one by one*/
- io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true);
-
while (!list_empty_careful(&ctx->inflight_list)) {
- struct io_kiocb *cancel_req = NULL, *req;
+ struct io_task_cancel cancel = { .task = task, .files = files };
+ struct io_kiocb *req;
DEFINE_WAIT(wait);
+ bool found = false;
spin_lock_irq(&ctx->inflight_lock);
list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
- if (files && (req->work.flags & IO_WQ_WORK_FILES) &&
+ if (req->task != task ||
req->work.identity->files != files)
continue;
- /* req is being completed, ignore */
- if (!refcount_inc_not_zero(&req->refs))
- continue;
- cancel_req = req;
+ found = true;
break;
}
- if (cancel_req)
- prepare_to_wait(&ctx->inflight_wait, &wait,
- TASK_UNINTERRUPTIBLE);
+ if (found)
+ prepare_to_wait(&task->io_uring->wait, &wait,
+ TASK_UNINTERRUPTIBLE);
spin_unlock_irq(&ctx->inflight_lock);
/* We need to keep going until we don't find a matching req */
- if (!cancel_req)
+ if (!found)
break;
- /* cancel this request, or head link requests */
- io_attempt_cancel(ctx, cancel_req);
- io_put_req(cancel_req);
+
+ io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true);
+ io_poll_remove_all(ctx, task, files);
+ io_kill_timeouts(ctx, task, files);
/* cancellations _may_ trigger task work */
io_run_task_work();
schedule();
- finish_wait(&ctx->inflight_wait, &wait);
+ finish_wait(&task->io_uring->wait, &wait);
}
-
- return true;
}
-static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
+static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
+ struct task_struct *task)
{
- struct io_kiocb *req = container_of(work, struct io_kiocb, work);
- struct task_struct *task = data;
-
- return io_task_match(req, task);
-}
-
-static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
- struct task_struct *task,
- struct files_struct *files)
-{
- bool ret;
-
- ret = io_uring_cancel_files(ctx, files);
- if (!files) {
+ while (1) {
+ struct io_task_cancel cancel = { .task = task, .files = NULL, };
enum io_wq_cancel cret;
+ bool ret = false;
- cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true);
+ cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true);
if (cret != IO_WQ_CANCEL_NOTFOUND)
ret = true;
@@ -8639,11 +8796,13 @@ static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
}
}
- ret |= io_poll_remove_all(ctx, task);
- ret |= io_kill_timeouts(ctx, task);
+ ret |= io_poll_remove_all(ctx, task, NULL);
+ ret |= io_kill_timeouts(ctx, task, NULL);
+ if (!ret)
+ break;
+ io_run_task_work();
+ cond_resched();
}
-
- return ret;
}
/*
@@ -8662,17 +8821,15 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
io_sq_thread_park(ctx->sq_data);
}
- if (files)
- io_cancel_defer_files(ctx, NULL, files);
- else
- io_cancel_defer_files(ctx, task, NULL);
-
+ io_cancel_defer_files(ctx, task, files);
+ io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
io_cqring_overflow_flush(ctx, true, task, files);
+ io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
- while (__io_uring_cancel_task_requests(ctx, task, files)) {
- io_run_task_work();
- cond_resched();
- }
+ if (!files)
+ __io_uring_cancel_task_requests(ctx, task);
+ else
+ io_uring_cancel_files(ctx, task, files);
if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
atomic_dec(&task->io_uring->in_idle);
@@ -8930,9 +9087,39 @@ static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
finish_wait(&ctx->sqo_sq_wait, &wait);
}
+static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
+ struct __kernel_timespec __user **ts,
+ const sigset_t __user **sig)
+{
+ struct io_uring_getevents_arg arg;
+
+ /*
+ * If EXT_ARG isn't set, then we have no timespec and the argp pointer
+ * is just a pointer to the sigset_t.
+ */
+ if (!(flags & IORING_ENTER_EXT_ARG)) {
+ *sig = (const sigset_t __user *) argp;
+ *ts = NULL;
+ return 0;
+ }
+
+ /*
+ * EXT_ARG is set - ensure we agree on the size of it and copy in our
+ * timespec and sigset_t pointers if good.
+ */
+ if (*argsz != sizeof(arg))
+ return -EINVAL;
+ if (copy_from_user(&arg, argp, sizeof(arg)))
+ return -EFAULT;
+ *sig = u64_to_user_ptr(arg.sigmask);
+ *argsz = arg.sigmask_sz;
+ *ts = u64_to_user_ptr(arg.ts);
+ return 0;
+}
+
SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
- u32, min_complete, u32, flags, const sigset_t __user *, sig,
- size_t, sigsz)
+ u32, min_complete, u32, flags, const void __user *, argp,
+ size_t, argsz)
{
struct io_ring_ctx *ctx;
long ret = -EBADF;
@@ -8942,7 +9129,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
io_run_task_work();
if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
- IORING_ENTER_SQ_WAIT))
+ IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))
return -EINVAL;
f = fdget(fd);
@@ -8969,8 +9156,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
*/
ret = 0;
if (ctx->flags & IORING_SETUP_SQPOLL) {
+ io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
if (!list_empty_careful(&ctx->cq_overflow_list))
io_cqring_overflow_flush(ctx, false, NULL, NULL);
+ io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
if (flags & IORING_ENTER_SQ_WAKEUP)
wake_up(&ctx->sq_data->wait);
if (flags & IORING_ENTER_SQ_WAIT)
@@ -8988,6 +9177,13 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
goto out;
}
if (flags & IORING_ENTER_GETEVENTS) {
+ const sigset_t __user *sig;
+ struct __kernel_timespec __user *ts;
+
+ ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
+ if (unlikely(ret))
+ goto out;
+
min_complete = min(min_complete, ctx->cq_entries);
/*
@@ -9000,7 +9196,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
!(ctx->flags & IORING_SETUP_SQPOLL)) {
ret = io_iopoll_check(ctx, min_complete);
} else {
- ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
+ ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
}
}
@@ -9368,7 +9564,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
- IORING_FEAT_POLL_32BITS;
+ IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
+ IORING_FEAT_EXT_ARG;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 10cc7979ce38..16a1e82e3aeb 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -650,7 +650,7 @@ iomap_set_page_dirty(struct page *page)
return !TestSetPageDirty(page);
/*
- * Lock out page->mem_cgroup migration to keep PageDirty
+ * Lock out page's memcg migration to keep PageDirty
* synchronized with per-memcg dirty page counters.
*/
lock_page_memcg(page);
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 7dfcab2a2da6..94b7c1cb5ceb 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -668,7 +668,7 @@ unlock:
* this does not succeed, we finally try to allocate anywhere
* within the aggregate.
*
- * we also try to allocate anywhere within the aggregate for
+ * we also try to allocate anywhere within the aggregate
* for allocation requests larger than the allocation group
* size or requests that specify no hint value.
*
@@ -2549,15 +2549,19 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
*/
if (oldval == NOFREE) {
rc = dbBackSplit((dmtree_t *) dcp, leafno);
- if (rc)
+ if (rc) {
+ release_metapage(mp);
return rc;
+ }
oldval = dcp->stree[ti];
}
dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval);
} else {
rc = dbJoin((dmtree_t *) dcp, leafno, newval);
- if (rc)
+ if (rc) {
+ release_metapage(mp);
return rc;
+ }
}
/* check if the root of the current dmap control page changed due
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
index 29891fad3f09..aa03a904d5ab 100644
--- a/fs/jfs/jfs_dmap.h
+++ b/fs/jfs/jfs_dmap.h
@@ -183,7 +183,7 @@ typedef union dmtree {
#define dmt_leafidx t1.leafidx
#define dmt_height t1.height
#define dmt_budmin t1.budmin
-#define dmt_stree t1.stree
+#define dmt_stree t2.stree
/*
* on-disk aggregate disk allocation map descriptor.
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index f65bd6b35412..bb4a342a193d 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -575,7 +575,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
* blkno - starting block number of the extents current allocation.
* nblks - number of blocks within the extents current allocation.
* newnblks - pointer to a s64 value. on entry, this value is the
- * the new desired extent size (number of blocks). on
+ * new desired extent size (number of blocks). on
* successful exit, this value is set to the extent's actual
* new size (new number of blocks).
* newblkno - the starting block number of the extents new allocation.
diff --git a/fs/jfs/jfs_extent.h b/fs/jfs/jfs_extent.h
index dd635a8a0f8c..1c984214e95e 100644
--- a/fs/jfs/jfs_extent.h
+++ b/fs/jfs/jfs_extent.h
@@ -5,7 +5,7 @@
#ifndef _H_JFS_EXTENT
#define _H_JFS_EXTENT
-/* get block allocation allocation hint as location of disk inode */
+/* get block allocation hint as location of disk inode */
#define INOHINT(ip) \
(addressPXD(&(JFS_IP(ip)->ixpxd)) + lengthPXD(&(JFS_IP(ip)->ixpxd)) - 1)
diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h
index 7fd125c8dd19..805877ce5020 100644
--- a/fs/jfs/jfs_logmgr.h
+++ b/fs/jfs/jfs_logmgr.h
@@ -132,7 +132,7 @@ struct logpage {
* (this comment should be rewritten !)
* jfs uses only "after" log records (only a single writer is allowed
* in a page, pages are written to temporary paging space if
- * if they must be written to disk before commit, and i/o is
+ * they must be written to disk before commit, and i/o is
* scheduled for modified pages to their home location after
* the log records containing the after values and the commit
* record is written to the log on disk, undo discards the copy
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index c8ce7f1bc594..dca8edd2378c 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1474,7 +1474,7 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
* For the LOG_NOREDOINOEXT record, we need
* to pass the IAG number and inode extent
* index (within that IAG) from which the
- * the extent being released. These have been
+ * extent is being released. These have been
* passed to us in the iplist[1] and iplist[2].
*/
lrd->log.noredoinoext.iagnum =
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 16ad920f6fb1..3148e9b35f3b 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -3684,7 +3684,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
*
* function:
* Perform truncate to zero length for deleted file, leaving the
- * the xtree and working map untouched. This allows the file to
+ * xtree and working map untouched. This allows the file to
* be accessed via open file handles, while the delete of the file
* is committed to disk.
*
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 9aec80b9d7c6..7a53eed69fef 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -604,8 +604,7 @@ const struct dentry_operations kernfs_dops = {
*/
struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
{
- if (dentry->d_sb->s_op == &kernfs_sops &&
- !d_really_is_negative(dentry))
+ if (dentry->d_sb->s_op == &kernfs_sops)
return kernfs_dentry_node(dentry);
return NULL;
}
@@ -1599,7 +1598,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
return error;
}
-/* Relationship between s_mode and the DT_xxx types */
+/* Relationship between mode and the DT_xxx types */
static inline unsigned char dt_type(struct kernfs_node *kn)
{
return (kn->mode >> 12) & 15;
diff --git a/fs/libfs.c b/fs/libfs.c
index 7124c2e8df2f..d1c3bade9f30 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1451,4 +1451,74 @@ int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
return 0;
}
EXPORT_SYMBOL(generic_ci_d_hash);
+
+static const struct dentry_operations generic_ci_dentry_ops = {
+ .d_hash = generic_ci_d_hash,
+ .d_compare = generic_ci_d_compare,
+};
+#endif
+
+#ifdef CONFIG_FS_ENCRYPTION
+static const struct dentry_operations generic_encrypted_dentry_ops = {
+ .d_revalidate = fscrypt_d_revalidate,
+};
+#endif
+
+#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE)
+static const struct dentry_operations generic_encrypted_ci_dentry_ops = {
+ .d_hash = generic_ci_d_hash,
+ .d_compare = generic_ci_d_compare,
+ .d_revalidate = fscrypt_d_revalidate,
+};
+#endif
+
+/**
+ * generic_set_encrypted_ci_d_ops - helper for setting d_ops for given dentry
+ * @dentry: dentry to set ops on
+ *
+ * Casefolded directories need d_hash and d_compare set, so that the dentries
+ * contained in them are handled case-insensitively. Note that these operations
+ * are needed on the parent directory rather than on the dentries in it, and
+ * while the casefolding flag can be toggled on and off on an empty directory,
+ * dentry_operations can't be changed later. As a result, if the filesystem has
+ * casefolding support enabled at all, we have to give all dentries the
+ * casefolding operations even if their inode doesn't have the casefolding flag
+ * currently (and thus the casefolding ops would be no-ops for now).
+ *
+ * Encryption works differently in that the only dentry operation it needs is
+ * d_revalidate, which it only needs on dentries that have the no-key name flag.
+ * The no-key flag can't be set "later", so we don't have to worry about that.
+ *
+ * Finally, to maximize compatibility with overlayfs (which isn't compatible
+ * with certain dentry operations) and to avoid taking an unnecessary
+ * performance hit, we use custom dentry_operations for each possible
+ * combination rather than always installing all operations.
+ */
+void generic_set_encrypted_ci_d_ops(struct dentry *dentry)
+{
+#ifdef CONFIG_FS_ENCRYPTION
+ bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME;
+#endif
+#ifdef CONFIG_UNICODE
+ bool needs_ci_ops = dentry->d_sb->s_encoding;
+#endif
+#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE)
+ if (needs_encrypt_ops && needs_ci_ops) {
+ d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops);
+ return;
+ }
#endif
+#ifdef CONFIG_FS_ENCRYPTION
+ if (needs_encrypt_ops) {
+ d_set_d_op(dentry, &generic_encrypted_dentry_ops);
+ return;
+ }
+#endif
+#ifdef CONFIG_UNICODE
+ if (needs_ci_ops) {
+ d_set_d_op(dentry, &generic_ci_dentry_ops);
+ return;
+ }
+#endif
+}
+EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops);
diff --git a/fs/locks.c b/fs/locks.c
index 1f84a03601fe..99ca97e81b7a 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -542,7 +542,7 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
if (l->l_len > 0) {
if (l->l_len - 1 > OFFSET_MAX - fl->fl_start)
return -EOVERFLOW;
- fl->fl_end = fl->fl_start + l->l_len - 1;
+ fl->fl_end = fl->fl_start + (l->l_len - 1);
} else if (l->l_len < 0) {
if (fl->fl_start + l->l_len < 0)
@@ -750,7 +750,7 @@ static void __locks_wake_up_blocks(struct file_lock *blocker)
}
/**
- * locks_delete_lock - stop waiting for a file lock
+ * locks_delete_block - stop waiting for a file lock
* @waiter: the lock which was waiting
*
* lockd/nfsd need to disconnect the lock while working on it.
@@ -2539,14 +2539,15 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
*/
if (!error && file_lock->fl_type != F_UNLCK &&
!(file_lock->fl_flags & FL_OFDLCK)) {
+ struct files_struct *files = current->files;
/*
* We need that spin_lock here - it prevents reordering between
* update of i_flctx->flc_posix and check for it done in
* close(). rcu_read_lock() wouldn't do.
*/
- spin_lock(&current->files->file_lock);
- f = fcheck(fd);
- spin_unlock(&current->files->file_lock);
+ spin_lock(&files->file_lock);
+ f = files_lookup_fd_locked(files, fd);
+ spin_unlock(&files->file_lock);
if (f != filp) {
file_lock->fl_type = F_UNLCK;
error = do_lock_file_wait(filp, cmd, file_lock);
@@ -2670,14 +2671,15 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
*/
if (!error && file_lock->fl_type != F_UNLCK &&
!(file_lock->fl_flags & FL_OFDLCK)) {
+ struct files_struct *files = current->files;
/*
* We need that spin_lock here - it prevents reordering between
* update of i_flctx->flc_posix and check for it done in
* close(). rcu_read_lock() wouldn't do.
*/
- spin_lock(&current->files->file_lock);
- f = fcheck(fd);
- spin_unlock(&current->files->file_lock);
+ spin_lock(&files->file_lock);
+ f = files_lookup_fd_locked(files, fd);
+ spin_unlock(&files->file_lock);
if (f != filp) {
file_lock->fl_type = F_UNLCK;
error = do_lock_file_wait(filp, cmd, file_lock);
diff --git a/fs/mount.h b/fs/mount.h
index c7abb7b394d8..ce6c376e0bc2 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -6,7 +6,6 @@
#include <linux/fs_pin.h>
struct mnt_namespace {
- atomic_t count;
struct ns_common ns;
struct mount * root;
/*
@@ -120,7 +119,7 @@ static inline void detach_mounts(struct dentry *dentry)
static inline void get_mnt_ns(struct mnt_namespace *ns)
{
- atomic_inc(&ns->count);
+ refcount_inc(&ns->ns.count);
}
extern seqlock_t mount_lock;
diff --git a/fs/namei.c b/fs/namei.c
index d4a6dd772303..03d0e11e4f36 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4346,8 +4346,8 @@ out:
}
EXPORT_SYMBOL(vfs_rename);
-static int do_renameat2(int olddfd, const char __user *oldname, int newdfd,
- const char __user *newname, unsigned int flags)
+int do_renameat2(int olddfd, struct filename *from, int newdfd,
+ struct filename *to, unsigned int flags)
{
struct dentry *old_dentry, *new_dentry;
struct dentry *trap;
@@ -4355,32 +4355,30 @@ static int do_renameat2(int olddfd, const char __user *oldname, int newdfd,
struct qstr old_last, new_last;
int old_type, new_type;
struct inode *delegated_inode = NULL;
- struct filename *from;
- struct filename *to;
unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
bool should_retry = false;
- int error;
+ int error = -EINVAL;
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
- return -EINVAL;
+ goto put_both;
if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
(flags & RENAME_EXCHANGE))
- return -EINVAL;
+ goto put_both;
if (flags & RENAME_EXCHANGE)
target_flags = 0;
retry:
- from = filename_parentat(olddfd, getname(oldname), lookup_flags,
- &old_path, &old_last, &old_type);
+ from = filename_parentat(olddfd, from, lookup_flags, &old_path,
+ &old_last, &old_type);
if (IS_ERR(from)) {
error = PTR_ERR(from);
- goto exit;
+ goto put_new;
}
- to = filename_parentat(newdfd, getname(newname), lookup_flags,
- &new_path, &new_last, &new_type);
+ to = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
+ &new_type);
if (IS_ERR(to)) {
error = PTR_ERR(to);
goto exit1;
@@ -4473,34 +4471,40 @@ exit2:
if (retry_estale(error, lookup_flags))
should_retry = true;
path_put(&new_path);
- putname(to);
exit1:
path_put(&old_path);
- putname(from);
if (should_retry) {
should_retry = false;
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
-exit:
+put_both:
+ if (!IS_ERR(from))
+ putname(from);
+put_new:
+ if (!IS_ERR(to))
+ putname(to);
return error;
}
SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
int, newdfd, const char __user *, newname, unsigned int, flags)
{
- return do_renameat2(olddfd, oldname, newdfd, newname, flags);
+ return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
+ flags);
}
SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
int, newdfd, const char __user *, newname)
{
- return do_renameat2(olddfd, oldname, newdfd, newname, 0);
+ return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
+ 0);
}
SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
{
- return do_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
+ return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD,
+ getname(newname), 0);
}
int readlink_copy(char __user *buffer, int buflen, const char *link)
diff --git a/fs/namespace.c b/fs/namespace.c
index cebaa3e81794..2b681f65ca04 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3274,7 +3274,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
new_ns->ns.ops = &mntns_operations;
if (!anon)
new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
- atomic_set(&new_ns->count, 1);
+ refcount_set(&new_ns->ns.count, 1);
INIT_LIST_HEAD(&new_ns->list);
init_waitqueue_head(&new_ns->poll);
spin_lock_init(&new_ns->ns_lock);
@@ -3848,7 +3848,7 @@ void __init mnt_init(void)
void put_mnt_ns(struct mnt_namespace *ns)
{
- if (!atomic_dec_and_test(&ns->count))
+ if (!refcount_dec_and_test(&ns->ns.count))
return;
drop_collected_mounts(&ns->root->mnt);
free_mnt_ns(ns);
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 08108b6d2fa1..3be6836074ae 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -697,7 +697,7 @@ bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
xdr_init_decode_pages(&xdr, &buf,
lgr->layoutp->pages, lgr->layoutp->len);
- xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
+ xdr_set_scratch_page(&xdr, scratch);
status = -EIO;
p = xdr_inline_decode(&xdr, 4);
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index dec5880ac6de..acb1d22907da 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -510,7 +510,7 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
goto out;
xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
- xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
+ xdr_set_scratch_page(&xdr, scratch);
p = xdr_inline_decode(&xdr, sizeof(__be32));
if (!p)
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index e61dbc9b86ae..f7786e00a6a7 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -6,10 +6,15 @@
*
* NFSv4 callback procedures
*/
+
+#include <linux/errno.h>
+#include <linux/math.h>
#include <linux/nfs4.h>
#include <linux/nfs_fs.h>
#include <linux/slab.h>
#include <linux/rcupdate.h>
+#include <linux/types.h>
+
#include "nfs4_fs.h"
#include "callback.h"
#include "delegation.h"
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 4e011adaf967..8a24fe20dccf 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -576,7 +576,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
goto out_nopages;
xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
- xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+ xdr_set_scratch_page(&stream, scratch);
do {
if (entry->label)
diff --git a/fs/nfs/export.c b/fs/nfs/export.c
index 3430d6891e89..7412bb164fa7 100644
--- a/fs/nfs/export.c
+++ b/fs/nfs/export.c
@@ -171,4 +171,7 @@ const struct export_operations nfs_export_ops = {
.encode_fh = nfs_encode_fh,
.fh_to_dentry = nfs_fh_to_dentry,
.get_parent = nfs_get_parent,
+ .flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK|
+ EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS|
+ EXPORT_OP_NOATOMIC_ATTR,
};
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 7f5aa0403e16..d158a500c25c 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -666,7 +666,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
return -ENOMEM;
xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
- xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+ xdr_set_scratch_page(&stream, scratch);
/* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8),
* num_fh (4) */
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index d913e818858f..86c3f7e69ec4 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -82,7 +82,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
goto out_err;
xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
- xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+ xdr_set_scratch_page(&stream, scratch);
/* Get the stripe count (number of stripe index) */
p = xdr_inline_decode(&stream, 4);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 24bf5797f88a..4252ce633533 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -378,7 +378,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages,
lgr->layoutp->len);
- xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+ xdr_set_scratch_page(&stream, scratch);
/* stripe unit and mirror_array_cnt */
rc = -EIO;
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 3eda40a320a5..c9b61b818ec1 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -69,7 +69,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
INIT_LIST_HEAD(&dsaddrs);
xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
- xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+ xdr_set_scratch_page(&stream, scratch);
/* multipath count */
p = xdr_inline_decode(&stream, 4);
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 8432bd6b95f0..ea7dd8cbfac9 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -1539,7 +1539,7 @@ static int nfs4_xdr_dec_listxattrs(struct rpc_rqst *rqstp,
struct compound_hdr hdr;
int status;
- xdr_set_scratch_buffer(xdr, page_address(res->scratch), PAGE_SIZE);
+ xdr_set_scratch_page(xdr, res->scratch);
status = decode_compound_hdr(xdr, &hdr);
if (status)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index c6dbfcae7517..2eabe5add344 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -6403,10 +6403,8 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
struct compound_hdr hdr;
int status;
- if (res->acl_scratch != NULL) {
- void *p = page_address(res->acl_scratch);
- xdr_set_scratch_buffer(xdr, p, PAGE_SIZE);
- }
+ if (res->acl_scratch != NULL)
+ xdr_set_scratch_page(xdr, res->acl_scratch);
status = decode_compound_hdr(xdr, &hdr);
if (status)
goto out;
diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c
index b73d9dd37f73..26f2a50eceac 100644
--- a/fs/nfs_common/grace.c
+++ b/fs/nfs_common/grace.c
@@ -69,10 +69,14 @@ __state_in_grace(struct net *net, bool open)
if (!open)
return !list_empty(grace_list);
+ spin_lock(&grace_lock);
list_for_each_entry(lm, grace_list, list) {
- if (lm->block_opens)
+ if (lm->block_opens) {
+ spin_unlock(&grace_lock);
return true;
+ }
}
+ spin_unlock(&grace_lock);
return false;
}
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 21e404e7cb68..81e7bb12aca6 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -408,6 +408,12 @@ static int check_export(struct inode *inode, int *flags, unsigned char *uuid)
return -EINVAL;
}
+ if (inode->i_sb->s_export_op->flags & EXPORT_OP_NOSUBTREECHK &&
+ !(*flags & NFSEXP_NOSUBTREECHECK)) {
+ dprintk("%s: %s does not support subtree checking!\n",
+ __func__, inode->i_sb->s_type->name);
+ return -EINVAL;
+ }
return 0;
}
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 3c6c2f7d1688..53fcbf79bdca 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -600,7 +600,7 @@ static struct notifier_block nfsd_file_lease_notifier = {
static int
nfsd_file_fsnotify_handle_event(struct fsnotify_mark *mark, u32 mask,
struct inode *inode, struct inode *dir,
- const struct qstr *name)
+ const struct qstr *name, u32 cookie)
{
trace_nfsd_file_fsnotify_handle_event(inode, mask);
@@ -685,6 +685,7 @@ nfsd_file_cache_init(void)
if (IS_ERR(nfsd_file_fsnotify_group)) {
pr_err("nfsd: unable to create fsnotify group: %ld\n",
PTR_ERR(nfsd_file_fsnotify_group));
+ ret = PTR_ERR(nfsd_file_fsnotify_group);
nfsd_file_fsnotify_group = NULL;
goto out_notifier;
}
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 6a900f770dd2..b0f66604532a 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -185,10 +185,6 @@ out:
/*
* XDR decode functions
*/
-static int nfsaclsvc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p)
-{
- return 1;
-}
static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p)
{
@@ -255,15 +251,6 @@ static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p)
* XDR encode functions
*/
-/*
- * There must be an encoding function for void results so svc_process
- * will work properly.
- */
-static int nfsaclsvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p)
-{
- return xdr_ressize_check(rqstp, p);
-}
-
/* GETACL */
static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p)
{
@@ -378,10 +365,10 @@ struct nfsd3_voidargs { int dummy; };
static const struct svc_procedure nfsd_acl_procedures2[5] = {
[ACLPROC2_NULL] = {
.pc_func = nfsacld_proc_null,
- .pc_decode = nfsaclsvc_decode_voidarg,
- .pc_encode = nfsaclsvc_encode_voidres,
- .pc_argsize = sizeof(struct nfsd3_voidargs),
- .pc_ressize = sizeof(struct nfsd3_voidargs),
+ .pc_decode = nfssvc_decode_voidarg,
+ .pc_encode = nfssvc_encode_voidres,
+ .pc_argsize = sizeof(struct nfsd_voidargs),
+ .pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST,
},
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 34a394e50e1d..7c30876a31a1 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -245,10 +245,10 @@ struct nfsd3_voidargs { int dummy; };
static const struct svc_procedure nfsd_acl_procedures3[3] = {
[ACLPROC3_NULL] = {
.pc_func = nfsd3_proc_null,
- .pc_decode = nfs3svc_decode_voidarg,
- .pc_encode = nfs3svc_encode_voidres,
- .pc_argsize = sizeof(struct nfsd3_voidargs),
- .pc_ressize = sizeof(struct nfsd3_voidargs),
+ .pc_decode = nfssvc_decode_voidarg,
+ .pc_encode = nfssvc_encode_voidres,
+ .pc_argsize = sizeof(struct nfsd_voidargs),
+ .pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST,
},
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index a633044b0dc1..76931f4f57c3 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -689,12 +689,9 @@ out:
#define nfsd3_mkdirargs nfsd3_createargs
#define nfsd3_readdirplusargs nfsd3_readdirargs
#define nfsd3_fhandleargs nfsd_fhandle
-#define nfsd3_fhandleres nfsd3_attrstat
#define nfsd3_attrstatres nfsd3_attrstat
#define nfsd3_wccstatres nfsd3_attrstat
#define nfsd3_createres nfsd3_diropres
-#define nfsd3_voidres nfsd3_voidargs
-struct nfsd3_voidargs { int dummy; };
#define ST 1 /* status*/
#define FH 17 /* filehandle with length */
@@ -705,10 +702,10 @@ struct nfsd3_voidargs { int dummy; };
static const struct svc_procedure nfsd_procedures3[22] = {
[NFS3PROC_NULL] = {
.pc_func = nfsd3_proc_null,
- .pc_decode = nfs3svc_decode_voidarg,
- .pc_encode = nfs3svc_encode_voidres,
- .pc_argsize = sizeof(struct nfsd3_voidargs),
- .pc_ressize = sizeof(struct nfsd3_voidres),
+ .pc_decode = nfssvc_decode_voidarg,
+ .pc_encode = nfssvc_encode_voidres,
+ .pc_argsize = sizeof(struct nfsd_voidargs),
+ .pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST,
},
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 2277f83da250..821db21ba072 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -206,7 +206,7 @@ static __be32 *
encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
{
struct dentry *dentry = fhp->fh_dentry;
- if (dentry && d_really_is_positive(dentry)) {
+ if (!fhp->fh_no_wcc && dentry && d_really_is_positive(dentry)) {
__be32 err;
struct kstat stat;
@@ -259,11 +259,11 @@ void fill_pre_wcc(struct svc_fh *fhp)
{
struct inode *inode;
struct kstat stat;
+ bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE);
__be32 err;
- if (fhp->fh_pre_saved)
+ if (fhp->fh_no_wcc || fhp->fh_pre_saved)
return;
-
inode = d_inode(fhp->fh_dentry);
err = fh_getattr(fhp, &stat);
if (err) {
@@ -272,11 +272,12 @@ void fill_pre_wcc(struct svc_fh *fhp)
stat.ctime = inode->i_ctime;
stat.size = inode->i_size;
}
+ if (v4)
+ fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
fhp->fh_pre_mtime = stat.mtime;
fhp->fh_pre_ctime = stat.ctime;
fhp->fh_pre_size = stat.size;
- fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
fhp->fh_pre_saved = true;
}
@@ -285,30 +286,30 @@ void fill_pre_wcc(struct svc_fh *fhp)
*/
void fill_post_wcc(struct svc_fh *fhp)
{
+ bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE);
+ struct inode *inode = d_inode(fhp->fh_dentry);
__be32 err;
+ if (fhp->fh_no_wcc)
+ return;
+
if (fhp->fh_post_saved)
printk("nfsd: inode locked twice during operation.\n");
err = fh_getattr(fhp, &fhp->fh_post_attr);
- fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr,
- d_inode(fhp->fh_dentry));
if (err) {
fhp->fh_post_saved = false;
- /* Grab the ctime anyway - set_change_info might use it */
- fhp->fh_post_attr.ctime = d_inode(fhp->fh_dentry)->i_ctime;
+ fhp->fh_post_attr.ctime = inode->i_ctime;
} else
fhp->fh_post_saved = true;
+ if (v4)
+ fhp->fh_post_change =
+ nfsd4_change_attribute(&fhp->fh_post_attr, inode);
}
/*
* XDR decode functions
*/
-int
-nfs3svc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p)
-{
- return 1;
-}
int
nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p)
@@ -642,12 +643,6 @@ nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p)
* XDR encode functions
*/
-int
-nfs3svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p)
-{
- return xdr_ressize_check(rqstp, p);
-}
-
/* GETATTR */
int
nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p)
@@ -707,6 +702,7 @@ int
nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_readlinkres *resp = rqstp->rq_resp;
+ struct kvec *head = rqstp->rq_res.head;
*p++ = resp->status;
p = encode_post_op_attr(rqstp, p, &resp->fh);
@@ -720,6 +716,8 @@ nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p)
*p = 0;
rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3);
}
+ if (svc_encode_result_payload(rqstp, head->iov_len, resp->len))
+ return 0;
return 1;
} else
return xdr_ressize_check(rqstp, p);
@@ -730,6 +728,7 @@ int
nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd3_readres *resp = rqstp->rq_resp;
+ struct kvec *head = rqstp->rq_res.head;
*p++ = resp->status;
p = encode_post_op_attr(rqstp, p, &resp->fh);
@@ -746,6 +745,9 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p)
*p = 0;
rqstp->rq_res.tail[0].iov_len = 4 - (resp->count & 3);
}
+ if (svc_encode_result_payload(rqstp, head->iov_len,
+ resp->count))
+ return 0;
return 1;
} else
return xdr_ressize_check(rqstp, p);
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index e83b21778816..4727b7f03c5b 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -257,8 +257,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
* in NFSv4 as in v3 except EXCLUSIVE4_1.
*/
current->fs->umask = open->op_umask;
- status = do_nfsd_create(rqstp, current_fh, open->op_fname.data,
- open->op_fname.len, &open->op_iattr,
+ status = do_nfsd_create(rqstp, current_fh, open->op_fname,
+ open->op_fnamelen, &open->op_iattr,
*resfh, open->op_createmode,
(u32 *)open->op_verf.data,
&open->op_truncate, &open->op_created);
@@ -283,7 +283,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
* a chance to an acquire a delegation if appropriate.
*/
status = nfsd_lookup(rqstp, current_fh,
- open->op_fname.data, open->op_fname.len, *resfh);
+ open->op_fname, open->op_fnamelen, *resfh);
if (status)
goto out;
status = nfsd_check_obj_isreg(*resfh);
@@ -360,7 +360,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
bool reclaim = false;
dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n",
- (int)open->op_fname.len, open->op_fname.data,
+ (int)open->op_fnamelen, open->op_fname,
open->op_openowner);
/* This check required by spec. */
@@ -1023,8 +1023,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
write->wr_how_written = write->wr_stable_how;
- nvecs = svc_fill_write_vector(rqstp, write->wr_pagelist,
- &write->wr_head, write->wr_buflen);
+ nvecs = svc_fill_write_vector(rqstp, write->wr_payload.pages,
+ write->wr_payload.head, write->wr_buflen);
WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec));
status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf,
@@ -1425,7 +1425,7 @@ static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, bool sync)
return status;
}
-static int dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst)
+static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst)
{
dst->cp_src_pos = src->cp_src_pos;
dst->cp_dst_pos = src->cp_dst_pos;
@@ -1444,8 +1444,6 @@ static int dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst)
memcpy(&dst->stateid, &src->stateid, sizeof(src->stateid));
memcpy(&dst->c_fh, &src->c_fh, sizeof(src->c_fh));
dst->ss_mnt = src->ss_mnt;
-
- return 0;
}
static void cleanup_async_copy(struct nfsd4_copy *copy)
@@ -1539,9 +1537,7 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
refcount_set(&async_copy->refcount, 1);
memcpy(&copy->cp_res.cb_stateid, &copy->cp_stateid,
sizeof(copy->cp_stateid));
- status = dup_copy_fields(copy, async_copy);
- if (status)
- goto out_err;
+ dup_copy_fields(copy, async_copy);
async_copy->copy_task = kthread_create(nfsd4_do_async_copy,
async_copy, "%s", "copy thread");
if (IS_ERR(async_copy->copy_task))
@@ -2276,7 +2272,7 @@ static void svcxdr_init_encode(struct svc_rqst *rqstp,
xdr->end = head->iov_base + PAGE_SIZE - rqstp->rq_auth_slack;
/* Tail and page_len should be zero at this point: */
buf->len = buf->head[0].iov_len;
- xdr->scratch.iov_len = 0;
+ xdr_reset_scratch_buffer(xdr);
xdr->page_ptr = buf->pages - 1;
buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages)
- rqstp->rq_auth_slack;
@@ -3282,7 +3278,7 @@ int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op)
void warn_on_nonidempotent_op(struct nfsd4_op *op)
{
if (OPDESC(op)->op_flags & OP_MODIFIES_SOMETHING) {
- pr_err("unable to encode reply to nonidempotent op %d (%s)\n",
+ pr_err("unable to encode reply to nonidempotent op %u (%s)\n",
op->opnum, nfsd4_op_name(op->opnum));
WARN_ON_ONCE(1);
}
@@ -3295,16 +3291,13 @@ static const char *nfsd4_op_name(unsigned opnum)
return "unknown_operation";
}
-#define nfsd4_voidres nfsd4_voidargs
-struct nfsd4_voidargs { int dummy; };
-
static const struct svc_procedure nfsd_procedures4[2] = {
[NFSPROC4_NULL] = {
.pc_func = nfsd4_proc_null,
- .pc_decode = nfs4svc_decode_voidarg,
- .pc_encode = nfs4svc_encode_voidres,
- .pc_argsize = sizeof(struct nfsd4_voidargs),
- .pc_ressize = sizeof(struct nfsd4_voidres),
+ .pc_decode = nfssvc_decode_voidarg,
+ .pc_encode = nfssvc_encode_voidres,
+ .pc_argsize = sizeof(struct nfsd_voidargs),
+ .pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = 1,
},
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d7f27ed6b794..1d2cd6a88f61 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -769,6 +769,7 @@ static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid,
spin_lock(&nn->s2s_cp_lock);
new_id = idr_alloc_cyclic(&nn->s2s_cp_stateids, stid, 0, 0, GFP_NOWAIT);
stid->stid.si_opaque.so_id = new_id;
+ stid->stid.si_generation = 1;
spin_unlock(&nn->s2s_cp_lock);
idr_preload_end();
if (new_id < 0)
@@ -3066,7 +3067,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
rpc_ntop(sa, addr_str, sizeof(addr_str));
dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
- "ip_addr=%s flags %x, spa_how %d\n",
+ "ip_addr=%s flags %x, spa_how %u\n",
__func__, rqstp, exid, exid->clname.len, exid->clname.data,
addr_str, exid->flags, exid->spa_how);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 833a2c64dfe8..45ee6b12ce5b 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -54,6 +54,8 @@
#include "pnfs.h"
#include "filecache.h"
+#include "trace.h"
+
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
#include <linux/security.h>
#endif
@@ -90,6 +92,8 @@ check_filename(char *str, int len)
if (len == 0)
return nfserr_inval;
+ if (len > NFS4_MAXNAMLEN)
+ return nfserr_nametoolong;
if (isdotent(str, len))
return nfserr_badname;
for (i = 0; i < len; i++)
@@ -98,122 +102,6 @@ check_filename(char *str, int len)
return 0;
}
-#define DECODE_HEAD \
- __be32 *p; \
- __be32 status
-#define DECODE_TAIL \
- status = 0; \
-out: \
- return status; \
-xdr_error: \
- dprintk("NFSD: xdr error (%s:%d)\n", \
- __FILE__, __LINE__); \
- status = nfserr_bad_xdr; \
- goto out
-
-#define READMEM(x,nbytes) do { \
- x = (char *)p; \
- p += XDR_QUADLEN(nbytes); \
-} while (0)
-#define SAVEMEM(x,nbytes) do { \
- if (!(x = (p==argp->tmp || p == argp->tmpp) ? \
- savemem(argp, p, nbytes) : \
- (char *)p)) { \
- dprintk("NFSD: xdr error (%s:%d)\n", \
- __FILE__, __LINE__); \
- goto xdr_error; \
- } \
- p += XDR_QUADLEN(nbytes); \
-} while (0)
-#define COPYMEM(x,nbytes) do { \
- memcpy((x), p, nbytes); \
- p += XDR_QUADLEN(nbytes); \
-} while (0)
-
-/* READ_BUF, read_buf(): nbytes must be <= PAGE_SIZE */
-#define READ_BUF(nbytes) do { \
- if (nbytes <= (u32)((char *)argp->end - (char *)argp->p)) { \
- p = argp->p; \
- argp->p += XDR_QUADLEN(nbytes); \
- } else if (!(p = read_buf(argp, nbytes))) { \
- dprintk("NFSD: xdr error (%s:%d)\n", \
- __FILE__, __LINE__); \
- goto xdr_error; \
- } \
-} while (0)
-
-static void next_decode_page(struct nfsd4_compoundargs *argp)
-{
- argp->p = page_address(argp->pagelist[0]);
- argp->pagelist++;
- if (argp->pagelen < PAGE_SIZE) {
- argp->end = argp->p + XDR_QUADLEN(argp->pagelen);
- argp->pagelen = 0;
- } else {
- argp->end = argp->p + (PAGE_SIZE>>2);
- argp->pagelen -= PAGE_SIZE;
- }
-}
-
-static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
-{
- /* We want more bytes than seem to be available.
- * Maybe we need a new page, maybe we have just run out
- */
- unsigned int avail = (char *)argp->end - (char *)argp->p;
- __be32 *p;
-
- if (argp->pagelen == 0) {
- struct kvec *vec = &argp->rqstp->rq_arg.tail[0];
-
- if (!argp->tail) {
- argp->tail = true;
- avail = vec->iov_len;
- argp->p = vec->iov_base;
- argp->end = vec->iov_base + avail;
- }
-
- if (avail < nbytes)
- return NULL;
-
- p = argp->p;
- argp->p += XDR_QUADLEN(nbytes);
- return p;
- }
-
- if (avail + argp->pagelen < nbytes)
- return NULL;
- if (avail + PAGE_SIZE < nbytes) /* need more than a page !! */
- return NULL;
- /* ok, we can do it with the current plus the next page */
- if (nbytes <= sizeof(argp->tmp))
- p = argp->tmp;
- else {
- kfree(argp->tmpp);
- p = argp->tmpp = kmalloc(nbytes, GFP_KERNEL);
- if (!p)
- return NULL;
-
- }
- /*
- * The following memcpy is safe because read_buf is always
- * called with nbytes > avail, and the two cases above both
- * guarantee p points to at least nbytes bytes.
- */
- memcpy(p, argp->p, avail);
- next_decode_page(argp);
- memcpy(((char*)p)+avail, argp->p, (nbytes - avail));
- argp->p += XDR_QUADLEN(nbytes - avail);
- return p;
-}
-
-static unsigned int compoundargs_bytes_left(struct nfsd4_compoundargs *argp)
-{
- unsigned int this = (char *)argp->end - (char *)argp->p;
-
- return this + argp->pagelen;
-}
-
static int zero_clientid(clientid_t *clid)
{
return (clid->cl_boot == 0) && (clid->cl_id == 0);
@@ -259,118 +147,243 @@ svcxdr_dupstr(struct nfsd4_compoundargs *argp, void *buf, u32 len)
return p;
}
+/*
+ * NFSv4 basic data type decoders
+ */
+
+/*
+ * This helper handles variable-length opaques which belong to protocol
+ * elements that this implementation does not support.
+ */
static __be32
-svcxdr_construct_vector(struct nfsd4_compoundargs *argp, struct kvec *head,
- struct page ***pagelist, u32 buflen)
+nfsd4_decode_ignored_string(struct nfsd4_compoundargs *argp, u32 maxlen)
{
- int avail;
- int len;
- int pages;
+ u32 len;
- /* Sorry .. no magic macros for this.. *
- * READ_BUF(write->wr_buflen);
- * SAVEMEM(write->wr_buf, write->wr_buflen);
- */
- avail = (char *)argp->end - (char *)argp->p;
- if (avail + argp->pagelen < buflen) {
- dprintk("NFSD: xdr error (%s:%d)\n",
- __FILE__, __LINE__);
+ if (xdr_stream_decode_u32(argp->xdr, &len) < 0)
+ return nfserr_bad_xdr;
+ if (maxlen && len > maxlen)
+ return nfserr_bad_xdr;
+ if (!xdr_inline_decode(argp->xdr, len))
return nfserr_bad_xdr;
- }
- head->iov_base = argp->p;
- head->iov_len = avail;
- *pagelist = argp->pagelist;
- len = XDR_QUADLEN(buflen) << 2;
- if (len >= avail) {
- len -= avail;
+ return nfs_ok;
+}
- pages = len >> PAGE_SHIFT;
- argp->pagelist += pages;
- argp->pagelen -= pages * PAGE_SIZE;
- len -= pages * PAGE_SIZE;
+static __be32
+nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o)
+{
+ __be32 *p;
+ u32 len;
- next_decode_page(argp);
- }
- argp->p += XDR_QUADLEN(len);
+ if (xdr_stream_decode_u32(argp->xdr, &len) < 0)
+ return nfserr_bad_xdr;
+ if (len == 0 || len > NFS4_OPAQUE_LIMIT)
+ return nfserr_bad_xdr;
+ p = xdr_inline_decode(argp->xdr, len);
+ if (!p)
+ return nfserr_bad_xdr;
+ o->data = svcxdr_tmpalloc(argp, len);
+ if (!o->data)
+ return nfserr_jukebox;
+ o->len = len;
+ memcpy(o->data, p, len);
- return 0;
+ return nfs_ok;
}
-/**
- * savemem - duplicate a chunk of memory for later processing
- * @argp: NFSv4 compound argument structure to be freed with
- * @p: pointer to be duplicated
- * @nbytes: length to be duplicated
- *
- * Returns a pointer to a copy of @nbytes bytes of memory at @p
- * that are preserved until processing of the NFSv4 compound
- * operation described by @argp finishes.
- */
-static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
+static __be32
+nfsd4_decode_component4(struct nfsd4_compoundargs *argp, char **namp, u32 *lenp)
{
- void *ret;
+ __be32 *p, status;
- ret = svcxdr_tmpalloc(argp, nbytes);
- if (!ret)
- return NULL;
- memcpy(ret, p, nbytes);
- return ret;
+ if (xdr_stream_decode_u32(argp->xdr, lenp) < 0)
+ return nfserr_bad_xdr;
+ p = xdr_inline_decode(argp->xdr, *lenp);
+ if (!p)
+ return nfserr_bad_xdr;
+ status = check_filename((char *)p, *lenp);
+ if (status)
+ return status;
+ *namp = svcxdr_tmpalloc(argp, *lenp);
+ if (!*namp)
+ return nfserr_jukebox;
+ memcpy(*namp, p, *lenp);
+
+ return nfs_ok;
}
static __be32
-nfsd4_decode_time(struct nfsd4_compoundargs *argp, struct timespec64 *tv)
+nfsd4_decode_nfstime4(struct nfsd4_compoundargs *argp, struct timespec64 *tv)
{
- DECODE_HEAD;
+ __be32 *p;
- READ_BUF(12);
+ p = xdr_inline_decode(argp->xdr, XDR_UNIT * 3);
+ if (!p)
+ return nfserr_bad_xdr;
p = xdr_decode_hyper(p, &tv->tv_sec);
tv->tv_nsec = be32_to_cpup(p++);
if (tv->tv_nsec >= (u32)1000000000)
return nfserr_inval;
+ return nfs_ok;
+}
- DECODE_TAIL;
+static __be32
+nfsd4_decode_verifier4(struct nfsd4_compoundargs *argp, nfs4_verifier *verf)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(argp->xdr, NFS4_VERIFIER_SIZE);
+ if (!p)
+ return nfserr_bad_xdr;
+ memcpy(verf->data, p, sizeof(verf->data));
+ return nfs_ok;
}
+/**
+ * nfsd4_decode_bitmap4 - Decode an NFSv4 bitmap4
+ * @argp: NFSv4 compound argument structure
+ * @bmval: pointer to an array of u32's to decode into
+ * @bmlen: size of the @bmval array
+ *
+ * The server needs to return nfs_ok rather than nfserr_bad_xdr when
+ * encountering bitmaps containing bits it does not recognize. This
+ * includes bits in bitmap words past WORDn, where WORDn is the last
+ * bitmap WORD the implementation currently supports. Thus we are
+ * careful here to simply ignore bits in bitmap words that this
+ * implementation has yet to support explicitly.
+ *
+ * Return values:
+ * %nfs_ok: @bmval populated successfully
+ * %nfserr_bad_xdr: the encoded bitmap was invalid
+ */
static __be32
-nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
+nfsd4_decode_bitmap4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen)
{
- u32 bmlen;
- DECODE_HEAD;
+ u32 i, count;
+ __be32 *p;
+
+ if (xdr_stream_decode_u32(argp->xdr, &count) < 0)
+ return nfserr_bad_xdr;
+ /* request sanity */
+ if (count > 1000)
+ return nfserr_bad_xdr;
+ p = xdr_inline_decode(argp->xdr, count << 2);
+ if (!p)
+ return nfserr_bad_xdr;
+ i = 0;
+ while (i < count)
+ bmval[i++] = be32_to_cpup(p++);
+ while (i < bmlen)
+ bmval[i++] = 0;
+
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_decode_nfsace4(struct nfsd4_compoundargs *argp, struct nfs4_ace *ace)
+{
+ __be32 *p, status;
+ u32 length;
+
+ if (xdr_stream_decode_u32(argp->xdr, &ace->type) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u32(argp->xdr, &ace->flag) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u32(argp->xdr, &ace->access_mask) < 0)
+ return nfserr_bad_xdr;
+
+ if (xdr_stream_decode_u32(argp->xdr, &length) < 0)
+ return nfserr_bad_xdr;
+ p = xdr_inline_decode(argp->xdr, length);
+ if (!p)
+ return nfserr_bad_xdr;
+ ace->whotype = nfs4_acl_get_whotype((char *)p, length);
+ if (ace->whotype != NFS4_ACL_WHO_NAMED)
+ status = nfs_ok;
+ else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
+ status = nfsd_map_name_to_gid(argp->rqstp,
+ (char *)p, length, &ace->who_gid);
+ else
+ status = nfsd_map_name_to_uid(argp->rqstp,
+ (char *)p, length, &ace->who_uid);
+
+ return status;
+}
+
+/* A counted array of nfsace4's */
+static noinline __be32
+nfsd4_decode_acl(struct nfsd4_compoundargs *argp, struct nfs4_acl **acl)
+{
+ struct nfs4_ace *ace;
+ __be32 status;
+ u32 count;
+
+ if (xdr_stream_decode_u32(argp->xdr, &count) < 0)
+ return nfserr_bad_xdr;
+
+ if (count > xdr_stream_remaining(argp->xdr) / 20)
+ /*
+ * Even with 4-byte names there wouldn't be
+ * space for that many aces; something fishy is
+ * going on:
+ */
+ return nfserr_fbig;
+
+ *acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(count));
+ if (*acl == NULL)
+ return nfserr_jukebox;
+
+ (*acl)->naces = count;
+ for (ace = (*acl)->aces; ace < (*acl)->aces + count; ace++) {
+ status = nfsd4_decode_nfsace4(argp, ace);
+ if (status)
+ return status;
+ }
+
+ return nfs_ok;
+}
- bmval[0] = 0;
- bmval[1] = 0;
- bmval[2] = 0;
+static noinline __be32
+nfsd4_decode_security_label(struct nfsd4_compoundargs *argp,
+ struct xdr_netobj *label)
+{
+ u32 lfs, pi, length;
+ __be32 *p;
- READ_BUF(4);
- bmlen = be32_to_cpup(p++);
- if (bmlen > 1000)
- goto xdr_error;
+ if (xdr_stream_decode_u32(argp->xdr, &lfs) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u32(argp->xdr, &pi) < 0)
+ return nfserr_bad_xdr;
- READ_BUF(bmlen << 2);
- if (bmlen > 0)
- bmval[0] = be32_to_cpup(p++);
- if (bmlen > 1)
- bmval[1] = be32_to_cpup(p++);
- if (bmlen > 2)
- bmval[2] = be32_to_cpup(p++);
+ if (xdr_stream_decode_u32(argp->xdr, &length) < 0)
+ return nfserr_bad_xdr;
+ if (length > NFS4_MAXLABELLEN)
+ return nfserr_badlabel;
+ p = xdr_inline_decode(argp->xdr, length);
+ if (!p)
+ return nfserr_bad_xdr;
+ label->len = length;
+ label->data = svcxdr_dupstr(argp, p, length);
+ if (!label->data)
+ return nfserr_jukebox;
- DECODE_TAIL;
+ return nfs_ok;
}
static __be32
-nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
- struct iattr *iattr, struct nfs4_acl **acl,
- struct xdr_netobj *label, int *umask)
+nfsd4_decode_fattr4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen,
+ struct iattr *iattr, struct nfs4_acl **acl,
+ struct xdr_netobj *label, int *umask)
{
- int expected_len, len = 0;
- u32 dummy32;
- char *buf;
+ unsigned int starting_pos;
+ u32 attrlist4_count;
+ __be32 *p, status;
- DECODE_HEAD;
iattr->ia_valid = 0;
- if ((status = nfsd4_decode_bitmap(argp, bmval)))
- return status;
+ status = nfsd4_decode_bitmap4(argp, bmval, bmlen);
+ if (status)
+ return nfserr_bad_xdr;
if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0
|| bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1
@@ -380,96 +393,69 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
return nfserr_attrnotsupp;
}
- READ_BUF(4);
- expected_len = be32_to_cpup(p++);
+ if (xdr_stream_decode_u32(argp->xdr, &attrlist4_count) < 0)
+ return nfserr_bad_xdr;
+ starting_pos = xdr_stream_pos(argp->xdr);
if (bmval[0] & FATTR4_WORD0_SIZE) {
- READ_BUF(8);
- len += 8;
- p = xdr_decode_hyper(p, &iattr->ia_size);
+ u64 size;
+
+ if (xdr_stream_decode_u64(argp->xdr, &size) < 0)
+ return nfserr_bad_xdr;
+ iattr->ia_size = size;
iattr->ia_valid |= ATTR_SIZE;
}
if (bmval[0] & FATTR4_WORD0_ACL) {
- u32 nace;
- struct nfs4_ace *ace;
-
- READ_BUF(4); len += 4;
- nace = be32_to_cpup(p++);
-
- if (nace > compoundargs_bytes_left(argp)/20)
- /*
- * Even with 4-byte names there wouldn't be
- * space for that many aces; something fishy is
- * going on:
- */
- return nfserr_fbig;
-
- *acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(nace));
- if (*acl == NULL)
- return nfserr_jukebox;
-
- (*acl)->naces = nace;
- for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) {
- READ_BUF(16); len += 16;
- ace->type = be32_to_cpup(p++);
- ace->flag = be32_to_cpup(p++);
- ace->access_mask = be32_to_cpup(p++);
- dummy32 = be32_to_cpup(p++);
- READ_BUF(dummy32);
- len += XDR_QUADLEN(dummy32) << 2;
- READMEM(buf, dummy32);
- ace->whotype = nfs4_acl_get_whotype(buf, dummy32);
- status = nfs_ok;
- if (ace->whotype != NFS4_ACL_WHO_NAMED)
- ;
- else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
- status = nfsd_map_name_to_gid(argp->rqstp,
- buf, dummy32, &ace->who_gid);
- else
- status = nfsd_map_name_to_uid(argp->rqstp,
- buf, dummy32, &ace->who_uid);
- if (status)
- return status;
- }
+ status = nfsd4_decode_acl(argp, acl);
+ if (status)
+ return status;
} else
*acl = NULL;
if (bmval[1] & FATTR4_WORD1_MODE) {
- READ_BUF(4);
- len += 4;
- iattr->ia_mode = be32_to_cpup(p++);
+ u32 mode;
+
+ if (xdr_stream_decode_u32(argp->xdr, &mode) < 0)
+ return nfserr_bad_xdr;
+ iattr->ia_mode = mode;
iattr->ia_mode &= (S_IFMT | S_IALLUGO);
iattr->ia_valid |= ATTR_MODE;
}
if (bmval[1] & FATTR4_WORD1_OWNER) {
- READ_BUF(4);
- len += 4;
- dummy32 = be32_to_cpup(p++);
- READ_BUF(dummy32);
- len += (XDR_QUADLEN(dummy32) << 2);
- READMEM(buf, dummy32);
- if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid)))
+ u32 length;
+
+ if (xdr_stream_decode_u32(argp->xdr, &length) < 0)
+ return nfserr_bad_xdr;
+ p = xdr_inline_decode(argp->xdr, length);
+ if (!p)
+ return nfserr_bad_xdr;
+ status = nfsd_map_name_to_uid(argp->rqstp, (char *)p, length,
+ &iattr->ia_uid);
+ if (status)
return status;
iattr->ia_valid |= ATTR_UID;
}
if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) {
- READ_BUF(4);
- len += 4;
- dummy32 = be32_to_cpup(p++);
- READ_BUF(dummy32);
- len += (XDR_QUADLEN(dummy32) << 2);
- READMEM(buf, dummy32);
- if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid)))
+ u32 length;
+
+ if (xdr_stream_decode_u32(argp->xdr, &length) < 0)
+ return nfserr_bad_xdr;
+ p = xdr_inline_decode(argp->xdr, length);
+ if (!p)
+ return nfserr_bad_xdr;
+ status = nfsd_map_name_to_gid(argp->rqstp, (char *)p, length,
+ &iattr->ia_gid);
+ if (status)
return status;
iattr->ia_valid |= ATTR_GID;
}
if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
- READ_BUF(4);
- len += 4;
- dummy32 = be32_to_cpup(p++);
- switch (dummy32) {
+ u32 set_it;
+
+ if (xdr_stream_decode_u32(argp->xdr, &set_it) < 0)
+ return nfserr_bad_xdr;
+ switch (set_it) {
case NFS4_SET_TO_CLIENT_TIME:
- len += 12;
- status = nfsd4_decode_time(argp, &iattr->ia_atime);
+ status = nfsd4_decode_nfstime4(argp, &iattr->ia_atime);
if (status)
return status;
iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET);
@@ -478,17 +464,17 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
iattr->ia_valid |= ATTR_ATIME;
break;
default:
- goto xdr_error;
+ return nfserr_bad_xdr;
}
}
if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) {
- READ_BUF(4);
- len += 4;
- dummy32 = be32_to_cpup(p++);
- switch (dummy32) {
+ u32 set_it;
+
+ if (xdr_stream_decode_u32(argp->xdr, &set_it) < 0)
+ return nfserr_bad_xdr;
+ switch (set_it) {
case NFS4_SET_TO_CLIENT_TIME:
- len += 12;
- status = nfsd4_decode_time(argp, &iattr->ia_mtime);
+ status = nfsd4_decode_nfstime4(argp, &iattr->ia_mtime);
if (status)
return status;
iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET);
@@ -497,222 +483,329 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
iattr->ia_valid |= ATTR_MTIME;
break;
default:
- goto xdr_error;
+ return nfserr_bad_xdr;
}
}
-
label->len = 0;
if (IS_ENABLED(CONFIG_NFSD_V4_SECURITY_LABEL) &&
bmval[2] & FATTR4_WORD2_SECURITY_LABEL) {
- READ_BUF(4);
- len += 4;
- dummy32 = be32_to_cpup(p++); /* lfs: we don't use it */
- READ_BUF(4);
- len += 4;
- dummy32 = be32_to_cpup(p++); /* pi: we don't use it either */
- READ_BUF(4);
- len += 4;
- dummy32 = be32_to_cpup(p++);
- READ_BUF(dummy32);
- if (dummy32 > NFS4_MAXLABELLEN)
- return nfserr_badlabel;
- len += (XDR_QUADLEN(dummy32) << 2);
- READMEM(buf, dummy32);
- label->len = dummy32;
- label->data = svcxdr_dupstr(argp, buf, dummy32);
- if (!label->data)
- return nfserr_jukebox;
+ status = nfsd4_decode_security_label(argp, label);
+ if (status)
+ return status;
}
if (bmval[2] & FATTR4_WORD2_MODE_UMASK) {
+ u32 mode, mask;
+
if (!umask)
- goto xdr_error;
- READ_BUF(8);
- len += 8;
- dummy32 = be32_to_cpup(p++);
- iattr->ia_mode = dummy32 & (S_IFMT | S_IALLUGO);
- dummy32 = be32_to_cpup(p++);
- *umask = dummy32 & S_IRWXUGO;
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u32(argp->xdr, &mode) < 0)
+ return nfserr_bad_xdr;
+ iattr->ia_mode = mode & (S_IFMT | S_IALLUGO);
+ if (xdr_stream_decode_u32(argp->xdr, &mask) < 0)
+ return nfserr_bad_xdr;
+ *umask = mask & S_IRWXUGO;
iattr->ia_valid |= ATTR_MODE;
}
- if (len != expected_len)
- goto xdr_error;
- DECODE_TAIL;
+ /* request sanity: did attrlist4 contain the expected number of words? */
+ if (attrlist4_count != xdr_stream_pos(argp->xdr) - starting_pos)
+ return nfserr_bad_xdr;
+
+ return nfs_ok;
}
static __be32
-nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid)
+nfsd4_decode_stateid4(struct nfsd4_compoundargs *argp, stateid_t *sid)
{
- DECODE_HEAD;
+ __be32 *p;
- READ_BUF(sizeof(stateid_t));
+ p = xdr_inline_decode(argp->xdr, NFS4_STATEID_SIZE);
+ if (!p)
+ return nfserr_bad_xdr;
sid->si_generation = be32_to_cpup(p++);
- COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
+ memcpy(&sid->si_opaque, p, sizeof(sid->si_opaque));
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_decode_clientid4(struct nfsd4_compoundargs *argp, clientid_t *clientid)
+{
+ __be32 *p;
- DECODE_TAIL;
+ p = xdr_inline_decode(argp->xdr, sizeof(__be64));
+ if (!p)
+ return nfserr_bad_xdr;
+ memcpy(clientid, p, sizeof(*clientid));
+ return nfs_ok;
}
static __be32
-nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access)
+nfsd4_decode_state_owner4(struct nfsd4_compoundargs *argp,
+ clientid_t *clientid, struct xdr_netobj *owner)
{
- DECODE_HEAD;
+ __be32 status;
+
+ status = nfsd4_decode_clientid4(argp, clientid);
+ if (status)
+ return status;
+ return nfsd4_decode_opaque(argp, owner);
+}
- READ_BUF(4);
- access->ac_req_access = be32_to_cpup(p++);
+#ifdef CONFIG_NFSD_PNFS
+static __be32
+nfsd4_decode_deviceid4(struct nfsd4_compoundargs *argp,
+ struct nfsd4_deviceid *devid)
+{
+ __be32 *p;
- DECODE_TAIL;
+ p = xdr_inline_decode(argp->xdr, NFS4_DEVICEID4_SIZE);
+ if (!p)
+ return nfserr_bad_xdr;
+ memcpy(devid, p, sizeof(*devid));
+ return nfs_ok;
}
-static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs)
+static __be32
+nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp,
+ struct nfsd4_layoutcommit *lcp)
{
- DECODE_HEAD;
- struct user_namespace *userns = nfsd_user_namespace(argp->rqstp);
- u32 dummy, uid, gid;
- char *machine_name;
- int i;
- int nr_secflavs;
+ if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_layout_type) < 0)
+ return nfserr_bad_xdr;
+ if (lcp->lc_layout_type < LAYOUT_NFSV4_1_FILES)
+ return nfserr_bad_xdr;
+ if (lcp->lc_layout_type >= LAYOUT_TYPE_MAX)
+ return nfserr_bad_xdr;
+
+ if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_up_len) < 0)
+ return nfserr_bad_xdr;
+ if (lcp->lc_up_len > 0) {
+ lcp->lc_up_layout = xdr_inline_decode(argp->xdr, lcp->lc_up_len);
+ if (!lcp->lc_up_layout)
+ return nfserr_bad_xdr;
+ }
+
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_decode_layoutreturn4(struct nfsd4_compoundargs *argp,
+ struct nfsd4_layoutreturn *lrp)
+{
+ __be32 status;
+
+ if (xdr_stream_decode_u32(argp->xdr, &lrp->lr_return_type) < 0)
+ return nfserr_bad_xdr;
+ switch (lrp->lr_return_type) {
+ case RETURN_FILE:
+ if (xdr_stream_decode_u64(argp->xdr, &lrp->lr_seg.offset) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u64(argp->xdr, &lrp->lr_seg.length) < 0)
+ return nfserr_bad_xdr;
+ status = nfsd4_decode_stateid4(argp, &lrp->lr_sid);
+ if (status)
+ return status;
+ if (xdr_stream_decode_u32(argp->xdr, &lrp->lrf_body_len) < 0)
+ return nfserr_bad_xdr;
+ if (lrp->lrf_body_len > 0) {
+ lrp->lrf_body = xdr_inline_decode(argp->xdr, lrp->lrf_body_len);
+ if (!lrp->lrf_body)
+ return nfserr_bad_xdr;
+ }
+ break;
+ case RETURN_FSID:
+ case RETURN_ALL:
+ lrp->lr_seg.offset = 0;
+ lrp->lr_seg.length = NFS4_MAX_UINT64;
+ break;
+ default:
+ return nfserr_bad_xdr;
+ }
+
+ return nfs_ok;
+}
+
+#endif /* CONFIG_NFSD_PNFS */
+
+static __be32
+nfsd4_decode_sessionid4(struct nfsd4_compoundargs *argp,
+ struct nfs4_sessionid *sessionid)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(argp->xdr, NFS4_MAX_SESSIONID_LEN);
+ if (!p)
+ return nfserr_bad_xdr;
+ memcpy(sessionid->data, p, sizeof(sessionid->data));
+ return nfs_ok;
+}
+
+/* Defined in Appendix A of RFC 5531 */
+static __be32
+nfsd4_decode_authsys_parms(struct nfsd4_compoundargs *argp,
+ struct nfsd4_cb_sec *cbs)
+{
+ u32 stamp, gidcount, uid, gid;
+ __be32 *p, status;
+
+ if (xdr_stream_decode_u32(argp->xdr, &stamp) < 0)
+ return nfserr_bad_xdr;
+ /* machine name */
+ status = nfsd4_decode_ignored_string(argp, 255);
+ if (status)
+ return status;
+ if (xdr_stream_decode_u32(argp->xdr, &uid) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u32(argp->xdr, &gid) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u32(argp->xdr, &gidcount) < 0)
+ return nfserr_bad_xdr;
+ if (gidcount > 16)
+ return nfserr_bad_xdr;
+ p = xdr_inline_decode(argp->xdr, gidcount << 2);
+ if (!p)
+ return nfserr_bad_xdr;
+ if (cbs->flavor == (u32)(-1)) {
+ struct user_namespace *userns = nfsd_user_namespace(argp->rqstp);
+
+ kuid_t kuid = make_kuid(userns, uid);
+ kgid_t kgid = make_kgid(userns, gid);
+ if (uid_valid(kuid) && gid_valid(kgid)) {
+ cbs->uid = kuid;
+ cbs->gid = kgid;
+ cbs->flavor = RPC_AUTH_UNIX;
+ } else {
+ dprintk("RPC_AUTH_UNIX with invalid uid or gid, ignoring!\n");
+ }
+ }
+
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_decode_gss_cb_handles4(struct nfsd4_compoundargs *argp,
+ struct nfsd4_cb_sec *cbs)
+{
+ __be32 status;
+ u32 service;
+
+ dprintk("RPC_AUTH_GSS callback secflavor not supported!\n");
+
+ if (xdr_stream_decode_u32(argp->xdr, &service) < 0)
+ return nfserr_bad_xdr;
+ if (service < RPC_GSS_SVC_NONE || service > RPC_GSS_SVC_PRIVACY)
+ return nfserr_bad_xdr;
+ /* gcbp_handle_from_server */
+ status = nfsd4_decode_ignored_string(argp, 0);
+ if (status)
+ return status;
+ /* gcbp_handle_from_client */
+ status = nfsd4_decode_ignored_string(argp, 0);
+ if (status)
+ return status;
+
+ return nfs_ok;
+}
+
+/* a counted array of callback_sec_parms4 items */
+static __be32
+nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs)
+{
+ u32 i, secflavor, nr_secflavs;
+ __be32 status;
/* callback_sec_params4 */
- READ_BUF(4);
- nr_secflavs = be32_to_cpup(p++);
+ if (xdr_stream_decode_u32(argp->xdr, &nr_secflavs) < 0)
+ return nfserr_bad_xdr;
if (nr_secflavs)
cbs->flavor = (u32)(-1);
else
/* Is this legal? Be generous, take it to mean AUTH_NONE: */
cbs->flavor = 0;
+
for (i = 0; i < nr_secflavs; ++i) {
- READ_BUF(4);
- dummy = be32_to_cpup(p++);
- switch (dummy) {
+ if (xdr_stream_decode_u32(argp->xdr, &secflavor) < 0)
+ return nfserr_bad_xdr;
+ switch (secflavor) {
case RPC_AUTH_NULL:
- /* Nothing to read */
+ /* void */
if (cbs->flavor == (u32)(-1))
cbs->flavor = RPC_AUTH_NULL;
break;
case RPC_AUTH_UNIX:
- READ_BUF(8);
- /* stamp */
- dummy = be32_to_cpup(p++);
-
- /* machine name */
- dummy = be32_to_cpup(p++);
- READ_BUF(dummy);
- SAVEMEM(machine_name, dummy);
-
- /* uid, gid */
- READ_BUF(8);
- uid = be32_to_cpup(p++);
- gid = be32_to_cpup(p++);
-
- /* more gids */
- READ_BUF(4);
- dummy = be32_to_cpup(p++);
- READ_BUF(dummy * 4);
- if (cbs->flavor == (u32)(-1)) {
- kuid_t kuid = make_kuid(userns, uid);
- kgid_t kgid = make_kgid(userns, gid);
- if (uid_valid(kuid) && gid_valid(kgid)) {
- cbs->uid = kuid;
- cbs->gid = kgid;
- cbs->flavor = RPC_AUTH_UNIX;
- } else {
- dprintk("RPC_AUTH_UNIX with invalid"
- "uid or gid ignoring!\n");
- }
- }
+ status = nfsd4_decode_authsys_parms(argp, cbs);
+ if (status)
+ return status;
break;
case RPC_AUTH_GSS:
- dprintk("RPC_AUTH_GSS callback secflavor "
- "not supported!\n");
- READ_BUF(8);
- /* gcbp_service */
- dummy = be32_to_cpup(p++);
- /* gcbp_handle_from_server */
- dummy = be32_to_cpup(p++);
- READ_BUF(dummy);
- p += XDR_QUADLEN(dummy);
- /* gcbp_handle_from_client */
- READ_BUF(4);
- dummy = be32_to_cpup(p++);
- READ_BUF(dummy);
+ status = nfsd4_decode_gss_cb_handles4(argp, cbs);
+ if (status)
+ return status;
break;
default:
- dprintk("Illegal callback secflavor\n");
return nfserr_inval;
}
}
- DECODE_TAIL;
-}
-static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc)
-{
- DECODE_HEAD;
+ return nfs_ok;
+}
- READ_BUF(4);
- bc->bc_cb_program = be32_to_cpup(p++);
- nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec);
- DECODE_TAIL;
-}
+/*
+ * NFSv4 operation argument decoders
+ */
-static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
+static __be32
+nfsd4_decode_access(struct nfsd4_compoundargs *argp,
+ struct nfsd4_access *access)
{
- DECODE_HEAD;
-
- READ_BUF(NFS4_MAX_SESSIONID_LEN + 8);
- COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
- bcts->dir = be32_to_cpup(p++);
- /* XXX: skipping ctsa_use_conn_in_rdma_mode. Perhaps Tom Tucker
- * could help us figure out we should be using it. */
- DECODE_TAIL;
+ if (xdr_stream_decode_u32(argp->xdr, &access->ac_req_access) < 0)
+ return nfserr_bad_xdr;
+ return nfs_ok;
}
static __be32
nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
{
- DECODE_HEAD;
-
- READ_BUF(4);
- close->cl_seqid = be32_to_cpup(p++);
- return nfsd4_decode_stateid(argp, &close->cl_stateid);
-
- DECODE_TAIL;
+ if (xdr_stream_decode_u32(argp->xdr, &close->cl_seqid) < 0)
+ return nfserr_bad_xdr;
+ return nfsd4_decode_stateid4(argp, &close->cl_stateid);
}
static __be32
nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit)
{
- DECODE_HEAD;
-
- READ_BUF(12);
- p = xdr_decode_hyper(p, &commit->co_offset);
- commit->co_count = be32_to_cpup(p++);
-
- DECODE_TAIL;
+ if (xdr_stream_decode_u64(argp->xdr, &commit->co_offset) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u32(argp->xdr, &commit->co_count) < 0)
+ return nfserr_bad_xdr;
+ return nfs_ok;
}
static __be32
nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create)
{
- DECODE_HEAD;
+ __be32 *p, status;
- READ_BUF(4);
- create->cr_type = be32_to_cpup(p++);
+ if (xdr_stream_decode_u32(argp->xdr, &create->cr_type) < 0)
+ return nfserr_bad_xdr;
switch (create->cr_type) {
case NF4LNK:
- READ_BUF(4);
- create->cr_datalen = be32_to_cpup(p++);
- READ_BUF(create->cr_datalen);
+ if (xdr_stream_decode_u32(argp->xdr, &create->cr_datalen) < 0)
+ return nfserr_bad_xdr;
+ p = xdr_inline_decode(argp->xdr, create->cr_datalen);
+ if (!p)
+ return nfserr_bad_xdr;
create->cr_data = svcxdr_dupstr(argp, p, create->cr_datalen);
if (!create->cr_data)
return nfserr_jukebox;
break;
case NF4BLK:
case NF4CHR:
- READ_BUF(8);
- create->cr_specdata1 = be32_to_cpup(p++);
- create->cr_specdata2 = be32_to_cpup(p++);
+ if (xdr_stream_decode_u32(argp->xdr, &create->cr_specdata1) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u32(argp->xdr, &create->cr_specdata2) < 0)
+ return nfserr_bad_xdr;
break;
case NF4SOCK:
case NF4FIFO:
@@ -720,151 +813,210 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
default:
break;
}
-
- READ_BUF(4);
- create->cr_namelen = be32_to_cpup(p++);
- READ_BUF(create->cr_namelen);
- SAVEMEM(create->cr_name, create->cr_namelen);
- if ((status = check_filename(create->cr_name, create->cr_namelen)))
+ status = nfsd4_decode_component4(argp, &create->cr_name,
+ &create->cr_namelen);
+ if (status)
return status;
-
- status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr,
- &create->cr_acl, &create->cr_label,
- &create->cr_umask);
+ status = nfsd4_decode_fattr4(argp, create->cr_bmval,
+ ARRAY_SIZE(create->cr_bmval),
+ &create->cr_iattr, &create->cr_acl,
+ &create->cr_label, &create->cr_umask);
if (status)
- goto out;
+ return status;
- DECODE_TAIL;
+ return nfs_ok;
}
static inline __be32
nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr)
{
- return nfsd4_decode_stateid(argp, &dr->dr_stateid);
+ return nfsd4_decode_stateid4(argp, &dr->dr_stateid);
}
static inline __be32
nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr)
{
- return nfsd4_decode_bitmap(argp, getattr->ga_bmval);
+ return nfsd4_decode_bitmap4(argp, getattr->ga_bmval,
+ ARRAY_SIZE(getattr->ga_bmval));
}
static __be32
nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
{
- DECODE_HEAD;
+ return nfsd4_decode_component4(argp, &link->li_name, &link->li_namelen);
+}
- READ_BUF(4);
- link->li_namelen = be32_to_cpup(p++);
- READ_BUF(link->li_namelen);
- SAVEMEM(link->li_name, link->li_namelen);
- if ((status = check_filename(link->li_name, link->li_namelen)))
+static __be32
+nfsd4_decode_open_to_lock_owner4(struct nfsd4_compoundargs *argp,
+ struct nfsd4_lock *lock)
+{
+ __be32 status;
+
+ if (xdr_stream_decode_u32(argp->xdr, &lock->lk_new_open_seqid) < 0)
+ return nfserr_bad_xdr;
+ status = nfsd4_decode_stateid4(argp, &lock->lk_new_open_stateid);
+ if (status)
return status;
+ if (xdr_stream_decode_u32(argp->xdr, &lock->lk_new_lock_seqid) < 0)
+ return nfserr_bad_xdr;
+ return nfsd4_decode_state_owner4(argp, &lock->lk_new_clientid,
+ &lock->lk_new_owner);
+}
- DECODE_TAIL;
+static __be32
+nfsd4_decode_exist_lock_owner4(struct nfsd4_compoundargs *argp,
+ struct nfsd4_lock *lock)
+{
+ __be32 status;
+
+ status = nfsd4_decode_stateid4(argp, &lock->lk_old_lock_stateid);
+ if (status)
+ return status;
+ if (xdr_stream_decode_u32(argp->xdr, &lock->lk_old_lock_seqid) < 0)
+ return nfserr_bad_xdr;
+
+ return nfs_ok;
}
static __be32
-nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
+nfsd4_decode_locker4(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
{
- DECODE_HEAD;
+ if (xdr_stream_decode_bool(argp->xdr, &lock->lk_is_new) < 0)
+ return nfserr_bad_xdr;
+ if (lock->lk_is_new)
+ return nfsd4_decode_open_to_lock_owner4(argp, lock);
+ return nfsd4_decode_exist_lock_owner4(argp, lock);
+}
- /*
- * type, reclaim(boolean), offset, length, new_lock_owner(boolean)
- */
- READ_BUF(28);
- lock->lk_type = be32_to_cpup(p++);
+static __be32
+nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
+{
+ if (xdr_stream_decode_u32(argp->xdr, &lock->lk_type) < 0)
+ return nfserr_bad_xdr;
if ((lock->lk_type < NFS4_READ_LT) || (lock->lk_type > NFS4_WRITEW_LT))
- goto xdr_error;
- lock->lk_reclaim = be32_to_cpup(p++);
- p = xdr_decode_hyper(p, &lock->lk_offset);
- p = xdr_decode_hyper(p, &lock->lk_length);
- lock->lk_is_new = be32_to_cpup(p++);
-
- if (lock->lk_is_new) {
- READ_BUF(4);
- lock->lk_new_open_seqid = be32_to_cpup(p++);
- status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid);
- if (status)
- return status;
- READ_BUF(8 + sizeof(clientid_t));
- lock->lk_new_lock_seqid = be32_to_cpup(p++);
- COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t));
- lock->lk_new_owner.len = be32_to_cpup(p++);
- READ_BUF(lock->lk_new_owner.len);
- READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len);
- } else {
- status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid);
- if (status)
- return status;
- READ_BUF(4);
- lock->lk_old_lock_seqid = be32_to_cpup(p++);
- }
-
- DECODE_TAIL;
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_bool(argp->xdr, &lock->lk_reclaim) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u64(argp->xdr, &lock->lk_offset) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u64(argp->xdr, &lock->lk_length) < 0)
+ return nfserr_bad_xdr;
+ return nfsd4_decode_locker4(argp, lock);
}
static __be32
nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
{
- DECODE_HEAD;
-
- READ_BUF(32);
- lockt->lt_type = be32_to_cpup(p++);
- if((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT))
- goto xdr_error;
- p = xdr_decode_hyper(p, &lockt->lt_offset);
- p = xdr_decode_hyper(p, &lockt->lt_length);
- COPYMEM(&lockt->lt_clientid, 8);
- lockt->lt_owner.len = be32_to_cpup(p++);
- READ_BUF(lockt->lt_owner.len);
- READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
-
- DECODE_TAIL;
+ if (xdr_stream_decode_u32(argp->xdr, &lockt->lt_type) < 0)
+ return nfserr_bad_xdr;
+ if ((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT))
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u64(argp->xdr, &lockt->lt_offset) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u64(argp->xdr, &lockt->lt_length) < 0)
+ return nfserr_bad_xdr;
+ return nfsd4_decode_state_owner4(argp, &lockt->lt_clientid,
+ &lockt->lt_owner);
}
static __be32
nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
{
- DECODE_HEAD;
+ __be32 status;
- READ_BUF(8);
- locku->lu_type = be32_to_cpup(p++);
+ if (xdr_stream_decode_u32(argp->xdr, &locku->lu_type) < 0)
+ return nfserr_bad_xdr;
if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT))
- goto xdr_error;
- locku->lu_seqid = be32_to_cpup(p++);
- status = nfsd4_decode_stateid(argp, &locku->lu_stateid);
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u32(argp->xdr, &locku->lu_seqid) < 0)
+ return nfserr_bad_xdr;
+ status = nfsd4_decode_stateid4(argp, &locku->lu_stateid);
if (status)
return status;
- READ_BUF(16);
- p = xdr_decode_hyper(p, &locku->lu_offset);
- p = xdr_decode_hyper(p, &locku->lu_length);
+ if (xdr_stream_decode_u64(argp->xdr, &locku->lu_offset) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u64(argp->xdr, &locku->lu_length) < 0)
+ return nfserr_bad_xdr;
- DECODE_TAIL;
+ return nfs_ok;
}
static __be32
nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup)
{
- DECODE_HEAD;
+ return nfsd4_decode_component4(argp, &lookup->lo_name, &lookup->lo_len);
+}
- READ_BUF(4);
- lookup->lo_len = be32_to_cpup(p++);
- READ_BUF(lookup->lo_len);
- SAVEMEM(lookup->lo_name, lookup->lo_len);
- if ((status = check_filename(lookup->lo_name, lookup->lo_len)))
- return status;
+static __be32
+nfsd4_decode_createhow4(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
+{
+ __be32 status;
- DECODE_TAIL;
+ if (xdr_stream_decode_u32(argp->xdr, &open->op_createmode) < 0)
+ return nfserr_bad_xdr;
+ switch (open->op_createmode) {
+ case NFS4_CREATE_UNCHECKED:
+ case NFS4_CREATE_GUARDED:
+ status = nfsd4_decode_fattr4(argp, open->op_bmval,
+ ARRAY_SIZE(open->op_bmval),
+ &open->op_iattr, &open->op_acl,
+ &open->op_label, &open->op_umask);
+ if (status)
+ return status;
+ break;
+ case NFS4_CREATE_EXCLUSIVE:
+ status = nfsd4_decode_verifier4(argp, &open->op_verf);
+ if (status)
+ return status;
+ break;
+ case NFS4_CREATE_EXCLUSIVE4_1:
+ if (argp->minorversion < 1)
+ return nfserr_bad_xdr;
+ status = nfsd4_decode_verifier4(argp, &open->op_verf);
+ if (status)
+ return status;
+ status = nfsd4_decode_fattr4(argp, open->op_bmval,
+ ARRAY_SIZE(open->op_bmval),
+ &open->op_iattr, &open->op_acl,
+ &open->op_label, &open->op_umask);
+ if (status)
+ return status;
+ break;
+ default:
+ return nfserr_bad_xdr;
+ }
+
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_decode_openflag4(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
+{
+ __be32 status;
+
+ if (xdr_stream_decode_u32(argp->xdr, &open->op_create) < 0)
+ return nfserr_bad_xdr;
+ switch (open->op_create) {
+ case NFS4_OPEN_NOCREATE:
+ break;
+ case NFS4_OPEN_CREATE:
+ status = nfsd4_decode_createhow4(argp, open);
+ if (status)
+ return status;
+ break;
+ default:
+ return nfserr_bad_xdr;
+ }
+
+ return nfs_ok;
}
static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *share_access, u32 *deleg_want, u32 *deleg_when)
{
- __be32 *p;
u32 w;
- READ_BUF(4);
- w = be32_to_cpup(p++);
+ if (xdr_stream_decode_u32(argp->xdr, &w) < 0)
+ return nfserr_bad_xdr;
*share_access = w & NFS4_SHARE_ACCESS_MASK;
*deleg_want = w & NFS4_SHARE_WANT_MASK;
if (deleg_when)
@@ -907,206 +1059,153 @@ static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *sh
NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED):
return nfs_ok;
}
-xdr_error:
return nfserr_bad_xdr;
}
static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x)
{
- __be32 *p;
-
- READ_BUF(4);
- *x = be32_to_cpup(p++);
- /* Note: unlinke access bits, deny bits may be zero. */
- if (*x & ~NFS4_SHARE_DENY_BOTH)
+ if (xdr_stream_decode_u32(argp->xdr, x) < 0)
return nfserr_bad_xdr;
- return nfs_ok;
-xdr_error:
- return nfserr_bad_xdr;
-}
-
-static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o)
-{
- __be32 *p;
-
- READ_BUF(4);
- o->len = be32_to_cpup(p++);
-
- if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT)
+ /* Note: unlike access bits, deny bits may be zero. */
+ if (*x & ~NFS4_SHARE_DENY_BOTH)
return nfserr_bad_xdr;
- READ_BUF(o->len);
- SAVEMEM(o->data, o->len);
return nfs_ok;
-xdr_error:
- return nfserr_bad_xdr;
}
static __be32
-nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
+nfsd4_decode_open_claim4(struct nfsd4_compoundargs *argp,
+ struct nfsd4_open *open)
{
- DECODE_HEAD;
- u32 dummy;
-
- memset(open->op_bmval, 0, sizeof(open->op_bmval));
- open->op_iattr.ia_valid = 0;
- open->op_openowner = NULL;
-
- open->op_xdr_error = 0;
- /* seqid, share_access, share_deny, clientid, ownerlen */
- READ_BUF(4);
- open->op_seqid = be32_to_cpup(p++);
- /* decode, yet ignore deleg_when until supported */
- status = nfsd4_decode_share_access(argp, &open->op_share_access,
- &open->op_deleg_want, &dummy);
- if (status)
- goto xdr_error;
- status = nfsd4_decode_share_deny(argp, &open->op_share_deny);
- if (status)
- goto xdr_error;
- READ_BUF(sizeof(clientid_t));
- COPYMEM(&open->op_clientid, sizeof(clientid_t));
- status = nfsd4_decode_opaque(argp, &open->op_owner);
- if (status)
- goto xdr_error;
- READ_BUF(4);
- open->op_create = be32_to_cpup(p++);
- switch (open->op_create) {
- case NFS4_OPEN_NOCREATE:
- break;
- case NFS4_OPEN_CREATE:
- READ_BUF(4);
- open->op_createmode = be32_to_cpup(p++);
- switch (open->op_createmode) {
- case NFS4_CREATE_UNCHECKED:
- case NFS4_CREATE_GUARDED:
- status = nfsd4_decode_fattr(argp, open->op_bmval,
- &open->op_iattr, &open->op_acl, &open->op_label,
- &open->op_umask);
- if (status)
- goto out;
- break;
- case NFS4_CREATE_EXCLUSIVE:
- READ_BUF(NFS4_VERIFIER_SIZE);
- COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE);
- break;
- case NFS4_CREATE_EXCLUSIVE4_1:
- if (argp->minorversion < 1)
- goto xdr_error;
- READ_BUF(NFS4_VERIFIER_SIZE);
- COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE);
- status = nfsd4_decode_fattr(argp, open->op_bmval,
- &open->op_iattr, &open->op_acl, &open->op_label,
- &open->op_umask);
- if (status)
- goto out;
- break;
- default:
- goto xdr_error;
- }
- break;
- default:
- goto xdr_error;
- }
+ __be32 status;
- /* open_claim */
- READ_BUF(4);
- open->op_claim_type = be32_to_cpup(p++);
+ if (xdr_stream_decode_u32(argp->xdr, &open->op_claim_type) < 0)
+ return nfserr_bad_xdr;
switch (open->op_claim_type) {
case NFS4_OPEN_CLAIM_NULL:
case NFS4_OPEN_CLAIM_DELEGATE_PREV:
- READ_BUF(4);
- open->op_fname.len = be32_to_cpup(p++);
- READ_BUF(open->op_fname.len);
- SAVEMEM(open->op_fname.data, open->op_fname.len);
- if ((status = check_filename(open->op_fname.data, open->op_fname.len)))
+ status = nfsd4_decode_component4(argp, &open->op_fname,
+ &open->op_fnamelen);
+ if (status)
return status;
break;
case NFS4_OPEN_CLAIM_PREVIOUS:
- READ_BUF(4);
- open->op_delegate_type = be32_to_cpup(p++);
+ if (xdr_stream_decode_u32(argp->xdr, &open->op_delegate_type) < 0)
+ return nfserr_bad_xdr;
break;
case NFS4_OPEN_CLAIM_DELEGATE_CUR:
- status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
+ status = nfsd4_decode_stateid4(argp, &open->op_delegate_stateid);
if (status)
return status;
- READ_BUF(4);
- open->op_fname.len = be32_to_cpup(p++);
- READ_BUF(open->op_fname.len);
- SAVEMEM(open->op_fname.data, open->op_fname.len);
- if ((status = check_filename(open->op_fname.data, open->op_fname.len)))
+ status = nfsd4_decode_component4(argp, &open->op_fname,
+ &open->op_fnamelen);
+ if (status)
return status;
break;
case NFS4_OPEN_CLAIM_FH:
case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
if (argp->minorversion < 1)
- goto xdr_error;
+ return nfserr_bad_xdr;
/* void */
break;
case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
if (argp->minorversion < 1)
- goto xdr_error;
- status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
+ return nfserr_bad_xdr;
+ status = nfsd4_decode_stateid4(argp, &open->op_delegate_stateid);
if (status)
return status;
break;
default:
- goto xdr_error;
+ return nfserr_bad_xdr;
}
- DECODE_TAIL;
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
+{
+ __be32 status;
+ u32 dummy;
+
+ memset(open->op_bmval, 0, sizeof(open->op_bmval));
+ open->op_iattr.ia_valid = 0;
+ open->op_openowner = NULL;
+
+ open->op_xdr_error = 0;
+ if (xdr_stream_decode_u32(argp->xdr, &open->op_seqid) < 0)
+ return nfserr_bad_xdr;
+ /* deleg_want is ignored */
+ status = nfsd4_decode_share_access(argp, &open->op_share_access,
+ &open->op_deleg_want, &dummy);
+ if (status)
+ return status;
+ status = nfsd4_decode_share_deny(argp, &open->op_share_deny);
+ if (status)
+ return status;
+ status = nfsd4_decode_state_owner4(argp, &open->op_clientid,
+ &open->op_owner);
+ if (status)
+ return status;
+ status = nfsd4_decode_openflag4(argp, open);
+ if (status)
+ return status;
+ return nfsd4_decode_open_claim4(argp, open);
}
static __be32
nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf)
{
- DECODE_HEAD;
+ __be32 status;
if (argp->minorversion >= 1)
return nfserr_notsupp;
- status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid);
+ status = nfsd4_decode_stateid4(argp, &open_conf->oc_req_stateid);
if (status)
return status;
- READ_BUF(4);
- open_conf->oc_seqid = be32_to_cpup(p++);
+ if (xdr_stream_decode_u32(argp->xdr, &open_conf->oc_seqid) < 0)
+ return nfserr_bad_xdr;
- DECODE_TAIL;
+ return nfs_ok;
}
static __be32
nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_downgrade *open_down)
{
- DECODE_HEAD;
-
- status = nfsd4_decode_stateid(argp, &open_down->od_stateid);
+ __be32 status;
+
+ status = nfsd4_decode_stateid4(argp, &open_down->od_stateid);
if (status)
return status;
- READ_BUF(4);
- open_down->od_seqid = be32_to_cpup(p++);
+ if (xdr_stream_decode_u32(argp->xdr, &open_down->od_seqid) < 0)
+ return nfserr_bad_xdr;
+ /* deleg_want is ignored */
status = nfsd4_decode_share_access(argp, &open_down->od_share_access,
&open_down->od_deleg_want, NULL);
if (status)
return status;
- status = nfsd4_decode_share_deny(argp, &open_down->od_share_deny);
- if (status)
- return status;
- DECODE_TAIL;
+ return nfsd4_decode_share_deny(argp, &open_down->od_share_deny);
}
static __be32
nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
{
- DECODE_HEAD;
+ __be32 *p;
- READ_BUF(4);
- putfh->pf_fhlen = be32_to_cpup(p++);
+ if (xdr_stream_decode_u32(argp->xdr, &putfh->pf_fhlen) < 0)
+ return nfserr_bad_xdr;
if (putfh->pf_fhlen > NFS4_FHSIZE)
- goto xdr_error;
- READ_BUF(putfh->pf_fhlen);
- SAVEMEM(putfh->pf_fhval, putfh->pf_fhlen);
+ return nfserr_bad_xdr;
+ p = xdr_inline_decode(argp->xdr, putfh->pf_fhlen);
+ if (!p)
+ return nfserr_bad_xdr;
+ putfh->pf_fhval = svcxdr_tmpalloc(argp, putfh->pf_fhlen);
+ if (!putfh->pf_fhval)
+ return nfserr_jukebox;
+ memcpy(putfh->pf_fhval, p, putfh->pf_fhlen);
- DECODE_TAIL;
+ return nfs_ok;
}
static __be32
@@ -1120,109 +1219,68 @@ nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, void *p)
static __be32
nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
{
- DECODE_HEAD;
+ __be32 status;
- status = nfsd4_decode_stateid(argp, &read->rd_stateid);
+ status = nfsd4_decode_stateid4(argp, &read->rd_stateid);
if (status)
return status;
- READ_BUF(12);
- p = xdr_decode_hyper(p, &read->rd_offset);
- read->rd_length = be32_to_cpup(p++);
+ if (xdr_stream_decode_u64(argp->xdr, &read->rd_offset) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u32(argp->xdr, &read->rd_length) < 0)
+ return nfserr_bad_xdr;
- DECODE_TAIL;
+ return nfs_ok;
}
static __be32
nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *readdir)
{
- DECODE_HEAD;
+ __be32 status;
- READ_BUF(24);
- p = xdr_decode_hyper(p, &readdir->rd_cookie);
- COPYMEM(readdir->rd_verf.data, sizeof(readdir->rd_verf.data));
- readdir->rd_dircount = be32_to_cpup(p++);
- readdir->rd_maxcount = be32_to_cpup(p++);
- if ((status = nfsd4_decode_bitmap(argp, readdir->rd_bmval)))
- goto out;
+ if (xdr_stream_decode_u64(argp->xdr, &readdir->rd_cookie) < 0)
+ return nfserr_bad_xdr;
+ status = nfsd4_decode_verifier4(argp, &readdir->rd_verf);
+ if (status)
+ return status;
+ if (xdr_stream_decode_u32(argp->xdr, &readdir->rd_dircount) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u32(argp->xdr, &readdir->rd_maxcount) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_uint32_array(argp->xdr, readdir->rd_bmval,
+ ARRAY_SIZE(readdir->rd_bmval)) < 0)
+ return nfserr_bad_xdr;
- DECODE_TAIL;
+ return nfs_ok;
}
static __be32
nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove)
{
- DECODE_HEAD;
-
- READ_BUF(4);
- remove->rm_namelen = be32_to_cpup(p++);
- READ_BUF(remove->rm_namelen);
- SAVEMEM(remove->rm_name, remove->rm_namelen);
- if ((status = check_filename(remove->rm_name, remove->rm_namelen)))
- return status;
-
- DECODE_TAIL;
+ return nfsd4_decode_component4(argp, &remove->rm_name, &remove->rm_namelen);
}
static __be32
nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename)
{
- DECODE_HEAD;
-
- READ_BUF(4);
- rename->rn_snamelen = be32_to_cpup(p++);
- READ_BUF(rename->rn_snamelen);
- SAVEMEM(rename->rn_sname, rename->rn_snamelen);
- READ_BUF(4);
- rename->rn_tnamelen = be32_to_cpup(p++);
- READ_BUF(rename->rn_tnamelen);
- SAVEMEM(rename->rn_tname, rename->rn_tnamelen);
- if ((status = check_filename(rename->rn_sname, rename->rn_snamelen)))
- return status;
- if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen)))
- return status;
+ __be32 status;
- DECODE_TAIL;
+ status = nfsd4_decode_component4(argp, &rename->rn_sname, &rename->rn_snamelen);
+ if (status)
+ return status;
+ return nfsd4_decode_component4(argp, &rename->rn_tname, &rename->rn_tnamelen);
}
static __be32
nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid)
{
- DECODE_HEAD;
-
- if (argp->minorversion >= 1)
- return nfserr_notsupp;
-
- READ_BUF(sizeof(clientid_t));
- COPYMEM(clientid, sizeof(clientid_t));
-
- DECODE_TAIL;
+ return nfsd4_decode_clientid4(argp, clientid);
}
static __be32
nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
struct nfsd4_secinfo *secinfo)
{
- DECODE_HEAD;
-
- READ_BUF(4);
- secinfo->si_namelen = be32_to_cpup(p++);
- READ_BUF(secinfo->si_namelen);
- SAVEMEM(secinfo->si_name, secinfo->si_namelen);
- status = check_filename(secinfo->si_name, secinfo->si_namelen);
- if (status)
- return status;
- DECODE_TAIL;
-}
-
-static __be32
-nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
- struct nfsd4_secinfo_no_name *sin)
-{
- DECODE_HEAD;
-
- READ_BUF(4);
- sin->sin_style = be32_to_cpup(p++);
- DECODE_TAIL;
+ return nfsd4_decode_component4(argp, &secinfo->si_name, &secinfo->si_namelen);
}
static __be32
@@ -1230,362 +1288,381 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
{
__be32 status;
- status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
+ status = nfsd4_decode_stateid4(argp, &setattr->sa_stateid);
if (status)
return status;
- return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr,
- &setattr->sa_acl, &setattr->sa_label, NULL);
+ return nfsd4_decode_fattr4(argp, setattr->sa_bmval,
+ ARRAY_SIZE(setattr->sa_bmval),
+ &setattr->sa_iattr, &setattr->sa_acl,
+ &setattr->sa_label, NULL);
}
static __be32
nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid *setclientid)
{
- DECODE_HEAD;
+ __be32 *p, status;
if (argp->minorversion >= 1)
return nfserr_notsupp;
- READ_BUF(NFS4_VERIFIER_SIZE);
- COPYMEM(setclientid->se_verf.data, NFS4_VERIFIER_SIZE);
-
+ status = nfsd4_decode_verifier4(argp, &setclientid->se_verf);
+ if (status)
+ return status;
status = nfsd4_decode_opaque(argp, &setclientid->se_name);
if (status)
+ return status;
+ if (xdr_stream_decode_u32(argp->xdr, &setclientid->se_callback_prog) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u32(argp->xdr, &setclientid->se_callback_netid_len) < 0)
+ return nfserr_bad_xdr;
+ p = xdr_inline_decode(argp->xdr, setclientid->se_callback_netid_len);
+ if (!p)
return nfserr_bad_xdr;
- READ_BUF(8);
- setclientid->se_callback_prog = be32_to_cpup(p++);
- setclientid->se_callback_netid_len = be32_to_cpup(p++);
- READ_BUF(setclientid->se_callback_netid_len);
- SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len);
- READ_BUF(4);
- setclientid->se_callback_addr_len = be32_to_cpup(p++);
+ setclientid->se_callback_netid_val = svcxdr_tmpalloc(argp,
+ setclientid->se_callback_netid_len);
+ if (!setclientid->se_callback_netid_val)
+ return nfserr_jukebox;
+ memcpy(setclientid->se_callback_netid_val, p,
+ setclientid->se_callback_netid_len);
- READ_BUF(setclientid->se_callback_addr_len);
- SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len);
- READ_BUF(4);
- setclientid->se_callback_ident = be32_to_cpup(p++);
+ if (xdr_stream_decode_u32(argp->xdr, &setclientid->se_callback_addr_len) < 0)
+ return nfserr_bad_xdr;
+ p = xdr_inline_decode(argp->xdr, setclientid->se_callback_addr_len);
+ if (!p)
+ return nfserr_bad_xdr;
+ setclientid->se_callback_addr_val = svcxdr_tmpalloc(argp,
+ setclientid->se_callback_addr_len);
+ if (!setclientid->se_callback_addr_val)
+ return nfserr_jukebox;
+ memcpy(setclientid->se_callback_addr_val, p,
+ setclientid->se_callback_addr_len);
+ if (xdr_stream_decode_u32(argp->xdr, &setclientid->se_callback_ident) < 0)
+ return nfserr_bad_xdr;
- DECODE_TAIL;
+ return nfs_ok;
}
static __be32
nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid_confirm *scd_c)
{
- DECODE_HEAD;
+ __be32 status;
if (argp->minorversion >= 1)
return nfserr_notsupp;
- READ_BUF(8 + NFS4_VERIFIER_SIZE);
- COPYMEM(&scd_c->sc_clientid, 8);
- COPYMEM(&scd_c->sc_confirm, NFS4_VERIFIER_SIZE);
-
- DECODE_TAIL;
+ status = nfsd4_decode_clientid4(argp, &scd_c->sc_clientid);
+ if (status)
+ return status;
+ return nfsd4_decode_verifier4(argp, &scd_c->sc_confirm);
}
/* Also used for NVERIFY */
static __be32
nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify)
{
- DECODE_HEAD;
+ __be32 *p, status;
- if ((status = nfsd4_decode_bitmap(argp, verify->ve_bmval)))
- goto out;
+ status = nfsd4_decode_bitmap4(argp, verify->ve_bmval,
+ ARRAY_SIZE(verify->ve_bmval));
+ if (status)
+ return status;
/* For convenience's sake, we compare raw xdr'd attributes in
* nfsd4_proc_verify */
- READ_BUF(4);
- verify->ve_attrlen = be32_to_cpup(p++);
- READ_BUF(verify->ve_attrlen);
- SAVEMEM(verify->ve_attrval, verify->ve_attrlen);
+ if (xdr_stream_decode_u32(argp->xdr, &verify->ve_attrlen) < 0)
+ return nfserr_bad_xdr;
+ p = xdr_inline_decode(argp->xdr, verify->ve_attrlen);
+ if (!p)
+ return nfserr_bad_xdr;
+ verify->ve_attrval = svcxdr_tmpalloc(argp, verify->ve_attrlen);
+ if (!verify->ve_attrval)
+ return nfserr_jukebox;
+ memcpy(verify->ve_attrval, p, verify->ve_attrlen);
- DECODE_TAIL;
+ return nfs_ok;
}
static __be32
nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
{
- DECODE_HEAD;
+ __be32 status;
- status = nfsd4_decode_stateid(argp, &write->wr_stateid);
+ status = nfsd4_decode_stateid4(argp, &write->wr_stateid);
if (status)
return status;
- READ_BUF(16);
- p = xdr_decode_hyper(p, &write->wr_offset);
- write->wr_stable_how = be32_to_cpup(p++);
+ if (xdr_stream_decode_u64(argp->xdr, &write->wr_offset) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u32(argp->xdr, &write->wr_stable_how) < 0)
+ return nfserr_bad_xdr;
if (write->wr_stable_how > NFS_FILE_SYNC)
- goto xdr_error;
- write->wr_buflen = be32_to_cpup(p++);
-
- status = svcxdr_construct_vector(argp, &write->wr_head,
- &write->wr_pagelist, write->wr_buflen);
- if (status)
- return status;
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u32(argp->xdr, &write->wr_buflen) < 0)
+ return nfserr_bad_xdr;
+ if (!xdr_stream_subsegment(argp->xdr, &write->wr_payload, write->wr_buflen))
+ return nfserr_bad_xdr;
- DECODE_TAIL;
+ return nfs_ok;
}
static __be32
nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_release_lockowner *rlockowner)
{
- DECODE_HEAD;
+ __be32 status;
if (argp->minorversion >= 1)
return nfserr_notsupp;
- READ_BUF(12);
- COPYMEM(&rlockowner->rl_clientid, sizeof(clientid_t));
- rlockowner->rl_owner.len = be32_to_cpup(p++);
- READ_BUF(rlockowner->rl_owner.len);
- READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len);
+ status = nfsd4_decode_state_owner4(argp, &rlockowner->rl_clientid,
+ &rlockowner->rl_owner);
+ if (status)
+ return status;
if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid))
return nfserr_inval;
- DECODE_TAIL;
+
+ return nfs_ok;
+}
+
+static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc)
+{
+ if (xdr_stream_decode_u32(argp->xdr, &bc->bc_cb_program) < 0)
+ return nfserr_bad_xdr;
+ return nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec);
+}
+
+static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
+{
+ u32 use_conn_in_rdma_mode;
+ __be32 status;
+
+ status = nfsd4_decode_sessionid4(argp, &bcts->sessionid);
+ if (status)
+ return status;
+ if (xdr_stream_decode_u32(argp->xdr, &bcts->dir) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u32(argp->xdr, &use_conn_in_rdma_mode) < 0)
+ return nfserr_bad_xdr;
+
+ return nfs_ok;
}
static __be32
-nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
- struct nfsd4_exchange_id *exid)
+nfsd4_decode_state_protect_ops(struct nfsd4_compoundargs *argp,
+ struct nfsd4_exchange_id *exid)
{
- int dummy, tmp;
- DECODE_HEAD;
+ __be32 status;
- READ_BUF(NFS4_VERIFIER_SIZE);
- COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
+ status = nfsd4_decode_bitmap4(argp, exid->spo_must_enforce,
+ ARRAY_SIZE(exid->spo_must_enforce));
+ if (status)
+ return nfserr_bad_xdr;
+ status = nfsd4_decode_bitmap4(argp, exid->spo_must_allow,
+ ARRAY_SIZE(exid->spo_must_allow));
+ if (status)
+ return nfserr_bad_xdr;
- status = nfsd4_decode_opaque(argp, &exid->clname);
+ return nfs_ok;
+}
+
+/*
+ * This implementation currently does not support SP4_SSV.
+ * This decoder simply skips over these arguments.
+ */
+static noinline __be32
+nfsd4_decode_ssv_sp_parms(struct nfsd4_compoundargs *argp,
+ struct nfsd4_exchange_id *exid)
+{
+ u32 count, window, num_gss_handles;
+ __be32 status;
+
+ /* ssp_ops */
+ status = nfsd4_decode_state_protect_ops(argp, exid);
if (status)
+ return status;
+
+ /* ssp_hash_algs<> */
+ if (xdr_stream_decode_u32(argp->xdr, &count) < 0)
+ return nfserr_bad_xdr;
+ while (count--) {
+ status = nfsd4_decode_ignored_string(argp, 0);
+ if (status)
+ return status;
+ }
+
+ /* ssp_encr_algs<> */
+ if (xdr_stream_decode_u32(argp->xdr, &count) < 0)
+ return nfserr_bad_xdr;
+ while (count--) {
+ status = nfsd4_decode_ignored_string(argp, 0);
+ if (status)
+ return status;
+ }
+
+ if (xdr_stream_decode_u32(argp->xdr, &window) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u32(argp->xdr, &num_gss_handles) < 0)
return nfserr_bad_xdr;
- READ_BUF(4);
- exid->flags = be32_to_cpup(p++);
+ return nfs_ok;
+}
- /* Ignore state_protect4_a */
- READ_BUF(4);
- exid->spa_how = be32_to_cpup(p++);
+static __be32
+nfsd4_decode_state_protect4_a(struct nfsd4_compoundargs *argp,
+ struct nfsd4_exchange_id *exid)
+{
+ __be32 status;
+
+ if (xdr_stream_decode_u32(argp->xdr, &exid->spa_how) < 0)
+ return nfserr_bad_xdr;
switch (exid->spa_how) {
case SP4_NONE:
break;
case SP4_MACH_CRED:
- /* spo_must_enforce */
- status = nfsd4_decode_bitmap(argp,
- exid->spo_must_enforce);
+ status = nfsd4_decode_state_protect_ops(argp, exid);
if (status)
- goto out;
- /* spo_must_allow */
- status = nfsd4_decode_bitmap(argp, exid->spo_must_allow);
- if (status)
- goto out;
+ return status;
break;
case SP4_SSV:
- /* ssp_ops */
- READ_BUF(4);
- dummy = be32_to_cpup(p++);
- READ_BUF(dummy * 4);
- p += dummy;
-
- READ_BUF(4);
- dummy = be32_to_cpup(p++);
- READ_BUF(dummy * 4);
- p += dummy;
-
- /* ssp_hash_algs<> */
- READ_BUF(4);
- tmp = be32_to_cpup(p++);
- while (tmp--) {
- READ_BUF(4);
- dummy = be32_to_cpup(p++);
- READ_BUF(dummy);
- p += XDR_QUADLEN(dummy);
- }
-
- /* ssp_encr_algs<> */
- READ_BUF(4);
- tmp = be32_to_cpup(p++);
- while (tmp--) {
- READ_BUF(4);
- dummy = be32_to_cpup(p++);
- READ_BUF(dummy);
- p += XDR_QUADLEN(dummy);
- }
-
- /* ignore ssp_window and ssp_num_gss_handles: */
- READ_BUF(8);
+ status = nfsd4_decode_ssv_sp_parms(argp, exid);
+ if (status)
+ return status;
break;
default:
- goto xdr_error;
+ return nfserr_bad_xdr;
}
- READ_BUF(4); /* nfs_impl_id4 array length */
- dummy = be32_to_cpup(p++);
+ return nfs_ok;
+}
- if (dummy > 1)
- goto xdr_error;
+static __be32
+nfsd4_decode_nfs_impl_id4(struct nfsd4_compoundargs *argp,
+ struct nfsd4_exchange_id *exid)
+{
+ __be32 status;
+ u32 count;
- if (dummy == 1) {
+ if (xdr_stream_decode_u32(argp->xdr, &count) < 0)
+ return nfserr_bad_xdr;
+ switch (count) {
+ case 0:
+ break;
+ case 1:
+ /* Note that RFC 8881 places no length limit on
+ * nii_domain, but this implementation permits no
+ * more than NFS4_OPAQUE_LIMIT bytes */
status = nfsd4_decode_opaque(argp, &exid->nii_domain);
if (status)
- goto xdr_error;
-
- /* nii_name */
+ return status;
+ /* Note that RFC 8881 places no length limit on
+ * nii_name, but this implementation permits no
+ * more than NFS4_OPAQUE_LIMIT bytes */
status = nfsd4_decode_opaque(argp, &exid->nii_name);
if (status)
- goto xdr_error;
-
- /* nii_date */
- status = nfsd4_decode_time(argp, &exid->nii_time);
+ return status;
+ status = nfsd4_decode_nfstime4(argp, &exid->nii_time);
if (status)
- goto xdr_error;
+ return status;
+ break;
+ default:
+ return nfserr_bad_xdr;
}
- DECODE_TAIL;
-}
-static __be32
-nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
- struct nfsd4_create_session *sess)
-{
- DECODE_HEAD;
-
- READ_BUF(16);
- COPYMEM(&sess->clientid, 8);
- sess->seqid = be32_to_cpup(p++);
- sess->flags = be32_to_cpup(p++);
-
- /* Fore channel attrs */
- READ_BUF(28);
- p++; /* headerpadsz is always 0 */
- sess->fore_channel.maxreq_sz = be32_to_cpup(p++);
- sess->fore_channel.maxresp_sz = be32_to_cpup(p++);
- sess->fore_channel.maxresp_cached = be32_to_cpup(p++);
- sess->fore_channel.maxops = be32_to_cpup(p++);
- sess->fore_channel.maxreqs = be32_to_cpup(p++);
- sess->fore_channel.nr_rdma_attrs = be32_to_cpup(p++);
- if (sess->fore_channel.nr_rdma_attrs == 1) {
- READ_BUF(4);
- sess->fore_channel.rdma_attrs = be32_to_cpup(p++);
- } else if (sess->fore_channel.nr_rdma_attrs > 1) {
- dprintk("Too many fore channel attr bitmaps!\n");
- goto xdr_error;
- }
-
- /* Back channel attrs */
- READ_BUF(28);
- p++; /* headerpadsz is always 0 */
- sess->back_channel.maxreq_sz = be32_to_cpup(p++);
- sess->back_channel.maxresp_sz = be32_to_cpup(p++);
- sess->back_channel.maxresp_cached = be32_to_cpup(p++);
- sess->back_channel.maxops = be32_to_cpup(p++);
- sess->back_channel.maxreqs = be32_to_cpup(p++);
- sess->back_channel.nr_rdma_attrs = be32_to_cpup(p++);
- if (sess->back_channel.nr_rdma_attrs == 1) {
- READ_BUF(4);
- sess->back_channel.rdma_attrs = be32_to_cpup(p++);
- } else if (sess->back_channel.nr_rdma_attrs > 1) {
- dprintk("Too many back channel attr bitmaps!\n");
- goto xdr_error;
- }
-
- READ_BUF(4);
- sess->callback_prog = be32_to_cpup(p++);
- nfsd4_decode_cb_sec(argp, &sess->cb_sec);
- DECODE_TAIL;
+ return nfs_ok;
}
static __be32
-nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
- struct nfsd4_destroy_session *destroy_session)
+nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
+ struct nfsd4_exchange_id *exid)
{
- DECODE_HEAD;
- READ_BUF(NFS4_MAX_SESSIONID_LEN);
- COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+ __be32 status;
- DECODE_TAIL;
+ status = nfsd4_decode_verifier4(argp, &exid->verifier);
+ if (status)
+ return status;
+ status = nfsd4_decode_opaque(argp, &exid->clname);
+ if (status)
+ return status;
+ if (xdr_stream_decode_u32(argp->xdr, &exid->flags) < 0)
+ return nfserr_bad_xdr;
+ status = nfsd4_decode_state_protect4_a(argp, exid);
+ if (status)
+ return status;
+ return nfsd4_decode_nfs_impl_id4(argp, exid);
}
static __be32
-nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp,
- struct nfsd4_free_stateid *free_stateid)
+nfsd4_decode_channel_attrs4(struct nfsd4_compoundargs *argp,
+ struct nfsd4_channel_attrs *ca)
{
- DECODE_HEAD;
-
- READ_BUF(sizeof(stateid_t));
- free_stateid->fr_stateid.si_generation = be32_to_cpup(p++);
- COPYMEM(&free_stateid->fr_stateid.si_opaque, sizeof(stateid_opaque_t));
-
- DECODE_TAIL;
-}
+ __be32 *p;
-static __be32
-nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
- struct nfsd4_sequence *seq)
-{
- DECODE_HEAD;
+ p = xdr_inline_decode(argp->xdr, XDR_UNIT * 7);
+ if (!p)
+ return nfserr_bad_xdr;
- READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
- COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
- seq->seqid = be32_to_cpup(p++);
- seq->slotid = be32_to_cpup(p++);
- seq->maxslots = be32_to_cpup(p++);
- seq->cachethis = be32_to_cpup(p++);
+ /* headerpadsz is ignored */
+ p++;
+ ca->maxreq_sz = be32_to_cpup(p++);
+ ca->maxresp_sz = be32_to_cpup(p++);
+ ca->maxresp_cached = be32_to_cpup(p++);
+ ca->maxops = be32_to_cpup(p++);
+ ca->maxreqs = be32_to_cpup(p++);
+ ca->nr_rdma_attrs = be32_to_cpup(p);
+ switch (ca->nr_rdma_attrs) {
+ case 0:
+ break;
+ case 1:
+ if (xdr_stream_decode_u32(argp->xdr, &ca->rdma_attrs) < 0)
+ return nfserr_bad_xdr;
+ break;
+ default:
+ return nfserr_bad_xdr;
+ }
- DECODE_TAIL;
+ return nfs_ok;
}
static __be32
-nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_stateid *test_stateid)
+nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
+ struct nfsd4_create_session *sess)
{
- int i;
- __be32 *p, status;
- struct nfsd4_test_stateid_id *stateid;
-
- READ_BUF(4);
- test_stateid->ts_num_ids = ntohl(*p++);
-
- INIT_LIST_HEAD(&test_stateid->ts_stateid_list);
-
- for (i = 0; i < test_stateid->ts_num_ids; i++) {
- stateid = svcxdr_tmpalloc(argp, sizeof(*stateid));
- if (!stateid) {
- status = nfserrno(-ENOMEM);
- goto out;
- }
-
- INIT_LIST_HEAD(&stateid->ts_id_list);
- list_add_tail(&stateid->ts_id_list, &test_stateid->ts_stateid_list);
+ __be32 status;
- status = nfsd4_decode_stateid(argp, &stateid->ts_id_stateid);
- if (status)
- goto out;
- }
+ status = nfsd4_decode_clientid4(argp, &sess->clientid);
+ if (status)
+ return status;
+ if (xdr_stream_decode_u32(argp->xdr, &sess->seqid) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u32(argp->xdr, &sess->flags) < 0)
+ return nfserr_bad_xdr;
+ status = nfsd4_decode_channel_attrs4(argp, &sess->fore_channel);
+ if (status)
+ return status;
+ status = nfsd4_decode_channel_attrs4(argp, &sess->back_channel);
+ if (status)
+ return status;
+ if (xdr_stream_decode_u32(argp->xdr, &sess->callback_prog) < 0)
+ return nfserr_bad_xdr;
+ status = nfsd4_decode_cb_sec(argp, &sess->cb_sec);
+ if (status)
+ return status;
- status = 0;
-out:
- return status;
-xdr_error:
- dprintk("NFSD: xdr error (%s:%d)\n", __FILE__, __LINE__);
- status = nfserr_bad_xdr;
- goto out;
+ return nfs_ok;
}
-static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, struct nfsd4_destroy_clientid *dc)
+static __be32
+nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
+ struct nfsd4_destroy_session *destroy_session)
{
- DECODE_HEAD;
-
- READ_BUF(8);
- COPYMEM(&dc->clientid, 8);
-
- DECODE_TAIL;
+ return nfsd4_decode_sessionid4(argp, &destroy_session->sessionid);
}
-static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc)
+static __be32
+nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp,
+ struct nfsd4_free_stateid *free_stateid)
{
- DECODE_HEAD;
-
- READ_BUF(4);
- rc->rca_one_fs = be32_to_cpup(p++);
-
- DECODE_TAIL;
+ return nfsd4_decode_stateid4(argp, &free_stateid->fr_stateid);
}
#ifdef CONFIG_NFSD_PNFS
@@ -1593,244 +1670,264 @@ static __be32
nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
struct nfsd4_getdeviceinfo *gdev)
{
- DECODE_HEAD;
- u32 num, i;
-
- READ_BUF(sizeof(struct nfsd4_deviceid) + 3 * 4);
- COPYMEM(&gdev->gd_devid, sizeof(struct nfsd4_deviceid));
- gdev->gd_layout_type = be32_to_cpup(p++);
- gdev->gd_maxcount = be32_to_cpup(p++);
- num = be32_to_cpup(p++);
- if (num) {
- if (num > 1000)
- goto xdr_error;
- READ_BUF(4 * num);
- gdev->gd_notify_types = be32_to_cpup(p++);
- for (i = 1; i < num; i++) {
- if (be32_to_cpup(p++)) {
- status = nfserr_inval;
- goto out;
- }
- }
- }
- DECODE_TAIL;
-}
-
-static __be32
-nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
- struct nfsd4_layoutget *lgp)
-{
- DECODE_HEAD;
-
- READ_BUF(36);
- lgp->lg_signal = be32_to_cpup(p++);
- lgp->lg_layout_type = be32_to_cpup(p++);
- lgp->lg_seg.iomode = be32_to_cpup(p++);
- p = xdr_decode_hyper(p, &lgp->lg_seg.offset);
- p = xdr_decode_hyper(p, &lgp->lg_seg.length);
- p = xdr_decode_hyper(p, &lgp->lg_minlength);
+ __be32 status;
- status = nfsd4_decode_stateid(argp, &lgp->lg_sid);
+ status = nfsd4_decode_deviceid4(argp, &gdev->gd_devid);
if (status)
return status;
+ if (xdr_stream_decode_u32(argp->xdr, &gdev->gd_layout_type) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u32(argp->xdr, &gdev->gd_maxcount) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_uint32_array(argp->xdr,
+ &gdev->gd_notify_types, 1) < 0)
+ return nfserr_bad_xdr;
- READ_BUF(4);
- lgp->lg_maxcount = be32_to_cpup(p++);
-
- DECODE_TAIL;
+ return nfs_ok;
}
static __be32
nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
- struct nfsd4_layoutcommit *lcp)
+ struct nfsd4_layoutcommit *lcp)
{
- DECODE_HEAD;
- u32 timechange;
-
- READ_BUF(20);
- p = xdr_decode_hyper(p, &lcp->lc_seg.offset);
- p = xdr_decode_hyper(p, &lcp->lc_seg.length);
- lcp->lc_reclaim = be32_to_cpup(p++);
+ __be32 *p, status;
- status = nfsd4_decode_stateid(argp, &lcp->lc_sid);
+ if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_seg.offset) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_seg.length) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_bool(argp->xdr, &lcp->lc_reclaim) < 0)
+ return nfserr_bad_xdr;
+ status = nfsd4_decode_stateid4(argp, &lcp->lc_sid);
if (status)
return status;
-
- READ_BUF(4);
- lcp->lc_newoffset = be32_to_cpup(p++);
+ if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_newoffset) < 0)
+ return nfserr_bad_xdr;
if (lcp->lc_newoffset) {
- READ_BUF(8);
- p = xdr_decode_hyper(p, &lcp->lc_last_wr);
+ if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_last_wr) < 0)
+ return nfserr_bad_xdr;
} else
lcp->lc_last_wr = 0;
- READ_BUF(4);
- timechange = be32_to_cpup(p++);
- if (timechange) {
- status = nfsd4_decode_time(argp, &lcp->lc_mtime);
+ p = xdr_inline_decode(argp->xdr, XDR_UNIT);
+ if (!p)
+ return nfserr_bad_xdr;
+ if (xdr_item_is_present(p)) {
+ status = nfsd4_decode_nfstime4(argp, &lcp->lc_mtime);
if (status)
return status;
} else {
lcp->lc_mtime.tv_nsec = UTIME_NOW;
}
- READ_BUF(8);
- lcp->lc_layout_type = be32_to_cpup(p++);
+ return nfsd4_decode_layoutupdate4(argp, lcp);
+}
- /*
- * Save the layout update in XDR format and let the layout driver deal
- * with it later.
- */
- lcp->lc_up_len = be32_to_cpup(p++);
- if (lcp->lc_up_len > 0) {
- READ_BUF(lcp->lc_up_len);
- READMEM(lcp->lc_up_layout, lcp->lc_up_len);
- }
+static __be32
+nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
+ struct nfsd4_layoutget *lgp)
+{
+ __be32 status;
- DECODE_TAIL;
+ if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_signal) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_layout_type) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_seg.iomode) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u64(argp->xdr, &lgp->lg_seg.offset) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u64(argp->xdr, &lgp->lg_seg.length) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u64(argp->xdr, &lgp->lg_minlength) < 0)
+ return nfserr_bad_xdr;
+ status = nfsd4_decode_stateid4(argp, &lgp->lg_sid);
+ if (status)
+ return status;
+ if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_maxcount) < 0)
+ return nfserr_bad_xdr;
+
+ return nfs_ok;
}
static __be32
nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
struct nfsd4_layoutreturn *lrp)
{
- DECODE_HEAD;
+ if (xdr_stream_decode_bool(argp->xdr, &lrp->lr_reclaim) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u32(argp->xdr, &lrp->lr_layout_type) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u32(argp->xdr, &lrp->lr_seg.iomode) < 0)
+ return nfserr_bad_xdr;
+ return nfsd4_decode_layoutreturn4(argp, lrp);
+}
+#endif /* CONFIG_NFSD_PNFS */
- READ_BUF(16);
- lrp->lr_reclaim = be32_to_cpup(p++);
- lrp->lr_layout_type = be32_to_cpup(p++);
- lrp->lr_seg.iomode = be32_to_cpup(p++);
- lrp->lr_return_type = be32_to_cpup(p++);
- if (lrp->lr_return_type == RETURN_FILE) {
- READ_BUF(16);
- p = xdr_decode_hyper(p, &lrp->lr_seg.offset);
- p = xdr_decode_hyper(p, &lrp->lr_seg.length);
+static __be32 nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
+ struct nfsd4_secinfo_no_name *sin)
+{
+ if (xdr_stream_decode_u32(argp->xdr, &sin->sin_style) < 0)
+ return nfserr_bad_xdr;
+ return nfs_ok;
+}
- status = nfsd4_decode_stateid(argp, &lrp->lr_sid);
- if (status)
- return status;
+static __be32
+nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
+ struct nfsd4_sequence *seq)
+{
+ __be32 *p, status;
- READ_BUF(4);
- lrp->lrf_body_len = be32_to_cpup(p++);
- if (lrp->lrf_body_len > 0) {
- READ_BUF(lrp->lrf_body_len);
- READMEM(lrp->lrf_body, lrp->lrf_body_len);
- }
- } else {
- lrp->lr_seg.offset = 0;
- lrp->lr_seg.length = NFS4_MAX_UINT64;
- }
+ status = nfsd4_decode_sessionid4(argp, &seq->sessionid);
+ if (status)
+ return status;
+ p = xdr_inline_decode(argp->xdr, XDR_UNIT * 4);
+ if (!p)
+ return nfserr_bad_xdr;
+ seq->seqid = be32_to_cpup(p++);
+ seq->slotid = be32_to_cpup(p++);
+ seq->maxslots = be32_to_cpup(p++);
+ seq->cachethis = be32_to_cpup(p);
- DECODE_TAIL;
+ return nfs_ok;
}
-#endif /* CONFIG_NFSD_PNFS */
static __be32
-nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
- struct nfsd4_fallocate *fallocate)
+nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_stateid *test_stateid)
{
- DECODE_HEAD;
+ struct nfsd4_test_stateid_id *stateid;
+ __be32 status;
+ u32 i;
- status = nfsd4_decode_stateid(argp, &fallocate->falloc_stateid);
- if (status)
- return status;
+ if (xdr_stream_decode_u32(argp->xdr, &test_stateid->ts_num_ids) < 0)
+ return nfserr_bad_xdr;
+
+ INIT_LIST_HEAD(&test_stateid->ts_stateid_list);
+ for (i = 0; i < test_stateid->ts_num_ids; i++) {
+ stateid = svcxdr_tmpalloc(argp, sizeof(*stateid));
+ if (!stateid)
+ return nfserrno(-ENOMEM); /* XXX: not jukebox? */
+ INIT_LIST_HEAD(&stateid->ts_id_list);
+ list_add_tail(&stateid->ts_id_list, &test_stateid->ts_stateid_list);
+ status = nfsd4_decode_stateid4(argp, &stateid->ts_id_stateid);
+ if (status)
+ return status;
+ }
+
+ return nfs_ok;
+}
- READ_BUF(16);
- p = xdr_decode_hyper(p, &fallocate->falloc_offset);
- xdr_decode_hyper(p, &fallocate->falloc_length);
+static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp,
+ struct nfsd4_destroy_clientid *dc)
+{
+ return nfsd4_decode_clientid4(argp, &dc->clientid);
+}
- DECODE_TAIL;
+static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp,
+ struct nfsd4_reclaim_complete *rc)
+{
+ if (xdr_stream_decode_bool(argp->xdr, &rc->rca_one_fs) < 0)
+ return nfserr_bad_xdr;
+ return nfs_ok;
}
static __be32
-nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone)
+nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
+ struct nfsd4_fallocate *fallocate)
{
- DECODE_HEAD;
+ __be32 status;
- status = nfsd4_decode_stateid(argp, &clone->cl_src_stateid);
- if (status)
- return status;
- status = nfsd4_decode_stateid(argp, &clone->cl_dst_stateid);
+ status = nfsd4_decode_stateid4(argp, &fallocate->falloc_stateid);
if (status)
return status;
+ if (xdr_stream_decode_u64(argp->xdr, &fallocate->falloc_offset) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u64(argp->xdr, &fallocate->falloc_length) < 0)
+ return nfserr_bad_xdr;
- READ_BUF(8 + 8 + 8);
- p = xdr_decode_hyper(p, &clone->cl_src_pos);
- p = xdr_decode_hyper(p, &clone->cl_dst_pos);
- p = xdr_decode_hyper(p, &clone->cl_count);
- DECODE_TAIL;
+ return nfs_ok;
}
static __be32 nfsd4_decode_nl4_server(struct nfsd4_compoundargs *argp,
struct nl4_server *ns)
{
- DECODE_HEAD;
struct nfs42_netaddr *naddr;
+ __be32 *p;
- READ_BUF(4);
- ns->nl4_type = be32_to_cpup(p++);
+ if (xdr_stream_decode_u32(argp->xdr, &ns->nl4_type) < 0)
+ return nfserr_bad_xdr;
/* currently support for 1 inter-server source server */
switch (ns->nl4_type) {
case NL4_NETADDR:
naddr = &ns->u.nl4_addr;
- READ_BUF(4);
- naddr->netid_len = be32_to_cpup(p++);
+ if (xdr_stream_decode_u32(argp->xdr, &naddr->netid_len) < 0)
+ return nfserr_bad_xdr;
if (naddr->netid_len > RPCBIND_MAXNETIDLEN)
- goto xdr_error;
+ return nfserr_bad_xdr;
- READ_BUF(naddr->netid_len + 4); /* 4 for uaddr len */
- COPYMEM(naddr->netid, naddr->netid_len);
+ p = xdr_inline_decode(argp->xdr, naddr->netid_len);
+ if (!p)
+ return nfserr_bad_xdr;
+ memcpy(naddr->netid, p, naddr->netid_len);
- naddr->addr_len = be32_to_cpup(p++);
+ if (xdr_stream_decode_u32(argp->xdr, &naddr->addr_len) < 0)
+ return nfserr_bad_xdr;
if (naddr->addr_len > RPCBIND_MAXUADDRLEN)
- goto xdr_error;
+ return nfserr_bad_xdr;
- READ_BUF(naddr->addr_len);
- COPYMEM(naddr->addr, naddr->addr_len);
+ p = xdr_inline_decode(argp->xdr, naddr->addr_len);
+ if (!p)
+ return nfserr_bad_xdr;
+ memcpy(naddr->addr, p, naddr->addr_len);
break;
default:
- goto xdr_error;
+ return nfserr_bad_xdr;
}
- DECODE_TAIL;
+
+ return nfs_ok;
}
static __be32
nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
{
- DECODE_HEAD;
struct nl4_server *ns_dummy;
- int i, count;
+ u32 consecutive, i, count;
+ __be32 status;
- status = nfsd4_decode_stateid(argp, &copy->cp_src_stateid);
+ status = nfsd4_decode_stateid4(argp, &copy->cp_src_stateid);
if (status)
return status;
- status = nfsd4_decode_stateid(argp, &copy->cp_dst_stateid);
+ status = nfsd4_decode_stateid4(argp, &copy->cp_dst_stateid);
if (status)
return status;
+ if (xdr_stream_decode_u64(argp->xdr, &copy->cp_src_pos) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u64(argp->xdr, &copy->cp_dst_pos) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u64(argp->xdr, &copy->cp_count) < 0)
+ return nfserr_bad_xdr;
+ /* ca_consecutive: we always do consecutive copies */
+ if (xdr_stream_decode_u32(argp->xdr, &consecutive) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u32(argp->xdr, &copy->cp_synchronous) < 0)
+ return nfserr_bad_xdr;
- READ_BUF(8 + 8 + 8 + 4 + 4 + 4);
- p = xdr_decode_hyper(p, &copy->cp_src_pos);
- p = xdr_decode_hyper(p, &copy->cp_dst_pos);
- p = xdr_decode_hyper(p, &copy->cp_count);
- p++; /* ca_consecutive: we always do consecutive copies */
- copy->cp_synchronous = be32_to_cpup(p++);
-
- count = be32_to_cpup(p++);
-
+ if (xdr_stream_decode_u32(argp->xdr, &count) < 0)
+ return nfserr_bad_xdr;
copy->cp_intra = false;
if (count == 0) { /* intra-server copy */
copy->cp_intra = true;
- goto intra;
+ return nfs_ok;
}
- /* decode all the supplied server addresses but use first */
+ /* decode all the supplied server addresses but use only the first */
status = nfsd4_decode_nl4_server(argp, &copy->cp_src);
if (status)
return status;
ns_dummy = kmalloc(sizeof(struct nl4_server), GFP_KERNEL);
if (ns_dummy == NULL)
- return nfserrno(-ENOMEM);
+ return nfserrno(-ENOMEM); /* XXX: jukebox? */
for (i = 0; i < count - 1; i++) {
status = nfsd4_decode_nl4_server(argp, ns_dummy);
if (status) {
@@ -1839,44 +1936,64 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
}
}
kfree(ns_dummy);
-intra:
- DECODE_TAIL;
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp,
+ struct nfsd4_copy_notify *cn)
+{
+ __be32 status;
+
+ status = nfsd4_decode_stateid4(argp, &cn->cpn_src_stateid);
+ if (status)
+ return status;
+ return nfsd4_decode_nl4_server(argp, &cn->cpn_dst);
}
static __be32
nfsd4_decode_offload_status(struct nfsd4_compoundargs *argp,
struct nfsd4_offload_status *os)
{
- return nfsd4_decode_stateid(argp, &os->stateid);
+ return nfsd4_decode_stateid4(argp, &os->stateid);
}
static __be32
-nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp,
- struct nfsd4_copy_notify *cn)
+nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
{
__be32 status;
- status = nfsd4_decode_stateid(argp, &cn->cpn_src_stateid);
+ status = nfsd4_decode_stateid4(argp, &seek->seek_stateid);
if (status)
return status;
- return nfsd4_decode_nl4_server(argp, &cn->cpn_dst);
+ if (xdr_stream_decode_u64(argp->xdr, &seek->seek_offset) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u32(argp->xdr, &seek->seek_whence) < 0)
+ return nfserr_bad_xdr;
+
+ return nfs_ok;
}
static __be32
-nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
+nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone)
{
- DECODE_HEAD;
+ __be32 status;
- status = nfsd4_decode_stateid(argp, &seek->seek_stateid);
+ status = nfsd4_decode_stateid4(argp, &clone->cl_src_stateid);
if (status)
return status;
+ status = nfsd4_decode_stateid4(argp, &clone->cl_dst_stateid);
+ if (status)
+ return status;
+ if (xdr_stream_decode_u64(argp->xdr, &clone->cl_src_pos) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u64(argp->xdr, &clone->cl_dst_pos) < 0)
+ return nfserr_bad_xdr;
+ if (xdr_stream_decode_u64(argp->xdr, &clone->cl_count) < 0)
+ return nfserr_bad_xdr;
- READ_BUF(8 + 4);
- p = xdr_decode_hyper(p, &seek->seek_offset);
- seek->seek_whence = be32_to_cpup(p);
-
- DECODE_TAIL;
+ return nfs_ok;
}
/*
@@ -1889,13 +2006,14 @@ nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
*/
/*
- * Decode data into buffer. Uses head and pages constructed by
- * svcxdr_construct_vector.
+ * Decode data into buffer.
*/
static __be32
-nfsd4_vbuf_from_vector(struct nfsd4_compoundargs *argp, struct kvec *head,
- struct page **pages, char **bufp, u32 buflen)
+nfsd4_vbuf_from_vector(struct nfsd4_compoundargs *argp, struct xdr_buf *xdr,
+ char **bufp, u32 buflen)
{
+ struct page **pages = xdr->pages;
+ struct kvec *head = xdr->head;
char *tmp, *dp;
u32 len;
@@ -1938,25 +2056,22 @@ nfsd4_vbuf_from_vector(struct nfsd4_compoundargs *argp, struct kvec *head,
static __be32
nfsd4_decode_xattr_name(struct nfsd4_compoundargs *argp, char **namep)
{
- DECODE_HEAD;
char *name, *sp, *dp;
u32 namelen, cnt;
+ __be32 *p;
- READ_BUF(4);
- namelen = be32_to_cpup(p++);
-
+ if (xdr_stream_decode_u32(argp->xdr, &namelen) < 0)
+ return nfserr_bad_xdr;
if (namelen > (XATTR_NAME_MAX - XATTR_USER_PREFIX_LEN))
return nfserr_nametoolong;
-
if (namelen == 0)
- goto xdr_error;
-
- READ_BUF(namelen);
-
+ return nfserr_bad_xdr;
+ p = xdr_inline_decode(argp->xdr, namelen);
+ if (!p)
+ return nfserr_bad_xdr;
name = svcxdr_tmpalloc(argp, namelen + XATTR_USER_PREFIX_LEN + 1);
if (!name)
return nfserr_jukebox;
-
memcpy(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
/*
@@ -1969,14 +2084,14 @@ nfsd4_decode_xattr_name(struct nfsd4_compoundargs *argp, char **namep)
while (cnt-- > 0) {
if (*sp == '\0')
- goto xdr_error;
+ return nfserr_bad_xdr;
*dp++ = *sp++;
}
*dp = '\0';
*namep = name;
- DECODE_TAIL;
+ return nfs_ok;
}
/*
@@ -2008,13 +2123,11 @@ static __be32
nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp,
struct nfsd4_setxattr *setxattr)
{
- DECODE_HEAD;
u32 flags, maxcount, size;
- struct kvec head;
- struct page **pagelist;
+ __be32 status;
- READ_BUF(4);
- flags = be32_to_cpup(p++);
+ if (xdr_stream_decode_u32(argp->xdr, &flags) < 0)
+ return nfserr_bad_xdr;
if (flags > SETXATTR4_REPLACE)
return nfserr_inval;
@@ -2027,33 +2140,32 @@ nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp,
maxcount = svc_max_payload(argp->rqstp);
maxcount = min_t(u32, XATTR_SIZE_MAX, maxcount);
- READ_BUF(4);
- size = be32_to_cpup(p++);
+ if (xdr_stream_decode_u32(argp->xdr, &size) < 0)
+ return nfserr_bad_xdr;
if (size > maxcount)
return nfserr_xattr2big;
setxattr->setxa_len = size;
if (size > 0) {
- status = svcxdr_construct_vector(argp, &head, &pagelist, size);
- if (status)
- return status;
+ struct xdr_buf payload;
- status = nfsd4_vbuf_from_vector(argp, &head, pagelist,
- &setxattr->setxa_buf, size);
+ if (!xdr_stream_subsegment(argp->xdr, &payload, size))
+ return nfserr_bad_xdr;
+ status = nfsd4_vbuf_from_vector(argp, &payload,
+ &setxattr->setxa_buf, size);
}
- DECODE_TAIL;
+ return nfs_ok;
}
static __be32
nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp,
struct nfsd4_listxattrs *listxattrs)
{
- DECODE_HEAD;
u32 maxcount;
- READ_BUF(12);
- p = xdr_decode_hyper(p, &listxattrs->lsxa_cookie);
+ if (xdr_stream_decode_u64(argp->xdr, &listxattrs->lsxa_cookie) < 0)
+ return nfserr_bad_xdr;
/*
* If the cookie is too large to have even one user.x attribute
@@ -2063,7 +2175,8 @@ nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp,
(XATTR_LIST_MAX / (XATTR_USER_PREFIX_LEN + 2)))
return nfserr_badcookie;
- maxcount = be32_to_cpup(p++);
+ if (xdr_stream_decode_u32(argp->xdr, &maxcount) < 0)
+ return nfserr_bad_xdr;
if (maxcount < 8)
/* Always need at least 2 words (length and one character) */
return nfserr_inval;
@@ -2071,7 +2184,7 @@ nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp,
maxcount = min(maxcount, svc_max_payload(argp->rqstp));
listxattrs->lsxa_maxcount = maxcount;
- DECODE_TAIL;
+ return nfs_ok;
}
static __be32
@@ -2198,43 +2311,54 @@ nfsd4_opnum_in_range(struct nfsd4_compoundargs *argp, struct nfsd4_op *op)
return true;
}
-static __be32
+static int
nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
{
- DECODE_HEAD;
struct nfsd4_op *op;
bool cachethis = false;
int auth_slack= argp->rqstp->rq_auth_slack;
int max_reply = auth_slack + 8; /* opcnt, status */
int readcount = 0;
int readbytes = 0;
+ __be32 *p;
int i;
- READ_BUF(4);
- argp->taglen = be32_to_cpup(p++);
- READ_BUF(argp->taglen);
- SAVEMEM(argp->tag, argp->taglen);
- READ_BUF(8);
- argp->minorversion = be32_to_cpup(p++);
- argp->opcnt = be32_to_cpup(p++);
- max_reply += 4 + (XDR_QUADLEN(argp->taglen) << 2);
-
- if (argp->taglen > NFSD4_MAX_TAGLEN)
- goto xdr_error;
+ if (xdr_stream_decode_u32(argp->xdr, &argp->taglen) < 0)
+ return 0;
+ max_reply += XDR_UNIT;
+ argp->tag = NULL;
+ if (unlikely(argp->taglen)) {
+ if (argp->taglen > NFSD4_MAX_TAGLEN)
+ return 0;
+ p = xdr_inline_decode(argp->xdr, argp->taglen);
+ if (!p)
+ return 0;
+ argp->tag = svcxdr_tmpalloc(argp, argp->taglen);
+ if (!argp->tag)
+ return 0;
+ memcpy(argp->tag, p, argp->taglen);
+ max_reply += xdr_align_size(argp->taglen);
+ }
+
+ if (xdr_stream_decode_u32(argp->xdr, &argp->minorversion) < 0)
+ return 0;
+ if (xdr_stream_decode_u32(argp->xdr, &argp->opcnt) < 0)
+ return 0;
+
/*
* NFS4ERR_RESOURCE is a more helpful error than GARBAGE_ARGS
* here, so we return success at the xdr level so that
* nfsd4_proc can handle this is an NFS-level error.
*/
if (argp->opcnt > NFSD_MAX_OPS_PER_COMPOUND)
- return 0;
+ return 1;
if (argp->opcnt > ARRAY_SIZE(argp->iops)) {
argp->ops = kzalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL);
if (!argp->ops) {
argp->ops = argp->iops;
dprintk("nfsd: couldn't allocate room for COMPOUND\n");
- goto xdr_error;
+ return 0;
}
}
@@ -2245,12 +2369,16 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
op = &argp->ops[i];
op->replay = NULL;
- READ_BUF(4);
- op->opnum = be32_to_cpup(p++);
-
- if (nfsd4_opnum_in_range(argp, op))
+ if (xdr_stream_decode_u32(argp->xdr, &op->opnum) < 0)
+ return 0;
+ if (nfsd4_opnum_in_range(argp, op)) {
op->status = nfsd4_dec_ops[op->opnum](argp, &op->u);
- else {
+ if (op->status != nfs_ok)
+ trace_nfsd_compound_decode_err(argp->rqstp,
+ argp->opcnt, i,
+ op->opnum,
+ op->status);
+ } else {
op->opnum = OP_ILLEGAL;
op->status = nfserr_op_illegal;
}
@@ -2289,7 +2417,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack)
clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags);
- DECODE_TAIL;
+ return 1;
}
static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
@@ -2298,12 +2426,8 @@ static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
if (exp->ex_flags & NFSEXP_V4ROOT) {
*p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
*p++ = 0;
- } else if (IS_I_VERSION(inode)) {
+ } else
p = xdr_encode_hyper(p, nfsd4_change_attribute(stat, inode));
- } else {
- *p++ = cpu_to_be32(stat->ctime.tv_sec);
- *p++ = cpu_to_be32(stat->ctime.tv_nsec);
- }
return p;
}
@@ -2335,15 +2459,8 @@ static __be32 *encode_time_delta(__be32 *p, struct inode *inode)
static __be32 *encode_cinfo(__be32 *p, struct nfsd4_change_info *c)
{
*p++ = cpu_to_be32(c->atomic);
- if (c->change_supported) {
- p = xdr_encode_hyper(p, c->before_change);
- p = xdr_encode_hyper(p, c->after_change);
- } else {
- *p++ = cpu_to_be32(c->before_ctime_sec);
- *p++ = cpu_to_be32(c->before_ctime_nsec);
- *p++ = cpu_to_be32(c->after_ctime_sec);
- *p++ = cpu_to_be32(c->after_ctime_nsec);
- }
+ p = xdr_encode_hyper(p, c->before_change);
+ p = xdr_encode_hyper(p, c->after_change);
return p;
}
@@ -2558,7 +2675,7 @@ static u32 nfs4_file_type(umode_t mode)
case S_IFREG: return NF4REG;
case S_IFSOCK: return NF4SOCK;
default: return NF4BAD;
- };
+ }
}
static inline __be32
@@ -3194,16 +3311,6 @@ out_acl:
goto out;
}
- if (bmval2 & FATTR4_WORD2_CHANGE_ATTR_TYPE) {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- if (IS_I_VERSION(d_inode(dentry)))
- *p++ = cpu_to_be32(NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR);
- else
- *p++ = cpu_to_be32(NFS4_CHANGE_TYPE_IS_TIME_METADATA);
- }
-
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
status = nfsd4_encode_security_label(xdr, rqstp, context,
@@ -3756,8 +3863,8 @@ static __be32 nfsd4_encode_splice_read(
{
struct xdr_stream *xdr = &resp->xdr;
struct xdr_buf *buf = xdr->buf;
+ int status, space_left;
u32 eof;
- int space_left;
__be32 nfserr;
__be32 *p = xdr->p - 2;
@@ -3768,14 +3875,13 @@ static __be32 nfsd4_encode_splice_read(
nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp,
file, read->rd_offset, &maxcount, &eof);
read->rd_length = maxcount;
- if (nfserr) {
- /*
- * nfsd_splice_actor may have already messed with the
- * page length; reset it so as not to confuse
- * xdr_truncate_encode:
- */
- buf->page_len = 0;
- return nfserr;
+ if (nfserr)
+ goto out_err;
+ status = svc_encode_result_payload(read->rd_rqstp,
+ buf->head[0].iov_len, maxcount);
+ if (status) {
+ nfserr = nfserrno(status);
+ goto out_err;
}
*(p++) = htonl(eof);
@@ -3806,6 +3912,15 @@ static __be32 nfsd4_encode_splice_read(
xdr->end = (__be32 *)((void *)xdr->end + space_left);
return 0;
+
+out_err:
+ /*
+ * nfsd_splice_actor may have already messed with the
+ * page length; reset it so as not to confuse
+ * xdr_truncate_encode in our caller.
+ */
+ buf->page_len = 0;
+ return nfserr;
}
static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
@@ -3829,7 +3944,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
read->rd_length = maxcount;
if (nfserr)
return nfserr;
- if (svc_encode_read_payload(resp->rqstp, starting_len + 8, maxcount))
+ if (svc_encode_result_payload(resp->rqstp, starting_len + 8, maxcount))
return nfserr_io;
xdr_truncate_encode(xdr, starting_len + 8 + xdr_align_size(maxcount));
@@ -3897,6 +4012,7 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd
int zero = 0;
struct xdr_stream *xdr = &resp->xdr;
int length_offset = xdr->buf->len;
+ int status;
__be32 *p;
p = xdr_reserve_space(xdr, 4);
@@ -3917,9 +4033,13 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd
(char *)p, &maxcount);
if (nfserr == nfserr_isdir)
nfserr = nfserr_inval;
- if (nfserr) {
- xdr_truncate_encode(xdr, length_offset);
- return nfserr;
+ if (nfserr)
+ goto out_err;
+ status = svc_encode_result_payload(readlink->rl_rqstp, length_offset,
+ maxcount);
+ if (status) {
+ nfserr = nfserrno(status);
+ goto out_err;
}
wire_count = htonl(maxcount);
@@ -3929,6 +4049,10 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd
write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount,
&zero, 4 - (maxcount&3));
return 0;
+
+out_err:
+ xdr_truncate_encode(xdr, length_offset);
+ return nfserr;
}
static __be32
@@ -4575,7 +4699,7 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
__be32 *p;
nfserr = nfsd42_encode_write_res(resp, &copy->cp_res,
- copy->cp_synchronous);
+ !!copy->cp_synchronous);
if (nfserr)
return nfserr;
@@ -5182,10 +5306,12 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
if (op->status && opdesc &&
!(opdesc->op_flags & OP_NONTRIVIAL_ERROR_ENCODE))
goto status;
- BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
+ BUG_ON(op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
!nfsd4_enc_ops[op->opnum]);
encoder = nfsd4_enc_ops[op->opnum];
op->status = encoder(resp, op->status, &op->u);
+ if (op->status)
+ trace_nfsd_compound_encode_err(rqstp, op->opnum, op->status);
if (opdesc && opdesc->op_release)
opdesc->op_release(&op->u);
xdr_commit_encode(xdr);
@@ -5254,12 +5380,6 @@ nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op)
p = xdr_encode_opaque_fixed(p, rp->rp_buf, rp->rp_buflen);
}
-int
-nfs4svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p)
-{
- return xdr_ressize_check(rqstp, p);
-}
-
void nfsd4_release_compoundargs(struct svc_rqst *rqstp)
{
struct nfsd4_compoundargs *args = rqstp->rq_argp;
@@ -5268,8 +5388,6 @@ void nfsd4_release_compoundargs(struct svc_rqst *rqstp)
kfree(args->ops);
args->ops = args->iops;
}
- kfree(args->tmpp);
- args->tmpp = NULL;
while (args->to_free) {
struct svcxdr_tmpbuf *tb = args->to_free;
args->to_free = tb->next;
@@ -5278,33 +5396,18 @@ void nfsd4_release_compoundargs(struct svc_rqst *rqstp)
}
int
-nfs4svc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p)
-{
- return 1;
-}
-
-int
nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd4_compoundargs *args = rqstp->rq_argp;
- if (rqstp->rq_arg.head[0].iov_len % 4) {
- /* client is nuts */
- dprintk("%s: compound not properly padded! (peeraddr=%pISc xid=0x%x)",
- __func__, svc_addr(rqstp), be32_to_cpu(rqstp->rq_xid));
- return 0;
- }
- args->p = p;
- args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len;
- args->pagelist = rqstp->rq_arg.pages;
- args->pagelen = rqstp->rq_arg.page_len;
- args->tail = false;
- args->tmpp = NULL;
+ /* svcxdr_tmp_alloc */
args->to_free = NULL;
+
+ args->xdr = &rqstp->rq_arg_stream;
args->ops = args->iops;
args->rqstp = rqstp;
- return !nfsd4_decode_compound(args);
+ return nfsd4_decode_compound(args);
}
int
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index cb742e17e04a..d63cf8196fed 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -74,6 +74,14 @@ extern unsigned long nfsd_drc_mem_used;
extern const struct seq_operations nfs_exports_op;
/*
+ * Common void argument and result helpers
+ */
+struct nfsd_voidargs { };
+struct nfsd_voidres { };
+int nfssvc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p);
+int nfssvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p);
+
+/*
* Function prototypes.
*/
int nfsd_svc(int nrservs, struct net *net, const struct cred *cred);
@@ -387,7 +395,6 @@ void nfsd_lockd_shutdown(void);
#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \
(NFSD4_1_SUPPORTED_ATTRS_WORD2 | \
- FATTR4_WORD2_CHANGE_ATTR_TYPE | \
FATTR4_WORD2_MODE_UMASK | \
NFSD4_2_SECURITY_ATTRS | \
FATTR4_WORD2_XATTR_SUPPORT)
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index c81dbbad8792..66f2ef67792a 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -268,12 +268,20 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
if (fileid_type == FILEID_ROOT)
dentry = dget(exp->ex_path.dentry);
else {
- dentry = exportfs_decode_fh(exp->ex_path.mnt, fid,
- data_left, fileid_type,
- nfsd_acceptable, exp);
- if (IS_ERR_OR_NULL(dentry))
+ dentry = exportfs_decode_fh_raw(exp->ex_path.mnt, fid,
+ data_left, fileid_type,
+ nfsd_acceptable, exp);
+ if (IS_ERR_OR_NULL(dentry)) {
trace_nfsd_set_fh_dentry_badhandle(rqstp, fhp,
dentry ? PTR_ERR(dentry) : -ESTALE);
+ switch (PTR_ERR(dentry)) {
+ case -ENOMEM:
+ case -ETIMEDOUT:
+ break;
+ default:
+ dentry = ERR_PTR(-ESTALE);
+ }
+ }
}
if (dentry == NULL)
goto out;
@@ -291,6 +299,20 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
fhp->fh_dentry = dentry;
fhp->fh_export = exp;
+
+ switch (rqstp->rq_vers) {
+ case 4:
+ if (dentry->d_sb->s_export_op->flags & EXPORT_OP_NOATOMIC_ATTR)
+ fhp->fh_no_atomic_attr = true;
+ break;
+ case 3:
+ if (dentry->d_sb->s_export_op->flags & EXPORT_OP_NOWCC)
+ fhp->fh_no_wcc = true;
+ break;
+ case 2:
+ fhp->fh_no_wcc = true;
+ }
+
return 0;
out:
exp_put(exp);
@@ -559,6 +581,9 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
*/
set_version_and_fsid_type(fhp, exp, ref_fh);
+ /* If we have a ref_fh, then copy the fh_no_wcc setting from it. */
+ fhp->fh_no_wcc = ref_fh ? ref_fh->fh_no_wcc : false;
+
if (ref_fh == fhp)
fh_put(ref_fh);
@@ -662,6 +687,7 @@ fh_put(struct svc_fh *fhp)
exp_put(exp);
fhp->fh_export = NULL;
}
+ fhp->fh_no_wcc = false;
return;
}
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 56cfbc361561..cb20c2cd3469 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -35,6 +35,12 @@ typedef struct svc_fh {
bool fh_locked; /* inode locked by us */
bool fh_want_write; /* remount protection taken */
+ bool fh_no_wcc; /* no wcc data needed */
+ bool fh_no_atomic_attr;
+ /*
+ * wcc data is not atomic with
+ * operation
+ */
int fh_flags; /* FH flags */
#ifdef CONFIG_NFSD_V3
bool fh_post_saved; /* post-op attrs saved */
@@ -54,7 +60,6 @@ typedef struct svc_fh {
struct kstat fh_post_attr; /* full attrs after operation */
u64 fh_post_change; /* nfsv4 change; see above */
#endif /* CONFIG_NFSD_V3 */
-
} svc_fh;
#define NFSD4_FH_FOREIGN (1<<0)
#define SET_FH_FLAG(c, f) ((c)->fh_flags |= (f))
@@ -259,13 +264,16 @@ fh_clear_wcc(struct svc_fh *fhp)
static inline u64 nfsd4_change_attribute(struct kstat *stat,
struct inode *inode)
{
- u64 chattr;
-
- chattr = stat->ctime.tv_sec;
- chattr <<= 30;
- chattr += stat->ctime.tv_nsec;
- chattr += inode_query_iversion(inode);
- return chattr;
+ if (IS_I_VERSION(inode)) {
+ u64 chattr;
+
+ chattr = stat->ctime.tv_sec;
+ chattr <<= 30;
+ chattr += stat->ctime.tv_nsec;
+ chattr += inode_query_iversion(inode);
+ return chattr;
+ } else
+ return time_to_chattr(&stat->ctime);
}
extern void fill_pre_wcc(struct svc_fh *fhp);
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 0d71549f9d42..9473d048efec 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -609,7 +609,6 @@ nfsd_proc_statfs(struct svc_rqst *rqstp)
* NFSv2 Server procedures.
* Only the results of non-idempotent operations are cached.
*/
-struct nfsd_void { int dummy; };
#define ST 1 /* status */
#define FH 8 /* filehandle */
@@ -618,10 +617,10 @@ struct nfsd_void { int dummy; };
static const struct svc_procedure nfsd_procedures2[18] = {
[NFSPROC_NULL] = {
.pc_func = nfsd_proc_null,
- .pc_decode = nfssvc_decode_void,
- .pc_encode = nfssvc_encode_void,
- .pc_argsize = sizeof(struct nfsd_void),
- .pc_ressize = sizeof(struct nfsd_void),
+ .pc_decode = nfssvc_decode_voidarg,
+ .pc_encode = nfssvc_encode_voidres,
+ .pc_argsize = sizeof(struct nfsd_voidargs),
+ .pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = 0,
},
@@ -647,10 +646,10 @@ static const struct svc_procedure nfsd_procedures2[18] = {
},
[NFSPROC_ROOT] = {
.pc_func = nfsd_proc_root,
- .pc_decode = nfssvc_decode_void,
- .pc_encode = nfssvc_encode_void,
- .pc_argsize = sizeof(struct nfsd_void),
- .pc_ressize = sizeof(struct nfsd_void),
+ .pc_decode = nfssvc_decode_voidarg,
+ .pc_encode = nfssvc_encode_voidres,
+ .pc_argsize = sizeof(struct nfsd_voidargs),
+ .pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = 0,
},
@@ -685,10 +684,10 @@ static const struct svc_procedure nfsd_procedures2[18] = {
},
[NFSPROC_WRITECACHE] = {
.pc_func = nfsd_proc_writecache,
- .pc_decode = nfssvc_decode_void,
- .pc_encode = nfssvc_encode_void,
- .pc_argsize = sizeof(struct nfsd_void),
- .pc_ressize = sizeof(struct nfsd_void),
+ .pc_decode = nfssvc_decode_voidarg,
+ .pc_encode = nfssvc_encode_voidres,
+ .pc_argsize = sizeof(struct nfsd_voidargs),
+ .pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = 0,
},
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 27b1ad136150..00384c332f9b 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -29,6 +29,8 @@
#include "netns.h"
#include "filecache.h"
+#include "trace.h"
+
#define NFSDDBG_FACILITY NFSDDBG_SVC
bool inter_copy_offload_enable;
@@ -527,8 +529,7 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
return;
nfsd_shutdown_net(net);
- printk(KERN_WARNING "nfsd: last server has exited, flushing export "
- "cache\n");
+ pr_info("nfsd: last server has exited, flushing export cache\n");
nfsd_export_flush(net);
}
@@ -1009,17 +1010,16 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
struct kvec *resv = &rqstp->rq_res.head[0];
__be32 *p;
- dprintk("nfsd_dispatch: vers %d proc %d\n",
- rqstp->rq_vers, rqstp->rq_proc);
-
if (nfs_request_too_big(rqstp, proc))
- goto out_too_large;
+ goto out_decode_err;
/*
* Give the xdr decoder a chance to change this if it wants
* (necessary in the NFSv4.0 compound case)
*/
rqstp->rq_cachetype = proc->pc_cachetype;
+
+ svcxdr_init_decode(rqstp);
if (!proc->pc_decode(rqstp, argv->iov_base))
goto out_decode_err;
@@ -1050,29 +1050,51 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
out_cached_reply:
return 1;
-out_too_large:
- dprintk("nfsd: NFSv%d argument too large\n", rqstp->rq_vers);
- *statp = rpc_garbage_args;
- return 1;
-
out_decode_err:
- dprintk("nfsd: failed to decode arguments!\n");
+ trace_nfsd_garbage_args_err(rqstp);
*statp = rpc_garbage_args;
return 1;
out_update_drop:
- dprintk("nfsd: Dropping request; may be revisited later\n");
nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
out_dropit:
return 0;
out_encode_err:
- dprintk("nfsd: failed to encode result!\n");
+ trace_nfsd_cant_encode_err(rqstp);
nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
*statp = rpc_system_err;
return 1;
}
+/**
+ * nfssvc_decode_voidarg - Decode void arguments
+ * @rqstp: Server RPC transaction context
+ * @p: buffer containing arguments to decode
+ *
+ * Return values:
+ * %0: Arguments were not valid
+ * %1: Decoding was successful
+ */
+int nfssvc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p)
+{
+ return 1;
+}
+
+/**
+ * nfssvc_encode_voidres - Encode void results
+ * @rqstp: Server RPC transaction context
+ * @p: buffer in which to encode results
+ *
+ * Return values:
+ * %0: Local error while encoding
+ * %1: Encoding was successful
+ */
+int nfssvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p)
+{
+ return xdr_ressize_check(rqstp, p);
+}
+
int nfsd_pool_stats_open(struct inode *inode, struct file *file)
{
int ret;
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 8a288c8fcd57..7aa6e8aca2c1 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -192,11 +192,6 @@ __be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *f
/*
* XDR decode functions
*/
-int
-nfssvc_decode_void(struct svc_rqst *rqstp, __be32 *p)
-{
- return xdr_argsize_check(rqstp, p);
-}
int
nfssvc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p)
@@ -423,11 +418,6 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p)
/*
* XDR encode functions
*/
-int
-nfssvc_encode_void(struct svc_rqst *rqstp, __be32 *p)
-{
- return xdr_ressize_check(rqstp, p);
-}
int
nfssvc_encode_stat(struct svc_rqst *rqstp, __be32 *p)
@@ -469,6 +459,7 @@ int
nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd_readlinkres *resp = rqstp->rq_resp;
+ struct kvec *head = rqstp->rq_res.head;
*p++ = resp->status;
if (resp->status != nfs_ok)
@@ -483,6 +474,8 @@ nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p)
*p = 0;
rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3);
}
+ if (svc_encode_result_payload(rqstp, head->iov_len, resp->len))
+ return 0;
return 1;
}
@@ -490,6 +483,7 @@ int
nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd_readres *resp = rqstp->rq_resp;
+ struct kvec *head = rqstp->rq_res.head;
*p++ = resp->status;
if (resp->status != nfs_ok)
@@ -507,6 +501,8 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p)
*p = 0;
rqstp->rq_res.tail[0].iov_len = 4 - (resp->count&3);
}
+ if (svc_encode_result_payload(rqstp, head->iov_len, resp->count))
+ return 0;
return 1;
}
diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c
index 90967466a1e5..f008b95ceec2 100644
--- a/fs/nfsd/trace.c
+++ b/fs/nfsd/trace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#define CREATE_TRACE_POINTS
#include "trace.h"
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 99bf07800cd0..92a0973dd671 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -12,6 +12,100 @@
#include "export.h"
#include "nfsfh.h"
+#define NFSD_TRACE_PROC_ARG_FIELDS \
+ __field(unsigned int, netns_ino) \
+ __field(u32, xid) \
+ __array(unsigned char, server, sizeof(struct sockaddr_in6)) \
+ __array(unsigned char, client, sizeof(struct sockaddr_in6))
+
+#define NFSD_TRACE_PROC_ARG_ASSIGNMENTS \
+ do { \
+ __entry->netns_ino = SVC_NET(rqstp)->ns.inum; \
+ __entry->xid = be32_to_cpu(rqstp->rq_xid); \
+ memcpy(__entry->server, &rqstp->rq_xprt->xpt_local, \
+ rqstp->rq_xprt->xpt_locallen); \
+ memcpy(__entry->client, &rqstp->rq_xprt->xpt_remote, \
+ rqstp->rq_xprt->xpt_remotelen); \
+ } while (0);
+
+#define NFSD_TRACE_PROC_RES_FIELDS \
+ __field(unsigned int, netns_ino) \
+ __field(u32, xid) \
+ __field(unsigned long, status) \
+ __array(unsigned char, server, sizeof(struct sockaddr_in6)) \
+ __array(unsigned char, client, sizeof(struct sockaddr_in6))
+
+#define NFSD_TRACE_PROC_RES_ASSIGNMENTS(error) \
+ do { \
+ __entry->netns_ino = SVC_NET(rqstp)->ns.inum; \
+ __entry->xid = be32_to_cpu(rqstp->rq_xid); \
+ __entry->status = be32_to_cpu(error); \
+ memcpy(__entry->server, &rqstp->rq_xprt->xpt_local, \
+ rqstp->rq_xprt->xpt_locallen); \
+ memcpy(__entry->client, &rqstp->rq_xprt->xpt_remote, \
+ rqstp->rq_xprt->xpt_remotelen); \
+ } while (0);
+
+TRACE_EVENT(nfsd_garbage_args_err,
+ TP_PROTO(
+ const struct svc_rqst *rqstp
+ ),
+ TP_ARGS(rqstp),
+ TP_STRUCT__entry(
+ NFSD_TRACE_PROC_ARG_FIELDS
+
+ __field(u32, vers)
+ __field(u32, proc)
+ ),
+ TP_fast_assign(
+ NFSD_TRACE_PROC_ARG_ASSIGNMENTS
+
+ __entry->vers = rqstp->rq_vers;
+ __entry->proc = rqstp->rq_proc;
+ ),
+ TP_printk("xid=0x%08x vers=%u proc=%u",
+ __entry->xid, __entry->vers, __entry->proc
+ )
+);
+
+TRACE_EVENT(nfsd_cant_encode_err,
+ TP_PROTO(
+ const struct svc_rqst *rqstp
+ ),
+ TP_ARGS(rqstp),
+ TP_STRUCT__entry(
+ NFSD_TRACE_PROC_ARG_FIELDS
+
+ __field(u32, vers)
+ __field(u32, proc)
+ ),
+ TP_fast_assign(
+ NFSD_TRACE_PROC_ARG_ASSIGNMENTS
+
+ __entry->vers = rqstp->rq_vers;
+ __entry->proc = rqstp->rq_proc;
+ ),
+ TP_printk("xid=0x%08x vers=%u proc=%u",
+ __entry->xid, __entry->vers, __entry->proc
+ )
+);
+
+#define show_nfsd_may_flags(x) \
+ __print_flags(x, "|", \
+ { NFSD_MAY_EXEC, "EXEC" }, \
+ { NFSD_MAY_WRITE, "WRITE" }, \
+ { NFSD_MAY_READ, "READ" }, \
+ { NFSD_MAY_SATTR, "SATTR" }, \
+ { NFSD_MAY_TRUNC, "TRUNC" }, \
+ { NFSD_MAY_LOCK, "LOCK" }, \
+ { NFSD_MAY_OWNER_OVERRIDE, "OWNER_OVERRIDE" }, \
+ { NFSD_MAY_LOCAL_ACCESS, "LOCAL_ACCESS" }, \
+ { NFSD_MAY_BYPASS_GSS_ON_ROOT, "BYPASS_GSS_ON_ROOT" }, \
+ { NFSD_MAY_NOT_BREAK_LEASE, "NOT_BREAK_LEASE" }, \
+ { NFSD_MAY_BYPASS_GSS, "BYPASS_GSS" }, \
+ { NFSD_MAY_READ_IF_EXEC, "READ_IF_EXEC" }, \
+ { NFSD_MAY_64BIT_COOKIE, "64BIT_COOKIE" })
+
TRACE_EVENT(nfsd_compound,
TP_PROTO(const struct svc_rqst *rqst,
u32 args_opcnt),
@@ -51,6 +145,56 @@ TRACE_EVENT(nfsd_compound_status,
__get_str(name), __entry->status)
)
+TRACE_EVENT(nfsd_compound_decode_err,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ u32 args_opcnt,
+ u32 resp_opcnt,
+ u32 opnum,
+ __be32 status
+ ),
+ TP_ARGS(rqstp, args_opcnt, resp_opcnt, opnum, status),
+ TP_STRUCT__entry(
+ NFSD_TRACE_PROC_RES_FIELDS
+
+ __field(u32, args_opcnt)
+ __field(u32, resp_opcnt)
+ __field(u32, opnum)
+ ),
+ TP_fast_assign(
+ NFSD_TRACE_PROC_RES_ASSIGNMENTS(status)
+
+ __entry->args_opcnt = args_opcnt;
+ __entry->resp_opcnt = resp_opcnt;
+ __entry->opnum = opnum;
+ ),
+ TP_printk("op=%u/%u opnum=%u status=%lu",
+ __entry->resp_opcnt, __entry->args_opcnt,
+ __entry->opnum, __entry->status)
+);
+
+TRACE_EVENT(nfsd_compound_encode_err,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ u32 opnum,
+ __be32 status
+ ),
+ TP_ARGS(rqstp, opnum, status),
+ TP_STRUCT__entry(
+ NFSD_TRACE_PROC_RES_FIELDS
+
+ __field(u32, opnum)
+ ),
+ TP_fast_assign(
+ NFSD_TRACE_PROC_RES_ASSIGNMENTS(status)
+
+ __entry->opnum = opnum;
+ ),
+ TP_printk("opnum=%u status=%lu",
+ __entry->opnum, __entry->status)
+);
+
+
DECLARE_EVENT_CLASS(nfsd_fh_err_class,
TP_PROTO(struct svc_rqst *rqstp,
struct svc_fh *fhp,
@@ -421,6 +565,9 @@ TRACE_EVENT(nfsd_clid_inuse_err,
__entry->cl_boot, __entry->cl_id)
)
+/*
+ * from fs/nfsd/filecache.h
+ */
TRACE_DEFINE_ENUM(NFSD_FILE_HASHED);
TRACE_DEFINE_ENUM(NFSD_FILE_PENDING);
TRACE_DEFINE_ENUM(NFSD_FILE_BREAK_READ);
@@ -435,13 +582,6 @@ TRACE_DEFINE_ENUM(NFSD_FILE_REFERENCED);
{ 1 << NFSD_FILE_BREAK_WRITE, "BREAK_WRITE" }, \
{ 1 << NFSD_FILE_REFERENCED, "REFERENCED"})
-/* FIXME: This should probably be fleshed out in the future. */
-#define show_nf_may(val) \
- __print_flags(val, "|", \
- { NFSD_MAY_READ, "READ" }, \
- { NFSD_MAY_WRITE, "WRITE" }, \
- { NFSD_MAY_NOT_BREAK_LEASE, "NOT_BREAK_LEASE" })
-
DECLARE_EVENT_CLASS(nfsd_file_class,
TP_PROTO(struct nfsd_file *nf),
TP_ARGS(nf),
@@ -461,12 +601,12 @@ DECLARE_EVENT_CLASS(nfsd_file_class,
__entry->nf_may = nf->nf_may;
__entry->nf_file = nf->nf_file;
),
- TP_printk("hash=0x%x inode=0x%p ref=%d flags=%s may=%s file=%p",
+ TP_printk("hash=0x%x inode=%p ref=%d flags=%s may=%s file=%p",
__entry->nf_hashval,
__entry->nf_inode,
__entry->nf_ref,
show_nf_flags(__entry->nf_flags),
- show_nf_may(__entry->nf_may),
+ show_nfsd_may_flags(__entry->nf_may),
__entry->nf_file)
)
@@ -492,10 +632,10 @@ TRACE_EVENT(nfsd_file_acquire,
__field(u32, xid)
__field(unsigned int, hash)
__field(void *, inode)
- __field(unsigned int, may_flags)
+ __field(unsigned long, may_flags)
__field(int, nf_ref)
__field(unsigned long, nf_flags)
- __field(unsigned char, nf_may)
+ __field(unsigned long, nf_may)
__field(struct file *, nf_file)
__field(u32, status)
),
@@ -512,12 +652,12 @@ TRACE_EVENT(nfsd_file_acquire,
__entry->status = be32_to_cpu(status);
),
- TP_printk("xid=0x%x hash=0x%x inode=0x%p may_flags=%s ref=%d nf_flags=%s nf_may=%s nf_file=0x%p status=%u",
+ TP_printk("xid=0x%x hash=0x%x inode=%p may_flags=%s ref=%d nf_flags=%s nf_may=%s nf_file=%p status=%u",
__entry->xid, __entry->hash, __entry->inode,
- show_nf_may(__entry->may_flags), __entry->nf_ref,
- show_nf_flags(__entry->nf_flags),
- show_nf_may(__entry->nf_may), __entry->nf_file,
- __entry->status)
+ show_nfsd_may_flags(__entry->may_flags),
+ __entry->nf_ref, show_nf_flags(__entry->nf_flags),
+ show_nfsd_may_flags(__entry->nf_may),
+ __entry->nf_file, __entry->status)
);
DECLARE_EVENT_CLASS(nfsd_file_search_class,
@@ -533,7 +673,7 @@ DECLARE_EVENT_CLASS(nfsd_file_search_class,
__entry->hash = hash;
__entry->found = found;
),
- TP_printk("hash=0x%x inode=0x%p found=%d", __entry->hash,
+ TP_printk("hash=0x%x inode=%p found=%d", __entry->hash,
__entry->inode, __entry->found)
);
@@ -561,7 +701,7 @@ TRACE_EVENT(nfsd_file_fsnotify_handle_event,
__entry->mode = inode->i_mode;
__entry->mask = mask;
),
- TP_printk("inode=0x%p nlink=%u mode=0%ho mask=0x%x", __entry->inode,
+ TP_printk("inode=%p nlink=%u mode=0%ho mask=0x%x", __entry->inode,
__entry->nlink, __entry->mode, __entry->mask)
);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 1ecaceebee13..04937e51de56 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -978,18 +978,25 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
__be32 *verf)
{
struct file *file = nf->nf_file;
+ struct super_block *sb = file_inode(file)->i_sb;
struct svc_export *exp;
struct iov_iter iter;
__be32 nfserr;
int host_err;
int use_wgather;
loff_t pos = offset;
+ unsigned long exp_op_flags = 0;
unsigned int pflags = current->flags;
rwf_t flags = 0;
+ bool restore_flags = false;
trace_nfsd_write_opened(rqstp, fhp, offset, *cnt);
- if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
+ if (sb->s_export_op)
+ exp_op_flags = sb->s_export_op->flags;
+
+ if (test_bit(RQ_LOCAL, &rqstp->rq_flags) &&
+ !(exp_op_flags & EXPORT_OP_REMOTE_FS)) {
/*
* We want throttling in balance_dirty_pages()
* and shrink_inactive_list() to only consider
@@ -998,6 +1005,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
* the client's dirty pages or its congested queue.
*/
current->flags |= PF_LOCAL_THROTTLE;
+ restore_flags = true;
+ }
exp = fhp->fh_export;
use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp);
@@ -1049,7 +1058,7 @@ out_nfserr:
trace_nfsd_write_err(rqstp, fhp, offset, host_err);
nfserr = nfserrno(host_err);
}
- if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
+ if (restore_flags)
current_restore_flags(pflags, PF_LOCAL_THROTTLE);
return nfserr;
}
@@ -1724,7 +1733,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
struct inode *fdir, *tdir;
__be32 err;
int host_err;
- bool has_cached = false;
+ bool close_cached = false;
err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE);
if (err)
@@ -1783,8 +1792,9 @@ retry:
if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
goto out_dput_new;
- if (nfsd_has_cached_files(ndentry)) {
- has_cached = true;
+ if ((ndentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) &&
+ nfsd_has_cached_files(ndentry)) {
+ close_cached = true;
goto out_dput_old;
} else {
host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
@@ -1805,7 +1815,7 @@ retry:
* as that would do the wrong thing if the two directories
* were the same, so again we do it by hand.
*/
- if (!has_cached) {
+ if (!close_cached) {
fill_post_wcc(ffhp);
fill_post_wcc(tfhp);
}
@@ -1819,8 +1829,8 @@ retry:
* shouldn't be done with locks held however, so we delay it until this
* point and then reattempt the whole shebang.
*/
- if (has_cached) {
- has_cached = false;
+ if (close_cached) {
+ close_cached = false;
nfsd_close_cached_files(ndentry);
dput(ndentry);
goto retry;
@@ -1872,7 +1882,8 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
type = d_inode(rdentry)->i_mode & S_IFMT;
if (type != S_IFDIR) {
- nfsd_close_cached_files(rdentry);
+ if (rdentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK)
+ nfsd_close_cached_files(rdentry);
host_err = vfs_unlink(dirp, rdentry, NULL);
} else {
host_err = vfs_rmdir(dirp, rdentry);
diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h
index 0ff336b0b25f..ad77387734cc 100644
--- a/fs/nfsd/xdr.h
+++ b/fs/nfsd/xdr.h
@@ -144,7 +144,6 @@ union nfsd_xdrstore {
#define NFS2_SVC_XDRSIZE sizeof(union nfsd_xdrstore)
-int nfssvc_decode_void(struct svc_rqst *, __be32 *);
int nfssvc_decode_fhandle(struct svc_rqst *, __be32 *);
int nfssvc_decode_sattrargs(struct svc_rqst *, __be32 *);
int nfssvc_decode_diropargs(struct svc_rqst *, __be32 *);
@@ -156,7 +155,6 @@ int nfssvc_decode_readlinkargs(struct svc_rqst *, __be32 *);
int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *);
int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *);
int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *);
-int nfssvc_encode_void(struct svc_rqst *, __be32 *);
int nfssvc_encode_stat(struct svc_rqst *, __be32 *);
int nfssvc_encode_attrstat(struct svc_rqst *, __be32 *);
int nfssvc_encode_diropres(struct svc_rqst *, __be32 *);
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index ae6fa6c9cb46..456fcd7a1038 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -273,7 +273,6 @@ union nfsd3_xdrstore {
#define NFS3_SVC_XDRSIZE sizeof(union nfsd3_xdrstore)
-int nfs3svc_decode_voidarg(struct svc_rqst *, __be32 *);
int nfs3svc_decode_fhandle(struct svc_rqst *, __be32 *);
int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *);
int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *);
@@ -290,7 +289,6 @@ int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *);
int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *);
int nfs3svc_decode_readdirplusargs(struct svc_rqst *, __be32 *);
int nfs3svc_decode_commitargs(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_voidres(struct svc_rqst *, __be32 *);
int nfs3svc_encode_attrstat(struct svc_rqst *, __be32 *);
int nfs3svc_encode_wccstat(struct svc_rqst *, __be32 *);
int nfs3svc_encode_diropres(struct svc_rqst *, __be32 *);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 679d40af1bbb..a60ff5ce1a37 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -76,12 +76,7 @@ static inline bool nfsd4_has_session(struct nfsd4_compound_state *cs)
struct nfsd4_change_info {
u32 atomic;
- bool change_supported;
- u32 before_ctime_sec;
- u32 before_ctime_nsec;
u64 before_change;
- u32 after_ctime_sec;
- u32 after_ctime_nsec;
u64 after_change;
};
@@ -252,7 +247,8 @@ struct nfsd4_listxattrs {
struct nfsd4_open {
u32 op_claim_type; /* request */
- struct xdr_netobj op_fname; /* request - everything but CLAIM_PREV */
+ u32 op_fnamelen;
+ char * op_fname; /* request - everything but CLAIM_PREV */
u32 op_delegate_type; /* request - CLAIM_PREV only */
stateid_t op_delegate_stateid; /* request - response */
u32 op_why_no_deleg; /* response - DELEG_NONE_EXT only */
@@ -385,13 +381,6 @@ struct nfsd4_setclientid_confirm {
nfs4_verifier sc_confirm;
};
-struct nfsd4_saved_compoundargs {
- __be32 *p;
- __be32 *end;
- int pagelen;
- struct page **pagelist;
-};
-
struct nfsd4_test_stateid_id {
__be32 ts_id_status;
stateid_t ts_id_stateid;
@@ -419,8 +408,7 @@ struct nfsd4_write {
u64 wr_offset; /* request */
u32 wr_stable_how; /* request */
u32 wr_buflen; /* request */
- struct kvec wr_head;
- struct page ** wr_pagelist; /* request */
+ struct xdr_buf wr_payload; /* request */
u32 wr_bytes_written; /* response */
u32 wr_how_written; /* response */
@@ -433,7 +421,7 @@ struct nfsd4_exchange_id {
u32 flags;
clientid_t clientid;
u32 seqid;
- int spa_how;
+ u32 spa_how;
u32 spo_must_enforce[3];
u32 spo_must_allow[3];
struct xdr_netobj nii_domain;
@@ -554,7 +542,7 @@ struct nfsd4_copy {
bool cp_intra;
/* both */
- bool cp_synchronous;
+ u32 cp_synchronous;
/* response */
struct nfsd42_write_res cp_res;
@@ -615,7 +603,7 @@ struct nfsd4_copy_notify {
};
struct nfsd4_op {
- int opnum;
+ u32 opnum;
const struct nfsd4_operation * opdesc;
__be32 status;
union nfsd4_op_u {
@@ -696,15 +684,8 @@ struct svcxdr_tmpbuf {
struct nfsd4_compoundargs {
/* scratch variables for XDR decode */
- __be32 * p;
- __be32 * end;
- struct page ** pagelist;
- int pagelen;
- bool tail;
- __be32 tmp[8];
- __be32 * tmpp;
+ struct xdr_stream *xdr;
struct svcxdr_tmpbuf *to_free;
-
struct svc_rqst *rqstp;
u32 taglen;
@@ -767,22 +748,14 @@ static inline void
set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
{
BUG_ON(!fhp->fh_pre_saved);
- cinfo->atomic = (u32)fhp->fh_post_saved;
- cinfo->change_supported = IS_I_VERSION(d_inode(fhp->fh_dentry));
+ cinfo->atomic = (u32)(fhp->fh_post_saved && !fhp->fh_no_atomic_attr);
cinfo->before_change = fhp->fh_pre_change;
cinfo->after_change = fhp->fh_post_change;
- cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
- cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
- cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
- cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
-
}
bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp);
-int nfs4svc_decode_voidarg(struct svc_rqst *, __be32 *);
-int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *);
int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *);
int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *);
__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index e3726aca28ed..cd4da9535aed 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -134,14 +134,9 @@ static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int);
static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *);
static void nilfs_dispose_list(struct the_nilfs *, struct list_head *, int);
-#define nilfs_cnt32_gt(a, b) \
- (typecheck(__u32, a) && typecheck(__u32, b) && \
- ((__s32)(b) - (__s32)(a) < 0))
#define nilfs_cnt32_ge(a, b) \
(typecheck(__u32, a) && typecheck(__u32, b) && \
((__s32)(a) - (__s32)(b) >= 0))
-#define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a)
-#define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a)
static int nilfs_prepare_segment_lock(struct super_block *sb,
struct nilfs_transaction_info *ti)
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 5dcda8f20c04..e85e13c50d6d 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -72,7 +72,7 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
*/
static int dnotify_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
struct inode *inode, struct inode *dir,
- const struct qstr *name)
+ const struct qstr *name, u32 cookie)
{
struct dnotify_mark *dn_mark;
struct dnotify_struct *dn;
@@ -327,7 +327,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
}
rcu_read_lock();
- f = fcheck(fd);
+ f = lookup_fd_rcu(fd);
rcu_read_unlock();
/* if (f != filp) means that we lost a race and another task/thread
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 9167884a61ec..1192c9953620 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -268,12 +268,11 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
continue;
/*
- * If the event is for a child and this mark is on a parent not
+ * If the event is on a child and this mark is on a parent not
* watching children, don't send it!
*/
- if (event_mask & FS_EVENT_ON_CHILD &&
- type == FSNOTIFY_OBJ_TYPE_INODE &&
- !(mark->mask & FS_EVENT_ON_CHILD))
+ if (type == FSNOTIFY_OBJ_TYPE_PARENT &&
+ !(mark->mask & FS_EVENT_ON_CHILD))
continue;
marks_mask |= mark->mask;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 8d3ad5ef2925..30d422b8c0fc 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -152,6 +152,13 @@ static bool fsnotify_event_needs_parent(struct inode *inode, struct mount *mnt,
if (mask & FS_ISDIR)
return false;
+ /*
+ * All events that are possible on child can also may be reported with
+ * parent/name info to inode/sb/mount. Otherwise, a watching parent
+ * could result in events reported with unexpected name info to sb/mount.
+ */
+ BUILD_BUG_ON(FS_EVENTS_POSS_ON_CHILD & ~FS_EVENTS_POSS_TO_PARENT);
+
/* Did either inode/sb/mount subscribe for events with parent/name? */
marks_mask |= fsnotify_parent_needed_mask(inode->i_fsnotify_mask);
marks_mask |= fsnotify_parent_needed_mask(inode->i_sb->s_fsnotify_mask);
@@ -232,47 +239,76 @@ notify:
}
EXPORT_SYMBOL_GPL(__fsnotify_parent);
+static int fsnotify_handle_inode_event(struct fsnotify_group *group,
+ struct fsnotify_mark *inode_mark,
+ u32 mask, const void *data, int data_type,
+ struct inode *dir, const struct qstr *name,
+ u32 cookie)
+{
+ const struct path *path = fsnotify_data_path(data, data_type);
+ struct inode *inode = fsnotify_data_inode(data, data_type);
+ const struct fsnotify_ops *ops = group->ops;
+
+ if (WARN_ON_ONCE(!ops->handle_inode_event))
+ return 0;
+
+ if ((inode_mark->mask & FS_EXCL_UNLINK) &&
+ path && d_unlinked(path->dentry))
+ return 0;
+
+ /* Check interest of this mark in case event was sent with two marks */
+ if (!(mask & inode_mark->mask & ALL_FSNOTIFY_EVENTS))
+ return 0;
+
+ return ops->handle_inode_event(inode_mark, mask, inode, dir, name, cookie);
+}
+
static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
const void *data, int data_type,
struct inode *dir, const struct qstr *name,
u32 cookie, struct fsnotify_iter_info *iter_info)
{
struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
- struct fsnotify_mark *child_mark = fsnotify_iter_child_mark(iter_info);
- struct inode *inode = fsnotify_data_inode(data, data_type);
- const struct fsnotify_ops *ops = group->ops;
+ struct fsnotify_mark *parent_mark = fsnotify_iter_parent_mark(iter_info);
int ret;
- if (WARN_ON_ONCE(!ops->handle_inode_event))
- return 0;
-
if (WARN_ON_ONCE(fsnotify_iter_sb_mark(iter_info)) ||
WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info)))
return 0;
- /*
- * An event can be sent on child mark iterator instead of inode mark
- * iterator because of other groups that have interest of this inode
- * and have marks on both parent and child. We can simplify this case.
- */
- if (!inode_mark) {
- inode_mark = child_mark;
- child_mark = NULL;
+ if (parent_mark) {
+ /*
+ * parent_mark indicates that the parent inode is watching
+ * children and interested in this event, which is an event
+ * possible on child. But is *this mark* watching children and
+ * interested in this event?
+ */
+ if (parent_mark->mask & FS_EVENT_ON_CHILD) {
+ ret = fsnotify_handle_inode_event(group, parent_mark, mask,
+ data, data_type, dir, name, 0);
+ if (ret)
+ return ret;
+ }
+ if (!inode_mark)
+ return 0;
+ }
+
+ if (mask & FS_EVENT_ON_CHILD) {
+ /*
+ * Some events can be sent on both parent dir and child marks
+ * (e.g. FS_ATTRIB). If both parent dir and child are
+ * watching, report the event once to parent dir with name (if
+ * interested) and once to child without name (if interested).
+ * The child watcher is expecting an event without a file name
+ * and without the FS_EVENT_ON_CHILD flag.
+ */
+ mask &= ~FS_EVENT_ON_CHILD;
dir = NULL;
name = NULL;
}
- ret = ops->handle_inode_event(inode_mark, mask, inode, dir, name);
- if (ret || !child_mark)
- return ret;
-
- /*
- * Some events can be sent on both parent dir and child marks
- * (e.g. FS_ATTRIB). If both parent dir and child are watching,
- * report the event once to parent dir with name and once to child
- * without name.
- */
- return ops->handle_inode_event(child_mark, mask, inode, NULL, NULL);
+ return fsnotify_handle_inode_event(group, inode_mark, mask, data, data_type,
+ dir, name, cookie);
}
static int send_to_group(__u32 mask, const void *data, int data_type,
@@ -430,7 +466,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
struct fsnotify_iter_info iter_info = {};
struct super_block *sb;
struct mount *mnt = NULL;
- struct inode *child = NULL;
+ struct inode *parent = NULL;
int ret = 0;
__u32 test_mask, marks_mask;
@@ -442,11 +478,10 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
inode = dir;
} else if (mask & FS_EVENT_ON_CHILD) {
/*
- * Event on child - report on TYPE_INODE to dir if it is
- * watching children and on TYPE_CHILD to child.
+ * Event on child - report on TYPE_PARENT to dir if it is
+ * watching children and on TYPE_INODE to child.
*/
- child = inode;
- inode = dir;
+ parent = dir;
}
sb = inode->i_sb;
@@ -460,7 +495,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
if (!sb->s_fsnotify_marks &&
(!mnt || !mnt->mnt_fsnotify_marks) &&
(!inode || !inode->i_fsnotify_marks) &&
- (!child || !child->i_fsnotify_marks))
+ (!parent || !parent->i_fsnotify_marks))
return 0;
marks_mask = sb->s_fsnotify_mask;
@@ -468,8 +503,8 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
marks_mask |= mnt->mnt_fsnotify_mask;
if (inode)
marks_mask |= inode->i_fsnotify_mask;
- if (child)
- marks_mask |= child->i_fsnotify_mask;
+ if (parent)
+ marks_mask |= parent->i_fsnotify_mask;
/*
@@ -492,9 +527,9 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] =
fsnotify_first_mark(&inode->i_fsnotify_marks);
}
- if (child) {
- iter_info.marks[FSNOTIFY_OBJ_TYPE_CHILD] =
- fsnotify_first_mark(&child->i_fsnotify_marks);
+ if (parent) {
+ iter_info.marks[FSNOTIFY_OBJ_TYPE_PARENT] =
+ fsnotify_first_mark(&parent->i_fsnotify_marks);
}
/*
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index 4327d0e9c364..2007e3711916 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -24,11 +24,10 @@ static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse)
extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
struct fsnotify_group *group);
-extern int inotify_handle_event(struct fsnotify_group *group, u32 mask,
- const void *data, int data_type,
- struct inode *dir,
- const struct qstr *file_name, u32 cookie,
- struct fsnotify_iter_info *iter_info);
+extern int inotify_handle_inode_event(struct fsnotify_mark *inode_mark,
+ u32 mask, struct inode *inode,
+ struct inode *dir,
+ const struct qstr *name, u32 cookie);
extern const struct fsnotify_ops inotify_fsnotify_ops;
extern struct kmem_cache *inotify_inode_mark_cachep;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 9ddcbadc98e2..1901d799909b 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -55,25 +55,21 @@ static int inotify_merge(struct list_head *list,
return event_compare(last_event, event);
}
-static int inotify_one_event(struct fsnotify_group *group, u32 mask,
- struct fsnotify_mark *inode_mark,
- const struct path *path,
- const struct qstr *file_name, u32 cookie)
+int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *name, u32 cookie)
{
struct inotify_inode_mark *i_mark;
struct inotify_event_info *event;
struct fsnotify_event *fsn_event;
+ struct fsnotify_group *group = inode_mark->group;
int ret;
int len = 0;
int alloc_len = sizeof(struct inotify_event_info);
struct mem_cgroup *old_memcg;
- if ((inode_mark->mask & FS_EXCL_UNLINK) &&
- path && d_unlinked(path->dentry))
- return 0;
-
- if (file_name) {
- len = file_name->len;
+ if (name) {
+ len = name->len;
alloc_len += len + 1;
}
@@ -117,7 +113,7 @@ static int inotify_one_event(struct fsnotify_group *group, u32 mask,
event->sync_cookie = cookie;
event->name_len = len;
if (len)
- strcpy(event->name, file_name->name);
+ strcpy(event->name, name->name);
ret = fsnotify_add_event(group, fsn_event, inotify_merge);
if (ret) {
@@ -131,37 +127,6 @@ static int inotify_one_event(struct fsnotify_group *group, u32 mask,
return 0;
}
-int inotify_handle_event(struct fsnotify_group *group, u32 mask,
- const void *data, int data_type, struct inode *dir,
- const struct qstr *file_name, u32 cookie,
- struct fsnotify_iter_info *iter_info)
-{
- const struct path *path = fsnotify_data_path(data, data_type);
- struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
- struct fsnotify_mark *child_mark = fsnotify_iter_child_mark(iter_info);
- int ret = 0;
-
- if (WARN_ON(fsnotify_iter_vfsmount_mark(iter_info)))
- return 0;
-
- /*
- * Some events cannot be sent on both parent and child marks
- * (e.g. IN_CREATE). Those events are always sent on inode_mark.
- * For events that are possible on both parent and child (e.g. IN_OPEN),
- * event is sent on inode_mark with name if the parent is watching and
- * is sent on child_mark without name if child is watching.
- * If both parent and child are watching, report the event with child's
- * name here and report another event without child's name below.
- */
- if (inode_mark)
- ret = inotify_one_event(group, mask, inode_mark, path,
- file_name, cookie);
- if (ret || !child_mark)
- return ret;
-
- return inotify_one_event(group, mask, child_mark, path, NULL, 0);
-}
-
static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group)
{
inotify_ignored_and_remove_idr(fsn_mark, group);
@@ -227,7 +192,7 @@ static void inotify_free_mark(struct fsnotify_mark *fsn_mark)
}
const struct fsnotify_ops inotify_fsnotify_ops = {
- .handle_event = inotify_handle_event,
+ .handle_inode_event = inotify_handle_inode_event,
.free_group_priv = inotify_free_group_priv,
.free_event = inotify_free_event,
.freeing_mark = inotify_freeing_mark,
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 186722ba3894..59c177011a0f 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -37,6 +37,15 @@
#include <asm/ioctls.h>
+/*
+ * An inotify watch requires allocating an inotify_inode_mark structure as
+ * well as pinning the watched inode. Doubling the size of a VFS inode
+ * should be more than enough to cover the additional filesystem inode
+ * size increase.
+ */
+#define INOTIFY_WATCH_COST (sizeof(struct inotify_inode_mark) + \
+ 2 * sizeof(struct inode))
+
/* configurable via /proc/sys/fs/inotify/ */
static int inotify_max_queued_events __read_mostly;
@@ -486,14 +495,10 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
struct fsnotify_group *group)
{
struct inotify_inode_mark *i_mark;
- struct fsnotify_iter_info iter_info = { };
-
- fsnotify_iter_set_report_type_mark(&iter_info, FSNOTIFY_OBJ_TYPE_INODE,
- fsn_mark);
/* Queue ignore event for the watch */
- inotify_handle_event(group, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE,
- NULL, NULL, 0, &iter_info);
+ inotify_handle_inode_event(fsn_mark, FS_IN_IGNORED, NULL, NULL, NULL,
+ 0);
i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
/* remove this mark from the idr */
@@ -801,6 +806,18 @@ out:
*/
static int __init inotify_user_setup(void)
{
+ unsigned long watches_max;
+ struct sysinfo si;
+
+ si_meminfo(&si);
+ /*
+ * Allow up to 1% of addressable memory to be allocated for inotify
+ * watches (per user) limited to the range [8192, 1048576].
+ */
+ watches_max = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) /
+ INOTIFY_WATCH_COST;
+ watches_max = clamp(watches_max, 8192UL, 1048576UL);
+
BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
@@ -827,7 +844,7 @@ static int __init inotify_user_setup(void)
inotify_max_queued_events = 16384;
init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128;
- init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = 8192;
+ init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = watches_max;
return 0;
}
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index f42967b738eb..e5aab265dff1 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -323,7 +323,7 @@ static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb,
unsigned long flags;
struct file *file = iocb->ki_filp;
struct inode *vi = file_inode(file);
- ntfs_inode *base_ni, *ni = NTFS_I(vi);
+ ntfs_inode *ni = NTFS_I(vi);
ntfs_volume *vol = ni->vol;
ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
@@ -365,9 +365,6 @@ static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb,
err = -EOPNOTSUPP;
goto out;
}
- base_ni = ni;
- if (NInoAttr(ni))
- base_ni = ni->ext.base_ntfs_ino;
err = file_remove_privs(file);
if (unlikely(err))
goto out;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index caf563981532..f7e4cbc26eaf 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2347,7 +2347,6 @@ int ntfs_truncate(struct inode *vi)
ATTR_RECORD *a;
const char *te = " Leaving file length out of sync with i_size.";
int err, mp_size, size_change, alloc_change;
- u32 attr_len;
ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
BUG_ON(NInoAttr(ni));
@@ -2721,7 +2720,6 @@ do_non_resident_truncate:
* this cannot fail since we are making the attribute smaller thus by
* definition there is enough space to do so.
*/
- attr_len = le32_to_cpu(a->length);
err = ntfs_attr_record_resize(m, a, mp_size +
le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
BUG_ON(err);
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index a0c40f1be7ac..bc1bf217b38e 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -478,7 +478,7 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
u8 *kaddr = NULL;
RESTART_PAGE_HEADER *rstr1_ph = NULL;
RESTART_PAGE_HEADER *rstr2_ph = NULL;
- int log_page_size, log_page_mask, err;
+ int log_page_size, err;
bool logfile_is_empty = true;
u8 log_page_bits;
@@ -501,7 +501,6 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
log_page_size = DefaultLogPageSize;
else
log_page_size = PAGE_SIZE;
- log_page_mask = log_page_size - 1;
/*
* Use ntfs_ffs() instead of ffs() to enable the compiler to
* optimize log_page_size and log_page_bits into constants.
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 79a231719460..3bd8119bed5e 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1198,7 +1198,6 @@ static int o2net_process_message(struct o2net_sock_container *sc,
msglog(hdr, "bad magic\n");
ret = -EINVAL;
goto out;
- break;
}
/* find a handler for it */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index c46bf7f581a1..2a237ab00453 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1088,8 +1088,8 @@ static int ocfs2_check_if_ancestor(struct ocfs2_super *osb,
child_inode_no = parent_inode_no;
if (++i >= MAX_LOOKUP_TIMES) {
- mlog(ML_NOTICE, "max lookup times reached, filesystem "
- "may have nested directories, "
+ mlog_ratelimited(ML_NOTICE, "max lookup times reached, "
+ "filesystem may have nested directories, "
"src inode: %llu, dest inode: %llu.\n",
(unsigned long long)src_inode_no,
(unsigned long long)dest_inode_no);
diff --git a/fs/open.c b/fs/open.c
index 9af548fb841b..1e06e443a565 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1010,6 +1010,10 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
if (how->resolve & ~VALID_RESOLVE_FLAGS)
return -EINVAL;
+ /* Scoping flags are mutually exclusive. */
+ if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT))
+ return -EINVAL;
+
/* Deal with the mode. */
if (WILL_CREATE(flags)) {
if (how->mode & ~S_IALLUGO)
@@ -1292,7 +1296,7 @@ EXPORT_SYMBOL(filp_close);
*/
SYSCALL_DEFINE1(close, unsigned int, fd)
{
- int retval = __close_fd(current->files, fd);
+ int retval = close_fd(fd);
/* can't restart close syscall because file table entry was cleared */
if (unlikely(retval == -ERESTARTSYS ||
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 955ecd4030f0..e5b616c93e11 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -275,7 +275,8 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
return err;
}
-struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper)
+struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
+ bool is_upper)
{
struct ovl_fh *fh;
int fh_type, dwords;
@@ -319,7 +320,8 @@ struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper)
if (is_upper)
fh->fb.flags |= OVL_FH_FLAG_PATH_UPPER;
fh->fb.len = sizeof(fh->fb) + buflen;
- fh->fb.uuid = *uuid;
+ if (ofs->config.uuid)
+ fh->fb.uuid = *uuid;
return fh;
@@ -328,8 +330,8 @@ out_err:
return ERR_PTR(err);
}
-int ovl_set_origin(struct dentry *dentry, struct dentry *lower,
- struct dentry *upper)
+int ovl_set_origin(struct ovl_fs *ofs, struct dentry *dentry,
+ struct dentry *lower, struct dentry *upper)
{
const struct ovl_fh *fh = NULL;
int err;
@@ -340,7 +342,7 @@ int ovl_set_origin(struct dentry *dentry, struct dentry *lower,
* up and a pure upper inode.
*/
if (ovl_can_decode_fh(lower->d_sb)) {
- fh = ovl_encode_real_fh(lower, false);
+ fh = ovl_encode_real_fh(ofs, lower, false);
if (IS_ERR(fh))
return PTR_ERR(fh);
}
@@ -352,7 +354,8 @@ int ovl_set_origin(struct dentry *dentry, struct dentry *lower,
fh ? fh->fb.len : 0, 0);
kfree(fh);
- return err;
+ /* Ignore -EPERM from setting "user.*" on symlink/special */
+ return err == -EPERM ? 0 : err;
}
/* Store file handle of @upper dir in @index dir entry */
@@ -362,7 +365,7 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper,
const struct ovl_fh *fh;
int err;
- fh = ovl_encode_real_fh(upper, true);
+ fh = ovl_encode_real_fh(ofs, upper, true);
if (IS_ERR(fh))
return PTR_ERR(fh);
@@ -380,6 +383,7 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper,
static int ovl_create_index(struct dentry *dentry, struct dentry *origin,
struct dentry *upper)
{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *indexdir = ovl_indexdir(dentry->d_sb);
struct inode *dir = d_inode(indexdir);
struct dentry *index = NULL;
@@ -402,7 +406,7 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin,
if (WARN_ON(ovl_test_flag(OVL_INDEX, d_inode(dentry))))
return -EIO;
- err = ovl_get_index_name(origin, &name);
+ err = ovl_get_index_name(ofs, origin, &name);
if (err)
return err;
@@ -411,7 +415,7 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin,
if (IS_ERR(temp))
goto free_name;
- err = ovl_set_upper_fh(OVL_FS(dentry->d_sb), upper, temp);
+ err = ovl_set_upper_fh(ofs, upper, temp);
if (err)
goto out;
@@ -521,7 +525,7 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
* hard link.
*/
if (c->origin) {
- err = ovl_set_origin(c->dentry, c->lowerpath.dentry, temp);
+ err = ovl_set_origin(ofs, c->dentry, c->lowerpath.dentry, temp);
if (err)
return err;
}
@@ -700,7 +704,7 @@ out_dput:
static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
{
int err;
- struct ovl_fs *ofs = c->dentry->d_sb->s_fs_info;
+ struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
bool to_index = false;
/*
@@ -722,7 +726,7 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
if (to_index) {
c->destdir = ovl_indexdir(c->dentry->d_sb);
- err = ovl_get_index_name(c->lowerpath.dentry, &c->destname);
+ err = ovl_get_index_name(ofs, c->lowerpath.dentry, &c->destname);
if (err)
return err;
} else if (WARN_ON(!c->parent)) {
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index ed35be3fafc6..41ebf52f1bbc 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -211,7 +211,8 @@ static int ovl_check_encode_origin(struct dentry *dentry)
return 1;
}
-static int ovl_dentry_to_fid(struct dentry *dentry, u32 *fid, int buflen)
+static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry,
+ u32 *fid, int buflen)
{
struct ovl_fh *fh = NULL;
int err, enc_lower;
@@ -226,7 +227,7 @@ static int ovl_dentry_to_fid(struct dentry *dentry, u32 *fid, int buflen)
goto fail;
/* Encode an upper or lower file handle */
- fh = ovl_encode_real_fh(enc_lower ? ovl_dentry_lower(dentry) :
+ fh = ovl_encode_real_fh(ofs, enc_lower ? ovl_dentry_lower(dentry) :
ovl_dentry_upper(dentry), !enc_lower);
if (IS_ERR(fh))
return PTR_ERR(fh);
@@ -249,6 +250,7 @@ fail:
static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len,
struct inode *parent)
{
+ struct ovl_fs *ofs = OVL_FS(inode->i_sb);
struct dentry *dentry;
int bytes, buflen = *max_len << 2;
@@ -260,7 +262,7 @@ static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len,
if (WARN_ON(!dentry))
return FILEID_INVALID;
- bytes = ovl_dentry_to_fid(dentry, fid, buflen);
+ bytes = ovl_dentry_to_fid(ofs, dentry, fid, buflen);
dput(dentry);
if (bytes <= 0)
return FILEID_INVALID;
@@ -680,7 +682,7 @@ static struct dentry *ovl_upper_fh_to_d(struct super_block *sb,
if (!ovl_upper_mnt(ofs))
return ERR_PTR(-EACCES);
- upper = ovl_decode_real_fh(fh, ovl_upper_mnt(ofs), true);
+ upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), true);
if (IS_ERR_OR_NULL(upper))
return upper;
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index efccb7c1f9bc..bd9dd38347ae 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -53,9 +53,10 @@ static struct file *ovl_open_realfile(const struct file *file,
err = inode_permission(realinode, MAY_OPEN | acc_mode);
if (err) {
realfile = ERR_PTR(err);
- } else if (!inode_owner_or_capable(realinode)) {
- realfile = ERR_PTR(-EPERM);
} else {
+ if (!inode_owner_or_capable(realinode))
+ flags &= ~O_NOATIME;
+
realfile = open_with_fake_path(&file->f_path, flags, realinode,
current_cred());
}
@@ -75,12 +76,6 @@ static int ovl_change_flags(struct file *file, unsigned int flags)
struct inode *inode = file_inode(file);
int err;
- flags |= OVL_OPEN_FLAGS;
-
- /* If some flag changed that cannot be changed then something's amiss */
- if (WARN_ON((file->f_flags ^ flags) & ~OVL_SETFL_MASK))
- return -EIO;
-
flags &= OVL_SETFL_MASK;
if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode))
@@ -397,48 +392,6 @@ out_unlock:
return ret;
}
-static ssize_t ovl_splice_read(struct file *in, loff_t *ppos,
- struct pipe_inode_info *pipe, size_t len,
- unsigned int flags)
-{
- ssize_t ret;
- struct fd real;
- const struct cred *old_cred;
-
- ret = ovl_real_fdget(in, &real);
- if (ret)
- return ret;
-
- old_cred = ovl_override_creds(file_inode(in)->i_sb);
- ret = generic_file_splice_read(real.file, ppos, pipe, len, flags);
- revert_creds(old_cred);
-
- ovl_file_accessed(in);
- fdput(real);
- return ret;
-}
-
-static ssize_t
-ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
- loff_t *ppos, size_t len, unsigned int flags)
-{
- struct fd real;
- const struct cred *old_cred;
- ssize_t ret;
-
- ret = ovl_real_fdget(out, &real);
- if (ret)
- return ret;
-
- old_cred = ovl_override_creds(file_inode(out)->i_sb);
- ret = iter_file_splice_write(pipe, real.file, ppos, len, flags);
- revert_creds(old_cred);
-
- ovl_file_accessed(out);
- fdput(real);
- return ret;
-}
-
static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
struct fd real;
@@ -541,46 +494,31 @@ static long ovl_real_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
struct fd real;
- const struct cred *old_cred;
long ret;
ret = ovl_real_fdget(file, &real);
if (ret)
return ret;
- old_cred = ovl_override_creds(file_inode(file)->i_sb);
ret = security_file_ioctl(real.file, cmd, arg);
- if (!ret)
+ if (!ret) {
+ /*
+ * Don't override creds, since we currently can't safely check
+ * permissions before doing so.
+ */
ret = vfs_ioctl(real.file, cmd, arg);
- revert_creds(old_cred);
+ }
fdput(real);
return ret;
}
-static unsigned int ovl_iflags_to_fsflags(unsigned int iflags)
-{
- unsigned int flags = 0;
-
- if (iflags & S_SYNC)
- flags |= FS_SYNC_FL;
- if (iflags & S_APPEND)
- flags |= FS_APPEND_FL;
- if (iflags & S_IMMUTABLE)
- flags |= FS_IMMUTABLE_FL;
- if (iflags & S_NOATIME)
- flags |= FS_NOATIME_FL;
-
- return flags;
-}
-
static long ovl_ioctl_set_flags(struct file *file, unsigned int cmd,
- unsigned long arg, unsigned int flags)
+ unsigned long arg)
{
long ret;
struct inode *inode = file_inode(file);
- unsigned int oldflags;
if (!inode_owner_or_capable(inode))
return -EACCES;
@@ -591,10 +529,13 @@ static long ovl_ioctl_set_flags(struct file *file, unsigned int cmd,
inode_lock(inode);
- /* Check the capability before cred override */
- oldflags = ovl_iflags_to_fsflags(READ_ONCE(inode->i_flags));
- ret = vfs_ioc_setflags_prepare(inode, oldflags, flags);
- if (ret)
+ /*
+ * Prevent copy up if immutable and has no CAP_LINUX_IMMUTABLE
+ * capability.
+ */
+ ret = -EPERM;
+ if (!ovl_has_upperdata(inode) && IS_IMMUTABLE(inode) &&
+ !capable(CAP_LINUX_IMMUTABLE))
goto unlock;
ret = ovl_maybe_copy_up(file_dentry(file), O_WRONLY);
@@ -613,46 +554,6 @@ unlock:
}
-static long ovl_ioctl_set_fsflags(struct file *file, unsigned int cmd,
- unsigned long arg)
-{
- unsigned int flags;
-
- if (get_user(flags, (int __user *) arg))
- return -EFAULT;
-
- return ovl_ioctl_set_flags(file, cmd, arg, flags);
-}
-
-static unsigned int ovl_fsxflags_to_fsflags(unsigned int xflags)
-{
- unsigned int flags = 0;
-
- if (xflags & FS_XFLAG_SYNC)
- flags |= FS_SYNC_FL;
- if (xflags & FS_XFLAG_APPEND)
- flags |= FS_APPEND_FL;
- if (xflags & FS_XFLAG_IMMUTABLE)
- flags |= FS_IMMUTABLE_FL;
- if (xflags & FS_XFLAG_NOATIME)
- flags |= FS_NOATIME_FL;
-
- return flags;
-}
-
-static long ovl_ioctl_set_fsxflags(struct file *file, unsigned int cmd,
- unsigned long arg)
-{
- struct fsxattr fa;
-
- memset(&fa, 0, sizeof(fa));
- if (copy_from_user(&fa, (void __user *) arg, sizeof(fa)))
- return -EFAULT;
-
- return ovl_ioctl_set_flags(file, cmd, arg,
- ovl_fsxflags_to_fsflags(fa.fsx_xflags));
-}
-
long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
long ret;
@@ -663,12 +564,9 @@ long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
ret = ovl_real_ioctl(file, cmd, arg);
break;
- case FS_IOC_SETFLAGS:
- ret = ovl_ioctl_set_fsflags(file, cmd, arg);
- break;
-
case FS_IOC_FSSETXATTR:
- ret = ovl_ioctl_set_fsxflags(file, cmd, arg);
+ case FS_IOC_SETFLAGS:
+ ret = ovl_ioctl_set_flags(file, cmd, arg);
break;
default:
@@ -801,8 +699,8 @@ const struct file_operations ovl_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ovl_compat_ioctl,
#endif
- .splice_read = ovl_splice_read,
- .splice_write = ovl_splice_write,
+ .splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
.copy_file_range = ovl_copy_file_range,
.remap_file_range = ovl_remap_file_range,
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index b584dca845ba..d739e14c6814 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -329,8 +329,14 @@ static const char *ovl_get_link(struct dentry *dentry,
bool ovl_is_private_xattr(struct super_block *sb, const char *name)
{
- return strncmp(name, OVL_XATTR_PREFIX,
- sizeof(OVL_XATTR_PREFIX) - 1) == 0;
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ if (ofs->config.userxattr)
+ return strncmp(name, OVL_XATTR_USER_PREFIX,
+ sizeof(OVL_XATTR_USER_PREFIX) - 1) == 0;
+ else
+ return strncmp(name, OVL_XATTR_TRUSTED_PREFIX,
+ sizeof(OVL_XATTR_TRUSTED_PREFIX) - 1) == 0;
}
int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
@@ -476,7 +482,7 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
int err;
- struct inode *realinode = ovl_inode_real(inode);
+ struct inode *realinode = ovl_inode_realdata(inode);
const struct cred *old_cred;
if (!realinode->i_op->fiemap)
@@ -690,7 +696,7 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev)
* For the first, copy up case, the union nlink does not change, whether the
* operation succeeds or fails, but the upper inode nlink may change.
* Therefore, before copy up, we store the union nlink value relative to the
- * lower inode nlink in the index inode xattr trusted.overlay.nlink.
+ * lower inode nlink in the index inode xattr .overlay.nlink.
*
* For the second, upper hardlink case, the union nlink should be incremented
* or decremented IFF the operation succeeds, aligned with nlink change of the
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index a6162c4076db..3fe05fb5d145 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -150,17 +150,22 @@ invalid:
goto out;
}
-struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt,
- bool connected)
+struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh,
+ struct vfsmount *mnt, bool connected)
{
struct dentry *real;
int bytes;
+ if (!capable(CAP_DAC_READ_SEARCH))
+ return NULL;
+
/*
* Make sure that the stored uuid matches the uuid of the lower
* layer where file handle will be decoded.
+ * In case of uuid=off option just make sure that stored uuid is null.
*/
- if (!uuid_equal(&fh->fb.uuid, &mnt->mnt_sb->s_uuid))
+ if (ofs->config.uuid ? !uuid_equal(&fh->fb.uuid, &mnt->mnt_sb->s_uuid) :
+ !uuid_is_null(&fh->fb.uuid))
return NULL;
bytes = (fh->fb.len - offsetof(struct ovl_fb, fid));
@@ -354,7 +359,7 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
ofs->layers[i].fs->bad_uuid)
continue;
- origin = ovl_decode_real_fh(fh, ofs->layers[i].mnt,
+ origin = ovl_decode_real_fh(ofs, fh, ofs->layers[i].mnt,
connected);
if (origin)
break;
@@ -450,7 +455,7 @@ int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
struct ovl_fh *fh;
int err;
- fh = ovl_encode_real_fh(real, is_upper);
+ fh = ovl_encode_real_fh(ofs, real, is_upper);
err = PTR_ERR(fh);
if (IS_ERR(fh)) {
fh = NULL;
@@ -488,7 +493,7 @@ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index)
if (IS_ERR_OR_NULL(fh))
return ERR_CAST(fh);
- upper = ovl_decode_real_fh(fh, ovl_upper_mnt(ofs), true);
+ upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), true);
kfree(fh);
if (IS_ERR_OR_NULL(upper))
@@ -640,12 +645,13 @@ static int ovl_get_index_name_fh(struct ovl_fh *fh, struct qstr *name)
* index dir was cleared. Either way, that index cannot be used to indentify
* the overlay inode.
*/
-int ovl_get_index_name(struct dentry *origin, struct qstr *name)
+int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin,
+ struct qstr *name)
{
struct ovl_fh *fh;
int err;
- fh = ovl_encode_real_fh(origin, false);
+ fh = ovl_encode_real_fh(ofs, origin, false);
if (IS_ERR(fh))
return PTR_ERR(fh);
@@ -694,7 +700,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
bool is_dir = d_is_dir(origin);
int err;
- err = ovl_get_index_name(origin, &name);
+ err = ovl_get_index_name(ofs, origin, &name);
if (err)
return ERR_PTR(err);
@@ -805,7 +811,7 @@ static int ovl_fix_origin(struct ovl_fs *ofs, struct dentry *dentry,
if (err)
return err;
- err = ovl_set_origin(dentry, lower, upper);
+ err = ovl_set_origin(ofs, dentry, lower, upper);
if (!err)
err = ovl_set_impure(dentry->d_parent, upper->d_parent);
@@ -1003,6 +1009,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
* Just make sure a corresponding data dentry has been found.
*/
if (d.metacopy || (uppermetacopy && !ctr)) {
+ pr_warn_ratelimited("metacopy with no lower data found - abort lookup (%pd2)\n",
+ dentry);
err = -EIO;
goto out_put;
} else if (!d.is_dir && upperdentry && !ctr && origin_path) {
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index f8880aa2ba0e..b487e48c7fd4 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -22,7 +22,9 @@ enum ovl_path_type {
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
#define OVL_TYPE_ORIGIN(type) ((type) & __OVL_PATH_ORIGIN)
-#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay."
+#define OVL_XATTR_NAMESPACE "overlay."
+#define OVL_XATTR_TRUSTED_PREFIX XATTR_TRUSTED_PREFIX OVL_XATTR_NAMESPACE
+#define OVL_XATTR_USER_PREFIX XATTR_USER_PREFIX OVL_XATTR_NAMESPACE
enum ovl_xattr {
OVL_XATTR_OPAQUE,
@@ -113,10 +115,10 @@ struct ovl_fh {
#define OVL_FH_FID_OFFSET (OVL_FH_WIRE_OFFSET + \
offsetof(struct ovl_fb, fid))
-extern const char *ovl_xattr_table[];
+extern const char *const ovl_xattr_table[][2];
static inline const char *ovl_xattr(struct ovl_fs *ofs, enum ovl_xattr ox)
{
- return ovl_xattr_table[ox];
+ return ovl_xattr_table[ox][ofs->config.userxattr];
}
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
@@ -383,8 +385,8 @@ static inline int ovl_check_fh_len(struct ovl_fh *fh, int fh_len)
return ovl_check_fb_len(&fh->fb, fh_len - OVL_FH_WIRE_OFFSET);
}
-struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt,
- bool connected);
+struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh,
+ struct vfsmount *mnt, bool connected);
int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
struct dentry *upperdentry, struct ovl_path **stackp);
int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
@@ -392,7 +394,8 @@ int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
bool set);
struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index);
int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index);
-int ovl_get_index_name(struct dentry *origin, struct qstr *name);
+int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin,
+ struct qstr *name);
struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh);
struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
struct dentry *origin, bool verify);
@@ -514,9 +517,10 @@ int ovl_maybe_copy_up(struct dentry *dentry, int flags);
int ovl_copy_xattr(struct super_block *sb, struct dentry *old,
struct dentry *new);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);
-struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper);
-int ovl_set_origin(struct dentry *dentry, struct dentry *lower,
- struct dentry *upper);
+struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
+ bool is_upper);
+int ovl_set_origin(struct ovl_fs *ofs, struct dentry *dentry,
+ struct dentry *lower, struct dentry *upper);
/* export.c */
extern const struct export_operations ovl_export_operations;
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 1b5a2094df8e..fbd5e27ce66b 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -14,9 +14,11 @@ struct ovl_config {
bool redirect_follow;
const char *redirect_mode;
bool index;
+ bool uuid;
bool nfs_export;
int xino;
bool metacopy;
+ bool userxattr;
bool ovl_volatile;
};
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 290983bcfbb3..2bd570cbe8a4 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -79,7 +79,7 @@ static void ovl_dentry_release(struct dentry *dentry)
static struct dentry *ovl_d_real(struct dentry *dentry,
const struct inode *inode)
{
- struct dentry *real;
+ struct dentry *real = NULL, *lower;
/* It's an overlay file */
if (inode && d_inode(dentry) == inode)
@@ -98,9 +98,10 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
if (real && !inode && ovl_has_upperdata(d_inode(dentry)))
return real;
- real = ovl_dentry_lowerdata(dentry);
- if (!real)
+ lower = ovl_dentry_lowerdata(dentry);
+ if (!lower)
goto bug;
+ real = lower;
/* Handle recursion */
real = d_real(real, inode);
@@ -108,8 +109,10 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
if (!inode || inode == d_inode(real))
return real;
bug:
- WARN(1, "ovl_d_real(%pd4, %s:%lu): real dentry not found\n", dentry,
- inode ? inode->i_sb->s_id : "NULL", inode ? inode->i_ino : 0);
+ WARN(1, "%s(%pd4, %s:%lu): real dentry (%p/%lu) not found\n",
+ __func__, dentry, inode ? inode->i_sb->s_id : "NULL",
+ inode ? inode->i_ino : 0, real,
+ real && d_inode(real) ? d_inode(real)->i_ino : 0);
return dentry;
}
@@ -356,6 +359,8 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
seq_printf(m, ",redirect_dir=%s", ofs->config.redirect_mode);
if (ofs->config.index != ovl_index_def)
seq_printf(m, ",index=%s", ofs->config.index ? "on" : "off");
+ if (!ofs->config.uuid)
+ seq_puts(m, ",uuid=off");
if (ofs->config.nfs_export != ovl_nfs_export_def)
seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ?
"on" : "off");
@@ -410,7 +415,10 @@ enum {
OPT_REDIRECT_DIR,
OPT_INDEX_ON,
OPT_INDEX_OFF,
+ OPT_UUID_ON,
+ OPT_UUID_OFF,
OPT_NFS_EXPORT_ON,
+ OPT_USERXATTR,
OPT_NFS_EXPORT_OFF,
OPT_XINO_ON,
OPT_XINO_OFF,
@@ -429,6 +437,9 @@ static const match_table_t ovl_tokens = {
{OPT_REDIRECT_DIR, "redirect_dir=%s"},
{OPT_INDEX_ON, "index=on"},
{OPT_INDEX_OFF, "index=off"},
+ {OPT_USERXATTR, "userxattr"},
+ {OPT_UUID_ON, "uuid=on"},
+ {OPT_UUID_OFF, "uuid=off"},
{OPT_NFS_EXPORT_ON, "nfs_export=on"},
{OPT_NFS_EXPORT_OFF, "nfs_export=off"},
{OPT_XINO_ON, "xino=on"},
@@ -549,6 +560,14 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
index_opt = true;
break;
+ case OPT_UUID_ON:
+ config->uuid = true;
+ break;
+
+ case OPT_UUID_OFF:
+ config->uuid = false;
+ break;
+
case OPT_NFS_EXPORT_ON:
config->nfs_export = true;
nfs_export_opt = true;
@@ -585,6 +604,10 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
config->ovl_volatile = true;
break;
+ case OPT_USERXATTR:
+ config->userxattr = true;
+ break;
+
default:
pr_err("unrecognized mount option \"%s\" or missing value\n",
p);
@@ -688,6 +711,28 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
}
}
+
+ /* Resolve userxattr -> !redirect && !metacopy dependency */
+ if (config->userxattr) {
+ if (config->redirect_follow && redirect_opt) {
+ pr_err("conflicting options: userxattr,redirect_dir=%s\n",
+ config->redirect_mode);
+ return -EINVAL;
+ }
+ if (config->metacopy && metacopy_opt) {
+ pr_err("conflicting options: userxattr,metacopy=on\n");
+ return -EINVAL;
+ }
+ /*
+ * Silently disable default setting of redirect and metacopy.
+ * This shall be the default in the future as well: these
+ * options must be explicitly enabled if used together with
+ * userxattr.
+ */
+ config->redirect_dir = config->redirect_follow = false;
+ config->metacopy = false;
+ }
+
return 0;
}
@@ -1037,8 +1082,14 @@ ovl_posix_acl_default_xattr_handler = {
.set = ovl_posix_acl_xattr_set,
};
-static const struct xattr_handler ovl_own_xattr_handler = {
- .prefix = OVL_XATTR_PREFIX,
+static const struct xattr_handler ovl_own_trusted_xattr_handler = {
+ .prefix = OVL_XATTR_TRUSTED_PREFIX,
+ .get = ovl_own_xattr_get,
+ .set = ovl_own_xattr_set,
+};
+
+static const struct xattr_handler ovl_own_user_xattr_handler = {
+ .prefix = OVL_XATTR_USER_PREFIX,
.get = ovl_own_xattr_get,
.set = ovl_own_xattr_set,
};
@@ -1049,12 +1100,22 @@ static const struct xattr_handler ovl_other_xattr_handler = {
.set = ovl_other_xattr_set,
};
-static const struct xattr_handler *ovl_xattr_handlers[] = {
+static const struct xattr_handler *ovl_trusted_xattr_handlers[] = {
#ifdef CONFIG_FS_POSIX_ACL
&ovl_posix_acl_access_xattr_handler,
&ovl_posix_acl_default_xattr_handler,
#endif
- &ovl_own_xattr_handler,
+ &ovl_own_trusted_xattr_handler,
+ &ovl_other_xattr_handler,
+ NULL
+};
+
+static const struct xattr_handler *ovl_user_xattr_handlers[] = {
+#ifdef CONFIG_FS_POSIX_ACL
+ &ovl_posix_acl_access_xattr_handler,
+ &ovl_posix_acl_default_xattr_handler,
+#endif
+ &ovl_own_user_xattr_handler,
&ovl_other_xattr_handler,
NULL
};
@@ -1317,7 +1378,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
pr_warn("upper fs does not support RENAME_WHITEOUT.\n");
/*
- * Check if upper/work fs supports trusted.overlay.* xattr
+ * Check if upper/work fs supports (trusted|user).overlay.* xattr
*/
err = ovl_do_setxattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE, "0", 1);
if (err) {
@@ -1456,10 +1517,10 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
/*
* Verify upper root is exclusively associated with index dir.
- * Older kernels stored upper fh in "trusted.overlay.origin"
+ * Older kernels stored upper fh in ".overlay.origin"
* xattr. If that xattr exists, verify that it is a match to
* upper dir file handle. In any case, verify or set xattr
- * "trusted.overlay.upper" to indicate that index may have
+ * ".overlay.upper" to indicate that index may have
* directory entries.
*/
if (ovl_check_origin_xattr(ofs, ofs->indexdir)) {
@@ -1877,6 +1938,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
ofs->share_whiteout = true;
ofs->config.index = ovl_index_def;
+ ofs->config.uuid = true;
ofs->config.nfs_export = ovl_nfs_export_def;
ofs->config.xino = ovl_xino_def();
ofs->config.metacopy = ovl_metacopy_def;
@@ -1956,6 +2018,11 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
if (!ovl_upper_mnt(ofs))
sb->s_flags |= SB_RDONLY;
+ if (!ofs->config.uuid && ofs->numfs > 1) {
+ pr_warn("The uuid=off requires a single fs for lower and upper, falling back to uuid=on.\n");
+ ofs->config.uuid = true;
+ }
+
if (!ovl_force_readonly(ofs) && ofs->config.index) {
err = ovl_get_indexdir(sb, ofs, oe, &upperpath);
if (err)
@@ -1991,7 +2058,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
sb->s_magic = OVERLAYFS_SUPER_MAGIC;
- sb->s_xattr = ovl_xattr_handlers;
+ sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers :
+ ovl_trusted_xattr_handlers;
sb->s_fs_info = ofs;
sb->s_flags |= SB_POSIXACL;
sb->s_iflags |= SB_I_SKIP_SYNC;
@@ -2028,6 +2096,7 @@ static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags,
static struct file_system_type ovl_fs_type = {
.owner = THIS_MODULE,
.name = "overlay",
+ .fs_flags = FS_USERNS_MOUNT,
.mount = ovl_mount,
.kill_sb = kill_anon_super,
};
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 23f475627d07..6569031af3cd 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -50,6 +50,9 @@ const struct cred *ovl_override_creds(struct super_block *sb)
*/
int ovl_can_decode_fh(struct super_block *sb)
{
+ if (!capable(CAP_DAC_READ_SEARCH))
+ return 0;
+
if (!sb->s_export_op || !sb->s_export_op->fh_to_dentry)
return 0;
@@ -582,9 +585,10 @@ bool ovl_check_dir_xattr(struct super_block *sb, struct dentry *dentry,
#define OVL_XATTR_METACOPY_POSTFIX "metacopy"
#define OVL_XATTR_TAB_ENTRY(x) \
- [x] = OVL_XATTR_PREFIX x ## _POSTFIX
+ [x] = { [false] = OVL_XATTR_TRUSTED_PREFIX x ## _POSTFIX, \
+ [true] = OVL_XATTR_USER_PREFIX x ## _POSTFIX }
-const char *ovl_xattr_table[] = {
+const char *const ovl_xattr_table[][2] = {
OVL_XATTR_TAB_ENTRY(OVL_XATTR_OPAQUE),
OVL_XATTR_TAB_ENTRY(OVL_XATTR_REDIRECT),
OVL_XATTR_TAB_ENTRY(OVL_XATTR_ORIGIN),
@@ -716,6 +720,7 @@ bool ovl_need_index(struct dentry *dentry)
/* Caller must hold OVL_I(inode)->lock */
static void ovl_cleanup_index(struct dentry *dentry)
{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *indexdir = ovl_indexdir(dentry->d_sb);
struct inode *dir = indexdir->d_inode;
struct dentry *lowerdentry = ovl_dentry_lower(dentry);
@@ -725,7 +730,7 @@ static void ovl_cleanup_index(struct dentry *dentry)
struct qstr name = { };
int err;
- err = ovl_get_index_name(lowerdentry, &name);
+ err = ovl_get_index_name(ofs, lowerdentry, &name);
if (err)
goto fail;
@@ -879,6 +884,13 @@ int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct dentry *dentry)
if (res < 0) {
if (res == -ENODATA || res == -EOPNOTSUPP)
return 0;
+ /*
+ * getxattr on user.* may fail with EACCES in case there's no
+ * read permission on the inode. Not much we can do, other than
+ * tell the caller that this is not a metacopy inode.
+ */
+ if (ofs->config.userxattr && res == -EACCES)
+ return 0;
goto out;
}
diff --git a/fs/pipe.c b/fs/pipe.c
index 0ac197658a2d..c5989cfd564d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1342,9 +1342,8 @@ out_revert_acct:
}
/*
- * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
- * location, so checking ->i_pipe is not enough to verify that this is a
- * pipe.
+ * Note that i_pipe and i_cdev share the same location, so checking ->i_pipe is
+ * not enough to verify that this is a pipe.
*/
struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
{
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 65ec2029fa80..bb87e4d89cd8 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -56,6 +56,7 @@
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/time.h>
+#include <linux/time_namespace.h>
#include <linux/kernel.h>
#include <linux/kernel_stat.h>
#include <linux/tty.h>
@@ -368,6 +369,34 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
seq_puts(m, "vulnerable");
break;
}
+
+ seq_puts(m, "\nSpeculationIndirectBranch:\t");
+ switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_INDIRECT_BRANCH)) {
+ case -EINVAL:
+ seq_puts(m, "unsupported");
+ break;
+ case PR_SPEC_NOT_AFFECTED:
+ seq_puts(m, "not affected");
+ break;
+ case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE:
+ seq_puts(m, "conditional force disabled");
+ break;
+ case PR_SPEC_PRCTL | PR_SPEC_DISABLE:
+ seq_puts(m, "conditional disabled");
+ break;
+ case PR_SPEC_PRCTL | PR_SPEC_ENABLE:
+ seq_puts(m, "conditional enabled");
+ break;
+ case PR_SPEC_ENABLE:
+ seq_puts(m, "always enabled");
+ break;
+ case PR_SPEC_DISABLE:
+ seq_puts(m, "always disabled");
+ break;
+ default:
+ seq_puts(m, "unknown");
+ break;
+ }
seq_putc(m, '\n');
}
@@ -382,9 +411,9 @@ static inline void task_context_switch_counts(struct seq_file *m,
static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
{
seq_printf(m, "Cpus_allowed:\t%*pb\n",
- cpumask_pr_args(task->cpus_ptr));
+ cpumask_pr_args(&task->cpus_mask));
seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
- cpumask_pr_args(task->cpus_ptr));
+ cpumask_pr_args(&task->cpus_mask));
}
static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
@@ -533,8 +562,9 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
priority = task_prio(task);
nice = task_nice(task);
- /* convert nsec -> ticks */
- start_time = nsec_to_clock_t(task->start_boottime);
+ /* apply timens offset for boottime and convert nsec -> ticks */
+ start_time =
+ nsec_to_clock_t(timens_add_boottime_ns(task->start_boottime));
seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns));
seq_puts(m, " (");
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b362523a9829..b3422cda2a91 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -405,11 +405,11 @@ print0:
static int lock_trace(struct task_struct *task)
{
- int err = mutex_lock_killable(&task->signal->exec_update_mutex);
+ int err = down_read_killable(&task->signal->exec_update_lock);
if (err)
return err;
if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
- mutex_unlock(&task->signal->exec_update_mutex);
+ up_read(&task->signal->exec_update_lock);
return -EPERM;
}
return 0;
@@ -417,7 +417,7 @@ static int lock_trace(struct task_struct *task)
static void unlock_trace(struct task_struct *task)
{
- mutex_unlock(&task->signal->exec_update_mutex);
+ up_read(&task->signal->exec_update_lock);
}
#ifdef CONFIG_STACKTRACE
@@ -2021,7 +2021,7 @@ const struct dentry_operations pid_dentry_operations =
* file type from dcache entry.
*
* Since all of the proc inode numbers are dynamically generated, the inode
- * numbers do not exist until the inode is cache. This means creating the
+ * numbers do not exist until the inode is cache. This means creating
* the dcache entry in readdir is necessary to keep the inode numbers
* reported by readdir in sync with the inode numbers reported
* by stat.
@@ -2930,7 +2930,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
unsigned long flags;
int result;
- result = mutex_lock_killable(&task->signal->exec_update_mutex);
+ result = down_read_killable(&task->signal->exec_update_lock);
if (result)
return result;
@@ -2966,7 +2966,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
result = 0;
out_unlock:
- mutex_unlock(&task->signal->exec_update_mutex);
+ up_read(&task->signal->exec_update_lock);
return result;
}
@@ -3263,6 +3263,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_PROC_PID_ARCH_STATUS
ONE("arch_status", S_IRUGO, proc_pid_arch_status),
#endif
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+ ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
+#endif
};
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3592,6 +3595,9 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_PROC_PID_ARCH_STATUS
ONE("arch_status", S_IRUGO, proc_pid_arch_status),
#endif
+#ifdef CONFIG_SECCOMP_CACHE_DEBUG
+ ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
+#endif
};
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 81882a13212d..cb51763ed554 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -28,14 +28,13 @@ static int seq_show(struct seq_file *m, void *v)
if (!task)
return -ENOENT;
- files = get_files_struct(task);
- put_task_struct(task);
-
+ task_lock(task);
+ files = task->files;
if (files) {
unsigned int fd = proc_fd(m->private);
spin_lock(&files->file_lock);
- file = fcheck_files(files, fd);
+ file = files_lookup_fd_locked(files, fd);
if (file) {
struct fdtable *fdt = files_fdtable(files);
@@ -47,8 +46,9 @@ static int seq_show(struct seq_file *m, void *v)
ret = 0;
}
spin_unlock(&files->file_lock);
- put_files_struct(files);
}
+ task_unlock(task);
+ put_task_struct(task);
if (ret)
return ret;
@@ -57,6 +57,7 @@ static int seq_show(struct seq_file *m, void *v)
(long long)file->f_pos, f_flags,
real_mount(file->f_path.mnt)->mnt_id);
+ /* show_fd_locks() never deferences files so a stale value is safe */
show_fd_locks(m, file, files);
if (seq_has_overflowed(m))
goto out;
@@ -83,18 +84,13 @@ static const struct file_operations proc_fdinfo_file_operations = {
static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
{
- struct files_struct *files = get_files_struct(task);
struct file *file;
- if (!files)
- return false;
-
rcu_read_lock();
- file = fcheck_files(files, fd);
+ file = task_lookup_fd_rcu(task, fd);
if (file)
*mode = file->f_mode;
rcu_read_unlock();
- put_files_struct(files);
return !!file;
}
@@ -146,29 +142,22 @@ static const struct dentry_operations tid_fd_dentry_operations = {
static int proc_fd_link(struct dentry *dentry, struct path *path)
{
- struct files_struct *files = NULL;
struct task_struct *task;
int ret = -ENOENT;
task = get_proc_task(d_inode(dentry));
if (task) {
- files = get_files_struct(task);
- put_task_struct(task);
- }
-
- if (files) {
unsigned int fd = proc_fd(d_inode(dentry));
struct file *fd_file;
- spin_lock(&files->file_lock);
- fd_file = fcheck_files(files, fd);
+ fd_file = fget_task(task, fd);
if (fd_file) {
*path = fd_file->f_path;
path_get(&fd_file->f_path);
ret = 0;
+ fput(fd_file);
}
- spin_unlock(&files->file_lock);
- put_files_struct(files);
+ put_task_struct(task);
}
return ret;
@@ -229,7 +218,6 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
instantiate_t instantiate)
{
struct task_struct *p = get_proc_task(file_inode(file));
- struct files_struct *files;
unsigned int fd;
if (!p)
@@ -237,22 +225,18 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
if (!dir_emit_dots(file, ctx))
goto out;
- files = get_files_struct(p);
- if (!files)
- goto out;
rcu_read_lock();
- for (fd = ctx->pos - 2;
- fd < files_fdtable(files)->max_fds;
- fd++, ctx->pos++) {
+ for (fd = ctx->pos - 2;; fd++) {
struct file *f;
struct fd_data data;
char name[10 + 1];
unsigned int len;
- f = fcheck_files(files, fd);
+ f = task_lookup_next_fd_rcu(p, &fd);
+ ctx->pos = fd + 2LL;
if (!f)
- continue;
+ break;
data.mode = f->f_mode;
rcu_read_unlock();
data.fd = fd;
@@ -261,13 +245,11 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
if (!proc_fill_cache(file, ctx,
name, len, instantiate, p,
&data))
- goto out_fd_loop;
+ goto out;
cond_resched();
rcu_read_lock();
}
rcu_read_unlock();
-out_fd_loop:
- put_files_struct(files);
out:
put_task_struct(p);
return 0;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index b84663252add..6c0a05f55d6b 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -349,6 +349,16 @@ static const struct file_operations proc_dir_operations = {
.iterate_shared = proc_readdir,
};
+static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ return 0;
+}
+
+const struct dentry_operations proc_net_dentry_ops = {
+ .d_revalidate = proc_net_d_revalidate,
+ .d_delete = always_delete_dentry,
+};
+
/*
* proc directories can do almost nothing..
*/
@@ -471,8 +481,8 @@ struct proc_dir_entry *proc_symlink(const char *name,
}
EXPORT_SYMBOL(proc_symlink);
-struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
- struct proc_dir_entry *parent, void *data)
+struct proc_dir_entry *_proc_mkdir(const char *name, umode_t mode,
+ struct proc_dir_entry *parent, void *data, bool force_lookup)
{
struct proc_dir_entry *ent;
@@ -484,10 +494,20 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
ent->data = data;
ent->proc_dir_ops = &proc_dir_operations;
ent->proc_iops = &proc_dir_inode_operations;
+ if (force_lookup) {
+ pde_force_lookup(ent);
+ }
ent = proc_register(parent, ent);
}
return ent;
}
+EXPORT_SYMBOL_GPL(_proc_mkdir);
+
+struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
+ struct proc_dir_entry *parent, void *data)
+{
+ return _proc_mkdir(name, mode, parent, data, false);
+}
EXPORT_SYMBOL_GPL(proc_mkdir_data);
struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode,
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 917cc85e3466..f60b379dcdc7 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -190,10 +190,9 @@ struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_e
extern int proc_readdir(struct file *, struct dir_context *);
int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *);
-static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
+static inline void pde_get(struct proc_dir_entry *pde)
{
refcount_inc(&pde->refcnt);
- return pde;
}
extern void pde_put(struct proc_dir_entry *);
@@ -310,3 +309,10 @@ extern unsigned long task_statm(struct mm_struct *,
unsigned long *, unsigned long *,
unsigned long *, unsigned long *);
extern void task_mem(struct seq_file *, struct mm_struct *);
+
+extern const struct dentry_operations proc_net_dentry_ops;
+static inline void pde_force_lookup(struct proc_dir_entry *pde)
+{
+ /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
+ pde->proc_dops = &proc_net_dentry_ops;
+}
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e502414b3556..4d2e64e9016c 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -193,8 +193,6 @@ kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg)
return 1;
p = pfn_to_page(pfn);
- if (!memmap_valid_within(pfn, p, page_zone(p)))
- return 1;
ent = kmalloc(sizeof(*ent), GFP_KERNEL);
if (!ent)
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 887a5532e449..d6fc74619625 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -107,7 +107,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
global_node_page_state(NR_KERNEL_SCS_KB));
#endif
show_val_kb(m, "PageTables: ",
- global_zone_page_state(NR_PAGETABLE));
+ global_node_page_state(NR_PAGETABLE));
show_val_kb(m, "NFS_Unstable: ", 0);
show_val_kb(m, "Bounce: ",
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index ed8a6306990c..18601042af99 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -39,22 +39,6 @@ static struct net *get_proc_net(const struct inode *inode)
return maybe_get_net(PDE_NET(PDE(inode)));
}
-static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags)
-{
- return 0;
-}
-
-static const struct dentry_operations proc_net_dentry_ops = {
- .d_revalidate = proc_net_d_revalidate,
- .d_delete = always_delete_dentry,
-};
-
-static void pde_force_lookup(struct proc_dir_entry *pde)
-{
- /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
- pde->proc_dops = &proc_net_dentry_ops;
-}
-
static int seq_open_net(struct inode *inode, struct file *file)
{
unsigned int state_size = PDE(inode)->state_size;
@@ -140,7 +124,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_data);
* @mode: The file's access mode.
* @parent: The parent directory in which to create.
* @ops: The seq_file ops with which to read the file.
- * @write: The write method which which to 'modify' the file.
+ * @write: The write method with which to 'modify' the file.
* @data: Data for retrieval by PDE_DATA().
*
* Create a network namespaced proc file in the @parent directory with the
@@ -232,7 +216,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_single);
* @mode: The file's access mode.
* @parent: The parent directory in which to create.
* @show: The seqfile show method with which to read the file.
- * @write: The write method which which to 'modify' the file.
+ * @write: The write method with which to 'modify' the file.
* @data: Data for retrieval by PDE_DATA().
*
* Create a network-namespaced proc file in the @parent directory with the
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 4695b6de3151..f25e8531fd27 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -10,6 +10,7 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/time.h>
+#include <linux/time_namespace.h>
#include <linux/irqnr.h>
#include <linux/sched/cputime.h>
#include <linux/tick.h>
@@ -118,6 +119,8 @@ static int show_stat(struct seq_file *p, void *v)
irq = softirq = steal = 0;
guest = guest_nice = 0;
getboottime64(&boottime);
+ /* shift boot timestamp according to the timens offset */
+ timens_sub_boottime(&boottime);
for_each_possible_cpu(i) {
struct kernel_cpustat kcpustat;
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index e16a49ebfe54..8adabde685f1 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -14,6 +14,14 @@ config PSTORE
If you don't have a platform persistent store driver,
say N.
+config PSTORE_DEFAULT_KMSG_BYTES
+ int "Default kernel log storage space" if EXPERT
+ depends on PSTORE
+ default "10240"
+ help
+ Defines default size of pstore kernel log storage.
+ Can be enlarged if needed, not recommended to shrink it.
+
config PSTORE_DEFLATE_COMPRESS
tristate "DEFLATE (ZLIB) compression"
default y
diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c
index fcd5563dde06..4bb8a344957a 100644
--- a/fs/pstore/blk.c
+++ b/fs/pstore/blk.c
@@ -90,7 +90,6 @@ MODULE_PARM_DESC(blkdev, "block device for pstore storage");
static DEFINE_MUTEX(pstore_blk_lock);
static struct block_device *psblk_bdev;
static struct pstore_zone_info *pstore_zone_info;
-static pstore_blk_panic_write_op blkdev_panic_write;
struct bdev_info {
dev_t devt;
@@ -245,7 +244,7 @@ static struct block_device *psblk_get_bdev(void *holder,
return bdev;
}
- nr_sects = part_nr_sects_read(bdev->bd_part);
+ nr_sects = bdev_nr_sectors(bdev);
if (!nr_sects) {
pr_err("not enough space for '%s'\n", blkdev);
blkdev_put(bdev, mode);
@@ -341,24 +340,11 @@ static ssize_t psblk_generic_blk_write(const char *buf, size_t bytes,
return ret;
}
-static ssize_t psblk_blk_panic_write(const char *buf, size_t size,
- loff_t off)
-{
- int ret;
-
- if (!blkdev_panic_write)
- return -EOPNOTSUPP;
-
- /* size and off must align to SECTOR_SIZE for block device */
- ret = blkdev_panic_write(buf, off >> SECTOR_SHIFT,
- size >> SECTOR_SHIFT);
- /* try next zone */
- if (ret == -ENOMSG)
- return ret;
- return ret ? -EIO : size;
-}
-
-static int __register_pstore_blk(struct pstore_blk_info *info)
+/*
+ * This takes its configuration only from the module parameters now.
+ * See psblk_get_bdev() and blkdev.
+ */
+static int __register_pstore_blk(void)
{
char bdev_name[BDEVNAME_SIZE];
struct block_device *bdev;
@@ -378,68 +364,34 @@ static int __register_pstore_blk(struct pstore_blk_info *info)
}
/* only allow driver matching the @blkdev */
- if (!binfo.devt || (!best_effort &&
- MAJOR(binfo.devt) != info->major)) {
- pr_debug("invalid major %u (expect %u)\n",
- info->major, MAJOR(binfo.devt));
+ if (!binfo.devt) {
+ pr_debug("no major\n");
ret = -ENODEV;
goto err_put_bdev;
}
/* psblk_bdev must be assigned before register to pstore/blk */
psblk_bdev = bdev;
- blkdev_panic_write = info->panic_write;
-
- /* Copy back block device details. */
- info->devt = binfo.devt;
- info->nr_sects = binfo.nr_sects;
- info->start_sect = binfo.start_sect;
memset(&dev, 0, sizeof(dev));
- dev.total_size = info->nr_sects << SECTOR_SHIFT;
- dev.flags = info->flags;
+ dev.total_size = binfo.nr_sects << SECTOR_SHIFT;
dev.read = psblk_generic_blk_read;
dev.write = psblk_generic_blk_write;
- dev.erase = NULL;
- dev.panic_write = info->panic_write ? psblk_blk_panic_write : NULL;
ret = __register_pstore_device(&dev);
if (ret)
goto err_put_bdev;
bdevname(bdev, bdev_name);
- pr_info("attached %s%s\n", bdev_name,
- info->panic_write ? "" : " (no dedicated panic_write!)");
+ pr_info("attached %s (no dedicated panic_write!)\n", bdev_name);
return 0;
err_put_bdev:
psblk_bdev = NULL;
- blkdev_panic_write = NULL;
psblk_put_bdev(bdev, holder);
return ret;
}
-/**
- * register_pstore_blk() - register block device to pstore/blk
- *
- * @info: details on the desired block device interface
- *
- * Return:
- * * 0 - OK
- * * Others - something error.
- */
-int register_pstore_blk(struct pstore_blk_info *info)
-{
- int ret;
-
- mutex_lock(&pstore_blk_lock);
- ret = __register_pstore_blk(info);
- mutex_unlock(&pstore_blk_lock);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(register_pstore_blk);
-
static void __unregister_pstore_blk(unsigned int major)
{
struct pstore_device_info dev = { .read = psblk_generic_blk_read };
@@ -449,24 +401,10 @@ static void __unregister_pstore_blk(unsigned int major)
if (psblk_bdev && MAJOR(psblk_bdev->bd_dev) == major) {
__unregister_pstore_device(&dev);
psblk_put_bdev(psblk_bdev, holder);
- blkdev_panic_write = NULL;
psblk_bdev = NULL;
}
}
-/**
- * unregister_pstore_blk() - unregister block device from pstore/blk
- *
- * @major: the major device number of device
- */
-void unregister_pstore_blk(unsigned int major)
-{
- mutex_lock(&pstore_blk_lock);
- __unregister_pstore_blk(major);
- mutex_unlock(&pstore_blk_lock);
-}
-EXPORT_SYMBOL_GPL(unregister_pstore_blk);
-
/* get information of pstore/blk */
int pstore_blk_get_config(struct pstore_blk_config *info)
{
@@ -483,12 +421,11 @@ EXPORT_SYMBOL_GPL(pstore_blk_get_config);
static int __init pstore_blk_init(void)
{
- struct pstore_blk_info info = { };
int ret = 0;
mutex_lock(&pstore_blk_lock);
if (!pstore_zone_info && best_effort && blkdev[0])
- ret = __register_pstore_blk(&info);
+ ret = __register_pstore_blk();
mutex_unlock(&pstore_blk_lock);
return ret;
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index c331efe8de95..93a217e4f563 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -266,7 +266,7 @@ static void parse_options(char *options)
*/
static int pstore_show_options(struct seq_file *m, struct dentry *root)
{
- if (kmsg_bytes != PSTORE_DEFAULT_KMSG_BYTES)
+ if (kmsg_bytes != CONFIG_PSTORE_DEFAULT_KMSG_BYTES)
seq_printf(m, ",kmsg_bytes=%lu", kmsg_bytes);
return 0;
}
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 7fb219042f13..801d6c0b170c 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -6,7 +6,6 @@
#include <linux/time.h>
#include <linux/pstore.h>
-#define PSTORE_DEFAULT_KMSG_BYTES 10240
extern unsigned long kmsg_bytes;
#ifdef CONFIG_PSTORE_FTRACE
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 36714df37d5d..32f64abc277c 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -101,7 +101,7 @@ static char *big_oops_buf;
static size_t big_oops_buf_sz;
/* How much of the console log to snapshot */
-unsigned long kmsg_bytes = PSTORE_DEFAULT_KMSG_BYTES;
+unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES;
void pstore_set_kmsg_bytes(int bytes)
{
diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c
index 3ce89216670c..5266ccbec007 100644
--- a/fs/pstore/zone.c
+++ b/fs/pstore/zone.c
@@ -1299,6 +1299,10 @@ int register_pstore_zone(struct pstore_zone_info *info)
pr_warn("total_size must be >= 4096\n");
return -EINVAL;
}
+ if (info->total_size > SZ_128M) {
+ pr_warn("capping size to 128MiB\n");
+ info->total_size = SZ_128M;
+ }
if (!info->kmsg_size && !info->pmsg_size && !info->console_size &&
!info->ftrace_size) {
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index bb02989d92b6..4f1373463766 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2455,7 +2455,7 @@ int dquot_resume(struct super_block *sb, int type)
ret = dquot_load_quota_sb(sb, cnt, dqopt->info[cnt].dqi_fmt_id,
flags);
if (ret < 0)
- vfs_cleanup_quota_inode(sb, type);
+ vfs_cleanup_quota_inode(sb, cnt);
}
return ret;
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 9af95c7a0bbe..6d16b2be5ac4 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -20,6 +20,7 @@
#include <linux/writeback.h>
#include <linux/nospec.h>
#include "compat.h"
+#include "../internal.h"
static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
qid_t id)
@@ -865,27 +866,42 @@ static bool quotactl_cmd_onoff(int cmd)
static struct super_block *quotactl_block(const char __user *special, int cmd)
{
#ifdef CONFIG_BLOCK
- struct block_device *bdev;
struct super_block *sb;
struct filename *tmp = getname(special);
+ bool excl = false, thawed = false;
+ int error;
+ dev_t dev;
if (IS_ERR(tmp))
return ERR_CAST(tmp);
- bdev = lookup_bdev(tmp->name);
+ error = lookup_bdev(tmp->name, &dev);
putname(tmp);
- if (IS_ERR(bdev))
- return ERR_CAST(bdev);
- if (quotactl_cmd_onoff(cmd))
- sb = get_super_exclusive_thawed(bdev);
- else if (quotactl_cmd_write(cmd))
- sb = get_super_thawed(bdev);
- else
- sb = get_super(bdev);
- bdput(bdev);
+ if (error)
+ return ERR_PTR(error);
+
+ if (quotactl_cmd_onoff(cmd)) {
+ excl = true;
+ thawed = true;
+ } else if (quotactl_cmd_write(cmd)) {
+ thawed = true;
+ }
+
+retry:
+ sb = user_get_super(dev, excl);
if (!sb)
return ERR_PTR(-ENODEV);
-
+ if (thawed && sb->s_writers.frozen != SB_UNFROZEN) {
+ if (excl)
+ up_write(&sb->s_umount);
+ else
+ up_read(&sb->s_umount);
+ wait_event(sb->s_writers.wait_unfrozen,
+ sb->s_writers.frozen == SB_UNFROZEN);
+ put_super(sb);
+ goto retry;
+ }
return sb;
+
#else
return ERR_PTR(-ENODEV);
#endif
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index a6f856f341dc..c5562c871c8b 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -62,7 +62,7 @@ static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
memset(buf, 0, info->dqi_usable_bs);
return sb->s_op->quota_read(sb, info->dqi_type, buf,
- info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+ info->dqi_usable_bs, (loff_t)blk << info->dqi_blocksize_bits);
}
static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
@@ -71,7 +71,7 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
ssize_t ret;
ret = sb->s_op->quota_write(sb, info->dqi_type, buf,
- info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+ info->dqi_usable_bs, (loff_t)blk << info->dqi_blocksize_bits);
if (ret != info->dqi_usable_bs) {
quota_error(sb, "dquota write failed");
if (ret >= 0)
@@ -284,7 +284,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
blk);
goto out_buf;
}
- dquot->dq_off = (blk << info->dqi_blocksize_bits) +
+ dquot->dq_off = ((loff_t)blk << info->dqi_blocksize_bits) +
sizeof(struct qt_disk_dqdbheader) +
i * info->dqi_entry_size;
kfree(buf);
@@ -559,7 +559,7 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
ret = -EIO;
goto out_buf;
} else {
- ret = (blk << info->dqi_blocksize_bits) + sizeof(struct
+ ret = ((loff_t)blk << info->dqi_blocksize_bits) + sizeof(struct
qt_disk_dqdbheader) + i * info->dqi_entry_size;
}
out_buf:
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index e69a2bfdd81c..c21106557a37 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -157,6 +157,25 @@ static int v2_read_file_info(struct super_block *sb, int type)
qinfo->dqi_entry_size = sizeof(struct v2r1_disk_dqblk);
qinfo->dqi_ops = &v2r1_qtree_ops;
}
+ ret = -EUCLEAN;
+ /* Some sanity checks of the read headers... */
+ if ((loff_t)qinfo->dqi_blocks << qinfo->dqi_blocksize_bits >
+ i_size_read(sb_dqopt(sb)->files[type])) {
+ quota_error(sb, "Number of blocks too big for quota file size (%llu > %llu).",
+ (loff_t)qinfo->dqi_blocks << qinfo->dqi_blocksize_bits,
+ i_size_read(sb_dqopt(sb)->files[type]));
+ goto out;
+ }
+ if (qinfo->dqi_free_blk >= qinfo->dqi_blocks) {
+ quota_error(sb, "Free block number too big (%u >= %u).",
+ qinfo->dqi_free_blk, qinfo->dqi_blocks);
+ goto out;
+ }
+ if (qinfo->dqi_free_entry >= qinfo->dqi_blocks) {
+ quota_error(sb, "Block with free entry too big (%u >= %u).",
+ qinfo->dqi_free_entry, qinfo->dqi_blocks);
+ goto out;
+ }
ret = 0;
out:
up_read(&dqopt->dqio_sem);
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 8bf88d690729..476a7ff49482 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -454,6 +454,12 @@ static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
"(second one): %h", ih);
return 0;
}
+ if (is_direntry_le_ih(ih) && (ih_item_len(ih) < (ih_entry_count(ih) * IH_SIZE))) {
+ reiserfs_warning(NULL, "reiserfs-5093",
+ "item entry count seems wrong %h",
+ ih);
+ return 0;
+ }
prev_location = ih_location(ih);
}
diff --git a/fs/remap_range.c b/fs/remap_range.c
index e6099beefa97..77dba3a49e65 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -456,8 +456,16 @@ loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
if (ret)
return ret;
+ /*
+ * This is redundant if called from vfs_dedupe_file_range(), but other
+ * callers need it and it's not performance sesitive...
+ */
+ ret = remap_verify_area(src_file, src_pos, len, false);
+ if (ret)
+ goto out_drop_write;
+
ret = remap_verify_area(dst_file, dst_pos, len, true);
- if (ret < 0)
+ if (ret)
goto out_drop_write;
ret = -EPERM;
diff --git a/fs/statfs.c b/fs/statfs.c
index 59f33752c131..68cb07788750 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -235,7 +235,7 @@ SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user
static int vfs_ustat(dev_t dev, struct kstatfs *sbuf)
{
- struct super_block *s = user_get_super(dev);
+ struct super_block *s = user_get_super(dev, false);
int err;
if (!s)
return -EINVAL;
diff --git a/fs/super.c b/fs/super.c
index 98bb0629ee10..2c6cdea2ab2d 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -307,7 +307,7 @@ static void __put_super(struct super_block *s)
* Drops a temporary reference, frees superblock if there's no
* references left.
*/
-static void put_super(struct super_block *sb)
+void put_super(struct super_block *sb)
{
spin_lock(&sb_lock);
__put_super(sb);
@@ -740,7 +740,14 @@ void iterate_supers_type(struct file_system_type *type,
EXPORT_SYMBOL(iterate_supers_type);
-static struct super_block *__get_super(struct block_device *bdev, bool excl)
+/**
+ * get_super - get the superblock of a device
+ * @bdev: device to get the superblock for
+ *
+ * Scans the superblock list and finds the superblock of the file system
+ * mounted on the device given. %NULL is returned if no match is found.
+ */
+struct super_block *get_super(struct block_device *bdev)
{
struct super_block *sb;
@@ -755,17 +762,11 @@ rescan:
if (sb->s_bdev == bdev) {
sb->s_count++;
spin_unlock(&sb_lock);
- if (!excl)
- down_read(&sb->s_umount);
- else
- down_write(&sb->s_umount);
+ down_read(&sb->s_umount);
/* still alive? */
if (sb->s_root && (sb->s_flags & SB_BORN))
return sb;
- if (!excl)
- up_read(&sb->s_umount);
- else
- up_write(&sb->s_umount);
+ up_read(&sb->s_umount);
/* nope, got unmounted */
spin_lock(&sb_lock);
__put_super(sb);
@@ -777,66 +778,6 @@ rescan:
}
/**
- * get_super - get the superblock of a device
- * @bdev: device to get the superblock for
- *
- * Scans the superblock list and finds the superblock of the file system
- * mounted on the device given. %NULL is returned if no match is found.
- */
-struct super_block *get_super(struct block_device *bdev)
-{
- return __get_super(bdev, false);
-}
-EXPORT_SYMBOL(get_super);
-
-static struct super_block *__get_super_thawed(struct block_device *bdev,
- bool excl)
-{
- while (1) {
- struct super_block *s = __get_super(bdev, excl);
- if (!s || s->s_writers.frozen == SB_UNFROZEN)
- return s;
- if (!excl)
- up_read(&s->s_umount);
- else
- up_write(&s->s_umount);
- wait_event(s->s_writers.wait_unfrozen,
- s->s_writers.frozen == SB_UNFROZEN);
- put_super(s);
- }
-}
-
-/**
- * get_super_thawed - get thawed superblock of a device
- * @bdev: device to get the superblock for
- *
- * Scans the superblock list and finds the superblock of the file system
- * mounted on the device. The superblock is returned once it is thawed
- * (or immediately if it was not frozen). %NULL is returned if no match
- * is found.
- */
-struct super_block *get_super_thawed(struct block_device *bdev)
-{
- return __get_super_thawed(bdev, false);
-}
-EXPORT_SYMBOL(get_super_thawed);
-
-/**
- * get_super_exclusive_thawed - get thawed superblock of a device
- * @bdev: device to get the superblock for
- *
- * Scans the superblock list and finds the superblock of the file system
- * mounted on the device. The superblock is returned once it is thawed
- * (or immediately if it was not frozen) and s_umount semaphore is held
- * in exclusive mode. %NULL is returned if no match is found.
- */
-struct super_block *get_super_exclusive_thawed(struct block_device *bdev)
-{
- return __get_super_thawed(bdev, true);
-}
-EXPORT_SYMBOL(get_super_exclusive_thawed);
-
-/**
* get_active_super - get an active reference to the superblock of a device
* @bdev: device to get the superblock for
*
@@ -867,7 +808,7 @@ restart:
return NULL;
}
-struct super_block *user_get_super(dev_t dev)
+struct super_block *user_get_super(dev_t dev, bool excl)
{
struct super_block *sb;
@@ -879,11 +820,17 @@ rescan:
if (sb->s_dev == dev) {
sb->s_count++;
spin_unlock(&sb_lock);
- down_read(&sb->s_umount);
+ if (excl)
+ down_write(&sb->s_umount);
+ else
+ down_read(&sb->s_umount);
/* still alive? */
if (sb->s_root && (sb->s_flags & SB_BORN))
return sb;
- up_read(&sb->s_umount);
+ if (excl)
+ up_write(&sb->s_umount);
+ else
+ up_read(&sb->s_umount);
/* nope, got unmounted */
spin_lock(&sb_lock);
__put_super(sb);
diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c
index b93b3cd10bfd..0886d835f597 100644
--- a/fs/ubifs/auth.c
+++ b/fs/ubifs/auth.c
@@ -12,7 +12,6 @@
#include <linux/crypto.h>
#include <linux/verification.h>
#include <crypto/hash.h>
-#include <crypto/sha.h>
#include <crypto/algapi.h>
#include <keys/user-type.h>
#include <keys/asymmetric-type.h>
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 155521e51ac5..7949d7c9aa8c 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -203,6 +203,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino);
err = fscrypt_prepare_lookup(dir, dentry, &nm);
+ generic_set_encrypted_ci_d_ops(dentry);
if (err == -ENOENT)
return d_splice_alias(NULL, dentry);
if (err)
@@ -270,6 +271,15 @@ done:
return d_splice_alias(inode, dentry);
}
+static int ubifs_prepare_create(struct inode *dir, struct dentry *dentry,
+ struct fscrypt_name *nm)
+{
+ if (fscrypt_is_nokey_name(dentry))
+ return -ENOKEY;
+
+ return fscrypt_setup_filename(dir, &dentry->d_name, 0, nm);
+}
+
static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
@@ -293,7 +303,7 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
if (err)
return err;
- err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
+ err = ubifs_prepare_create(dir, dentry, &nm);
if (err)
goto out_budg;
@@ -505,7 +515,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
return 0;
if (encrypted) {
- err = fscrypt_get_encryption_info(dir);
+ err = fscrypt_prepare_readdir(dir);
if (err)
return err;
@@ -953,7 +963,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
if (err)
return err;
- err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
+ err = ubifs_prepare_create(dir, dentry, &nm);
if (err)
goto out_budg;
@@ -1038,7 +1048,7 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
return err;
}
- err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
+ err = ubifs_prepare_create(dir, dentry, &nm);
if (err) {
kfree(dev);
goto out_budg;
@@ -1122,7 +1132,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
if (err)
return err;
- err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
+ err = ubifs_prepare_create(dir, dentry, &nm);
if (err)
goto out_budg;
@@ -1610,14 +1620,6 @@ int ubifs_getattr(const struct path *path, struct kstat *stat,
return 0;
}
-static int ubifs_dir_open(struct inode *dir, struct file *file)
-{
- if (IS_ENCRYPTED(dir))
- return fscrypt_get_encryption_info(dir) ? -EACCES : 0;
-
- return 0;
-}
-
const struct inode_operations ubifs_dir_inode_operations = {
.lookup = ubifs_lookup,
.create = ubifs_create,
@@ -1644,7 +1646,6 @@ const struct file_operations ubifs_dir_operations = {
.iterate_shared = ubifs_readdir,
.fsync = ubifs_fsync,
.unlocked_ioctl = ubifs_ioctl,
- .open = ubifs_dir_open,
#ifdef CONFIG_COMPAT
.compat_ioctl = ubifs_compat_ioctl,
#endif
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 000b457ad087..894cc28142e7 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -28,7 +28,7 @@
#include <linux/security.h>
#include <linux/hugetlb.h>
-int sysctl_unprivileged_userfaultfd __read_mostly = 1;
+int sysctl_unprivileged_userfaultfd __read_mostly;
static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
@@ -405,6 +405,13 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
if (ctx->features & UFFD_FEATURE_SIGBUS)
goto out;
+ if ((vmf->flags & FAULT_FLAG_USER) == 0 &&
+ ctx->flags & UFFD_USER_MODE_ONLY) {
+ printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
+ "sysctl knob to 1 if kernel faults must be handled "
+ "without obtaining CAP_SYS_PTRACE capability\n");
+ goto out;
+ }
/*
* If it's already released don't get it. This avoids to loop
@@ -1959,16 +1966,23 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
struct userfaultfd_ctx *ctx;
int fd;
- if (!sysctl_unprivileged_userfaultfd && !capable(CAP_SYS_PTRACE))
+ if (!sysctl_unprivileged_userfaultfd &&
+ (flags & UFFD_USER_MODE_ONLY) == 0 &&
+ !capable(CAP_SYS_PTRACE)) {
+ printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
+ "sysctl knob to 1 if kernel faults must be handled "
+ "without obtaining CAP_SYS_PTRACE capability\n");
return -EPERM;
+ }
BUG_ON(!current->mm);
/* Check the UFFD_* constants for consistency. */
+ BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
- if (flags & ~UFFD_SHARED_FCNTL_FLAGS)
+ if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY))
return -EINVAL;
ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index 5ab3bbec8108..f7e997a01ad0 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * fs/verity/enable.c: ioctl to enable verity on a file
+ * Ioctl to enable verity on a file
*
* Copyright 2019 Google LLC
*/
@@ -398,9 +398,9 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg)
* Some pages of the file may have been evicted from pagecache after
* being used in the Merkle tree construction, then read into pagecache
* again by another process reading from the file concurrently. Since
- * these pages didn't undergo verification against the file measurement
- * which fs-verity now claims to be enforcing, we have to wipe the
- * pagecache to ensure that all future reads are verified.
+ * these pages didn't undergo verification against the file digest which
+ * fs-verity now claims to be enforcing, we have to wipe the pagecache
+ * to ensure that all future reads are verified.
*/
filemap_write_and_wait(inode->i_mapping);
invalidate_inode_pages2(inode->i_mapping);
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index e96d99d5145e..6413d28664d6 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -14,7 +14,7 @@
#define pr_fmt(fmt) "fs-verity: " fmt
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <linux/fsverity.h>
#include <linux/mempool.h>
@@ -67,52 +67,22 @@ struct merkle_tree_params {
* When a verity file is first opened, an instance of this struct is allocated
* and stored in ->i_verity_info; it remains until the inode is evicted. It
* caches information about the Merkle tree that's needed to efficiently verify
- * data read from the file. It also caches the file measurement. The Merkle
- * tree pages themselves are not cached here, but the filesystem may cache them.
+ * data read from the file. It also caches the file digest. The Merkle tree
+ * pages themselves are not cached here, but the filesystem may cache them.
*/
struct fsverity_info {
struct merkle_tree_params tree_params;
u8 root_hash[FS_VERITY_MAX_DIGEST_SIZE];
- u8 measurement[FS_VERITY_MAX_DIGEST_SIZE];
+ u8 file_digest[FS_VERITY_MAX_DIGEST_SIZE];
const struct inode *inode;
};
-/*
- * Merkle tree properties. The file measurement is the hash of this structure
- * excluding the signature and with the sig_size field set to 0.
- */
-struct fsverity_descriptor {
- __u8 version; /* must be 1 */
- __u8 hash_algorithm; /* Merkle tree hash algorithm */
- __u8 log_blocksize; /* log2 of size of data and tree blocks */
- __u8 salt_size; /* size of salt in bytes; 0 if none */
- __le32 sig_size; /* size of signature in bytes; 0 if none */
- __le64 data_size; /* size of file the Merkle tree is built over */
- __u8 root_hash[64]; /* Merkle tree root hash */
- __u8 salt[32]; /* salt prepended to each hashed block */
- __u8 __reserved[144]; /* must be 0's */
- __u8 signature[]; /* optional PKCS#7 signature */
-};
-
/* Arbitrary limit to bound the kmalloc() size. Can be changed. */
#define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384
#define FS_VERITY_MAX_SIGNATURE_SIZE (FS_VERITY_MAX_DESCRIPTOR_SIZE - \
sizeof(struct fsverity_descriptor))
-/*
- * Format in which verity file measurements are signed. This is the same as
- * 'struct fsverity_digest', except here some magic bytes are prepended to
- * provide some context about what is being signed in case the same key is used
- * for non-fsverity purposes, and here the fields have fixed endianness.
- */
-struct fsverity_signed_digest {
- char magic[8]; /* must be "FSVerity" */
- __le16 digest_algorithm;
- __le16 digest_size;
- __u8 digest[];
-};
-
/* hash_algs.c */
extern struct fsverity_hash_alg fsverity_hash_algs[];
diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c
index c37e186ebeb6..71d0fccb6d4c 100644
--- a/fs/verity/hash_algs.c
+++ b/fs/verity/hash_algs.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * fs/verity/hash_algs.c: fs-verity hash algorithms
+ * fs-verity hash algorithms
*
* Copyright 2019 Google LLC
*/
diff --git a/fs/verity/init.c b/fs/verity/init.c
index 94c104e00861..c98b7016f446 100644
--- a/fs/verity/init.c
+++ b/fs/verity/init.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * fs/verity/init.c: fs-verity module initialization and logging
+ * fs-verity module initialization and logging
*
* Copyright 2019 Google LLC
*/
diff --git a/fs/verity/measure.c b/fs/verity/measure.c
index df409a5682ed..f0d7b30c62db 100644
--- a/fs/verity/measure.c
+++ b/fs/verity/measure.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * fs/verity/measure.c: ioctl to get a verity file's measurement
+ * Ioctl to get a verity file's digest
*
* Copyright 2019 Google LLC
*/
@@ -10,12 +10,12 @@
#include <linux/uaccess.h>
/**
- * fsverity_ioctl_measure() - get a verity file's measurement
- * @filp: file to get measurement of
+ * fsverity_ioctl_measure() - get a verity file's digest
+ * @filp: file to get digest of
* @_uarg: user pointer to fsverity_digest
*
- * Retrieve the file measurement that the kernel is enforcing for reads from a
- * verity file. See the "FS_IOC_MEASURE_VERITY" section of
+ * Retrieve the file digest that the kernel is enforcing for reads from a verity
+ * file. See the "FS_IOC_MEASURE_VERITY" section of
* Documentation/filesystems/fsverity.rst for the documentation.
*
* Return: 0 on success, -errno on failure
@@ -51,7 +51,7 @@ int fsverity_ioctl_measure(struct file *filp, void __user *_uarg)
if (copy_to_user(uarg, &arg, sizeof(arg)))
return -EFAULT;
- if (copy_to_user(uarg->digest, vi->measurement, hash_alg->digest_size))
+ if (copy_to_user(uarg->digest, vi->file_digest, hash_alg->digest_size))
return -EFAULT;
return 0;
diff --git a/fs/verity/open.c b/fs/verity/open.c
index bfe0280c14e4..228d0eca3e2e 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * fs/verity/open.c: opening fs-verity files
+ * Opening fs-verity files
*
* Copyright 2019 Google LLC
*/
@@ -124,18 +124,18 @@ out_err:
}
/*
- * Compute the file measurement by hashing the fsverity_descriptor excluding the
+ * Compute the file digest by hashing the fsverity_descriptor excluding the
* signature and with the sig_size field set to 0.
*/
-static int compute_file_measurement(struct fsverity_hash_alg *hash_alg,
- struct fsverity_descriptor *desc,
- u8 *measurement)
+static int compute_file_digest(struct fsverity_hash_alg *hash_alg,
+ struct fsverity_descriptor *desc,
+ u8 *file_digest)
{
__le32 sig_size = desc->sig_size;
int err;
desc->sig_size = 0;
- err = fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), measurement);
+ err = fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), file_digest);
desc->sig_size = sig_size;
return err;
@@ -199,15 +199,15 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode,
memcpy(vi->root_hash, desc->root_hash, vi->tree_params.digest_size);
- err = compute_file_measurement(vi->tree_params.hash_alg, desc,
- vi->measurement);
+ err = compute_file_digest(vi->tree_params.hash_alg, desc,
+ vi->file_digest);
if (err) {
- fsverity_err(inode, "Error %d computing file measurement", err);
+ fsverity_err(inode, "Error %d computing file digest", err);
goto out;
}
- pr_debug("Computed file measurement: %s:%*phN\n",
+ pr_debug("Computed file digest: %s:%*phN\n",
vi->tree_params.hash_alg->name,
- vi->tree_params.digest_size, vi->measurement);
+ vi->tree_params.digest_size, vi->file_digest);
err = fsverity_verify_signature(vi, desc, desc_size);
out:
@@ -354,7 +354,7 @@ int __init fsverity_init_info_cache(void)
{
fsverity_info_cachep = KMEM_CACHE_USERCOPY(fsverity_info,
SLAB_RECLAIM_ACCOUNT,
- measurement);
+ file_digest);
if (!fsverity_info_cachep)
return -ENOMEM;
return 0;
diff --git a/fs/verity/signature.c b/fs/verity/signature.c
index b14ed96387ec..012468eda2a7 100644
--- a/fs/verity/signature.c
+++ b/fs/verity/signature.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * fs/verity/signature.c: verification of builtin signatures
+ * Verification of builtin signatures
*
* Copyright 2019 Google LLC
*/
@@ -32,8 +32,8 @@ static struct key *fsverity_keyring;
* @desc: the file's fsverity_descriptor
* @desc_size: size of @desc
*
- * If the file's fs-verity descriptor includes a signature of the file
- * measurement, verify it against the certificates in the fs-verity keyring.
+ * If the file's fs-verity descriptor includes a signature of the file digest,
+ * verify it against the certificates in the fs-verity keyring.
*
* Return: 0 on success (signature valid or not required); -errno on failure
*/
@@ -44,7 +44,7 @@ int fsverity_verify_signature(const struct fsverity_info *vi,
const struct inode *inode = vi->inode;
const struct fsverity_hash_alg *hash_alg = vi->tree_params.hash_alg;
const u32 sig_size = le32_to_cpu(desc->sig_size);
- struct fsverity_signed_digest *d;
+ struct fsverity_formatted_digest *d;
int err;
if (sig_size == 0) {
@@ -67,7 +67,7 @@ int fsverity_verify_signature(const struct fsverity_info *vi,
memcpy(d->magic, "FSVerity", 8);
d->digest_algorithm = cpu_to_le16(hash_alg - fsverity_hash_algs);
d->digest_size = cpu_to_le16(hash_alg->digest_size);
- memcpy(d->digest, vi->measurement, hash_alg->digest_size);
+ memcpy(d->digest, vi->file_digest, hash_alg->digest_size);
err = verify_pkcs7_signature(d, sizeof(*d) + hash_alg->digest_size,
desc->signature, sig_size,
@@ -90,8 +90,8 @@ int fsverity_verify_signature(const struct fsverity_info *vi,
return err;
}
- pr_debug("Valid signature for file measurement %s:%*phN\n",
- hash_alg->name, hash_alg->digest_size, vi->measurement);
+ pr_debug("Valid signature for file digest %s:%*phN\n",
+ hash_alg->name, hash_alg->digest_size, vi->file_digest);
return 0;
}
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index a8b68c6f663d..0adb970f4e73 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * fs/verity/verify.c: data verification functions, i.e. hooks for ->readpages()
+ * Data verification functions, i.e. hooks for ->readpages()
*
* Copyright 2019 Google LLC
*/
diff --git a/fs/xattr.c b/fs/xattr.c
index cd7a563e8bcd..fd57153b1f61 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -276,8 +276,16 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
{
struct inode *inode = dentry->d_inode;
struct inode *delegated_inode = NULL;
+ const void *orig_value = value;
int error;
+ if (size && strcmp(name, XATTR_NAME_CAPS) == 0) {
+ error = cap_convert_nscap(dentry, &value, size);
+ if (error < 0)
+ return error;
+ size = error;
+ }
+
retry_deleg:
inode_lock(inode);
error = __vfs_setxattr_locked(dentry, name, value, size, flags,
@@ -289,6 +297,9 @@ retry_deleg:
if (!error)
goto retry_deleg;
}
+ if (value != orig_value)
+ kfree(value);
+
return error;
}
EXPORT_SYMBOL_GPL(vfs_setxattr);
@@ -537,12 +548,6 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
(strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
posix_acl_fix_xattr_from_user(kvalue, size);
- else if (strcmp(kname, XATTR_NAME_CAPS) == 0) {
- error = cap_convert_nscap(d, &kvalue, size);
- if (error < 0)
- goto out;
- size = error;
- }
}
error = vfs_setxattr(d, kname, kvalue, size, flags);
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index ef1d5bb88b93..b7c5783a031c 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -433,13 +433,10 @@ xfs_fs_goingdown(
{
switch (inflags) {
case XFS_FSOP_GOING_FLAGS_DEFAULT: {
- struct super_block *sb = freeze_bdev(mp->m_super->s_bdev);
-
- if (sb && !IS_ERR(sb)) {
+ if (!freeze_bdev(mp->m_super->s_bdev)) {
xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
- thaw_bdev(sb->s_bdev, sb);
+ thaw_bdev(mp->m_super->s_bdev);
}
-
break;
}
case XFS_FSOP_GOING_FLAGS_LOGFLUSH: