diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-08 11:10:02 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-08 11:10:02 -0700 |
commit | 2bd5d41e0e9d8e423a0bd446ee174584c8a495fe (patch) | |
tree | 720682607966689b7579344c5e8890f8bc8a6628 | |
parent | 65512eb0e9e6308ca08110c88a9619a9e5a19aa9 (diff) | |
parent | 247861c325c2e4f5ad3c2f9a77ab9d85d15cbcfc (diff) |
Merge tag 'fuse-update-6.0' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse
Pull fuse updates from Miklos Szeredi:
- Fix an issue with reusing the bdi in case of block based filesystems
- Allow root (in init namespace) to access fuse filesystems in user
namespaces if expicitly enabled with a module param
- Misc fixes
* tag 'fuse-update-6.0' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse:
fuse: retire block-device-based superblock on force unmount
vfs: function to prevent re-use of block-device-based superblocks
virtio_fs: Modify format for virtio_fs_direct_access
virtiofs: delete unused parameter for virtio_fs_cleanup_vqs
fuse: Add module param for CAP_SYS_ADMIN access bypassing allow_other
fuse: Remove the control interface for virtio-fs
fuse: ioctl: translate ENOSYS
fuse: limit nsec
fuse: avoid unnecessary spinlock bump
fuse: fix deadlock between atomic O_TRUNC and page invalidation
fuse: write inode in fuse_release()
-rw-r--r-- | Documentation/filesystems/fuse.rst | 29 | ||||
-rw-r--r-- | fs/fuse/control.c | 4 | ||||
-rw-r--r-- | fs/fuse/dax.c | 2 | ||||
-rw-r--r-- | fs/fuse/dir.c | 16 | ||||
-rw-r--r-- | fs/fuse/file.c | 39 | ||||
-rw-r--r-- | fs/fuse/inode.c | 16 | ||||
-rw-r--r-- | fs/fuse/ioctl.c | 15 | ||||
-rw-r--r-- | fs/fuse/virtio_fs.c | 9 | ||||
-rw-r--r-- | fs/super.c | 33 | ||||
-rw-r--r-- | include/linux/fs.h | 2 |
10 files changed, 132 insertions, 33 deletions
diff --git a/Documentation/filesystems/fuse.rst b/Documentation/filesystems/fuse.rst index 8120c3c0cb4e..1e31e87aee68 100644 --- a/Documentation/filesystems/fuse.rst +++ b/Documentation/filesystems/fuse.rst @@ -279,7 +279,7 @@ How are requirements fulfilled? the filesystem or not. Note that the *ptrace* check is not strictly necessary to - prevent B/2/i, it is enough to check if mount owner has enough + prevent C/2/i, it is enough to check if mount owner has enough privilege to send signal to the process accessing the filesystem, since *SIGSTOP* can be used to get a similar effect. @@ -288,10 +288,29 @@ I think these limitations are unacceptable? If a sysadmin trusts the users enough, or can ensure through other measures, that system processes will never enter non-privileged -mounts, it can relax the last limitation with a 'user_allow_other' -config option. If this config option is set, the mounting user can -add the 'allow_other' mount option which disables the check for other -users' processes. +mounts, it can relax the last limitation in several ways: + + - With the 'user_allow_other' config option. If this config option is + set, the mounting user can add the 'allow_other' mount option which + disables the check for other users' processes. + + User namespaces have an unintuitive interaction with 'allow_other': + an unprivileged user - normally restricted from mounting with + 'allow_other' - could do so in a user namespace where they're + privileged. If any process could access such an 'allow_other' mount + this would give the mounting user the ability to manipulate + processes in user namespaces where they're unprivileged. For this + reason 'allow_other' restricts access to users in the same userns + or a descendant. + + - With the 'allow_sys_admin_access' module option. If this option is + set, super user's processes have unrestricted access to mounts + irrespective of allow_other setting or user namespace of the + mounting user. + +Note that both of these relaxations expose the system to potential +information leak or *DoS* as described in points B and C/2/i-ii in the +preceding section. Kernel - userspace interface ============================ diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 7cede9a3bc96..247ef4f76761 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -258,7 +258,7 @@ int fuse_ctl_add_conn(struct fuse_conn *fc) struct dentry *parent; char name[32]; - if (!fuse_control_sb) + if (!fuse_control_sb || fc->no_control) return 0; parent = fuse_control_sb->s_root; @@ -296,7 +296,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc) { int i; - if (!fuse_control_sb) + if (!fuse_control_sb || fc->no_control) return; for (i = fc->ctl_ndents - 1; i >= 0; i--) { diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 10eb50cbf398..e23e802a8013 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -138,9 +138,9 @@ static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd) WARN_ON(fcd->nr_free_ranges <= 0); fcd->nr_free_ranges--; } + __kick_dmap_free_worker(fcd, 0); spin_unlock(&fcd->lock); - kick_dmap_free_worker(fcd, 0); return dmap; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 74303d6e987b..b585b04e815e 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -11,6 +11,7 @@ #include <linux/pagemap.h> #include <linux/file.h> #include <linux/fs_context.h> +#include <linux/moduleparam.h> #include <linux/sched.h> #include <linux/namei.h> #include <linux/slab.h> @@ -21,6 +22,11 @@ #include <linux/types.h> #include <linux/kernel.h> +static bool __read_mostly allow_sys_admin_access; +module_param(allow_sys_admin_access, bool, 0644); +MODULE_PARM_DESC(allow_sys_admin_access, + "Allow users with CAP_SYS_ADMIN in initial userns to bypass allow_other access check"); + static void fuse_advise_use_readdirplus(struct inode *dir) { struct fuse_inode *fi = get_fuse_inode(dir); @@ -537,6 +543,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, struct fuse_file *ff; void *security_ctx = NULL; u32 security_ctxlen; + bool trunc = flags & O_TRUNC; /* Userspace expects S_IFREG in create mode */ BUG_ON((mode & S_IFMT) != S_IFREG); @@ -561,7 +568,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, inarg.mode = mode; inarg.umask = current_umask(); - if (fm->fc->handle_killpriv_v2 && (flags & O_TRUNC) && + if (fm->fc->handle_killpriv_v2 && trunc && !(flags & O_EXCL) && !capable(CAP_FSETID)) { inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; } @@ -623,6 +630,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, } else { file->private_data = ff; fuse_finish_open(inode, file); + if (fm->fc->atomic_o_trunc && trunc) + truncate_pagecache(inode, 0); + else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); } return err; @@ -1224,6 +1235,9 @@ int fuse_allow_current_process(struct fuse_conn *fc) { const struct cred *cred; + if (allow_sys_admin_access && capable(CAP_SYS_ADMIN)) + return 1; + if (fc->allow_other) return current_in_userns(fc->user_ns); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 00fa861aeead..7154b9555f39 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -210,13 +210,9 @@ void fuse_finish_open(struct inode *inode, struct file *file) fi->attr_version = atomic64_inc_return(&fc->attr_version); i_size_write(inode, 0); spin_unlock(&fi->lock); - truncate_pagecache(inode, 0); file_update_time(file); fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); - } else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) { - invalidate_inode_pages2(inode->i_mapping); } - if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); } @@ -239,30 +235,38 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) if (err) return err; - if (is_wb_truncate || dax_truncate) { + if (is_wb_truncate || dax_truncate) inode_lock(inode); - fuse_set_nowrite(inode); - } if (dax_truncate) { filemap_invalidate_lock(inode->i_mapping); err = fuse_dax_break_layouts(inode, 0, 0); if (err) - goto out; + goto out_inode_unlock; } + if (is_wb_truncate || dax_truncate) + fuse_set_nowrite(inode); + err = fuse_do_open(fm, get_node_id(inode), file, isdir); if (!err) fuse_finish_open(inode, file); -out: + if (is_wb_truncate || dax_truncate) + fuse_release_nowrite(inode); + if (!err) { + struct fuse_file *ff = file->private_data; + + if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) + truncate_pagecache(inode, 0); + else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); + } if (dax_truncate) filemap_invalidate_unlock(inode->i_mapping); - - if (is_wb_truncate | dax_truncate) { - fuse_release_nowrite(inode); +out_inode_unlock: + if (is_wb_truncate || dax_truncate) inode_unlock(inode); - } return err; } @@ -338,6 +342,15 @@ static int fuse_open(struct inode *inode, struct file *file) static int fuse_release(struct inode *inode, struct file *file) { + struct fuse_conn *fc = get_fuse_conn(inode); + + /* + * Dirty pages might remain despite write_inode_now() call from + * fuse_flush() due to writes racing with the close. + */ + if (fc->writeback_cache) + write_inode_now(inode, 1); + fuse_release_common(file, false); /* return value is ignored by VFS */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 8c0665c5dff8..6b3beda16c1b 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -180,6 +180,12 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_uid = make_kuid(fc->user_ns, attr->uid); inode->i_gid = make_kgid(fc->user_ns, attr->gid); inode->i_blocks = attr->blocks; + + /* Sanitize nsecs */ + attr->atimensec = min_t(u32, attr->atimensec, NSEC_PER_SEC - 1); + attr->mtimensec = min_t(u32, attr->mtimensec, NSEC_PER_SEC - 1); + attr->ctimensec = min_t(u32, attr->ctimensec, NSEC_PER_SEC - 1); + inode->i_atime.tv_sec = attr->atime; inode->i_atime.tv_nsec = attr->atimensec; /* mtime from server may be stale due to local buffered write */ @@ -476,8 +482,14 @@ static void fuse_umount_begin(struct super_block *sb) { struct fuse_conn *fc = get_fuse_conn_super(sb); - if (!fc->no_force_umount) - fuse_abort_conn(fc); + if (fc->no_force_umount) + return; + + fuse_abort_conn(fc); + + // Only retire block-device-based superblocks. + if (sb->s_bdev != NULL) + retire_super(sb); } static void fuse_send_destroy(struct fuse_mount *fm) diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 33cde4bbccdc..61d8afcb10a3 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -9,6 +9,17 @@ #include <linux/compat.h> #include <linux/fileattr.h> +static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args) +{ + ssize_t ret = fuse_simple_request(fm, args); + + /* Translate ENOSYS, which shouldn't be returned from fs */ + if (ret == -ENOSYS) + ret = -ENOTTY; + + return ret; +} + /* * CUSE servers compiled on 32bit broke on 64bit kernels because the * ABI was defined to be 'struct iovec' which is different on 32bit @@ -259,7 +270,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, ap.args.out_pages = true; ap.args.out_argvar = true; - transferred = fuse_simple_request(fm, &ap.args); + transferred = fuse_send_ioctl(fm, &ap.args); err = transferred; if (transferred < 0) goto out; @@ -393,7 +404,7 @@ static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff, args.out_args[1].size = inarg.out_size; args.out_args[1].value = ptr; - err = fuse_simple_request(fm, &args); + err = fuse_send_ioctl(fm, &args); if (!err) { if (outarg.result < 0) err = outarg.result; diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 8db53fa67359..4d8d4f16c727 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -741,8 +741,7 @@ out: } /* Free virtqueues (device must already be reset) */ -static void virtio_fs_cleanup_vqs(struct virtio_device *vdev, - struct virtio_fs *fs) +static void virtio_fs_cleanup_vqs(struct virtio_device *vdev) { vdev->config->del_vqs(vdev); } @@ -757,7 +756,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, { struct virtio_fs *fs = dax_get_private(dax_dev); phys_addr_t offset = PFN_PHYS(pgoff); - size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff; + size_t max_nr_pages = fs->window_len / PAGE_SIZE - pgoff; if (kaddr) *kaddr = fs->window_kaddr + offset; @@ -895,7 +894,7 @@ static int virtio_fs_probe(struct virtio_device *vdev) out_vqs: virtio_reset_device(vdev); - virtio_fs_cleanup_vqs(vdev, fs); + virtio_fs_cleanup_vqs(vdev); kfree(fs->vqs); out: @@ -927,7 +926,7 @@ static void virtio_fs_remove(struct virtio_device *vdev) virtio_fs_stop_all_queues(fs); virtio_fs_drain_all_queues_locked(fs); virtio_reset_device(vdev); - virtio_fs_cleanup_vqs(vdev, fs); + virtio_fs_cleanup_vqs(vdev); vdev->priv = NULL; /* Put device reference on virtio_fs object */ diff --git a/fs/super.c b/fs/super.c index 4fca6657f442..734ed584a946 100644 --- a/fs/super.c +++ b/fs/super.c @@ -423,6 +423,35 @@ bool trylock_super(struct super_block *sb) } /** + * retire_super - prevents superblock from being reused + * @sb: superblock to retire + * + * The function marks superblock to be ignored in superblock test, which + * prevents it from being reused for any new mounts. If the superblock has + * a private bdi, it also unregisters it, but doesn't reduce the refcount + * of the superblock to prevent potential races. The refcount is reduced + * by generic_shutdown_super(). The function can not be called + * concurrently with generic_shutdown_super(). It is safe to call the + * function multiple times, subsequent calls have no effect. + * + * The marker will affect the re-use only for block-device-based + * superblocks. Other superblocks will still get marked if this function + * is used, but that will not affect their reusability. + */ +void retire_super(struct super_block *sb) +{ + WARN_ON(!sb->s_bdev); + down_write(&sb->s_umount); + if (sb->s_iflags & SB_I_PERSB_BDI) { + bdi_unregister(sb->s_bdi); + sb->s_iflags &= ~SB_I_PERSB_BDI; + } + sb->s_iflags |= SB_I_RETIRED; + up_write(&sb->s_umount); +} +EXPORT_SYMBOL(retire_super); + +/** * generic_shutdown_super - common helper for ->kill_sb() * @sb: superblock to kill * @@ -1216,7 +1245,7 @@ static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc) static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc) { - return s->s_bdev == fc->sget_key; + return !(s->s_iflags & SB_I_RETIRED) && s->s_bdev == fc->sget_key; } /** @@ -1309,7 +1338,7 @@ EXPORT_SYMBOL(get_tree_bdev); static int test_bdev_super(struct super_block *s, void *data) { - return (void *)s->s_bdev == data; + return !(s->s_iflags & SB_I_RETIRED) && (void *)s->s_bdev == data; } struct dentry *mount_bdev(struct file_system_type *fs_type, diff --git a/include/linux/fs.h b/include/linux/fs.h index a3522bd811f9..8c127ffa6563 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1433,6 +1433,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ #define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ #define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ +#define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ /* Possible states of 'frozen' field */ enum { @@ -2565,6 +2566,7 @@ extern struct dentry *mount_nodev(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path); +void retire_super(struct super_block *sb); void generic_shutdown_super(struct super_block *sb); void kill_block_super(struct super_block *sb); void kill_anon_super(struct super_block *sb); |