From a6435940b62f81a1718bf2bd46a051379fc89b9d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 21 Jan 2021 14:19:20 +0100 Subject: mount: attach mappings to mounts In order to support per-mount idmappings vfsmounts are marked with user namespaces. The idmapping of the user namespace will be used to map the ids of vfs objects when they are accessed through that mount. By default all vfsmounts are marked with the initial user namespace. The initial user namespace is used to indicate that a mount is not idmapped. All operations behave as before. Based on prior discussions we want to attach the whole user namespace and not just a dedicated idmapping struct. This allows us to reuse all the helpers that already exist for dealing with idmappings instead of introducing a whole new range of helpers. In addition, if we decide in the future that we are confident enough to enable unprivileged users to setup idmapped mounts the permission checking can take into account whether the caller is privileged in the user namespace the mount is currently marked with. Later patches enforce that once a mount has been idmapped it can't be remapped. This keeps permission checking and life-cycle management simple. Users wanting to change the idmapped can always create a new detached mount with a different idmapping. Add a new mnt_userns member to vfsmount and two simple helpers to retrieve the mnt_userns from vfsmounts and files. The idea to attach user namespaces to vfsmounts has been floated around in various forms at Linux Plumbers in ~2018 with the original idea tracing back to a discussion in 2017 at a conference in St. Petersburg between Christoph, Tycho, and myself. Link: https://lore.kernel.org/r/20210121131959.646623-2-christian.brauner@ubuntu.com Cc: Christoph Hellwig Cc: David Howells Cc: Al Viro Cc: linux-fsdevel@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/namespace.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index 9d33909d0f9e..ecdc63ef881c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -210,6 +210,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_HLIST_NODE(&mnt->mnt_mp_list); INIT_LIST_HEAD(&mnt->mnt_umounting); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); + mnt->mnt.mnt_userns = &init_user_ns; } return mnt; @@ -547,6 +548,11 @@ int sb_prepare_remount_readonly(struct super_block *sb) static void free_vfsmnt(struct mount *mnt) { + struct user_namespace *mnt_userns; + + mnt_userns = mnt_user_ns(&mnt->mnt); + if (mnt_userns != &init_user_ns) + put_user_ns(mnt_userns); kfree_const(mnt->mnt_devname); #ifdef CONFIG_SMP free_percpu(mnt->mnt_pcp); @@ -1055,6 +1061,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); atomic_inc(&sb->s_active); + mnt->mnt.mnt_userns = mnt_user_ns(&old->mnt); + if (mnt->mnt.mnt_userns != &init_user_ns) + mnt->mnt.mnt_userns = get_user_ns(mnt->mnt.mnt_userns); mnt->mnt.mnt_sb = sb; mnt->mnt.mnt_root = dget(root); mnt->mnt_mountpoint = mnt->mnt.mnt_root; -- cgit v1.2.3-70-g09d2 From 68847c941700475575ced191108971d26e82ae29 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 21 Jan 2021 14:19:48 +0100 Subject: namespace: take lock_mount_hash() directly when changing flags Changing mount options always ends up taking lock_mount_hash() but when MNT_READONLY is requested and neither the mount nor the superblock are MNT_READONLY we end up taking the lock, dropping it, and retaking it to change the other mount attributes. Instead, let's acquire the lock once when changing the mount attributes. This simplifies the locking in these codepath, makes them easier to reason about and avoids having to reacquire the lock right after dropping it. Link: https://lore.kernel.org/r/20210121131959.646623-30-christian.brauner@ubuntu.com Cc: David Howells Cc: Al Viro Cc: linux-fsdevel@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/namespace.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index ecdc63ef881c..bdfb130f2c3c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -464,7 +464,6 @@ static int mnt_make_readonly(struct mount *mnt) { int ret = 0; - lock_mount_hash(); mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; /* * After storing MNT_WRITE_HOLD, we'll read the counters. This store @@ -498,18 +497,9 @@ static int mnt_make_readonly(struct mount *mnt) */ smp_wmb(); mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; - unlock_mount_hash(); return ret; } -static int __mnt_unmake_readonly(struct mount *mnt) -{ - lock_mount_hash(); - mnt->mnt.mnt_flags &= ~MNT_READONLY; - unlock_mount_hash(); - return 0; -} - int sb_prepare_remount_readonly(struct super_block *sb) { struct mount *mnt; @@ -2523,7 +2513,8 @@ static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags) if (readonly_request) return mnt_make_readonly(mnt); - return __mnt_unmake_readonly(mnt); + mnt->mnt.mnt_flags &= ~MNT_READONLY; + return 0; } /* @@ -2532,11 +2523,9 @@ static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags) */ static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags) { - lock_mount_hash(); mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; mnt->mnt.mnt_flags = mnt_flags; touch_mnt_namespace(mnt->mnt_ns); - unlock_mount_hash(); } static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt) @@ -2582,9 +2571,11 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) return -EPERM; down_write(&sb->s_umount); + lock_mount_hash(); ret = change_mount_ro_state(mnt, mnt_flags); if (ret == 0) set_mount_attributes(mnt, mnt_flags); + unlock_mount_hash(); up_write(&sb->s_umount); mnt_warn_timestamp_expiry(path, &mnt->mnt); @@ -2625,8 +2616,11 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags, err = -EPERM; if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { err = reconfigure_super(fc); - if (!err) + if (!err) { + lock_mount_hash(); set_mount_attributes(mnt, mnt_flags); + unlock_mount_hash(); + } } up_write(&sb->s_umount); } -- cgit v1.2.3-70-g09d2 From d033cb6784c4f3a19a593cfe11f850e476197388 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 21 Jan 2021 14:19:49 +0100 Subject: mount: make {lock,unlock}_mount_hash() static The lock_mount_hash() and unlock_mount_hash() helpers are never called outside a single file. Remove them from the header and make them static to reflect this fact. There's no need to have them callable from other places right now, as Christoph observed. Link: https://lore.kernel.org/r/20210121131959.646623-31-christian.brauner@ubuntu.com Cc: David Howells Cc: Al Viro Cc: linux-fsdevel@vger.kernel.org Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/mount.h | 10 ---------- fs/namespace.c | 10 ++++++++++ 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/mount.h b/fs/mount.h index ce6c376e0bc2..0b6e08cf8afb 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -124,16 +124,6 @@ static inline void get_mnt_ns(struct mnt_namespace *ns) extern seqlock_t mount_lock; -static inline void lock_mount_hash(void) -{ - write_seqlock(&mount_lock); -} - -static inline void unlock_mount_hash(void) -{ - write_sequnlock(&mount_lock); -} - struct proc_mounts { struct mnt_namespace *ns; struct path root; diff --git a/fs/namespace.c b/fs/namespace.c index bdfb130f2c3c..5fceb2854395 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -87,6 +87,16 @@ EXPORT_SYMBOL_GPL(fs_kobj); */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); +static inline void lock_mount_hash(void) +{ + write_seqlock(&mount_lock); +} + +static inline void unlock_mount_hash(void) +{ + write_sequnlock(&mount_lock); +} + static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) { unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); -- cgit v1.2.3-70-g09d2 From e58ace1a0fa9d578f85f556b4b88c5fe9b871d08 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 21 Jan 2021 14:19:50 +0100 Subject: namespace: only take read lock in do_reconfigure_mnt() do_reconfigure_mnt() used to take the down_write(&sb->s_umount) lock which seems unnecessary since we're not changing the superblock. We're only checking whether it is already read-only. Setting other mount attributes is protected by lock_mount_hash() afaict and not by s_umount. The history of down_write(&sb->s_umount) lock being taken when setting mount attributes dates back to the introduction of MNT_READONLY in [2]. This introduced the concept of having read-only mounts in contrast to just having a read-only superblock. When it got introduced it was simply plumbed into do_remount() which already took down_write(&sb->s_umount) because it was only used to actually change the superblock before [2]. Afaict, it would've already been possible back then to only use down_read(&sb->s_umount) for MS_BIND | MS_REMOUNT since actual mount options were protected by the vfsmount lock already. But that would've meant special casing the locking for MS_BIND | MS_REMOUNT in do_remount() which people might not have considered worth it. Then in [1] MS_BIND | MS_REMOUNT mount option changes were split out of do_remount() into do_reconfigure_mnt() but the down_write(&sb->s_umount) lock was simply copied over. Now that we have this be a separate helper only take the down_read(&sb->s_umount) lock since we're only interested in checking whether the super block is currently read-only and blocking any writers from changing it. Essentially, checking that the super block is read-only has the advantage that we can avoid having to go into the slowpath and through MNT_WRITE_HOLD and can simply set the read-only flag on the mount in set_mount_attributes(). [1]: commit 43f5e655eff7 ("vfs: Separate changing mount flags full remount") [2]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount") Link: https://lore.kernel.org/r/20210121131959.646623-32-christian.brauner@ubuntu.com Cc: David Howells Cc: Al Viro Cc: linux-fsdevel@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/namespace.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index 5fceb2854395..367f1c7cb6db 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2527,10 +2527,6 @@ static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags) return 0; } -/* - * Update the user-settable attributes on a mount. The caller must hold - * sb->s_umount for writing. - */ static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags) { mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; @@ -2580,13 +2576,17 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) if (!can_change_locked_flags(mnt, mnt_flags)) return -EPERM; - down_write(&sb->s_umount); + /* + * We're only checking whether the superblock is read-only not + * changing it, so only take down_read(&sb->s_umount). + */ + down_read(&sb->s_umount); lock_mount_hash(); ret = change_mount_ro_state(mnt, mnt_flags); if (ret == 0) set_mount_attributes(mnt, mnt_flags); unlock_mount_hash(); - up_write(&sb->s_umount); + up_read(&sb->s_umount); mnt_warn_timestamp_expiry(path, &mnt->mnt); -- cgit v1.2.3-70-g09d2 From fbdc2f6c40f6528fa0db79c73e844451234f3e26 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 21 Jan 2021 14:19:51 +0100 Subject: fs: split out functions to hold writers When a mount is marked read-only we set MNT_WRITE_HOLD on it if there aren't currently any active writers. Split this logic out into simple helpers that we can use in follow-up patches. Link: https://lore.kernel.org/r/20210121131959.646623-33-christian.brauner@ubuntu.com Cc: David Howells Cc: Al Viro Cc: linux-fsdevel@vger.kernel.org Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/namespace.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index 367f1c7cb6db..774ae5f74716 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -470,10 +470,8 @@ void mnt_drop_write_file(struct file *file) } EXPORT_SYMBOL(mnt_drop_write_file); -static int mnt_make_readonly(struct mount *mnt) +static inline int mnt_hold_writers(struct mount *mnt) { - int ret = 0; - mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; /* * After storing MNT_WRITE_HOLD, we'll read the counters. This store @@ -498,15 +496,29 @@ static int mnt_make_readonly(struct mount *mnt) * we're counting up here. */ if (mnt_get_writers(mnt) > 0) - ret = -EBUSY; - else - mnt->mnt.mnt_flags |= MNT_READONLY; + return -EBUSY; + + return 0; +} + +static inline void mnt_unhold_writers(struct mount *mnt) +{ /* * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers * that become unheld will see MNT_READONLY. */ smp_wmb(); mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; +} + +static int mnt_make_readonly(struct mount *mnt) +{ + int ret; + + ret = mnt_hold_writers(mnt); + if (!ret) + mnt->mnt.mnt_flags |= MNT_READONLY; + mnt_unhold_writers(mnt); return ret; } -- cgit v1.2.3-70-g09d2 From 5b490500f91b212d862560e7568aa5cdc141f9d0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 21 Jan 2021 14:19:52 +0100 Subject: fs: add attr_flags_to_mnt_flags helper Add a simple helper to translate uapi MOUNT_ATTR_* flags to MNT_* flags which we will use in follow-up patches too. Link: https://lore.kernel.org/r/20210121131959.646623-34-christian.brauner@ubuntu.com Cc: David Howells Cc: Al Viro Cc: linux-fsdevel@vger.kernel.org Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/namespace.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index 774ae5f74716..00ed0d6cb2ee 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3465,6 +3465,28 @@ out_type: return ret; } +#define FSMOUNT_VALID_FLAGS \ + (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \ + MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME) + +static unsigned int attr_flags_to_mnt_flags(u64 attr_flags) +{ + unsigned int mnt_flags = 0; + + if (attr_flags & MOUNT_ATTR_RDONLY) + mnt_flags |= MNT_READONLY; + if (attr_flags & MOUNT_ATTR_NOSUID) + mnt_flags |= MNT_NOSUID; + if (attr_flags & MOUNT_ATTR_NODEV) + mnt_flags |= MNT_NODEV; + if (attr_flags & MOUNT_ATTR_NOEXEC) + mnt_flags |= MNT_NOEXEC; + if (attr_flags & MOUNT_ATTR_NODIRATIME) + mnt_flags |= MNT_NODIRATIME; + + return mnt_flags; +} + /* * Create a kernel mount representation for a new, prepared superblock * (specified by fs_fd) and attach to an open_tree-like file descriptor. @@ -3487,24 +3509,10 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, if ((flags & ~(FSMOUNT_CLOEXEC)) != 0) return -EINVAL; - if (attr_flags & ~(MOUNT_ATTR_RDONLY | - MOUNT_ATTR_NOSUID | - MOUNT_ATTR_NODEV | - MOUNT_ATTR_NOEXEC | - MOUNT_ATTR__ATIME | - MOUNT_ATTR_NODIRATIME)) + if (attr_flags & ~FSMOUNT_VALID_FLAGS) return -EINVAL; - if (attr_flags & MOUNT_ATTR_RDONLY) - mnt_flags |= MNT_READONLY; - if (attr_flags & MOUNT_ATTR_NOSUID) - mnt_flags |= MNT_NOSUID; - if (attr_flags & MOUNT_ATTR_NODEV) - mnt_flags |= MNT_NODEV; - if (attr_flags & MOUNT_ATTR_NOEXEC) - mnt_flags |= MNT_NOEXEC; - if (attr_flags & MOUNT_ATTR_NODIRATIME) - mnt_flags |= MNT_NODIRATIME; + mnt_flags = attr_flags_to_mnt_flags(attr_flags); switch (attr_flags & MOUNT_ATTR__ATIME) { case MOUNT_ATTR_STRICTATIME: -- cgit v1.2.3-70-g09d2 From 2a1867219c7b27f928e2545782b86daaf9ad50bd Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 21 Jan 2021 14:19:53 +0100 Subject: fs: add mount_setattr() This implements the missing mount_setattr() syscall. While the new mount api allows to change the properties of a superblock there is currently no way to change the properties of a mount or a mount tree using file descriptors which the new mount api is based on. In addition the old mount api has the restriction that mount options cannot be applied recursively. This hasn't changed since changing mount options on a per-mount basis was implemented in [1] and has been a frequent request not just for convenience but also for security reasons. The legacy mount syscall is unable to accommodate this behavior without introducing a whole new set of flags because MS_REC | MS_REMOUNT | MS_BIND | MS_RDONLY | MS_NOEXEC | [...] only apply the mount option to the topmost mount. Changing MS_REC to apply to the whole mount tree would mean introducing a significant uapi change and would likely cause significant regressions. The new mount_setattr() syscall allows to recursively clear and set mount options in one shot. Multiple calls to change mount options requesting the same changes are idempotent: int mount_setattr(int dfd, const char *path, unsigned flags, struct mount_attr *uattr, size_t usize); Flags to modify path resolution behavior are specified in the @flags argument. Currently, AT_EMPTY_PATH, AT_RECURSIVE, AT_SYMLINK_NOFOLLOW, and AT_NO_AUTOMOUNT are supported. If useful, additional lookup flags to restrict path resolution as introduced with openat2() might be supported in the future. The mount_setattr() syscall can be expected to grow over time and is designed with extensibility in mind. It follows the extensible syscall pattern we have used with other syscalls such as openat2(), clone3(), sched_{set,get}attr(), and others. The set of mount options is passed in the uapi struct mount_attr which currently has the following layout: struct mount_attr { __u64 attr_set; __u64 attr_clr; __u64 propagation; __u64 userns_fd; }; The @attr_set and @attr_clr members are used to clear and set mount options. This way a user can e.g. request that a set of flags is to be raised such as turning mounts readonly by raising MOUNT_ATTR_RDONLY in @attr_set while at the same time requesting that another set of flags is to be lowered such as removing noexec from a mount tree by specifying MOUNT_ATTR_NOEXEC in @attr_clr. Note, since the MOUNT_ATTR_ values are an enum starting from 0, not a bitmap, users wanting to transition to a different atime setting cannot simply specify the atime setting in @attr_set, but must also specify MOUNT_ATTR__ATIME in the @attr_clr field. So we ensure that MOUNT_ATTR__ATIME can't be partially set in @attr_clr and that @attr_set can't have any atime bits set if MOUNT_ATTR__ATIME isn't set in @attr_clr. The @propagation field lets callers specify the propagation type of a mount tree. Propagation is a single property that has four different settings and as such is not really a flag argument but an enum. Specifically, it would be unclear what setting and clearing propagation settings in combination would amount to. The legacy mount() syscall thus forbids the combination of multiple propagation settings too. The goal is to keep the semantics of mount propagation somewhat simple as they are overly complex as it is. The @userns_fd field lets user specify a user namespace whose idmapping becomes the idmapping of the mount. This is implemented and explained in detail in the next patch. [1]: commit 2e4b7fcd9260 ("[PATCH] r/o bind mounts: honor mount writer counts at remount") Link: https://lore.kernel.org/r/20210121131959.646623-35-christian.brauner@ubuntu.com Cc: David Howells Cc: Aleksa Sarai Cc: Al Viro Cc: linux-fsdevel@vger.kernel.org Cc: linux-api@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- arch/alpha/kernel/syscalls/syscall.tbl | 1 + arch/arm/tools/syscall.tbl | 1 + arch/arm64/include/asm/unistd.h | 2 +- arch/arm64/include/asm/unistd32.h | 2 + arch/ia64/kernel/syscalls/syscall.tbl | 1 + arch/m68k/kernel/syscalls/syscall.tbl | 1 + arch/microblaze/kernel/syscalls/syscall.tbl | 1 + arch/mips/kernel/syscalls/syscall_n32.tbl | 1 + arch/mips/kernel/syscalls/syscall_n64.tbl | 1 + arch/mips/kernel/syscalls/syscall_o32.tbl | 1 + arch/parisc/kernel/syscalls/syscall.tbl | 1 + arch/powerpc/kernel/syscalls/syscall.tbl | 1 + arch/s390/kernel/syscalls/syscall.tbl | 1 + arch/sh/kernel/syscalls/syscall.tbl | 1 + arch/sparc/kernel/syscalls/syscall.tbl | 1 + arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 1 + fs/namespace.c | 263 ++++++++++++++++++++++++++++ include/linux/syscalls.h | 4 + include/uapi/asm-generic/unistd.h | 4 +- include/uapi/linux/mount.h | 15 ++ tools/include/uapi/asm-generic/unistd.h | 4 +- 23 files changed, 307 insertions(+), 3 deletions(-) (limited to 'fs/namespace.c') diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index a6617067dbe6..02f0244e005c 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -481,3 +481,4 @@ 549 common faccessat2 sys_faccessat2 550 common process_madvise sys_process_madvise 551 common epoll_pwait2 sys_epoll_pwait2 +552 common mount_setattr sys_mount_setattr diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 20e1170e2e0a..dcc1191291a2 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -455,3 +455,4 @@ 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 +442 common mount_setattr sys_mount_setattr diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 86a9d7b3eabe..949788f5ba40 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 442 +#define __NR_compat_syscalls 443 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index cccfbbefbf95..3d874f624056 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -891,6 +891,8 @@ __SYSCALL(__NR_faccessat2, sys_faccessat2) __SYSCALL(__NR_process_madvise, sys_process_madvise) #define __NR_epoll_pwait2 441 __SYSCALL(__NR_epoll_pwait2, compat_sys_epoll_pwait2) +#define __NR_mount_setattr 442 +__SYSCALL(__NR_mount_setattr, sys_mount_setattr) /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index bfc00f2bd437..d89231166e19 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -362,3 +362,4 @@ 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 +442 common mount_setattr sys_mount_setattr diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 7fe4e45c864c..72bde6707dd3 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -441,3 +441,4 @@ 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 +442 common mount_setattr sys_mount_setattr diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index a522adf194ab..d603a5ec9338 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -447,3 +447,4 @@ 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 +442 common mount_setattr sys_mount_setattr diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 0f03ad223f33..8fd8c1790941 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -380,3 +380,4 @@ 439 n32 faccessat2 sys_faccessat2 440 n32 process_madvise sys_process_madvise 441 n32 epoll_pwait2 compat_sys_epoll_pwait2 +442 n32 mount_setattr sys_mount_setattr diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 91649690b52f..169f21438065 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -356,3 +356,4 @@ 439 n64 faccessat2 sys_faccessat2 440 n64 process_madvise sys_process_madvise 441 n64 epoll_pwait2 sys_epoll_pwait2 +442 n64 mount_setattr sys_mount_setattr diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 4bad0c40aed6..090d29ca80ff 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -429,3 +429,4 @@ 439 o32 faccessat2 sys_faccessat2 440 o32 process_madvise sys_process_madvise 441 o32 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 +442 o32 mount_setattr sys_mount_setattr diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 6bcc31966b44..271a92519683 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -439,3 +439,4 @@ 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 +442 common mount_setattr sys_mount_setattr diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index f744eb5cba88..72e5aa67ab8a 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -531,3 +531,4 @@ 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 +442 common mount_setattr sys_mount_setattr diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index d443423495e5..3abef2144dac 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -444,3 +444,4 @@ 439 common faccessat2 sys_faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 +442 common mount_setattr sys_mount_setattr sys_mount_setattr diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index 9df40ac0ebc0..d08eebad6b7f 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -444,3 +444,4 @@ 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 +442 common mount_setattr sys_mount_setattr diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 40d8c7cd8298..84403a99039c 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -487,3 +487,4 @@ 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 +442 common mount_setattr sys_mount_setattr diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 874aeacde2dd..a1c9f496fca6 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -446,3 +446,4 @@ 439 i386 faccessat2 sys_faccessat2 440 i386 process_madvise sys_process_madvise 441 i386 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 +442 i386 mount_setattr sys_mount_setattr diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 78672124d28b..7bf01cbe582f 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -363,6 +363,7 @@ 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 +442 common mount_setattr sys_mount_setattr # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 46116a28eeed..365a9b849224 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -412,3 +412,4 @@ 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 +442 common mount_setattr sys_mount_setattr diff --git a/fs/namespace.c b/fs/namespace.c index 00ed0d6cb2ee..726a51c8c46e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -73,6 +73,14 @@ static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ +struct mount_kattr { + unsigned int attr_set; + unsigned int attr_clr; + unsigned int propagation; + unsigned int lookup_flags; + bool recurse; +}; + /* /sys/fs */ struct kobject *fs_kobj; EXPORT_SYMBOL_GPL(fs_kobj); @@ -3469,6 +3477,11 @@ out_type: (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \ MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME) +#define MOUNT_SETATTR_VALID_FLAGS FSMOUNT_VALID_FLAGS + +#define MOUNT_SETATTR_PROPAGATION_FLAGS \ + (MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED) + static unsigned int attr_flags_to_mnt_flags(u64 attr_flags) { unsigned int mnt_flags = 0; @@ -3820,6 +3833,256 @@ out0: return error; } +static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt) +{ + unsigned int flags = mnt->mnt.mnt_flags; + + /* flags to clear */ + flags &= ~kattr->attr_clr; + /* flags to raise */ + flags |= kattr->attr_set; + + return flags; +} + +static struct mount *mount_setattr_prepare(struct mount_kattr *kattr, + struct mount *mnt, int *err) +{ + struct mount *m = mnt, *last = NULL; + + if (!is_mounted(&m->mnt)) { + *err = -EINVAL; + goto out; + } + + if (!(mnt_has_parent(m) ? check_mnt(m) : is_anon_ns(m->mnt_ns))) { + *err = -EINVAL; + goto out; + } + + do { + unsigned int flags; + + flags = recalc_flags(kattr, m); + if (!can_change_locked_flags(m, flags)) { + *err = -EPERM; + goto out; + } + + last = m; + + if ((kattr->attr_set & MNT_READONLY) && + !(m->mnt.mnt_flags & MNT_READONLY)) { + *err = mnt_hold_writers(m); + if (*err) + goto out; + } + } while (kattr->recurse && (m = next_mnt(m, mnt))); + +out: + return last; +} + +static void mount_setattr_commit(struct mount_kattr *kattr, + struct mount *mnt, struct mount *last, + int err) +{ + struct mount *m = mnt; + + do { + if (!err) { + unsigned int flags; + + flags = recalc_flags(kattr, m); + WRITE_ONCE(m->mnt.mnt_flags, flags); + } + + /* + * We either set MNT_READONLY above so make it visible + * before ~MNT_WRITE_HOLD or we failed to recursively + * apply mount options. + */ + if ((kattr->attr_set & MNT_READONLY) && + (m->mnt.mnt_flags & MNT_WRITE_HOLD)) + mnt_unhold_writers(m); + + if (!err && kattr->propagation) + change_mnt_propagation(m, kattr->propagation); + + /* + * On failure, only cleanup until we found the first mount + * we failed to handle. + */ + if (err && m == last) + break; + } while (kattr->recurse && (m = next_mnt(m, mnt))); + + if (!err) + touch_mnt_namespace(mnt->mnt_ns); +} + +static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) +{ + struct mount *mnt = real_mount(path->mnt), *last = NULL; + int err = 0; + + if (path->dentry != mnt->mnt.mnt_root) + return -EINVAL; + + if (kattr->propagation) { + /* + * Only take namespace_lock() if we're actually changing + * propagation. + */ + namespace_lock(); + if (kattr->propagation == MS_SHARED) { + err = invent_group_ids(mnt, kattr->recurse); + if (err) { + namespace_unlock(); + return err; + } + } + } + + lock_mount_hash(); + + /* + * Get the mount tree in a shape where we can change mount + * properties without failure. + */ + last = mount_setattr_prepare(kattr, mnt, &err); + if (last) /* Commit all changes or revert to the old state. */ + mount_setattr_commit(kattr, mnt, last, err); + + unlock_mount_hash(); + + if (kattr->propagation) { + namespace_unlock(); + if (err) + cleanup_group_ids(mnt, NULL); + } + + return err; +} + +static int build_mount_kattr(const struct mount_attr *attr, + struct mount_kattr *kattr, unsigned int flags) +{ + unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; + + if (flags & AT_NO_AUTOMOUNT) + lookup_flags &= ~LOOKUP_AUTOMOUNT; + if (flags & AT_SYMLINK_NOFOLLOW) + lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + + *kattr = (struct mount_kattr) { + .lookup_flags = lookup_flags, + .recurse = !!(flags & AT_RECURSIVE), + }; + + if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS) + return -EINVAL; + if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1) + return -EINVAL; + kattr->propagation = attr->propagation; + + if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS) + return -EINVAL; + + if (attr->userns_fd) + return -EINVAL; + + kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set); + kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr); + + /* + * Since the MOUNT_ATTR_ values are an enum, not a bitmap, + * users wanting to transition to a different atime setting cannot + * simply specify the atime setting in @attr_set, but must also + * specify MOUNT_ATTR__ATIME in the @attr_clr field. + * So ensure that MOUNT_ATTR__ATIME can't be partially set in + * @attr_clr and that @attr_set can't have any atime bits set if + * MOUNT_ATTR__ATIME isn't set in @attr_clr. + */ + if (attr->attr_clr & MOUNT_ATTR__ATIME) { + if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME) + return -EINVAL; + + /* + * Clear all previous time settings as they are mutually + * exclusive. + */ + kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME; + switch (attr->attr_set & MOUNT_ATTR__ATIME) { + case MOUNT_ATTR_RELATIME: + kattr->attr_set |= MNT_RELATIME; + break; + case MOUNT_ATTR_NOATIME: + kattr->attr_set |= MNT_NOATIME; + break; + case MOUNT_ATTR_STRICTATIME: + break; + default: + return -EINVAL; + } + } else { + if (attr->attr_set & MOUNT_ATTR__ATIME) + return -EINVAL; + } + + return 0; +} + +SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, + unsigned int, flags, struct mount_attr __user *, uattr, + size_t, usize) +{ + int err; + struct path target; + struct mount_attr attr; + struct mount_kattr kattr; + + BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0); + + if (flags & ~(AT_EMPTY_PATH | + AT_RECURSIVE | + AT_SYMLINK_NOFOLLOW | + AT_NO_AUTOMOUNT)) + return -EINVAL; + + if (unlikely(usize > PAGE_SIZE)) + return -E2BIG; + if (unlikely(usize < MOUNT_ATTR_SIZE_VER0)) + return -EINVAL; + + if (!may_mount()) + return -EPERM; + + err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize); + if (err) + return err; + + /* Don't bother walking through the mounts if this is a nop. */ + if (attr.attr_set == 0 && + attr.attr_clr == 0 && + attr.propagation == 0) + return 0; + + err = build_mount_kattr(&attr, &kattr, flags); + if (err) + return err; + + err = user_path_at(dfd, path, kattr.lookup_flags, &target); + if (err) + return err; + + err = do_mount_setattr(&target, &kattr); + path_put(&target); + return err; +} + static void __init init_mount_tree(void) { struct vfsmount *mnt; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 7688bc983de5..cd7b5c817ba2 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -68,6 +68,7 @@ union bpf_attr; struct io_uring_params; struct clone_args; struct open_how; +struct mount_attr; #include #include @@ -1028,6 +1029,9 @@ asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags); asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path, int to_dfd, const char __user *to_path, unsigned int ms_flags); +asmlinkage long sys_mount_setattr(int dfd, const char __user *path, + unsigned int flags, + struct mount_attr __user *uattr, size_t usize); asmlinkage long sys_fsopen(const char __user *fs_name, unsigned int flags); asmlinkage long sys_fsconfig(int fs_fd, unsigned int cmd, const char __user *key, const void __user *value, int aux); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 728752917785..ce58cff99b66 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -861,9 +861,11 @@ __SYSCALL(__NR_faccessat2, sys_faccessat2) __SYSCALL(__NR_process_madvise, sys_process_madvise) #define __NR_epoll_pwait2 441 __SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2) +#define __NR_mount_setattr 442 +__SYSCALL(__NR_mount_setattr, sys_mount_setattr) #undef __NR_syscalls -#define __NR_syscalls 442 +#define __NR_syscalls 443 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index dd8306ea336c..d3bcefdfa73d 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -1,6 +1,8 @@ #ifndef _UAPI_LINUX_MOUNT_H #define _UAPI_LINUX_MOUNT_H +#include + /* * These are the fs-independent mount-flags: up to 32 flags are supported * @@ -118,4 +120,17 @@ enum fsconfig_command { #define MOUNT_ATTR_STRICTATIME 0x00000020 /* - Always perform atime updates */ #define MOUNT_ATTR_NODIRATIME 0x00000080 /* Do not update directory access times */ +/* + * mount_setattr() + */ +struct mount_attr { + __u64 attr_set; + __u64 attr_clr; + __u64 propagation; + __u64 userns_fd; +}; + +/* List of all mount_attr versions. */ +#define MOUNT_ATTR_SIZE_VER0 32 /* sizeof first published struct */ + #endif /* _UAPI_LINUX_MOUNT_H */ diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index 728752917785..ce58cff99b66 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -861,9 +861,11 @@ __SYSCALL(__NR_faccessat2, sys_faccessat2) __SYSCALL(__NR_process_madvise, sys_process_madvise) #define __NR_epoll_pwait2 441 __SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2) +#define __NR_mount_setattr 442 +__SYSCALL(__NR_mount_setattr, sys_mount_setattr) #undef __NR_syscalls -#define __NR_syscalls 442 +#define __NR_syscalls 443 /* * 32 bit systems traditionally used different -- cgit v1.2.3-70-g09d2 From 9caccd41541a6f7d6279928d9f971f6642c361af Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 21 Jan 2021 14:19:54 +0100 Subject: fs: introduce MOUNT_ATTR_IDMAP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a new mount bind mount property to allow idmapping mounts. The MOUNT_ATTR_IDMAP flag can be set via the new mount_setattr() syscall together with a file descriptor referring to a user namespace. The user namespace referenced by the namespace file descriptor will be attached to the bind mount. All interactions with the filesystem going through that mount will be mapped according to the mapping specified in the user namespace attached to it. Using user namespaces to mark mounts means we can reuse all the existing infrastructure in the kernel that already exists to handle idmappings and can also use this for permission checking to allow unprivileged user to create idmapped mounts in the future. Idmapping a mount is decoupled from the caller's user and mount namespace. This means idmapped mounts can be created in the initial user namespace which is an important use-case for systemd-homed, portable usb-sticks between systems, sharing data between the initial user namespace and unprivileged containers, and other use-cases that have been brought up. For example, assume a home directory where all files are owned by uid and gid 1000 and the home directory is brought to a new laptop where the user has id 12345. The system administrator can simply create a mount of this home directory with a mapping of 1000:12345:1 and other mappings to indicate the ids should be kept. (With this it is e.g. also possible to create idmapped mounts on the host with an identity mapping 1:1:100000 where the root user is not mapped. A user with root access that e.g. has been pivot rooted into such a mount on the host will be not be able to execute, read, write, or create files as root.) Given that mapping a mount is decoupled from the caller's user namespace a sufficiently privileged process such as a container manager can set up an idmapped mount for the container and the container can simply pivot root to it. There's no need for the container to do anything. The mount will appear correctly mapped independent of the user namespace the container uses. This means we don't need to mark a mount as idmappable. In order to create an idmapped mount the caller must currently be privileged in the user namespace of the superblock the mount belongs to. Once a mount has been idmapped we don't allow it to change its mapping. This keeps permission checking and life-cycle management simple. Users wanting to change the idmapped can always create a new detached mount with a different idmapping. Link: https://lore.kernel.org/r/20210121131959.646623-36-christian.brauner@ubuntu.com Cc: Christoph Hellwig Cc: David Howells Cc: Mauricio Vásquez Bernal Cc: Al Viro Cc: linux-fsdevel@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/namespace.c | 122 ++++++++++++++++++++++++++++++++++++++++++--- fs/proc_namespace.c | 3 ++ include/linux/mount.h | 3 +- include/uapi/linux/mount.h | 1 + 4 files changed, 121 insertions(+), 8 deletions(-) (limited to 'fs/namespace.c') diff --git a/fs/namespace.c b/fs/namespace.c index 726a51c8c46e..062fe984a697 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -79,6 +80,7 @@ struct mount_kattr { unsigned int propagation; unsigned int lookup_flags; bool recurse; + struct user_namespace *mnt_userns; }; /* /sys/fs */ @@ -3477,7 +3479,7 @@ out_type: (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \ MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME) -#define MOUNT_SETATTR_VALID_FLAGS FSMOUNT_VALID_FLAGS +#define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP) #define MOUNT_SETATTR_PROPAGATION_FLAGS \ (MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED) @@ -3845,6 +3847,36 @@ static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt) return flags; } +static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) +{ + struct vfsmount *m = &mnt->mnt; + + if (!kattr->mnt_userns) + return 0; + + /* + * Once a mount has been idmapped we don't allow it to change its + * mapping. It makes things simpler and callers can just create + * another bind-mount they can idmap if they want to. + */ + if (mnt_user_ns(m) != &init_user_ns) + return -EPERM; + + /* The underlying filesystem doesn't support idmapped mounts yet. */ + if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP)) + return -EINVAL; + + /* We're not controlling the superblock. */ + if (!ns_capable(m->mnt_sb->s_user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + /* Mount has already been visible in the filesystem hierarchy. */ + if (!is_anon_ns(mnt->mnt_ns)) + return -EINVAL; + + return 0; +} + static struct mount *mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt, int *err) { @@ -3869,6 +3901,10 @@ static struct mount *mount_setattr_prepare(struct mount_kattr *kattr, goto out; } + *err = can_idmap_mount(kattr, m); + if (*err) + goto out; + last = m; if ((kattr->attr_set & MNT_READONLY) && @@ -3883,6 +3919,18 @@ out: return last; } +static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) +{ + struct user_namespace *mnt_userns; + + if (!kattr->mnt_userns) + return; + + mnt_userns = get_user_ns(kattr->mnt_userns); + /* Pairs with smp_load_acquire() in mnt_user_ns(). */ + smp_store_release(&mnt->mnt.mnt_userns, mnt_userns); +} + static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt, struct mount *last, int err) @@ -3893,6 +3941,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, if (!err) { unsigned int flags; + do_idmap_mount(kattr, m); flags = recalc_flags(kattr, m); WRITE_ONCE(m->mnt.mnt_flags, flags); } @@ -3965,7 +4014,62 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) return err; } -static int build_mount_kattr(const struct mount_attr *attr, +static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, + struct mount_kattr *kattr, unsigned int flags) +{ + int err = 0; + struct ns_common *ns; + struct user_namespace *mnt_userns; + struct file *file; + + if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP)) + return 0; + + /* + * We currently do not support clearing an idmapped mount. If this ever + * is a use-case we can revisit this but for now let's keep it simple + * and not allow it. + */ + if (attr->attr_clr & MOUNT_ATTR_IDMAP) + return -EINVAL; + + if (attr->userns_fd > INT_MAX) + return -EINVAL; + + file = fget(attr->userns_fd); + if (!file) + return -EBADF; + + if (!proc_ns_file(file)) { + err = -EINVAL; + goto out_fput; + } + + ns = get_proc_ns(file_inode(file)); + if (ns->ops->type != CLONE_NEWUSER) { + err = -EINVAL; + goto out_fput; + } + + /* + * The init_user_ns is used to indicate that a vfsmount is not idmapped. + * This is simpler than just having to treat NULL as unmapped. Users + * wanting to idmap a mount to init_user_ns can just use a namespace + * with an identity mapping. + */ + mnt_userns = container_of(ns, struct user_namespace, ns); + if (mnt_userns == &init_user_ns) { + err = -EPERM; + goto out_fput; + } + kattr->mnt_userns = get_user_ns(mnt_userns); + +out_fput: + fput(file); + return err; +} + +static int build_mount_kattr(const struct mount_attr *attr, size_t usize, struct mount_kattr *kattr, unsigned int flags) { unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; @@ -3991,9 +4095,6 @@ static int build_mount_kattr(const struct mount_attr *attr, if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS) return -EINVAL; - if (attr->userns_fd) - return -EINVAL; - kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set); kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr); @@ -4032,7 +4133,13 @@ static int build_mount_kattr(const struct mount_attr *attr, return -EINVAL; } - return 0; + return build_mount_idmapped(attr, usize, kattr, flags); +} + +static void finish_mount_kattr(struct mount_kattr *kattr) +{ + put_user_ns(kattr->mnt_userns); + kattr->mnt_userns = NULL; } SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, @@ -4070,7 +4177,7 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, attr.propagation == 0) return 0; - err = build_mount_kattr(&attr, &kattr, flags); + err = build_mount_kattr(&attr, usize, &kattr, flags); if (err) return err; @@ -4079,6 +4186,7 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, return err; err = do_mount_setattr(&target, &kattr); + finish_mount_kattr(&kattr); path_put(&target); return err; } diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index eafb75755fa3..392ef5162655 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -79,6 +79,9 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) if (mnt->mnt_flags & fs_infop->flag) seq_puts(m, fs_infop->str); } + + if (mnt_user_ns(mnt) != &init_user_ns) + seq_puts(m, ",idmapped"); } static inline void mangle(struct seq_file *m, const char *s) diff --git a/include/linux/mount.h b/include/linux/mount.h index 52de25e08319..161f4419db6c 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -77,7 +77,8 @@ struct vfsmount { static inline struct user_namespace *mnt_user_ns(const struct vfsmount *mnt) { - return mnt->mnt_userns; + /* Pairs with smp_store_release() in do_idmap_mount(). */ + return smp_load_acquire(&mnt->mnt_userns); } struct file; /* forward dec */ diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index d3bcefdfa73d..e6524ead2b7b 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -119,6 +119,7 @@ enum fsconfig_command { #define MOUNT_ATTR_NOATIME 0x00000010 /* - Do not update access times. */ #define MOUNT_ATTR_STRICTATIME 0x00000020 /* - Always perform atime updates */ #define MOUNT_ATTR_NODIRATIME 0x00000080 /* Do not update directory access times */ +#define MOUNT_ATTR_IDMAP 0x00100000 /* Idmap mount to @userns_fd in struct mount_attr. */ /* * mount_setattr() -- cgit v1.2.3-70-g09d2