diff options
31 files changed, 1298 insertions, 129 deletions
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 18c842ca6c32..186e785f5b56 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -496,3 +496,5 @@ 564 common futex_wake sys_futex_wake 565 common futex_wait sys_futex_wait 566 common futex_requeue sys_futex_requeue +567 common statmount sys_statmount +568 common listmount sys_listmount diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 584f9528c996..d6a324dbff2e 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -470,3 +470,5 @@ 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue +457 common statmount sys_statmount +458 common listmount sys_listmount diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 9f7c1bf99526..8a191423c316 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -919,6 +919,10 @@ __SYSCALL(__NR_futex_wake, sys_futex_wake) __SYSCALL(__NR_futex_wait, sys_futex_wait) #define __NR_futex_requeue 456 __SYSCALL(__NR_futex_requeue, sys_futex_requeue) +#define __NR_statmount 457 +__SYSCALL(__NR_statmount, sys_statmount) +#define __NR_listmount 458 +__SYSCALL(__NR_listmount, sys_listmount) /* * Please add new compat syscalls above this comment and update diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 7a4b780e82cb..37db1a810b67 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -456,3 +456,5 @@ 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue +457 common statmount sys_statmount +458 common listmount sys_listmount diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 5b6a0b02b7de..07fff5ad1c9c 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -462,3 +462,5 @@ 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue +457 common statmount sys_statmount +458 common listmount sys_listmount diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index a842b41c8e06..134ea054b1c7 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -395,3 +395,5 @@ 454 n32 futex_wake sys_futex_wake 455 n32 futex_wait sys_futex_wait 456 n32 futex_requeue sys_futex_requeue +457 n32 statmount sys_statmount +458 n32 listmount sys_listmount diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 116ff501bf92..959a21664703 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -371,3 +371,5 @@ 454 n64 futex_wake sys_futex_wake 455 n64 futex_wait sys_futex_wait 456 n64 futex_requeue sys_futex_requeue +457 n64 statmount sys_statmount +458 n64 listmount sys_listmount diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 525cc54bc63b..e55bc1d4bf0f 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -444,3 +444,5 @@ 454 o32 futex_wake sys_futex_wake 455 o32 futex_wait sys_futex_wait 456 o32 futex_requeue sys_futex_requeue +457 o32 statmount sys_statmount +458 o32 listmount sys_listmount diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index a47798fed54e..9c84470c31c7 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -455,3 +455,5 @@ 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue +457 common statmount sys_statmount +458 common listmount sys_listmount diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 7fab411378f2..6988ecbc316e 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -543,3 +543,5 @@ 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue +457 common statmount sys_statmount +458 common listmount sys_listmount diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 86fec9b080f6..5f5cd20ebb34 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -459,3 +459,5 @@ 454 common futex_wake sys_futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue sys_futex_requeue +457 common statmount sys_statmount sys_statmount +458 common listmount sys_listmount sys_listmount diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index 363fae0fe9bf..3103ebd2e4cb 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -459,3 +459,5 @@ 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue +457 common statmount sys_statmount +458 common listmount sys_listmount diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 7bcaa3d5ea44..ba147d7ad19a 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -502,3 +502,5 @@ 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue +457 common statmount sys_statmount +458 common listmount sys_listmount diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index c8fac5205803..56e6c2f3ee9c 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -461,3 +461,5 @@ 454 i386 futex_wake sys_futex_wake 455 i386 futex_wait sys_futex_wait 456 i386 futex_requeue sys_futex_requeue +457 i386 statmount sys_statmount +458 i386 listmount sys_listmount diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 8cb8bf68721c..3a22eef585c2 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -378,6 +378,8 @@ 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue +457 common statmount sys_statmount +458 common listmount sys_listmount # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 06eefa9c1458..497b5d32f457 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -427,3 +427,5 @@ 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue +457 common statmount sys_statmount +458 common listmount sys_listmount diff --git a/fs/internal.h b/fs/internal.h index a7469ddba9b6..3a970a0644ce 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -83,6 +83,8 @@ int path_mount(const char *dev_name, struct path *path, const char *type_page, unsigned long flags, void *data_page); int path_umount(struct path *path, int flags); +int show_path(struct seq_file *m, struct dentry *root); + /* * fs_struct.c */ diff --git a/fs/mount.h b/fs/mount.h index 130c07c2f8d2..4a42fc68f4cc 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -8,19 +8,13 @@ struct mnt_namespace { struct ns_common ns; struct mount * root; - /* - * Traversal and modification of .list is protected by either - * - taking namespace_sem for write, OR - * - taking namespace_sem for read AND taking .ns_lock. - */ - struct list_head list; - spinlock_t ns_lock; + struct rb_root mounts; /* Protected by namespace_sem */ struct user_namespace *user_ns; struct ucounts *ucounts; u64 seq; /* Sequence number to prevent loops */ wait_queue_head_t poll; u64 event; - unsigned int mounts; /* # of mounts in the namespace */ + unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; } __randomize_layout; @@ -55,7 +49,10 @@ struct mount { struct list_head mnt_child; /* and going through their mnt_child */ struct list_head mnt_instance; /* mount instance on sb->s_mounts */ const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ - struct list_head mnt_list; + union { + struct rb_node mnt_node; /* Under ns->mounts */ + struct list_head mnt_list; + }; struct list_head mnt_expire; /* link in fs-specific expiry list */ struct list_head mnt_share; /* circular list of shared mounts */ struct list_head mnt_slave_list;/* list of slave mounts */ @@ -72,7 +69,8 @@ struct mount { struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; #endif - int mnt_id; /* mount identifier */ + int mnt_id; /* mount identifier, reused */ + u64 mnt_id_unique; /* mount ID unique until reboot */ int mnt_group_id; /* peer group identifier */ int mnt_expiry_mark; /* true if marked for expiry */ struct hlist_head mnt_pins; @@ -127,7 +125,6 @@ struct proc_mounts { struct mnt_namespace *ns; struct path root; int (*show)(struct seq_file *, struct vfsmount *); - struct mount cursor; }; extern const struct seq_operations mounts_op; @@ -146,4 +143,12 @@ static inline bool is_anon_ns(struct mnt_namespace *ns) return ns->seq == 0; } +static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) +{ + WARN_ON(!(mnt->mnt.mnt_flags & MNT_ONRB)); + mnt->mnt.mnt_flags &= ~MNT_ONRB; + rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts); + list_add_tail(&mnt->mnt_list, dt_list); +} + extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor); diff --git a/fs/namespace.c b/fs/namespace.c index 78366f114515..3beda4bb59af 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -32,6 +32,7 @@ #include <linux/fs_context.h> #include <linux/shmem_fs.h> #include <linux/mnt_idmapping.h> +#include <linux/nospec.h> #include "pnode.h" #include "internal.h" @@ -68,6 +69,9 @@ static u64 event; static DEFINE_IDA(mnt_id_ida); static DEFINE_IDA(mnt_group_ida); +/* Don't allow confusion with old 32bit mount ID */ +static atomic64_t mnt_id_ctr = ATOMIC64_INIT(1ULL << 32); + static struct hlist_head *mount_hashtable __ro_after_init; static struct hlist_head *mountpoint_hashtable __ro_after_init; static struct kmem_cache *mnt_cache __ro_after_init; @@ -131,6 +135,7 @@ static int mnt_alloc_id(struct mount *mnt) if (res < 0) return res; mnt->mnt_id = res; + mnt->mnt_id_unique = atomic64_inc_return(&mnt_id_ctr); return 0; } @@ -730,21 +735,6 @@ struct vfsmount *lookup_mnt(const struct path *path) return m; } -static inline void lock_ns_list(struct mnt_namespace *ns) -{ - spin_lock(&ns->ns_lock); -} - -static inline void unlock_ns_list(struct mnt_namespace *ns) -{ - spin_unlock(&ns->ns_lock); -} - -static inline bool mnt_is_cursor(struct mount *mnt) -{ - return mnt->mnt.mnt_flags & MNT_CURSOR; -} - /* * __is_local_mountpoint - Test to see if dentry is a mountpoint in the * current mount namespace. @@ -763,19 +753,15 @@ static inline bool mnt_is_cursor(struct mount *mnt) bool __is_local_mountpoint(struct dentry *dentry) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; - struct mount *mnt; + struct mount *mnt, *n; bool is_covered = false; down_read(&namespace_sem); - lock_ns_list(ns); - list_for_each_entry(mnt, &ns->list, mnt_list) { - if (mnt_is_cursor(mnt)) - continue; + rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) { is_covered = (mnt->mnt_mountpoint == dentry); if (is_covered) break; } - unlock_ns_list(ns); up_read(&namespace_sem); return is_covered; @@ -1022,6 +1008,30 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m mnt_add_count(old_parent, -1); } +static inline struct mount *node_to_mount(struct rb_node *node) +{ + return node ? rb_entry(node, struct mount, mnt_node) : NULL; +} + +static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) +{ + struct rb_node **link = &ns->mounts.rb_node; + struct rb_node *parent = NULL; + + WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB); + mnt->mnt_ns = ns; + while (*link) { + parent = *link; + if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) + link = &parent->rb_left; + else + link = &parent->rb_right; + } + rb_link_node(&mnt->mnt_node, parent, link); + rb_insert_color(&mnt->mnt_node, &ns->mounts); + mnt->mnt.mnt_flags |= MNT_ONRB; +} + /* * vfsmount lock must be held for write */ @@ -1035,12 +1045,13 @@ static void commit_tree(struct mount *mnt) BUG_ON(parent == mnt); list_add_tail(&head, &mnt->mnt_list); - list_for_each_entry(m, &head, mnt_list) - m->mnt_ns = n; + while (!list_empty(&head)) { + m = list_first_entry(&head, typeof(*m), mnt_list); + list_del(&m->mnt_list); - list_splice(&head, n->list.prev); - - n->mounts += n->pending_mounts; + mnt_add_to_ns(n, m); + } + n->nr_mounts += n->pending_mounts; n->pending_mounts = 0; __attach_mnt(mnt, parent); @@ -1188,7 +1199,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, } mnt->mnt.mnt_flags = old->mnt.mnt_flags; - mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); + mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB); atomic_inc(&sb->s_active); mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); @@ -1413,65 +1424,57 @@ struct vfsmount *mnt_clone_internal(const struct path *path) return &p->mnt; } -#ifdef CONFIG_PROC_FS -static struct mount *mnt_list_next(struct mnt_namespace *ns, - struct list_head *p) +/* + * Returns the mount which either has the specified mnt_id, or has the next + * smallest id afer the specified one. + */ +static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id) { - struct mount *mnt, *ret = NULL; + struct rb_node *node = ns->mounts.rb_node; + struct mount *ret = NULL; - lock_ns_list(ns); - list_for_each_continue(p, &ns->list) { - mnt = list_entry(p, typeof(*mnt), mnt_list); - if (!mnt_is_cursor(mnt)) { - ret = mnt; - break; + while (node) { + struct mount *m = node_to_mount(node); + + if (mnt_id <= m->mnt_id_unique) { + ret = node_to_mount(node); + if (mnt_id == m->mnt_id_unique) + break; + node = node->rb_left; + } else { + node = node->rb_right; } } - unlock_ns_list(ns); - return ret; } +#ifdef CONFIG_PROC_FS + /* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_mounts *p = m->private; - struct list_head *prev; down_read(&namespace_sem); - if (!*pos) { - prev = &p->ns->list; - } else { - prev = &p->cursor.mnt_list; - /* Read after we'd reached the end? */ - if (list_empty(prev)) - return NULL; - } - - return mnt_list_next(p->ns, prev); + return mnt_find_id_at(p->ns, *pos); } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { - struct proc_mounts *p = m->private; - struct mount *mnt = v; + struct mount *next = NULL, *mnt = v; + struct rb_node *node = rb_next(&mnt->mnt_node); ++*pos; - return mnt_list_next(p->ns, &mnt->mnt_list); + if (node) { + next = node_to_mount(node); + *pos = next->mnt_id_unique; + } + return next; } static void m_stop(struct seq_file *m, void *v) { - struct proc_mounts *p = m->private; - struct mount *mnt = v; - - lock_ns_list(p->ns); - if (mnt) - list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list); - else - list_del_init(&p->cursor.mnt_list); - unlock_ns_list(p->ns); up_read(&namespace_sem); } @@ -1489,14 +1492,6 @@ const struct seq_operations mounts_op = { .show = m_show, }; -void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor) -{ - down_read(&namespace_sem); - lock_ns_list(ns); - list_del(&cursor->mnt_list); - unlock_ns_list(ns); - up_read(&namespace_sem); -} #endif /* CONFIG_PROC_FS */ /** @@ -1638,7 +1633,10 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) /* Gather the mounts to umount */ for (p = mnt; p; p = next_mnt(p, mnt)) { p->mnt.mnt_flags |= MNT_UMOUNT; - list_move(&p->mnt_list, &tmp_list); + if (p->mnt.mnt_flags & MNT_ONRB) + move_from_ns(p, &tmp_list); + else + list_move(&p->mnt_list, &tmp_list); } /* Hide the mounts from mnt_mounts */ @@ -1658,7 +1656,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) list_del_init(&p->mnt_list); ns = p->mnt_ns; if (ns) { - ns->mounts--; + ns->nr_mounts--; __touch_mnt_namespace(ns); } p->mnt_ns = NULL; @@ -1784,14 +1782,16 @@ static int do_umount(struct mount *mnt, int flags) event++; if (flags & MNT_DETACH) { - if (!list_empty(&mnt->mnt_list)) + if (mnt->mnt.mnt_flags & MNT_ONRB || + !list_empty(&mnt->mnt_list)) umount_tree(mnt, UMOUNT_PROPAGATE); retval = 0; } else { shrink_submounts(mnt); retval = -EBUSY; if (!propagate_mount_busy(mnt, 2)) { - if (!list_empty(&mnt->mnt_list)) + if (mnt->mnt.mnt_flags & MNT_ONRB || + !list_empty(&mnt->mnt_list)) umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); retval = 0; } @@ -2209,9 +2209,9 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt) unsigned int mounts = 0; struct mount *p; - if (ns->mounts >= max) + if (ns->nr_mounts >= max) return -ENOSPC; - max -= ns->mounts; + max -= ns->nr_mounts; if (ns->pending_mounts >= max) return -ENOSPC; max -= ns->pending_mounts; @@ -2355,8 +2355,12 @@ static int attach_recursive_mnt(struct mount *source_mnt, touch_mnt_namespace(source_mnt->mnt_ns); } else { if (source_mnt->mnt_ns) { + LIST_HEAD(head); + /* move from anon - the caller will destroy */ - list_del_init(&source_mnt->mnt_ns->list); + for (p = source_mnt; p; p = next_mnt(p, source_mnt)) + move_from_ns(p, &head); + list_del_init(&head); } if (beneath) mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp); @@ -2667,11 +2671,10 @@ static struct file *open_detached_copy(struct path *path, bool recursive) lock_mount_hash(); for (p = mnt; p; p = next_mnt(p, mnt)) { - p->mnt_ns = ns; - ns->mounts++; + mnt_add_to_ns(ns, p); + ns->nr_mounts++; } ns->root = mnt; - list_add_tail(&ns->list, &mnt->mnt_list); mntget(&mnt->mnt); unlock_mount_hash(); namespace_unlock(); @@ -3735,9 +3738,8 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a if (!anon) new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); refcount_set(&new_ns->ns.count, 1); - INIT_LIST_HEAD(&new_ns->list); + new_ns->mounts = RB_ROOT; init_waitqueue_head(&new_ns->poll); - spin_lock_init(&new_ns->ns_lock); new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; return new_ns; @@ -3784,7 +3786,6 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, unlock_mount_hash(); } new_ns->root = new; - list_add_tail(&new_ns->list, &new->mnt_list); /* * Second pass: switch the tsk->fs->* elements and mark new vfsmounts @@ -3794,8 +3795,8 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, p = old; q = new; while (p) { - q->mnt_ns = new_ns; - new_ns->mounts++; + mnt_add_to_ns(new_ns, q); + new_ns->nr_mounts++; if (new_fs) { if (&p->mnt == new_fs->root.mnt) { new_fs->root.mnt = mntget(&q->mnt); @@ -3837,10 +3838,9 @@ struct dentry *mount_subtree(struct vfsmount *m, const char *name) mntput(m); return ERR_CAST(ns); } - mnt->mnt_ns = ns; ns->root = mnt; - ns->mounts++; - list_add(&mnt->mnt_list, &ns->list); + ns->nr_mounts++; + mnt_add_to_ns(ns, mnt); err = vfs_path_lookup(m->mnt_root, m, name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); @@ -4018,10 +4018,9 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, goto err_path; } mnt = real_mount(newmount.mnt); - mnt->mnt_ns = ns; ns->root = mnt; - ns->mounts = 1; - list_add(&mnt->mnt_list, &ns->list); + ns->nr_mounts = 1; + mnt_add_to_ns(ns, mnt); mntget(newmount.mnt); /* Attach to an apparent O_PATH fd with a note that we need to unmount @@ -4677,6 +4676,438 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, return err; } +int show_path(struct seq_file *m, struct dentry *root) +{ + if (root->d_sb->s_op->show_path) + return root->d_sb->s_op->show_path(m, root); + + seq_dentry(m, root, " \t\n\\"); + return 0; +} + +static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns) +{ + struct mount *mnt = mnt_find_id_at(ns, id); + + if (!mnt || mnt->mnt_id_unique != id) + return NULL; + + return &mnt->mnt; +} + +struct kstatmount { + struct statmount __user *buf; + size_t bufsize; + struct vfsmount *mnt; + u64 mask; + struct path root; + struct statmount sm; + struct seq_file seq; +}; + +static u64 mnt_to_attr_flags(struct vfsmount *mnt) +{ + unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags); + u64 attr_flags = 0; + + if (mnt_flags & MNT_READONLY) + attr_flags |= MOUNT_ATTR_RDONLY; + if (mnt_flags & MNT_NOSUID) + attr_flags |= MOUNT_ATTR_NOSUID; + if (mnt_flags & MNT_NODEV) + attr_flags |= MOUNT_ATTR_NODEV; + if (mnt_flags & MNT_NOEXEC) + attr_flags |= MOUNT_ATTR_NOEXEC; + if (mnt_flags & MNT_NODIRATIME) + attr_flags |= MOUNT_ATTR_NODIRATIME; + if (mnt_flags & MNT_NOSYMFOLLOW) + attr_flags |= MOUNT_ATTR_NOSYMFOLLOW; + + if (mnt_flags & MNT_NOATIME) + attr_flags |= MOUNT_ATTR_NOATIME; + else if (mnt_flags & MNT_RELATIME) + attr_flags |= MOUNT_ATTR_RELATIME; + else + attr_flags |= MOUNT_ATTR_STRICTATIME; + + if (is_idmapped_mnt(mnt)) + attr_flags |= MOUNT_ATTR_IDMAP; + + return attr_flags; +} + +static u64 mnt_to_propagation_flags(struct mount *m) +{ + u64 propagation = 0; + + if (IS_MNT_SHARED(m)) + propagation |= MS_SHARED; + if (IS_MNT_SLAVE(m)) + propagation |= MS_SLAVE; + if (IS_MNT_UNBINDABLE(m)) + propagation |= MS_UNBINDABLE; + if (!propagation) + propagation |= MS_PRIVATE; + + return propagation; +} + +static void statmount_sb_basic(struct kstatmount *s) +{ + struct super_block *sb = s->mnt->mnt_sb; + + s->sm.mask |= STATMOUNT_SB_BASIC; + s->sm.sb_dev_major = MAJOR(sb->s_dev); + s->sm.sb_dev_minor = MINOR(sb->s_dev); + s->sm.sb_magic = sb->s_magic; + s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME); +} + +static void statmount_mnt_basic(struct kstatmount *s) +{ + struct mount *m = real_mount(s->mnt); + + s->sm.mask |= STATMOUNT_MNT_BASIC; + s->sm.mnt_id = m->mnt_id_unique; + s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique; + s->sm.mnt_id_old = m->mnt_id; + s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id; + s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt); + s->sm.mnt_propagation = mnt_to_propagation_flags(m); + s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0; + s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0; +} + +static void statmount_propagate_from(struct kstatmount *s) +{ + struct mount *m = real_mount(s->mnt); + + s->sm.mask |= STATMOUNT_PROPAGATE_FROM; + if (IS_MNT_SLAVE(m)) + s->sm.propagate_from = get_dominating_id(m, ¤t->fs->root); +} + +static int statmount_mnt_root(struct kstatmount *s, struct seq_file *seq) +{ + int ret; + size_t start = seq->count; + + ret = show_path(seq, s->mnt->mnt_root); + if (ret) + return ret; + + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; + + /* + * Unescape the result. It would be better if supplied string was not + * escaped in the first place, but that's a pretty invasive change. + */ + seq->buf[seq->count] = '\0'; + seq->count = start; + seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL)); + return 0; +} + +static int statmount_mnt_point(struct kstatmount *s, struct seq_file *seq) +{ + struct vfsmount *mnt = s->mnt; + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + int err; + + err = seq_path_root(seq, &mnt_path, &s->root, ""); + return err == SEQ_SKIP ? 0 : err; +} + +static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq) +{ + struct super_block *sb = s->mnt->mnt_sb; + + seq_puts(seq, sb->s_type->name); + return 0; +} + +static int statmount_string(struct kstatmount *s, u64 flag) +{ + int ret; + size_t kbufsize; + struct seq_file *seq = &s->seq; + struct statmount *sm = &s->sm; + + switch (flag) { + case STATMOUNT_FS_TYPE: + sm->fs_type = seq->count; + ret = statmount_fs_type(s, seq); + break; + case STATMOUNT_MNT_ROOT: + sm->mnt_root = seq->count; + ret = statmount_mnt_root(s, seq); + break; + case STATMOUNT_MNT_POINT: + sm->mnt_point = seq->count; + ret = statmount_mnt_point(s, seq); + break; + default: + WARN_ON_ONCE(true); + return -EINVAL; + } + + if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize))) + return -EOVERFLOW; + if (kbufsize >= s->bufsize) + return -EOVERFLOW; + + /* signal a retry */ + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; + + if (ret) + return ret; + + seq->buf[seq->count++] = '\0'; + sm->mask |= flag; + return 0; +} + +static int copy_statmount_to_user(struct kstatmount *s) +{ + struct statmount *sm = &s->sm; + struct seq_file *seq = &s->seq; + char __user *str = ((char __user *)s->buf) + sizeof(*sm); + size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm)); + + if (seq->count && copy_to_user(str, seq->buf, seq->count)) + return -EFAULT; + + /* Return the number of bytes copied to the buffer */ + sm->size = copysize + seq->count; + if (copy_to_user(s->buf, sm, copysize)) + return -EFAULT; + + return 0; +} + +static int do_statmount(struct kstatmount *s) +{ + struct mount *m = real_mount(s->mnt); + int err; + + /* + * Don't trigger audit denials. We just want to determine what + * mounts to show users. + */ + if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) && + !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + err = security_sb_statfs(s->mnt->mnt_root); + if (err) + return err; + + if (s->mask & STATMOUNT_SB_BASIC) + statmount_sb_basic(s); + + if (s->mask & STATMOUNT_MNT_BASIC) + statmount_mnt_basic(s); + + if (s->mask & STATMOUNT_PROPAGATE_FROM) + statmount_propagate_from(s); + + if (s->mask & STATMOUNT_FS_TYPE) + err = statmount_string(s, STATMOUNT_FS_TYPE); + + if (!err && s->mask & STATMOUNT_MNT_ROOT) + err = statmount_string(s, STATMOUNT_MNT_ROOT); + + if (!err && s->mask & STATMOUNT_MNT_POINT) + err = statmount_string(s, STATMOUNT_MNT_POINT); + + if (err) + return err; + + return 0; +} + +static inline bool retry_statmount(const long ret, size_t *seq_size) +{ + if (likely(ret != -EAGAIN)) + return false; + if (unlikely(check_mul_overflow(*seq_size, 2, seq_size))) + return false; + if (unlikely(*seq_size > MAX_RW_COUNT)) + return false; + return true; +} + +static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, + struct statmount __user *buf, size_t bufsize, + size_t seq_size) +{ + if (!access_ok(buf, bufsize)) + return -EFAULT; + + memset(ks, 0, sizeof(*ks)); + ks->mask = kreq->param; + ks->buf = buf; + ks->bufsize = bufsize; + ks->seq.size = seq_size; + ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT); + if (!ks->seq.buf) + return -ENOMEM; + return 0; +} + +static int copy_mnt_id_req(const struct mnt_id_req __user *req, + struct mnt_id_req *kreq) +{ + int ret; + size_t usize; + + BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER0); + + ret = get_user(usize, &req->size); + if (ret) + return -EFAULT; + if (unlikely(usize > PAGE_SIZE)) + return -E2BIG; + if (unlikely(usize < MNT_ID_REQ_SIZE_VER0)) + return -EINVAL; + memset(kreq, 0, sizeof(*kreq)); + ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize); + if (ret) + return ret; + if (kreq->spare != 0) + return -EINVAL; + return 0; +} + +SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, + struct statmount __user *, buf, size_t, bufsize, + unsigned int, flags) +{ + struct vfsmount *mnt; + struct mnt_id_req kreq; + struct kstatmount ks; + /* We currently support retrieval of 3 strings. */ + size_t seq_size = 3 * PATH_MAX; + int ret; + + if (flags) + return -EINVAL; + + ret = copy_mnt_id_req(req, &kreq); + if (ret) + return ret; + +retry: + ret = prepare_kstatmount(&ks, &kreq, buf, bufsize, seq_size); + if (ret) + return ret; + + down_read(&namespace_sem); + mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns); + if (!mnt) { + up_read(&namespace_sem); + kvfree(ks.seq.buf); + return -ENOENT; + } + + ks.mnt = mnt; + get_fs_root(current->fs, &ks.root); + ret = do_statmount(&ks); + path_put(&ks.root); + up_read(&namespace_sem); + + if (!ret) + ret = copy_statmount_to_user(&ks); + kvfree(ks.seq.buf); + if (retry_statmount(ret, &seq_size)) + goto retry; + return ret; +} + +static struct mount *listmnt_next(struct mount *curr) +{ + return node_to_mount(rb_next(&curr->mnt_node)); +} + +static ssize_t do_listmount(struct mount *first, struct path *orig, u64 mnt_id, + u64 __user *buf, size_t bufsize, + const struct path *root) +{ + struct mount *r; + ssize_t ctr; + int err; + + /* + * Don't trigger audit denials. We just want to determine what + * mounts to show users. + */ + if (!is_path_reachable(real_mount(orig->mnt), orig->dentry, root) && + !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + err = security_sb_statfs(orig->dentry); + if (err) + return err; + + for (ctr = 0, r = first; r && ctr < bufsize; r = listmnt_next(r)) { + if (r->mnt_id_unique == mnt_id) + continue; + if (!is_path_reachable(r, r->mnt.mnt_root, orig)) + continue; + ctr = array_index_nospec(ctr, bufsize); + if (put_user(r->mnt_id_unique, buf + ctr)) + return -EFAULT; + if (check_add_overflow(ctr, 1, &ctr)) + return -ERANGE; + } + return ctr; +} + +SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, + u64 __user *, buf, size_t, bufsize, unsigned int, flags) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct mnt_id_req kreq; + struct mount *first; + struct path root, orig; + u64 mnt_id, last_mnt_id; + ssize_t ret; + + if (flags) + return -EINVAL; + + ret = copy_mnt_id_req(req, &kreq); + if (ret) + return ret; + mnt_id = kreq.mnt_id; + last_mnt_id = kreq.param; + + down_read(&namespace_sem); + get_fs_root(current->fs, &root); + if (mnt_id == LSMT_ROOT) { + orig = root; + } else { + ret = -ENOENT; + orig.mnt = lookup_mnt_in_ns(mnt_id, ns); + if (!orig.mnt) + goto err; + orig.dentry = orig.mnt->mnt_root; + } + if (!last_mnt_id) + first = node_to_mount(rb_first(&ns->mounts)); + else + first = mnt_find_id_at(ns, last_mnt_id + 1); + + ret = do_listmount(first, &orig, mnt_id, buf, bufsize, &root); +err: + path_put(&root); + up_read(&namespace_sem); + return ret; +} + + static void __init init_mount_tree(void) { struct vfsmount *mnt; @@ -4692,10 +5123,9 @@ static void __init init_mount_tree(void) if (IS_ERR(ns)) panic("Can't allocate initial namespace"); m = real_mount(mnt); - m->mnt_ns = ns; ns->root = m; - ns->mounts = 1; - list_add(&m->mnt_list, &ns->list); + ns->nr_mounts = 1; + mnt_add_to_ns(ns, m); init_task.nsproxy->mnt_ns = ns; get_mnt_ns(ns); @@ -4822,18 +5252,14 @@ static bool mnt_already_visible(struct mnt_namespace *ns, int *new_mnt_flags) { int new_flags = *new_mnt_flags; - struct mount *mnt; + struct mount *mnt, *n; bool visible = false; down_read(&namespace_sem); - lock_ns_list(ns); - list_for_each_entry(mnt, &ns->list, mnt_list) { + rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) { struct mount *child; int mnt_flags; - if (mnt_is_cursor(mnt)) - continue; - if (mnt->mnt.mnt_sb->s_type != sb->s_type) continue; @@ -4881,7 +5307,6 @@ static bool mnt_already_visible(struct mnt_namespace *ns, next: ; } found: - unlock_ns_list(ns); up_read(&namespace_sem); return visible; } diff --git a/fs/pnode.c b/fs/pnode.c index e4d0340393d5..a799e0315cc9 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -468,7 +468,7 @@ static void umount_one(struct mount *mnt, struct list_head *to_umount) mnt->mnt.mnt_flags |= MNT_UMOUNT; list_del_init(&mnt->mnt_child); list_del_init(&mnt->mnt_umounting); - list_move_tail(&mnt->mnt_list, to_umount); + move_from_ns(mnt, to_umount); } /* diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 250eb5bf7b52..0a808951b7d3 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -142,13 +142,9 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id, MAJOR(sb->s_dev), MINOR(sb->s_dev)); - if (sb->s_op->show_path) { - err = sb->s_op->show_path(m, mnt->mnt_root); - if (err) - goto out; - } else { - seq_dentry(m, mnt->mnt_root, " \t\n\\"); - } + err = show_path(m, mnt->mnt_root); + if (err) + goto out; seq_putc(m, ' '); /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ @@ -283,8 +279,6 @@ static int mounts_open_common(struct inode *inode, struct file *file, p->ns = ns; p->root = root; p->show = show; - INIT_LIST_HEAD(&p->cursor.mnt_list); - p->cursor.mnt.mnt_flags = MNT_CURSOR; return 0; @@ -301,7 +295,6 @@ static int mounts_release(struct inode *inode, struct file *file) struct seq_file *m = file->private_data; struct proc_mounts *p = m->private; path_put(&p->root); - mnt_cursor_del(p->ns, &p->cursor); put_mnt_ns(p->ns); return seq_release_private(inode, file); } diff --git a/fs/stat.c b/fs/stat.c index 8a8f7e97a742..77cdc69eb422 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -247,8 +247,13 @@ retry: error = vfs_getattr(&path, stat, request_mask, flags); - stat->mnt_id = real_mount(path.mnt)->mnt_id; - stat->result_mask |= STATX_MNT_ID; + if (request_mask & STATX_MNT_ID_UNIQUE) { + stat->mnt_id = real_mount(path.mnt)->mnt_id_unique; + stat->result_mask |= STATX_MNT_ID_UNIQUE; + } else { + stat->mnt_id = real_mount(path.mnt)->mnt_id; + stat->result_mask |= STATX_MNT_ID; + } if (path.mnt->mnt_root == path.dentry) stat->attributes |= STATX_ATTR_MOUNT_ROOT; diff --git a/include/linux/mount.h b/include/linux/mount.h index ac3dd2876197..c34c18b4e8f3 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -50,8 +50,7 @@ struct path; #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ - MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | \ - MNT_CURSOR) + MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | MNT_ONRB) #define MNT_INTERNAL 0x4000 @@ -65,7 +64,7 @@ struct path; #define MNT_SYNC_UMOUNT 0x2000000 #define MNT_MARKED 0x4000000 #define MNT_UMOUNT 0x8000000 -#define MNT_CURSOR 0x10000000 +#define MNT_ONRB 0x10000000 struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index fd9d12de7e92..2d6d3e76e3f7 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -74,6 +74,8 @@ struct landlock_ruleset_attr; enum landlock_rule_type; struct cachestat_range; struct cachestat; +struct statmount; +struct mnt_id_req; #include <linux/types.h> #include <linux/aio_abi.h> @@ -407,6 +409,12 @@ asmlinkage long sys_statfs64(const char __user *path, size_t sz, asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user *buf); asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz, struct statfs64 __user *buf); +asmlinkage long sys_statmount(const struct mnt_id_req __user *req, + struct statmount __user *buf, size_t bufsize, + unsigned int flags); +asmlinkage long sys_listmount(const struct mnt_id_req __user *req, + u64 __user *buf, size_t bufsize, + unsigned int flags); asmlinkage long sys_truncate(const char __user *path, long length); asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length); #if BITS_PER_LONG == 32 diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 756b013fb832..b67b18e71fbd 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -829,8 +829,14 @@ __SYSCALL(__NR_futex_wait, sys_futex_wait) #define __NR_futex_requeue 456 __SYSCALL(__NR_futex_requeue, sys_futex_requeue) +#define __NR_statmount 457 +__SYSCALL(__NR_statmount, sys_statmount) + +#define __NR_listmount 458 +__SYSCALL(__NR_listmount, sys_listmount) + #undef __NR_syscalls -#define __NR_syscalls 457 +#define __NR_syscalls 459 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index bb242fdcfe6b..ad5478dbad00 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -138,4 +138,74 @@ struct mount_attr { /* List of all mount_attr versions. */ #define MOUNT_ATTR_SIZE_VER0 32 /* sizeof first published struct */ + +/* + * Structure for getting mount/superblock/filesystem info with statmount(2). + * + * The interface is similar to statx(2): individual fields or groups can be + * selected with the @mask argument of statmount(). Kernel will set the @mask + * field according to the supported fields. + * + * If string fields are selected, then the caller needs to pass a buffer that + * has space after the fixed part of the structure. Nul terminated strings are + * copied there and offsets relative to @str are stored in the relevant fields. + * If the buffer is too small, then EOVERFLOW is returned. The actually used + * size is returned in @size. + */ +struct statmount { + __u32 size; /* Total size, including strings */ + __u32 __spare1; + __u64 mask; /* What results were written */ + __u32 sb_dev_major; /* Device ID */ + __u32 sb_dev_minor; + __u64 sb_magic; /* ..._SUPER_MAGIC */ + __u32 sb_flags; /* SB_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */ + __u32 fs_type; /* [str] Filesystem type */ + __u64 mnt_id; /* Unique ID of mount */ + __u64 mnt_parent_id; /* Unique ID of parent (for root == mnt_id) */ + __u32 mnt_id_old; /* Reused IDs used in proc/.../mountinfo */ + __u32 mnt_parent_id_old; + __u64 mnt_attr; /* MOUNT_ATTR_... */ + __u64 mnt_propagation; /* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */ + __u64 mnt_peer_group; /* ID of shared peer group */ + __u64 mnt_master; /* Mount receives propagation from this ID */ + __u64 propagate_from; /* Propagation from in current namespace */ + __u32 mnt_root; /* [str] Root of mount relative to root of fs */ + __u32 mnt_point; /* [str] Mountpoint relative to current root */ + __u64 __spare2[50]; + char str[]; /* Variable size part containing strings */ +}; + +/* + * Structure for passing mount ID and miscellaneous parameters to statmount(2) + * and listmount(2). + * + * For statmount(2) @param represents the request mask. + * For listmount(2) @param represents the last listed mount id (or zero). + */ +struct mnt_id_req { + __u32 size; + __u32 spare; + __u64 mnt_id; + __u64 param; +}; + +/* List of all mnt_id_req versions. */ +#define MNT_ID_REQ_SIZE_VER0 24 /* sizeof first published struct */ + +/* + * @mask bits for statmount(2) + */ +#define STATMOUNT_SB_BASIC 0x00000001U /* Want/got sb_... */ +#define STATMOUNT_MNT_BASIC 0x00000002U /* Want/got mnt_... */ +#define STATMOUNT_PROPAGATE_FROM 0x00000004U /* Want/got propagate_from */ +#define STATMOUNT_MNT_ROOT 0x00000008U /* Want/got mnt_root */ +#define STATMOUNT_MNT_POINT 0x00000010U /* Want/got mnt_point */ +#define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */ + +/* + * Special @mnt_id values that can be passed to listmount + */ +#define LSMT_ROOT 0xffffffffffffffff /* root mount */ + #endif /* _UAPI_LINUX_MOUNT_H */ diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index 7cab2c65d3d7..2f2ee82d5517 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -154,6 +154,7 @@ struct statx { #define STATX_BTIME 0x00000800U /* Want/got stx_btime */ #define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */ #define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */ +#define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */ #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 27f9f679ed15..8dc3de8beb98 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -27,6 +27,7 @@ TARGETS += filesystems/binderfs TARGETS += filesystems/epoll TARGETS += filesystems/fat TARGETS += filesystems/overlayfs +TARGETS += filesystems/statmount TARGETS += firmware TARGETS += fpu TARGETS += ftrace diff --git a/tools/testing/selftests/filesystems/statmount/.gitignore b/tools/testing/selftests/filesystems/statmount/.gitignore new file mode 100644 index 000000000000..82a4846cbc4b --- /dev/null +++ b/tools/testing/selftests/filesystems/statmount/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +/*_test diff --git a/tools/testing/selftests/filesystems/statmount/Makefile b/tools/testing/selftests/filesystems/statmount/Makefile new file mode 100644 index 000000000000..07a0d5b545ca --- /dev/null +++ b/tools/testing/selftests/filesystems/statmount/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES) +TEST_GEN_PROGS := statmount_test + +include ../../lib.mk diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c new file mode 100644 index 000000000000..3eafd7da58e2 --- /dev/null +++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c @@ -0,0 +1,612 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#define _GNU_SOURCE + +#include <assert.h> +#include <stdint.h> +#include <sched.h> +#include <fcntl.h> +#include <sys/param.h> +#include <sys/mount.h> +#include <sys/stat.h> +#include <sys/statfs.h> +#include <linux/mount.h> +#include <linux/stat.h> +#include <asm/unistd.h> + +#include "../../kselftest.h" + +static const char *const known_fs[] = { + "9p", "adfs", "affs", "afs", "aio", "anon_inodefs", "apparmorfs", + "autofs", "bcachefs", "bdev", "befs", "bfs", "binder", "binfmt_misc", + "bpf", "btrfs", "btrfs_test_fs", "ceph", "cgroup", "cgroup2", "cifs", + "coda", "configfs", "cpuset", "cramfs", "cxl", "dax", "debugfs", + "devpts", "devtmpfs", "dmabuf", "drm", "ecryptfs", "efivarfs", "efs", + "erofs", "exfat", "ext2", "ext3", "ext4", "f2fs", "functionfs", + "fuse", "fuseblk", "fusectl", "gadgetfs", "gfs2", "gfs2meta", "hfs", + "hfsplus", "hostfs", "hpfs", "hugetlbfs", "ibmasmfs", "iomem", + "ipathfs", "iso9660", "jffs2", "jfs", "minix", "mqueue", "msdos", + "nfs", "nfs4", "nfsd", "nilfs2", "nsfs", "ntfs", "ntfs3", "ocfs2", + "ocfs2_dlmfs", "ocxlflash", "omfs", "openpromfs", "overlay", "pipefs", + "proc", "pstore", "pvfs2", "qnx4", "qnx6", "ramfs", "reiserfs", + "resctrl", "romfs", "rootfs", "rpc_pipefs", "s390_hypfs", "secretmem", + "securityfs", "selinuxfs", "smackfs", "smb3", "sockfs", "spufs", + "squashfs", "sysfs", "sysv", "tmpfs", "tracefs", "ubifs", "udf", + "ufs", "v7", "vboxsf", "vfat", "virtiofs", "vxfs", "xenfs", "xfs", + "zonefs", NULL }; + +static int statmount(uint64_t mnt_id, uint64_t mask, struct statmount *buf, + size_t bufsize, unsigned int flags) +{ + struct mnt_id_req req = { + .size = MNT_ID_REQ_SIZE_VER0, + .mnt_id = mnt_id, + .param = mask, + }; + + return syscall(__NR_statmount, &req, buf, bufsize, flags); +} + +static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mask, unsigned int flags) +{ + size_t bufsize = 1 << 15; + struct statmount *buf = NULL, *tmp = alloca(bufsize); + int tofree = 0; + int ret; + + for (;;) { + ret = statmount(mnt_id, mask, tmp, bufsize, flags); + if (ret != -1) + break; + if (tofree) + free(tmp); + if (errno != EOVERFLOW) + return NULL; + bufsize <<= 1; + tofree = 1; + tmp = malloc(bufsize); + if (!tmp) + return NULL; + } + buf = malloc(tmp->size); + if (buf) + memcpy(buf, tmp, tmp->size); + if (tofree) + free(tmp); + + return buf; +} + +static void write_file(const char *path, const char *val) +{ + int fd = open(path, O_WRONLY); + size_t len = strlen(val); + int ret; + + if (fd == -1) + ksft_exit_fail_msg("opening %s for write: %s\n", path, strerror(errno)); + + ret = write(fd, val, len); + if (ret == -1) + ksft_exit_fail_msg("writing to %s: %s\n", path, strerror(errno)); + if (ret != len) + ksft_exit_fail_msg("short write to %s\n", path); + + ret = close(fd); + if (ret == -1) + ksft_exit_fail_msg("closing %s\n", path); +} + +static uint64_t get_mnt_id(const char *name, const char *path, uint64_t mask) +{ + struct statx sx; + int ret; + + ret = statx(AT_FDCWD, path, 0, mask, &sx); + if (ret == -1) + ksft_exit_fail_msg("retrieving %s mount ID for %s: %s\n", + mask & STATX_MNT_ID_UNIQUE ? "unique" : "old", + name, strerror(errno)); + if (!(sx.stx_mask & mask)) + ksft_exit_fail_msg("no %s mount ID available for %s\n", + mask & STATX_MNT_ID_UNIQUE ? "unique" : "old", + name); + + return sx.stx_mnt_id; +} + + +static char root_mntpoint[] = "/tmp/statmount_test_root.XXXXXX"; +static int orig_root; +static uint64_t root_id, parent_id; +static uint32_t old_root_id, old_parent_id; + + +static void cleanup_namespace(void) +{ + fchdir(orig_root); + chroot("."); + umount2(root_mntpoint, MNT_DETACH); + rmdir(root_mntpoint); +} + +static void setup_namespace(void) +{ + int ret; + char buf[32]; + uid_t uid = getuid(); + gid_t gid = getgid(); + + ret = unshare(CLONE_NEWNS|CLONE_NEWUSER); + if (ret == -1) + ksft_exit_fail_msg("unsharing mountns and userns: %s\n", + strerror(errno)); + + sprintf(buf, "0 %d 1", uid); + write_file("/proc/self/uid_map", buf); + write_file("/proc/self/setgroups", "deny"); + sprintf(buf, "0 %d 1", gid); + write_file("/proc/self/gid_map", buf); + + ret = mount("", "/", NULL, MS_REC|MS_PRIVATE, NULL); + if (ret == -1) + ksft_exit_fail_msg("making mount tree private: %s\n", + strerror(errno)); + + if (!mkdtemp(root_mntpoint)) + ksft_exit_fail_msg("creating temporary directory %s: %s\n", + root_mntpoint, strerror(errno)); + + old_parent_id = get_mnt_id("parent", root_mntpoint, STATX_MNT_ID); + parent_id = get_mnt_id("parent", root_mntpoint, STATX_MNT_ID_UNIQUE); + + orig_root = open("/", O_PATH); + if (orig_root == -1) + ksft_exit_fail_msg("opening root directory: %s", + strerror(errno)); + + atexit(cleanup_namespace); + + ret = mount(root_mntpoint, root_mntpoint, NULL, MS_BIND, NULL); + if (ret == -1) + ksft_exit_fail_msg("mounting temp root %s: %s\n", + root_mntpoint, strerror(errno)); + + ret = chroot(root_mntpoint); + if (ret == -1) + ksft_exit_fail_msg("chroot to temp root %s: %s\n", + root_mntpoint, strerror(errno)); + + ret = chdir("/"); + if (ret == -1) + ksft_exit_fail_msg("chdir to root: %s\n", strerror(errno)); + + old_root_id = get_mnt_id("root", "/", STATX_MNT_ID); + root_id = get_mnt_id("root", "/", STATX_MNT_ID_UNIQUE); +} + +static int setup_mount_tree(int log2_num) +{ + int ret, i; + + ret = mount("", "/", NULL, MS_REC|MS_SHARED, NULL); + if (ret == -1) { + ksft_test_result_fail("making mount tree shared: %s\n", + strerror(errno)); + return -1; + } + + for (i = 0; i < log2_num; i++) { + ret = mount("/", "/", NULL, MS_BIND, NULL); + if (ret == -1) { + ksft_test_result_fail("mounting submount %s: %s\n", + root_mntpoint, strerror(errno)); + return -1; + } + } + return 0; +} + +static ssize_t listmount(uint64_t mnt_id, uint64_t last_mnt_id, + uint64_t list[], size_t num, unsigned int flags) +{ + struct mnt_id_req req = { + .size = MNT_ID_REQ_SIZE_VER0, + .mnt_id = mnt_id, + .param = last_mnt_id, + }; + + return syscall(__NR_listmount, &req, list, num, flags); +} + +static void test_listmount_empty_root(void) +{ + ssize_t res; + const unsigned int size = 32; + uint64_t list[size]; + + res = listmount(LSMT_ROOT, 0, list, size, 0); + if (res == -1) { + ksft_test_result_fail("listmount: %s\n", strerror(errno)); + return; + } + if (res != 1) { + ksft_test_result_fail("listmount result is %zi != 1\n", res); + return; + } + + if (list[0] != root_id) { + ksft_test_result_fail("listmount ID doesn't match 0x%llx != 0x%llx\n", + (unsigned long long) list[0], + (unsigned long long) root_id); + return; + } + + ksft_test_result_pass("listmount empty root\n"); +} + +static void test_statmount_zero_mask(void) +{ + struct statmount sm; + int ret; + + ret = statmount(root_id, 0, &sm, sizeof(sm), 0); + if (ret == -1) { + ksft_test_result_fail("statmount zero mask: %s\n", + strerror(errno)); + return; + } + if (sm.size != sizeof(sm)) { + ksft_test_result_fail("unexpected size: %u != %u\n", + sm.size, (uint32_t) sizeof(sm)); + return; + } + if (sm.mask != 0) { + ksft_test_result_fail("unexpected mask: 0x%llx != 0x0\n", + (unsigned long long) sm.mask); + return; + } + + ksft_test_result_pass("statmount zero mask\n"); +} + +static void test_statmount_mnt_basic(void) +{ + struct statmount sm; + int ret; + uint64_t mask = STATMOUNT_MNT_BASIC; + + ret = statmount(root_id, mask, &sm, sizeof(sm), 0); + if (ret == -1) { + ksft_test_result_fail("statmount mnt basic: %s\n", + strerror(errno)); + return; + } + if (sm.size != sizeof(sm)) { + ksft_test_result_fail("unexpected size: %u != %u\n", + sm.size, (uint32_t) sizeof(sm)); + return; + } + if (sm.mask != mask) { + ksft_test_result_skip("statmount mnt basic unavailable\n"); + return; + } + + if (sm.mnt_id != root_id) { + ksft_test_result_fail("unexpected root ID: 0x%llx != 0x%llx\n", + (unsigned long long) sm.mnt_id, + (unsigned long long) root_id); + return; + } + + if (sm.mnt_id_old != old_root_id) { + ksft_test_result_fail("unexpected old root ID: %u != %u\n", + sm.mnt_id_old, old_root_id); + return; + } + + if (sm.mnt_parent_id != parent_id) { + ksft_test_result_fail("unexpected parent ID: 0x%llx != 0x%llx\n", + (unsigned long long) sm.mnt_parent_id, + (unsigned long long) parent_id); + return; + } + + if (sm.mnt_parent_id_old != old_parent_id) { + ksft_test_result_fail("unexpected old parent ID: %u != %u\n", + sm.mnt_parent_id_old, old_parent_id); + return; + } + + if (sm.mnt_propagation != MS_PRIVATE) { + ksft_test_result_fail("unexpected propagation: 0x%llx\n", + (unsigned long long) sm.mnt_propagation); + return; + } + + ksft_test_result_pass("statmount mnt basic\n"); +} + + +static void test_statmount_sb_basic(void) +{ + struct statmount sm; + int ret; + uint64_t mask = STATMOUNT_SB_BASIC; + struct statx sx; + struct statfs sf; + + ret = statmount(root_id, mask, &sm, sizeof(sm), 0); + if (ret == -1) { + ksft_test_result_fail("statmount sb basic: %s\n", + strerror(errno)); + return; + } + if (sm.size != sizeof(sm)) { + ksft_test_result_fail("unexpected size: %u != %u\n", + sm.size, (uint32_t) sizeof(sm)); + return; + } + if (sm.mask != mask) { + ksft_test_result_skip("statmount sb basic unavailable\n"); + return; + } + + ret = statx(AT_FDCWD, "/", 0, 0, &sx); + if (ret == -1) { + ksft_test_result_fail("stat root failed: %s\n", + strerror(errno)); + return; + } + + if (sm.sb_dev_major != sx.stx_dev_major || + sm.sb_dev_minor != sx.stx_dev_minor) { + ksft_test_result_fail("unexpected sb dev %u:%u != %u:%u\n", + sm.sb_dev_major, sm.sb_dev_minor, + sx.stx_dev_major, sx.stx_dev_minor); + return; + } + + ret = statfs("/", &sf); + if (ret == -1) { + ksft_test_result_fail("statfs root failed: %s\n", + strerror(errno)); + return; + } + + if (sm.sb_magic != sf.f_type) { + ksft_test_result_fail("unexpected sb magic: 0x%llx != 0x%lx\n", + (unsigned long long) sm.sb_magic, + sf.f_type); + return; + } + + ksft_test_result_pass("statmount sb basic\n"); +} + +static void test_statmount_mnt_point(void) +{ + struct statmount *sm; + + sm = statmount_alloc(root_id, STATMOUNT_MNT_POINT, 0); + if (!sm) { + ksft_test_result_fail("statmount mount point: %s\n", + strerror(errno)); + return; + } + + if (strcmp(sm->str + sm->mnt_point, "/") != 0) { + ksft_test_result_fail("unexpected mount point: '%s' != '/'\n", + sm->str + sm->mnt_point); + goto out; + } + ksft_test_result_pass("statmount mount point\n"); +out: + free(sm); +} + +static void test_statmount_mnt_root(void) +{ + struct statmount *sm; + const char *mnt_root, *last_dir, *last_root; + + last_dir = strrchr(root_mntpoint, '/'); + assert(last_dir); + last_dir++; + + sm = statmount_alloc(root_id, STATMOUNT_MNT_ROOT, 0); + if (!sm) { + ksft_test_result_fail("statmount mount root: %s\n", + strerror(errno)); + return; + } + mnt_root = sm->str + sm->mnt_root; + last_root = strrchr(mnt_root, '/'); + if (last_root) + last_root++; + else + last_root = mnt_root; + + if (strcmp(last_dir, last_root) != 0) { + ksft_test_result_fail("unexpected mount root last component: '%s' != '%s'\n", + last_root, last_dir); + goto out; + } + ksft_test_result_pass("statmount mount root\n"); +out: + free(sm); +} + +static void test_statmount_fs_type(void) +{ + struct statmount *sm; + const char *fs_type; + const char *const *s; + + sm = statmount_alloc(root_id, STATMOUNT_FS_TYPE, 0); + if (!sm) { + ksft_test_result_fail("statmount fs type: %s\n", + strerror(errno)); + return; + } + fs_type = sm->str + sm->fs_type; + for (s = known_fs; s != NULL; s++) { + if (strcmp(fs_type, *s) == 0) + break; + } + if (!s) + ksft_print_msg("unknown filesystem type: %s\n", fs_type); + + ksft_test_result_pass("statmount fs type\n"); + free(sm); +} + +static void test_statmount_string(uint64_t mask, size_t off, const char *name) +{ + struct statmount *sm; + size_t len, shortsize, exactsize; + uint32_t start, i; + int ret; + + sm = statmount_alloc(root_id, mask, 0); + if (!sm) { + ksft_test_result_fail("statmount %s: %s\n", name, + strerror(errno)); + goto out; + } + if (sm->size < sizeof(*sm)) { + ksft_test_result_fail("unexpected size: %u < %u\n", + sm->size, (uint32_t) sizeof(*sm)); + goto out; + } + if (sm->mask != mask) { + ksft_test_result_skip("statmount %s unavailable\n", name); + goto out; + } + len = sm->size - sizeof(*sm); + start = ((uint32_t *) sm)[off]; + + for (i = start;; i++) { + if (i >= len) { + ksft_test_result_fail("string out of bounds\n"); + goto out; + } + if (!sm->str[i]) + break; + } + exactsize = sm->size; + shortsize = sizeof(*sm) + i; + + ret = statmount(root_id, mask, sm, exactsize, 0); + if (ret == -1) { + ksft_test_result_fail("statmount exact size: %s\n", + strerror(errno)); + goto out; + } + errno = 0; + ret = statmount(root_id, mask, sm, shortsize, 0); + if (ret != -1 || errno != EOVERFLOW) { + ksft_test_result_fail("should have failed with EOVERFLOW: %s\n", + strerror(errno)); + goto out; + } + + ksft_test_result_pass("statmount string %s\n", name); +out: + free(sm); +} + +static void test_listmount_tree(void) +{ + ssize_t res; + const unsigned int log2_num = 4; + const unsigned int step = 3; + const unsigned int size = (1 << log2_num) + step + 1; + size_t num, expect = 1 << log2_num; + uint64_t list[size]; + uint64_t list2[size]; + size_t i; + + + res = setup_mount_tree(log2_num); + if (res == -1) + return; + + num = res = listmount(LSMT_ROOT, 0, list, size, 0); + if (res == -1) { + ksft_test_result_fail("listmount: %s\n", strerror(errno)); + return; + } + if (num != expect) { + ksft_test_result_fail("listmount result is %zi != %zi\n", + res, expect); + return; + } + + for (i = 0; i < size - step;) { + res = listmount(LSMT_ROOT, i ? list2[i - 1] : 0, list2 + i, step, 0); + if (res == -1) + ksft_test_result_fail("short listmount: %s\n", + strerror(errno)); + i += res; + if (res < step) + break; + } + if (i != num) { + ksft_test_result_fail("different number of entries: %zu != %zu\n", + i, num); + return; + } + for (i = 0; i < num; i++) { + if (list2[i] != list[i]) { + ksft_test_result_fail("different value for entry %zu: 0x%llx != 0x%llx\n", + i, + (unsigned long long) list2[i], + (unsigned long long) list[i]); + } + } + + ksft_test_result_pass("listmount tree\n"); +} + +#define str_off(memb) (offsetof(struct statmount, memb) / sizeof(uint32_t)) + +int main(void) +{ + int ret; + uint64_t all_mask = STATMOUNT_SB_BASIC | STATMOUNT_MNT_BASIC | + STATMOUNT_PROPAGATE_FROM | STATMOUNT_MNT_ROOT | + STATMOUNT_MNT_POINT | STATMOUNT_FS_TYPE; + + ksft_print_header(); + + ret = statmount(0, 0, NULL, 0, 0); + assert(ret == -1); + if (errno == ENOSYS) + ksft_exit_skip("statmount() syscall not supported\n"); + + setup_namespace(); + + ksft_set_plan(14); + test_listmount_empty_root(); + test_statmount_zero_mask(); + test_statmount_mnt_basic(); + test_statmount_sb_basic(); + test_statmount_mnt_root(); + test_statmount_mnt_point(); + test_statmount_fs_type(); + test_statmount_string(STATMOUNT_MNT_ROOT, str_off(mnt_root), "mount root"); + test_statmount_string(STATMOUNT_MNT_POINT, str_off(mnt_point), "mount point"); + test_statmount_string(STATMOUNT_FS_TYPE, str_off(fs_type), "fs type"); + test_statmount_string(all_mask, str_off(mnt_root), "mount root & all"); + test_statmount_string(all_mask, str_off(mnt_point), "mount point & all"); + test_statmount_string(all_mask, str_off(fs_type), "fs type & all"); + + test_listmount_tree(); + + + if (ksft_get_fail_cnt() + ksft_get_error_cnt() > 0) + ksft_exit_fail(); + else + ksft_exit_pass(); +} |