diff options
Diffstat (limited to 'fs/namespace.c')
| -rw-r--r-- | fs/namespace.c | 537 | 
1 files changed, 464 insertions, 73 deletions
diff --git a/fs/namespace.c b/fs/namespace.c index 9d33909d0f9e..56bb5a5fdc0d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -25,6 +25,7 @@  #include <linux/proc_ns.h>  #include <linux/magic.h>  #include <linux/memblock.h> +#include <linux/proc_fs.h>  #include <linux/task_work.h>  #include <linux/sched/task.h>  #include <uapi/linux/mount.h> @@ -73,6 +74,15 @@ static DECLARE_RWSEM(namespace_sem);  static HLIST_HEAD(unmounted);	/* protected by namespace_sem */  static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ +struct mount_kattr { +	unsigned int attr_set; +	unsigned int attr_clr; +	unsigned int propagation; +	unsigned int lookup_flags; +	bool recurse; +	struct user_namespace *mnt_userns; +}; +  /* /sys/fs */  struct kobject *fs_kobj;  EXPORT_SYMBOL_GPL(fs_kobj); @@ -87,6 +97,16 @@ EXPORT_SYMBOL_GPL(fs_kobj);   */  __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); +static inline void lock_mount_hash(void) +{ +	write_seqlock(&mount_lock); +} + +static inline void unlock_mount_hash(void) +{ +	write_sequnlock(&mount_lock); +} +  static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)  {  	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); @@ -210,6 +230,7 @@ static struct mount *alloc_vfsmnt(const char *name)  		INIT_HLIST_NODE(&mnt->mnt_mp_list);  		INIT_LIST_HEAD(&mnt->mnt_umounting);  		INIT_HLIST_HEAD(&mnt->mnt_stuck_children); +		mnt->mnt.mnt_userns = &init_user_ns;  	}  	return mnt; @@ -360,50 +381,36 @@ int mnt_want_write(struct vfsmount *m)  EXPORT_SYMBOL_GPL(mnt_want_write);  /** - * mnt_clone_write - get write access to a mount - * @mnt: the mount on which to take a write - * - * This is effectively like mnt_want_write, except - * it must only be used to take an extra write reference - * on a mountpoint that we already know has a write reference - * on it. This allows some optimisation. - * - * After finished, mnt_drop_write must be called as usual to - * drop the reference. - */ -int mnt_clone_write(struct vfsmount *mnt) -{ -	/* superblock may be r/o */ -	if (__mnt_is_readonly(mnt)) -		return -EROFS; -	preempt_disable(); -	mnt_inc_writers(real_mount(mnt)); -	preempt_enable(); -	return 0; -} -EXPORT_SYMBOL_GPL(mnt_clone_write); - -/**   * __mnt_want_write_file - get write access to a file's mount   * @file: the file who's mount on which to take a write   * - * This is like __mnt_want_write, but it takes a file and can - * do some optimisations if the file is open for write already + * This is like __mnt_want_write, but if the file is already open for writing it + * skips incrementing mnt_writers (since the open file already has a reference) + * and instead only does the check for emergency r/o remounts.  This must be + * paired with __mnt_drop_write_file.   */  int __mnt_want_write_file(struct file *file)  { -	if (!(file->f_mode & FMODE_WRITER)) -		return __mnt_want_write(file->f_path.mnt); -	else -		return mnt_clone_write(file->f_path.mnt); +	if (file->f_mode & FMODE_WRITER) { +		/* +		 * Superblock may have become readonly while there are still +		 * writable fd's, e.g. due to a fs error with errors=remount-ro +		 */ +		if (__mnt_is_readonly(file->f_path.mnt)) +			return -EROFS; +		return 0; +	} +	return __mnt_want_write(file->f_path.mnt);  }  /**   * mnt_want_write_file - get write access to a file's mount   * @file: the file who's mount on which to take a write   * - * This is like mnt_want_write, but it takes a file and can - * do some optimisations if the file is open for write already + * This is like mnt_want_write, but if the file is already open for writing it + * skips incrementing mnt_writers (since the open file already has a reference) + * and instead only does the freeze protection and the check for emergency r/o + * remounts.  This must be paired with mnt_drop_write_file.   */  int mnt_want_write_file(struct file *file)  { @@ -449,7 +456,8 @@ EXPORT_SYMBOL_GPL(mnt_drop_write);  void __mnt_drop_write_file(struct file *file)  { -	__mnt_drop_write(file->f_path.mnt); +	if (!(file->f_mode & FMODE_WRITER)) +		__mnt_drop_write(file->f_path.mnt);  }  void mnt_drop_write_file(struct file *file) @@ -459,11 +467,8 @@ void mnt_drop_write_file(struct file *file)  }  EXPORT_SYMBOL(mnt_drop_write_file); -static int mnt_make_readonly(struct mount *mnt) +static inline int mnt_hold_writers(struct mount *mnt)  { -	int ret = 0; - -	lock_mount_hash();  	mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;  	/*  	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store @@ -488,25 +493,30 @@ static int mnt_make_readonly(struct mount *mnt)  	 * we're counting up here.  	 */  	if (mnt_get_writers(mnt) > 0) -		ret = -EBUSY; -	else -		mnt->mnt.mnt_flags |= MNT_READONLY; +		return -EBUSY; + +	return 0; +} + +static inline void mnt_unhold_writers(struct mount *mnt) +{  	/*  	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers  	 * that become unheld will see MNT_READONLY.  	 */  	smp_wmb();  	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; -	unlock_mount_hash(); -	return ret;  } -static int __mnt_unmake_readonly(struct mount *mnt) +static int mnt_make_readonly(struct mount *mnt)  { -	lock_mount_hash(); -	mnt->mnt.mnt_flags &= ~MNT_READONLY; -	unlock_mount_hash(); -	return 0; +	int ret; + +	ret = mnt_hold_writers(mnt); +	if (!ret) +		mnt->mnt.mnt_flags |= MNT_READONLY; +	mnt_unhold_writers(mnt); +	return ret;  }  int sb_prepare_remount_readonly(struct super_block *sb) @@ -547,6 +557,11 @@ int sb_prepare_remount_readonly(struct super_block *sb)  static void free_vfsmnt(struct mount *mnt)  { +	struct user_namespace *mnt_userns; + +	mnt_userns = mnt_user_ns(&mnt->mnt); +	if (mnt_userns != &init_user_ns) +		put_user_ns(mnt_userns);  	kfree_const(mnt->mnt_devname);  #ifdef CONFIG_SMP  	free_percpu(mnt->mnt_pcp); @@ -1055,6 +1070,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,  	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);  	atomic_inc(&sb->s_active); +	mnt->mnt.mnt_userns = mnt_user_ns(&old->mnt); +	if (mnt->mnt.mnt_userns != &init_user_ns) +		mnt->mnt.mnt_userns = get_user_ns(mnt->mnt.mnt_userns);  	mnt->mnt.mnt_sb = sb;  	mnt->mnt.mnt_root = dget(root);  	mnt->mnt_mountpoint = mnt->mnt.mnt_root; @@ -2514,20 +2532,15 @@ static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)  	if (readonly_request)  		return mnt_make_readonly(mnt); -	return __mnt_unmake_readonly(mnt); +	mnt->mnt.mnt_flags &= ~MNT_READONLY; +	return 0;  } -/* - * Update the user-settable attributes on a mount.  The caller must hold - * sb->s_umount for writing. - */  static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)  { -	lock_mount_hash();  	mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;  	mnt->mnt.mnt_flags = mnt_flags;  	touch_mnt_namespace(mnt->mnt_ns); -	unlock_mount_hash();  }  static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt) @@ -2572,11 +2585,17 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)  	if (!can_change_locked_flags(mnt, mnt_flags))  		return -EPERM; -	down_write(&sb->s_umount); +	/* +	 * We're only checking whether the superblock is read-only not +	 * changing it, so only take down_read(&sb->s_umount). +	 */ +	down_read(&sb->s_umount); +	lock_mount_hash();  	ret = change_mount_ro_state(mnt, mnt_flags);  	if (ret == 0)  		set_mount_attributes(mnt, mnt_flags); -	up_write(&sb->s_umount); +	unlock_mount_hash(); +	up_read(&sb->s_umount);  	mnt_warn_timestamp_expiry(path, &mnt->mnt); @@ -2616,8 +2635,11 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags,  		err = -EPERM;  		if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {  			err = reconfigure_super(fc); -			if (!err) +			if (!err) { +				lock_mount_hash();  				set_mount_attributes(mnt, mnt_flags); +				unlock_mount_hash(); +			}  		}  		up_write(&sb->s_umount);  	} @@ -3440,6 +3462,33 @@ out_type:  	return ret;  } +#define FSMOUNT_VALID_FLAGS \ +	(MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \ +	 MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME) + +#define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP) + +#define MOUNT_SETATTR_PROPAGATION_FLAGS \ +	(MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED) + +static unsigned int attr_flags_to_mnt_flags(u64 attr_flags) +{ +	unsigned int mnt_flags = 0; + +	if (attr_flags & MOUNT_ATTR_RDONLY) +		mnt_flags |= MNT_READONLY; +	if (attr_flags & MOUNT_ATTR_NOSUID) +		mnt_flags |= MNT_NOSUID; +	if (attr_flags & MOUNT_ATTR_NODEV) +		mnt_flags |= MNT_NODEV; +	if (attr_flags & MOUNT_ATTR_NOEXEC) +		mnt_flags |= MNT_NOEXEC; +	if (attr_flags & MOUNT_ATTR_NODIRATIME) +		mnt_flags |= MNT_NODIRATIME; + +	return mnt_flags; +} +  /*   * Create a kernel mount representation for a new, prepared superblock   * (specified by fs_fd) and attach to an open_tree-like file descriptor. @@ -3462,24 +3511,10 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,  	if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)  		return -EINVAL; -	if (attr_flags & ~(MOUNT_ATTR_RDONLY | -			   MOUNT_ATTR_NOSUID | -			   MOUNT_ATTR_NODEV | -			   MOUNT_ATTR_NOEXEC | -			   MOUNT_ATTR__ATIME | -			   MOUNT_ATTR_NODIRATIME)) +	if (attr_flags & ~FSMOUNT_VALID_FLAGS)  		return -EINVAL; -	if (attr_flags & MOUNT_ATTR_RDONLY) -		mnt_flags |= MNT_READONLY; -	if (attr_flags & MOUNT_ATTR_NOSUID) -		mnt_flags |= MNT_NOSUID; -	if (attr_flags & MOUNT_ATTR_NODEV) -		mnt_flags |= MNT_NODEV; -	if (attr_flags & MOUNT_ATTR_NOEXEC) -		mnt_flags |= MNT_NOEXEC; -	if (attr_flags & MOUNT_ATTR_NODIRATIME) -		mnt_flags |= MNT_NODIRATIME; +	mnt_flags = attr_flags_to_mnt_flags(attr_flags);  	switch (attr_flags & MOUNT_ATTR__ATIME) {  	case MOUNT_ATTR_STRICTATIME: @@ -3787,6 +3822,362 @@ out0:  	return error;  } +static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt) +{ +	unsigned int flags = mnt->mnt.mnt_flags; + +	/*  flags to clear */ +	flags &= ~kattr->attr_clr; +	/* flags to raise */ +	flags |= kattr->attr_set; + +	return flags; +} + +static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) +{ +	struct vfsmount *m = &mnt->mnt; + +	if (!kattr->mnt_userns) +		return 0; + +	/* +	 * Once a mount has been idmapped we don't allow it to change its +	 * mapping. It makes things simpler and callers can just create +	 * another bind-mount they can idmap if they want to. +	 */ +	if (mnt_user_ns(m) != &init_user_ns) +		return -EPERM; + +	/* The underlying filesystem doesn't support idmapped mounts yet. */ +	if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP)) +		return -EINVAL; + +	/* We're not controlling the superblock. */ +	if (!ns_capable(m->mnt_sb->s_user_ns, CAP_SYS_ADMIN)) +		return -EPERM; + +	/* Mount has already been visible in the filesystem hierarchy. */ +	if (!is_anon_ns(mnt->mnt_ns)) +		return -EINVAL; + +	return 0; +} + +static struct mount *mount_setattr_prepare(struct mount_kattr *kattr, +					   struct mount *mnt, int *err) +{ +	struct mount *m = mnt, *last = NULL; + +	if (!is_mounted(&m->mnt)) { +		*err = -EINVAL; +		goto out; +	} + +	if (!(mnt_has_parent(m) ? check_mnt(m) : is_anon_ns(m->mnt_ns))) { +		*err = -EINVAL; +		goto out; +	} + +	do { +		unsigned int flags; + +		flags = recalc_flags(kattr, m); +		if (!can_change_locked_flags(m, flags)) { +			*err = -EPERM; +			goto out; +		} + +		*err = can_idmap_mount(kattr, m); +		if (*err) +			goto out; + +		last = m; + +		if ((kattr->attr_set & MNT_READONLY) && +		    !(m->mnt.mnt_flags & MNT_READONLY)) { +			*err = mnt_hold_writers(m); +			if (*err) +				goto out; +		} +	} while (kattr->recurse && (m = next_mnt(m, mnt))); + +out: +	return last; +} + +static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) +{ +	struct user_namespace *mnt_userns; + +	if (!kattr->mnt_userns) +		return; + +	mnt_userns = get_user_ns(kattr->mnt_userns); +	/* Pairs with smp_load_acquire() in mnt_user_ns(). */ +	smp_store_release(&mnt->mnt.mnt_userns, mnt_userns); +} + +static void mount_setattr_commit(struct mount_kattr *kattr, +				 struct mount *mnt, struct mount *last, +				 int err) +{ +	struct mount *m = mnt; + +	do { +		if (!err) { +			unsigned int flags; + +			do_idmap_mount(kattr, m); +			flags = recalc_flags(kattr, m); +			WRITE_ONCE(m->mnt.mnt_flags, flags); +		} + +		/* +		 * We either set MNT_READONLY above so make it visible +		 * before ~MNT_WRITE_HOLD or we failed to recursively +		 * apply mount options. +		 */ +		if ((kattr->attr_set & MNT_READONLY) && +		    (m->mnt.mnt_flags & MNT_WRITE_HOLD)) +			mnt_unhold_writers(m); + +		if (!err && kattr->propagation) +			change_mnt_propagation(m, kattr->propagation); + +		/* +		 * On failure, only cleanup until we found the first mount +		 * we failed to handle. +		 */ +		if (err && m == last) +			break; +	} while (kattr->recurse && (m = next_mnt(m, mnt))); + +	if (!err) +		touch_mnt_namespace(mnt->mnt_ns); +} + +static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) +{ +	struct mount *mnt = real_mount(path->mnt), *last = NULL; +	int err = 0; + +	if (path->dentry != mnt->mnt.mnt_root) +		return -EINVAL; + +	if (kattr->propagation) { +		/* +		 * Only take namespace_lock() if we're actually changing +		 * propagation. +		 */ +		namespace_lock(); +		if (kattr->propagation == MS_SHARED) { +			err = invent_group_ids(mnt, kattr->recurse); +			if (err) { +				namespace_unlock(); +				return err; +			} +		} +	} + +	lock_mount_hash(); + +	/* +	 * Get the mount tree in a shape where we can change mount +	 * properties without failure. +	 */ +	last = mount_setattr_prepare(kattr, mnt, &err); +	if (last) /* Commit all changes or revert to the old state. */ +		mount_setattr_commit(kattr, mnt, last, err); + +	unlock_mount_hash(); + +	if (kattr->propagation) { +		namespace_unlock(); +		if (err) +			cleanup_group_ids(mnt, NULL); +	} + +	return err; +} + +static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, +				struct mount_kattr *kattr, unsigned int flags) +{ +	int err = 0; +	struct ns_common *ns; +	struct user_namespace *mnt_userns; +	struct file *file; + +	if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP)) +		return 0; + +	/* +	 * We currently do not support clearing an idmapped mount. If this ever +	 * is a use-case we can revisit this but for now let's keep it simple +	 * and not allow it. +	 */ +	if (attr->attr_clr & MOUNT_ATTR_IDMAP) +		return -EINVAL; + +	if (attr->userns_fd > INT_MAX) +		return -EINVAL; + +	file = fget(attr->userns_fd); +	if (!file) +		return -EBADF; + +	if (!proc_ns_file(file)) { +		err = -EINVAL; +		goto out_fput; +	} + +	ns = get_proc_ns(file_inode(file)); +	if (ns->ops->type != CLONE_NEWUSER) { +		err = -EINVAL; +		goto out_fput; +	} + +	/* +	 * The init_user_ns is used to indicate that a vfsmount is not idmapped. +	 * This is simpler than just having to treat NULL as unmapped. Users +	 * wanting to idmap a mount to init_user_ns can just use a namespace +	 * with an identity mapping. +	 */ +	mnt_userns = container_of(ns, struct user_namespace, ns); +	if (mnt_userns == &init_user_ns) { +		err = -EPERM; +		goto out_fput; +	} +	kattr->mnt_userns = get_user_ns(mnt_userns); + +out_fput: +	fput(file); +	return err; +} + +static int build_mount_kattr(const struct mount_attr *attr, size_t usize, +			     struct mount_kattr *kattr, unsigned int flags) +{ +	unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; + +	if (flags & AT_NO_AUTOMOUNT) +		lookup_flags &= ~LOOKUP_AUTOMOUNT; +	if (flags & AT_SYMLINK_NOFOLLOW) +		lookup_flags &= ~LOOKUP_FOLLOW; +	if (flags & AT_EMPTY_PATH) +		lookup_flags |= LOOKUP_EMPTY; + +	*kattr = (struct mount_kattr) { +		.lookup_flags	= lookup_flags, +		.recurse	= !!(flags & AT_RECURSIVE), +	}; + +	if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS) +		return -EINVAL; +	if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1) +		return -EINVAL; +	kattr->propagation = attr->propagation; + +	if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS) +		return -EINVAL; + +	kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set); +	kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr); + +	/* +	 * Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap, +	 * users wanting to transition to a different atime setting cannot +	 * simply specify the atime setting in @attr_set, but must also +	 * specify MOUNT_ATTR__ATIME in the @attr_clr field. +	 * So ensure that MOUNT_ATTR__ATIME can't be partially set in +	 * @attr_clr and that @attr_set can't have any atime bits set if +	 * MOUNT_ATTR__ATIME isn't set in @attr_clr. +	 */ +	if (attr->attr_clr & MOUNT_ATTR__ATIME) { +		if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME) +			return -EINVAL; + +		/* +		 * Clear all previous time settings as they are mutually +		 * exclusive. +		 */ +		kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME; +		switch (attr->attr_set & MOUNT_ATTR__ATIME) { +		case MOUNT_ATTR_RELATIME: +			kattr->attr_set |= MNT_RELATIME; +			break; +		case MOUNT_ATTR_NOATIME: +			kattr->attr_set |= MNT_NOATIME; +			break; +		case MOUNT_ATTR_STRICTATIME: +			break; +		default: +			return -EINVAL; +		} +	} else { +		if (attr->attr_set & MOUNT_ATTR__ATIME) +			return -EINVAL; +	} + +	return build_mount_idmapped(attr, usize, kattr, flags); +} + +static void finish_mount_kattr(struct mount_kattr *kattr) +{ +	put_user_ns(kattr->mnt_userns); +	kattr->mnt_userns = NULL; +} + +SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, +		unsigned int, flags, struct mount_attr __user *, uattr, +		size_t, usize) +{ +	int err; +	struct path target; +	struct mount_attr attr; +	struct mount_kattr kattr; + +	BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0); + +	if (flags & ~(AT_EMPTY_PATH | +		      AT_RECURSIVE | +		      AT_SYMLINK_NOFOLLOW | +		      AT_NO_AUTOMOUNT)) +		return -EINVAL; + +	if (unlikely(usize > PAGE_SIZE)) +		return -E2BIG; +	if (unlikely(usize < MOUNT_ATTR_SIZE_VER0)) +		return -EINVAL; + +	if (!may_mount()) +		return -EPERM; + +	err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize); +	if (err) +		return err; + +	/* Don't bother walking through the mounts if this is a nop. */ +	if (attr.attr_set == 0 && +	    attr.attr_clr == 0 && +	    attr.propagation == 0) +		return 0; + +	err = build_mount_kattr(&attr, usize, &kattr, flags); +	if (err) +		return err; + +	err = user_path_at(dfd, path, kattr.lookup_flags, &target); +	if (err) +		return err; + +	err = do_mount_setattr(&target, &kattr); +	finish_mount_kattr(&kattr); +	path_put(&target); +	return err; +} +  static void __init init_mount_tree(void)  {  	struct vfsmount *mnt;  | 
