diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/Kconfig | 4 | ||||
-rw-r--r-- | fs/Makefile | 1 | ||||
-rw-r--r-- | fs/backing-file.c | 336 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 12 | ||||
-rw-r--r-- | fs/cachefiles/io.c | 5 | ||||
-rw-r--r-- | fs/ceph/file.c | 13 | ||||
-rw-r--r-- | fs/coda/file.c | 2 | ||||
-rw-r--r-- | fs/fuse/file.c | 5 | ||||
-rw-r--r-- | fs/internal.h | 8 | ||||
-rw-r--r-- | fs/nfs/nfs4file.c | 5 | ||||
-rw-r--r-- | fs/nfsd/vfs.c | 7 | ||||
-rw-r--r-- | fs/open.c | 42 | ||||
-rw-r--r-- | fs/overlayfs/Kconfig | 1 | ||||
-rw-r--r-- | fs/overlayfs/copy_up.c | 30 | ||||
-rw-r--r-- | fs/overlayfs/file.c | 247 | ||||
-rw-r--r-- | fs/overlayfs/overlayfs.h | 8 | ||||
-rw-r--r-- | fs/overlayfs/super.c | 11 | ||||
-rw-r--r-- | fs/read_write.c | 235 | ||||
-rw-r--r-- | fs/readdir.c | 4 | ||||
-rw-r--r-- | fs/remap_range.c | 45 | ||||
-rw-r--r-- | fs/smb/client/cifsfs.c | 5 | ||||
-rw-r--r-- | fs/splice.c | 243 |
22 files changed, 759 insertions, 510 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index 42837617a55b..231c7703793a 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -18,6 +18,10 @@ config VALIDATE_FS_PARSER config FS_IOMAP bool +# Stackable filesystems +config FS_STACK + bool + config BUFFER_HEAD bool diff --git a/fs/Makefile b/fs/Makefile index 75522f88e763..a6962c588962 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -39,6 +39,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o +obj-$(CONFIG_FS_STACK) += backing-file.o obj-$(CONFIG_FS_MBCACHE) += mbcache.o obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ diff --git a/fs/backing-file.c b/fs/backing-file.c new file mode 100644 index 000000000000..a681f38d84d8 --- /dev/null +++ b/fs/backing-file.c @@ -0,0 +1,336 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Common helpers for stackable filesystems and backing files. + * + * Forked from fs/overlayfs/file.c. + * + * Copyright (C) 2017 Red Hat, Inc. + * Copyright (C) 2023 CTERA Networks. + */ + +#include <linux/fs.h> +#include <linux/backing-file.h> +#include <linux/splice.h> +#include <linux/mm.h> + +#include "internal.h" + +/** + * backing_file_open - open a backing file for kernel internal use + * @user_path: path that the user reuqested to open + * @flags: open flags + * @real_path: path of the backing file + * @cred: credentials for open + * + * Open a backing file for a stackable filesystem (e.g., overlayfs). + * @user_path may be on the stackable filesystem and @real_path on the + * underlying filesystem. In this case, we want to be able to return the + * @user_path of the stackable filesystem. This is done by embedding the + * returned file into a container structure that also stores the stacked + * file's path, which can be retrieved using backing_file_user_path(). + */ +struct file *backing_file_open(const struct path *user_path, int flags, + const struct path *real_path, + const struct cred *cred) +{ + struct file *f; + int error; + + f = alloc_empty_backing_file(flags, cred); + if (IS_ERR(f)) + return f; + + path_get(user_path); + *backing_file_user_path(f) = *user_path; + error = vfs_open(real_path, f); + if (error) { + fput(f); + f = ERR_PTR(error); + } + + return f; +} +EXPORT_SYMBOL_GPL(backing_file_open); + +struct backing_aio { + struct kiocb iocb; + refcount_t ref; + struct kiocb *orig_iocb; + /* used for aio completion */ + void (*end_write)(struct file *); + struct work_struct work; + long res; +}; + +static struct kmem_cache *backing_aio_cachep; + +#define BACKING_IOCB_MASK \ + (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND) + +static rwf_t iocb_to_rw_flags(int flags) +{ + return (__force rwf_t)(flags & BACKING_IOCB_MASK); +} + +static void backing_aio_put(struct backing_aio *aio) +{ + if (refcount_dec_and_test(&aio->ref)) { + fput(aio->iocb.ki_filp); + kmem_cache_free(backing_aio_cachep, aio); + } +} + +static void backing_aio_cleanup(struct backing_aio *aio, long res) +{ + struct kiocb *iocb = &aio->iocb; + struct kiocb *orig_iocb = aio->orig_iocb; + + if (aio->end_write) + aio->end_write(orig_iocb->ki_filp); + + orig_iocb->ki_pos = iocb->ki_pos; + backing_aio_put(aio); +} + +static void backing_aio_rw_complete(struct kiocb *iocb, long res) +{ + struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb); + struct kiocb *orig_iocb = aio->orig_iocb; + + if (iocb->ki_flags & IOCB_WRITE) + kiocb_end_write(iocb); + + backing_aio_cleanup(aio, res); + orig_iocb->ki_complete(orig_iocb, res); +} + +static void backing_aio_complete_work(struct work_struct *work) +{ + struct backing_aio *aio = container_of(work, struct backing_aio, work); + + backing_aio_rw_complete(&aio->iocb, aio->res); +} + +static void backing_aio_queue_completion(struct kiocb *iocb, long res) +{ + struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb); + + /* + * Punt to a work queue to serialize updates of mtime/size. + */ + aio->res = res; + INIT_WORK(&aio->work, backing_aio_complete_work); + queue_work(file_inode(aio->orig_iocb->ki_filp)->i_sb->s_dio_done_wq, + &aio->work); +} + +static int backing_aio_init_wq(struct kiocb *iocb) +{ + struct super_block *sb = file_inode(iocb->ki_filp)->i_sb; + + if (sb->s_dio_done_wq) + return 0; + + return sb_init_dio_done_wq(sb); +} + + +ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx) +{ + struct backing_aio *aio = NULL; + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) + return -EIO; + + if (!iov_iter_count(iter)) + return 0; + + if (iocb->ki_flags & IOCB_DIRECT && + !(file->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; + + old_cred = override_creds(ctx->cred); + if (is_sync_kiocb(iocb)) { + rwf_t rwf = iocb_to_rw_flags(flags); + + ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf); + } else { + ret = -ENOMEM; + aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); + if (!aio) + goto out; + + aio->orig_iocb = iocb; + kiocb_clone(&aio->iocb, iocb, get_file(file)); + aio->iocb.ki_complete = backing_aio_rw_complete; + refcount_set(&aio->ref, 2); + ret = vfs_iocb_iter_read(file, &aio->iocb, iter); + backing_aio_put(aio); + if (ret != -EIOCBQUEUED) + backing_aio_cleanup(aio, ret); + } +out: + revert_creds(old_cred); + + if (ctx->accessed) + ctx->accessed(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_read_iter); + +ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) + return -EIO; + + if (!iov_iter_count(iter)) + return 0; + + ret = file_remove_privs(ctx->user_file); + if (ret) + return ret; + + if (iocb->ki_flags & IOCB_DIRECT && + !(file->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; + + /* + * Stacked filesystems don't support deferred completions, don't copy + * this property in case it is set by the issuer. + */ + flags &= ~IOCB_DIO_CALLER_COMP; + + old_cred = override_creds(ctx->cred); + if (is_sync_kiocb(iocb)) { + rwf_t rwf = iocb_to_rw_flags(flags); + + ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf); + if (ctx->end_write) + ctx->end_write(ctx->user_file); + } else { + struct backing_aio *aio; + + ret = backing_aio_init_wq(iocb); + if (ret) + goto out; + + ret = -ENOMEM; + aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); + if (!aio) + goto out; + + aio->orig_iocb = iocb; + aio->end_write = ctx->end_write; + kiocb_clone(&aio->iocb, iocb, get_file(file)); + aio->iocb.ki_flags = flags; + aio->iocb.ki_complete = backing_aio_queue_completion; + refcount_set(&aio->ref, 2); + ret = vfs_iocb_iter_write(file, &aio->iocb, iter); + backing_aio_put(aio); + if (ret != -EIOCBQUEUED) + backing_aio_cleanup(aio, ret); + } +out: + revert_creds(old_cred); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_write_iter); + +ssize_t backing_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING))) + return -EIO; + + old_cred = override_creds(ctx->cred); + ret = vfs_splice_read(in, ppos, pipe, len, flags); + revert_creds(old_cred); + + if (ctx->accessed) + ctx->accessed(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_splice_read); + +ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING))) + return -EIO; + + ret = file_remove_privs(ctx->user_file); + if (ret) + return ret; + + old_cred = override_creds(ctx->cred); + file_start_write(out); + ret = iter_file_splice_write(pipe, out, ppos, len, flags); + file_end_write(out); + revert_creds(old_cred); + + if (ctx->end_write) + ctx->end_write(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_splice_write); + +int backing_file_mmap(struct file *file, struct vm_area_struct *vma, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + int ret; + + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)) || + WARN_ON_ONCE(ctx->user_file != vma->vm_file)) + return -EIO; + + if (!file->f_op->mmap) + return -ENODEV; + + vma_set_file(vma, file); + + old_cred = override_creds(ctx->cred); + ret = call_mmap(vma->vm_file, vma); + revert_creds(old_cred); + + if (ctx->accessed) + ctx->accessed(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_mmap); + +static int __init backing_aio_init(void) +{ + backing_aio_cachep = kmem_cache_create("backing_aio", + sizeof(struct backing_aio), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!backing_aio_cachep) + return -ENOMEM; + + return 0; +} +fs_initcall(backing_aio_init); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a1743904202b..41b479861b3c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4533,29 +4533,29 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool if (ret < 0) goto out_acct; - file_start_write(file); - if (iov_iter_count(&iter) == 0) { ret = 0; - goto out_end_write; + goto out_iov; } pos = args.offset; ret = rw_verify_area(WRITE, file, &pos, args.len); if (ret < 0) - goto out_end_write; + goto out_iov; init_sync_kiocb(&kiocb, file); ret = kiocb_set_rw_flags(&kiocb, 0); if (ret) - goto out_end_write; + goto out_iov; kiocb.ki_pos = pos; + file_start_write(file); + ret = btrfs_do_write_iter(&kiocb, &iter, &args); if (ret > 0) fsnotify_modify(file); -out_end_write: file_end_write(file); +out_iov: kfree(iov); out_acct: if (ret > 0) diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 009d23cd435b..5857241c5918 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -259,7 +259,8 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret) _enter("%ld", ret); - kiocb_end_write(iocb); + if (ki->was_async) + kiocb_end_write(iocb); if (ret < 0) trace_cachefiles_io_error(object, inode, ret, @@ -319,8 +320,6 @@ int __cachefiles_write(struct cachefiles_object *object, ki->iocb.ki_complete = cachefiles_write_complete; atomic_long_add(ki->b_writing, &cache->b_writing); - kiocb_start_write(&ki->iocb); - get_file(ki->iocb.ki_filp); cachefiles_grab_object(object, cachefiles_obj_get_ioreq); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3b5aae29e944..d380d9dad0e0 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -12,6 +12,7 @@ #include <linux/falloc.h> #include <linux/iversion.h> #include <linux/ktime.h> +#include <linux/splice.h> #include "super.h" #include "mds_client.h" @@ -3010,8 +3011,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, * {read,write}_iter, which will get caps again. */ put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); - ret = do_splice_direct(src_file, &src_off, dst_file, - &dst_off, src_objlen, flags); + ret = splice_file_range(src_file, &src_off, dst_file, &dst_off, + src_objlen); /* Abort on short copies or on error */ if (ret < (long)src_objlen) { doutc(cl, "Failed partial copy (%zd)\n", ret); @@ -3065,8 +3066,8 @@ out_caps: */ if (len && (len < src_ci->i_layout.object_size)) { doutc(cl, "Final partial copy of %zu bytes\n", len); - bytes = do_splice_direct(src_file, &src_off, dst_file, - &dst_off, len, flags); + bytes = splice_file_range(src_file, &src_off, dst_file, + &dst_off, len); if (bytes > 0) ret += bytes; else @@ -3089,8 +3090,8 @@ static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off, len, flags); if (ret == -EOPNOTSUPP || ret == -EXDEV) - ret = generic_copy_file_range(src_file, src_off, dst_file, - dst_off, len, flags); + ret = splice_copy_file_range(src_file, src_off, dst_file, + dst_off, len); return ret; } diff --git a/fs/coda/file.c b/fs/coda/file.c index 16acc58311ea..148856a582a9 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -79,14 +79,12 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) if (ret) goto finish_write; - file_start_write(host_file); inode_lock(coda_inode); ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0); coda_inode->i_size = file_inode(host_file)->i_size; coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; inode_set_mtime_to_ts(coda_inode, inode_set_ctime_current(coda_inode)); inode_unlock(coda_inode); - file_end_write(host_file); finish_write: venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode), diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a660f1f21540..148a71b8b4d0 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -19,6 +19,7 @@ #include <linux/uio.h> #include <linux/fs.h> #include <linux/filelock.h> +#include <linux/splice.h> static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, int opcode, @@ -3195,8 +3196,8 @@ static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off, len, flags); if (ret == -EOPNOTSUPP || ret == -EXDEV) - ret = generic_copy_file_range(src_file, src_off, dst_file, - dst_off, len, flags); + ret = splice_copy_file_range(src_file, src_off, dst_file, + dst_off, len); return ret; } diff --git a/fs/internal.h b/fs/internal.h index 3a970a0644ce..bf2ee2e0d45d 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -244,10 +244,10 @@ int do_statx(int dfd, struct filename *filename, unsigned int flags, /* * fs/splice.c: */ -long splice_file_to_pipe(struct file *in, - struct pipe_inode_info *opipe, - loff_t *offset, - size_t len, unsigned int flags); +ssize_t splice_file_to_pipe(struct file *in, + struct pipe_inode_info *opipe, + loff_t *offset, + size_t len, unsigned int flags); /* * fs/xattr.c: diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 02788c3c85e5..e238abc78a13 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -10,6 +10,7 @@ #include <linux/mount.h> #include <linux/nfs_fs.h> #include <linux/nfs_ssc.h> +#include <linux/splice.h> #include "delegation.h" #include "internal.h" #include "iostat.h" @@ -195,8 +196,8 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, ret = __nfs4_copy_file_range(file_in, pos_in, file_out, pos_out, count, flags); if (ret == -EOPNOTSUPP || ret == -EXDEV) - ret = generic_copy_file_range(file_in, pos_in, file_out, - pos_out, count, flags); + ret = splice_copy_file_range(file_in, pos_in, file_out, + pos_out, count); return ret; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index e01e4e2acbd9..707ef21f275b 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1039,7 +1039,10 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, ssize_t host_err; trace_nfsd_read_splice(rqstp, fhp, offset, *count); - host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); + host_err = rw_verify_area(READ, file, &offset, *count); + if (!host_err) + host_err = splice_direct_to_actor(file, &sd, + nfsd_direct_splice_actor); return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } @@ -1176,9 +1179,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, since = READ_ONCE(file->f_wb_err); if (verf) nfsd_copy_write_verifier(verf, nn); - file_start_write(file); host_err = vfs_iter_write(file, &iter, &pos, flags); - file_end_write(file); if (host_err < 0) { commit_reset_write_verifier(nn, rqstp, host_err); goto out_nfserr; diff --git a/fs/open.c b/fs/open.c index 954d8fcbb635..a84d21e55c39 100644 --- a/fs/open.c +++ b/fs/open.c @@ -304,6 +304,10 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (ret) return ret; + ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len); + if (ret) + return ret; + if (S_ISFIFO(inode->i_mode)) return -ESPIPE; @@ -1178,44 +1182,6 @@ struct file *kernel_file_open(const struct path *path, int flags, } EXPORT_SYMBOL_GPL(kernel_file_open); -/** - * backing_file_open - open a backing file for kernel internal use - * @user_path: path that the user reuqested to open - * @flags: open flags - * @real_path: path of the backing file - * @cred: credentials for open - * - * Open a backing file for a stackable filesystem (e.g., overlayfs). - * @user_path may be on the stackable filesystem and @real_path on the - * underlying filesystem. In this case, we want to be able to return the - * @user_path of the stackable filesystem. This is done by embedding the - * returned file into a container structure that also stores the stacked - * file's path, which can be retrieved using backing_file_user_path(). - */ -struct file *backing_file_open(const struct path *user_path, int flags, - const struct path *real_path, - const struct cred *cred) -{ - struct file *f; - int error; - - f = alloc_empty_backing_file(flags, cred); - if (IS_ERR(f)) - return f; - - path_get(user_path); - *backing_file_user_path(f) = *user_path; - f->f_path = *real_path; - error = do_dentry_open(f, d_inode(real_path->dentry), NULL); - if (error) { - fput(f); - f = ERR_PTR(error); - } - - return f; -} -EXPORT_SYMBOL_GPL(backing_file_open); - #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) #define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index fec5020c3495..2ac67e04a6fb 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config OVERLAY_FS tristate "Overlay filesystem support" + select FS_STACK select EXPORTFS help An overlay filesystem combines two filesystems - an 'upper' filesystem diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 8bea66c97316..45cadc3aed85 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -230,6 +230,19 @@ static int ovl_copy_fileattr(struct inode *inode, const struct path *old, return ovl_real_fileattr_set(new, &newfa); } +static int ovl_verify_area(loff_t pos, loff_t pos2, loff_t len, loff_t totlen) +{ + loff_t tmp; + + if (WARN_ON_ONCE(pos != pos2)) + return -EIO; + if (WARN_ON_ONCE(pos < 0 || len < 0 || totlen < 0)) + return -EIO; + if (WARN_ON_ONCE(check_add_overflow(pos, len, &tmp))) + return -EIO; + return 0; +} + static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, struct file *new_file, loff_t len) { @@ -244,13 +257,20 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, int error = 0; ovl_path_lowerdata(dentry, &datapath); - if (WARN_ON(datapath.dentry == NULL)) + if (WARN_ON_ONCE(datapath.dentry == NULL) || + WARN_ON_ONCE(len < 0)) return -EIO; old_file = ovl_path_open(&datapath, O_LARGEFILE | O_RDONLY); if (IS_ERR(old_file)) return PTR_ERR(old_file); + error = rw_verify_area(READ, old_file, &old_pos, len); + if (!error) + error = rw_verify_area(WRITE, new_file, &new_pos, len); + if (error) + goto out_fput; + /* Try to use clone_file_range to clone up within the same fs */ ovl_start_write(dentry); cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0); @@ -265,7 +285,7 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, while (len) { size_t this_len = OVL_COPY_UP_CHUNK_SIZE; - long bytes; + ssize_t bytes; if (len < this_len) this_len = len; @@ -309,11 +329,13 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, } } - ovl_start_write(dentry); + error = ovl_verify_area(old_pos, new_pos, this_len, len); + if (error) + break; + bytes = do_splice_direct(old_file, &old_pos, new_file, &new_pos, this_len, SPLICE_F_MOVE); - ovl_end_write(dentry); if (bytes <= 0) { error = bytes; break; diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 131621daeb13..05536964d37f 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -9,25 +9,11 @@ #include <linux/xattr.h> #include <linux/uio.h> #include <linux/uaccess.h> -#include <linux/splice.h> #include <linux/security.h> -#include <linux/mm.h> #include <linux/fs.h> +#include <linux/backing-file.h> #include "overlayfs.h" -#include "../internal.h" /* for sb_init_dio_done_wq */ - -struct ovl_aio_req { - struct kiocb iocb; - refcount_t ref; - struct kiocb *orig_iocb; - /* used for aio completion */ - struct work_struct work; - long res; -}; - -static struct kmem_cache *ovl_aio_request_cachep; - static char ovl_whatisit(struct inode *inode, struct inode *realinode) { if (realinode != ovl_inode_upper(inode)) @@ -274,83 +260,16 @@ static void ovl_file_accessed(struct file *file) touch_atime(&file->f_path); } -#define OVL_IOCB_MASK \ - (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND) - -static rwf_t iocb_to_rw_flags(int flags) -{ - return (__force rwf_t)(flags & OVL_IOCB_MASK); -} - -static inline void ovl_aio_put(struct ovl_aio_req *aio_req) -{ - if (refcount_dec_and_test(&aio_req->ref)) { - fput(aio_req->iocb.ki_filp); - kmem_cache_free(ovl_aio_request_cachep, aio_req); - } -} - -static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) -{ - struct kiocb *iocb = &aio_req->iocb; - struct kiocb *orig_iocb = aio_req->orig_iocb; - - if (iocb->ki_flags & IOCB_WRITE) { - kiocb_end_write(iocb); - ovl_file_modified(orig_iocb->ki_filp); - } - - orig_iocb->ki_pos = iocb->ki_pos; - ovl_aio_put(aio_req); -} - -static void ovl_aio_rw_complete(struct kiocb *iocb, long res) -{ - struct ovl_aio_req *aio_req = container_of(iocb, - struct ovl_aio_req, iocb); - struct kiocb *orig_iocb = aio_req->orig_iocb; - - ovl_aio_cleanup_handler(aio_req); - orig_iocb->ki_complete(orig_iocb, res); -} - -static void ovl_aio_complete_work(struct work_struct *work) -{ - struct ovl_aio_req *aio_req = container_of(work, - struct ovl_aio_req, work); - - ovl_aio_rw_complete(&aio_req->iocb, aio_req->res); -} - -static void ovl_aio_queue_completion(struct kiocb *iocb, long res) -{ - struct ovl_aio_req *aio_req = container_of(iocb, - struct ovl_aio_req, iocb); - struct kiocb *orig_iocb = aio_req->orig_iocb; - - /* - * Punt to a work queue to serialize updates of mtime/size. - */ - aio_req->res = res; - INIT_WORK(&aio_req->work, ovl_aio_complete_work); - queue_work(file_inode(orig_iocb->ki_filp)->i_sb->s_dio_done_wq, - &aio_req->work); -} - -static int ovl_init_aio_done_wq(struct super_block *sb) -{ - if (sb->s_dio_done_wq) - return 0; - - return sb_init_dio_done_wq(sb); -} - static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct fd real; - const struct cred *old_cred; ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ovl_creds(file_inode(file)->i_sb), + .user_file = file, + .accessed = ovl_file_accessed, + }; if (!iov_iter_count(iter)) return 0; @@ -359,37 +278,8 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (ret) return ret; - ret = -EINVAL; - if (iocb->ki_flags & IOCB_DIRECT && - !(real.file->f_mode & FMODE_CAN_ODIRECT)) - goto out_fdput; - - old_cred = ovl_override_creds(file_inode(file)->i_sb); - if (is_sync_kiocb(iocb)) { - rwf_t rwf = iocb_to_rw_flags(iocb->ki_flags); - - ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, rwf); - } else { - struct ovl_aio_req *aio_req; - - ret = -ENOMEM; - aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL); - if (!aio_req) - goto out; - - aio_req->orig_iocb = iocb; - kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); - aio_req->iocb.ki_complete = ovl_aio_rw_complete; - refcount_set(&aio_req->ref, 2); - ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); - ovl_aio_put(aio_req); - if (ret != -EIOCBQUEUED) - ovl_aio_cleanup_handler(aio_req); - } -out: - revert_creds(old_cred); - ovl_file_accessed(file); -out_fdput: + ret = backing_file_read_iter(real.file, iter, iocb, iocb->ki_flags, + &ctx); fdput(real); return ret; @@ -400,9 +290,13 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct fd real; - const struct cred *old_cred; ssize_t ret; int ifl = iocb->ki_flags; + struct backing_file_ctx ctx = { + .cred = ovl_creds(inode->i_sb), + .user_file = file, + .end_write = ovl_file_modified, + }; if (!iov_iter_count(iter)) return 0; @@ -410,19 +304,11 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) inode_lock(inode); /* Update mode */ ovl_copyattr(inode); - ret = file_remove_privs(file); - if (ret) - goto out_unlock; ret = ovl_real_fdget(file, &real); if (ret) goto out_unlock; - ret = -EINVAL; - if (iocb->ki_flags & IOCB_DIRECT && - !(real.file->f_mode & FMODE_CAN_ODIRECT)) - goto out_fdput; - if (!ovl_should_sync(OVL_FS(inode->i_sb))) ifl &= ~(IOCB_DSYNC | IOCB_SYNC); @@ -431,42 +317,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) * this property in case it is set by the issuer. */ ifl &= ~IOCB_DIO_CALLER_COMP; - - old_cred = ovl_override_creds(file_inode(file)->i_sb); - if (is_sync_kiocb(iocb)) { - rwf_t rwf = iocb_to_rw_flags(ifl); - - file_start_write(real.file); - ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, rwf); - file_end_write(real.file); - /* Update size */ - ovl_file_modified(file); - } else { - struct ovl_aio_req *aio_req; - - ret = ovl_init_aio_done_wq(inode->i_sb); - if (ret) - goto out; - - ret = -ENOMEM; - aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL); - if (!aio_req) - goto out; - - aio_req->orig_iocb = iocb; - kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); - aio_req->iocb.ki_flags = ifl; - aio_req->iocb.ki_complete = ovl_aio_queue_completion; - refcount_set(&aio_req->ref, 2); - kiocb_start_write(&aio_req->iocb); - ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); - ovl_aio_put(aio_req); - if (ret != -EIOCBQUEUED) - ovl_aio_cleanup_handler(aio_req); - } -out: - revert_creds(old_cred); -out_fdput: + ret = backing_file_write_iter(real.file, iter, iocb, ifl, &ctx); fdput(real); out_unlock: @@ -479,20 +330,21 @@ static ssize_t ovl_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - const struct cred *old_cred; struct fd real; ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ovl_creds(file_inode(in)->i_sb), + .user_file = in, + .accessed = ovl_file_accessed, + }; ret = ovl_real_fdget(in, &real); if (ret) return ret; - old_cred = ovl_override_creds(file_inode(in)->i_sb); - ret = vfs_splice_read(real.file, ppos, pipe, len, flags); - revert_creds(old_cred); - ovl_file_accessed(in); - + ret = backing_file_splice_read(real.file, ppos, pipe, len, flags, &ctx); fdput(real); + return ret; } @@ -508,30 +360,23 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { struct fd real; - const struct cred *old_cred; struct inode *inode = file_inode(out); ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ovl_creds(inode->i_sb), + .user_file = out, + .end_write = ovl_file_modified, + }; inode_lock(inode); /* Update mode */ ovl_copyattr(inode); - ret = file_remove_privs(out); - if (ret) - goto out_unlock; ret = ovl_real_fdget(out, &real); if (ret) goto out_unlock; - old_cred = ovl_override_creds(inode->i_sb); - file_start_write(real.file); - - ret = iter_file_splice_write(pipe, real.file, ppos, len, flags); - - file_end_write(real.file); - /* Update size */ - ovl_file_modified(out); - revert_creds(old_cred); + ret = backing_file_splice_write(pipe, real.file, ppos, len, flags, &ctx); fdput(real); out_unlock: @@ -569,23 +414,13 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) static int ovl_mmap(struct file *file, struct vm_area_struct *vma) { struct file *realfile = file->private_data; - const struct cred *old_cred; - int ret; - - if (!realfile->f_op->mmap) - return -ENODEV; + struct backing_file_ctx ctx = { + .cred = ovl_creds(file_inode(file)->i_sb), + .user_file = file, + .accessed = ovl_file_accessed, + }; - if (WARN_ON(file != vma->vm_file)) - return -EIO; - - vma_set_file(vma, realfile); - - old_cred = ovl_override_creds(file_inode(file)->i_sb); - ret = call_mmap(vma->vm_file, vma); - revert_creds(old_cred); - ovl_file_accessed(file); - - return ret; + return backing_file_mmap(realfile, vma, &ctx); } static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len) @@ -778,19 +613,3 @@ const struct file_operations ovl_file_operations = { .copy_file_range = ovl_copy_file_range, .remap_file_range = ovl_remap_file_range, }; - -int __init ovl_aio_request_cache_init(void) -{ - ovl_aio_request_cachep = kmem_cache_create("ovl_aio_req", - sizeof(struct ovl_aio_req), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (!ovl_aio_request_cachep) - return -ENOMEM; - - return 0; -} - -void ovl_aio_request_cache_destroy(void) -{ - kmem_cache_destroy(ovl_aio_request_cachep); -} diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 05c3dd597fa8..5ba11eb43767 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -425,6 +425,12 @@ int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); struct dentry *ovl_workdir(struct dentry *dentry); const struct cred *ovl_override_creds(struct super_block *sb); + +static inline const struct cred *ovl_creds(struct super_block *sb) +{ + return OVL_FS(sb)->creator_cred; +} + int ovl_can_decode_fh(struct super_block *sb); struct dentry *ovl_indexdir(struct super_block *sb); bool ovl_index_all(struct super_block *sb); @@ -837,8 +843,6 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, /* file.c */ extern const struct file_operations ovl_file_operations; -int __init ovl_aio_request_cache_init(void); -void ovl_aio_request_cache_destroy(void); int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa); int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa); int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index a0967bb25003..bcd4c314a7eb 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1501,14 +1501,10 @@ static int __init ovl_init(void) if (ovl_inode_cachep == NULL) return -ENOMEM; - err = ovl_aio_request_cache_init(); - if (!err) { - err = register_filesystem(&ovl_fs_type); - if (!err) - return 0; + err = register_filesystem(&ovl_fs_type); + if (!err) + return 0; - ovl_aio_request_cache_destroy(); - } kmem_cache_destroy(ovl_inode_cachep); return err; @@ -1524,7 +1520,6 @@ static void __exit ovl_exit(void) */ rcu_barrier(); kmem_cache_destroy(ovl_inode_cachep); - ovl_aio_request_cache_destroy(); } module_init(ovl_init); diff --git a/fs/read_write.c b/fs/read_write.c index 4771701c896b..d4c036e82b6c 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -354,6 +354,9 @@ out_putf: int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { + int mask = read_write == READ ? MAY_READ : MAY_WRITE; + int ret; + if (unlikely((ssize_t) count < 0)) return -EINVAL; @@ -371,8 +374,11 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t } } - return security_file_permission(file, - read_write == READ ? MAY_READ : MAY_WRITE); + ret = security_file_permission(file, mask); + if (ret) + return ret; + + return fsnotify_file_area_perm(file, mask, ppos, count); } EXPORT_SYMBOL(rw_verify_area); @@ -773,12 +779,14 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, return ret; } -static ssize_t do_iter_read(struct file *file, struct iov_iter *iter, - loff_t *pos, rwf_t flags) +ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, + struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; + if (!file->f_op->read_iter) + return -EINVAL; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) @@ -787,22 +795,20 @@ static ssize_t do_iter_read(struct file *file, struct iov_iter *iter, tot_len = iov_iter_count(iter); if (!tot_len) goto out; - ret = rw_verify_area(READ, file, pos, tot_len); + ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; - if (file->f_op->read_iter) - ret = do_iter_readv_writev(file, iter, pos, READ, flags); - else - ret = do_loop_readv_writev(file, iter, pos, READ, flags); + ret = call_read_iter(file, iocb, iter); out: if (ret >= 0) fsnotify_access(file); return ret; } +EXPORT_SYMBOL(vfs_iocb_iter_read); -ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, - struct iov_iter *iter) +ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, + rwf_t flags) { size_t tot_len; ssize_t ret = 0; @@ -817,33 +823,30 @@ ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, tot_len = iov_iter_count(iter); if (!tot_len) goto out; - ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len); + ret = rw_verify_area(READ, file, ppos, tot_len); if (ret < 0) return ret; - ret = call_read_iter(file, iocb, iter); + ret = do_iter_readv_writev(file, iter, ppos, READ, flags); out: if (ret >= 0) fsnotify_access(file); return ret; } -EXPORT_SYMBOL(vfs_iocb_iter_read); - -ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, - rwf_t flags) -{ - if (!file->f_op->read_iter) - return -EINVAL; - return do_iter_read(file, iter, ppos, flags); -} EXPORT_SYMBOL(vfs_iter_read); -static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, - loff_t *pos, rwf_t flags) +/* + * Caller is responsible for calling kiocb_end_write() on completion + * if async iocb was queued. + */ +ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, + struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; + if (!file->f_op->write_iter) + return -EINVAL; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) @@ -852,88 +855,127 @@ static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, tot_len = iov_iter_count(iter); if (!tot_len) return 0; - ret = rw_verify_area(WRITE, file, pos, tot_len); + ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; - if (file->f_op->write_iter) - ret = do_iter_readv_writev(file, iter, pos, WRITE, flags); - else - ret = do_loop_readv_writev(file, iter, pos, WRITE, flags); + kiocb_start_write(iocb); + ret = call_write_iter(file, iocb, iter); + if (ret != -EIOCBQUEUED) + kiocb_end_write(iocb); if (ret > 0) fsnotify_modify(file); + return ret; } +EXPORT_SYMBOL(vfs_iocb_iter_write); -ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, - struct iov_iter *iter) +ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, + rwf_t flags) { size_t tot_len; - ssize_t ret = 0; + ssize_t ret; - if (!file->f_op->write_iter) - return -EINVAL; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; + if (!file->f_op->write_iter) + return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; - ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len); + + ret = rw_verify_area(WRITE, file, ppos, tot_len); if (ret < 0) return ret; - ret = call_write_iter(file, iocb, iter); + file_start_write(file); + ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags); if (ret > 0) fsnotify_modify(file); + file_end_write(file); return ret; } -EXPORT_SYMBOL(vfs_iocb_iter_write); - -ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, - rwf_t flags) -{ - if (!file->f_op->write_iter) - return -EINVAL; - return do_iter_write(file, iter, ppos, flags); -} EXPORT_SYMBOL(vfs_iter_write); static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, - unsigned long vlen, loff_t *pos, rwf_t flags) + unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; - ssize_t ret; + size_t tot_len; + ssize_t ret = 0; - ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); - if (ret >= 0) { - ret = do_iter_read(file, &iter, pos, flags); - kfree(iov); - } + if (!(file->f_mode & FMODE_READ)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_READ)) + return -EINVAL; + + ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, + &iter); + if (ret < 0) + return ret; + + tot_len = iov_iter_count(&iter); + if (!tot_len) + goto out; + ret = rw_verify_area(READ, file, pos, tot_len); + if (ret < 0) + goto out; + + if (file->f_op->read_iter) + ret = do_iter_readv_writev(file, &iter, pos, READ, flags); + else + ret = do_loop_readv_writev(file, &iter, pos, READ, flags); +out: + if (ret >= 0) + fsnotify_access(file); + kfree(iov); return ret; } static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, - unsigned long vlen, loff_t *pos, rwf_t flags) + unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; - ssize_t ret; + size_t tot_len; + ssize_t ret = 0; - ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); - if (ret >= 0) { - file_start_write(file); - ret = do_iter_write(file, &iter, pos, flags); - file_end_write(file); - kfree(iov); - } + if (!(file->f_mode & FMODE_WRITE)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_WRITE)) + return -EINVAL; + + ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, + &iter); + if (ret < 0) + return ret; + + tot_len = iov_iter_count(&iter); + if (!tot_len) + goto out; + + ret = rw_verify_area(WRITE, file, pos, tot_len); + if (ret < 0) + goto out; + + file_start_write(file); + if (file->f_op->write_iter) + ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags); + else + ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags); + if (ret > 0) + fsnotify_modify(file); + file_end_write(file); +out: + kfree(iov); return ret; } @@ -1178,7 +1220,7 @@ COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, #endif /* CONFIG_COMPAT */ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, - size_t count, loff_t max) + size_t count, loff_t max) { struct fd in, out; struct inode *in_inode, *out_inode; @@ -1250,10 +1292,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, retval = rw_verify_area(WRITE, out.file, &out_pos, count); if (retval < 0) goto fput_out; - file_start_write(out.file); retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); - file_end_write(out.file); } else { if (out.file->f_flags & O_NONBLOCK) fl |= SPLICE_F_NONBLOCK; @@ -1362,38 +1402,6 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, } #endif -/** - * generic_copy_file_range - copy data between two files - * @file_in: file structure to read from - * @pos_in: file offset to read from - * @file_out: file structure to write data to - * @pos_out: file offset to write data to - * @len: amount of data to copy - * @flags: copy flags - * - * This is a generic filesystem helper to copy data from one file to another. - * It has no constraints on the source or destination file owners - the files - * can belong to different superblocks and different filesystem types. Short - * copies are allowed. - * - * This should be called from the @file_out filesystem, as per the - * ->copy_file_range() method. - * - * Returns the number of bytes copied or a negative error indicating the - * failure. - */ - -ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - size_t len, unsigned int flags) -{ - lockdep_assert(sb_write_started(file_inode(file_out)->i_sb)); - - return do_splice_direct(file_in, &pos_in, file_out, &pos_out, - len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); -} -EXPORT_SYMBOL(generic_copy_file_range); - /* * Performs necessary checks before doing a file copy * @@ -1478,6 +1486,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, { ssize_t ret; bool splice = flags & COPY_FILE_SPLICE; + bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb; if (flags & ~COPY_FILE_SPLICE) return -EINVAL; @@ -1509,19 +1518,24 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); - goto done; - } - - if (!splice && file_in->f_op->remap_file_range && - file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) { + } else if (!splice && file_in->f_op->remap_file_range && samesb) { ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, min_t(loff_t, MAX_RW_COUNT, len), REMAP_FILE_CAN_SHORTEN); - if (ret > 0) - goto done; + /* fallback to splice */ + if (ret <= 0) + splice = true; + } else if (samesb) { + /* Fallback to splice for same sb copy for backward compat */ + splice = true; } + file_end_write(file_out); + + if (!splice) + goto done; + /* * We can get here for same sb copy of filesystems that do not implement * ->copy_file_range() in case filesystem does not support clone or in @@ -1533,11 +1547,16 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, * and which filesystems do not, that will allow userspace tools to * make consistent desicions w.r.t using copy_file_range(). * - * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE. + * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE + * for server-side-copy between any two sb. + * + * In any case, we call do_splice_direct() and not splice_file_range(), + * without file_start_write() held, to avoid possible deadlocks related + * to splicing from input file, while file_start_write() is held on + * the output file on a different sb. */ - ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len, - flags); - + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, + min_t(size_t, len, MAX_RW_COUNT), 0); done: if (ret > 0) { fsnotify_access(file_in); @@ -1549,8 +1568,6 @@ done: inc_syscr(current); inc_syscw(current); - file_end_write(file_out); - return ret; } EXPORT_SYMBOL(vfs_copy_file_range); diff --git a/fs/readdir.c b/fs/readdir.c index c8c46e294431..278bc0254732 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -96,6 +96,10 @@ int iterate_dir(struct file *file, struct dir_context *ctx) if (res) goto out; + res = fsnotify_file_perm(file, MAY_READ); + if (res) + goto out; + res = down_read_killable(&inode->i_rwsem); if (res) goto out; diff --git a/fs/remap_range.c b/fs/remap_range.c index 87ae4f0dc3aa..f8c1120b8311 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -102,7 +102,9 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in, static int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write) { + int mask = write ? MAY_WRITE : MAY_READ; loff_t tmp; + int ret; if (unlikely(pos < 0 || len < 0)) return -EINVAL; @@ -110,7 +112,11 @@ static int remap_verify_area(struct file *file, loff_t pos, loff_t len, if (unlikely(check_add_overflow(pos, len, &tmp))) return -EINVAL; - return security_file_permission(file, write ? MAY_WRITE : MAY_READ); + ret = security_file_permission(file, mask); + if (ret) + return ret; + + return fsnotify_file_area_perm(file, mask, &pos, len); } /* @@ -385,14 +391,6 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, if (!file_in->f_op->remap_file_range) return -EOPNOTSUPP; - ret = remap_verify_area(file_in, pos_in, len, false); - if (ret) - return ret; - - ret = remap_verify_area(file_out, pos_out, len, true); - if (ret) - return ret; - ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, len, remap_flags); if (ret < 0) @@ -410,6 +408,14 @@ loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, { loff_t ret; + ret = remap_verify_area(file_in, pos_in, len, false); + if (ret) + return ret; + + ret = remap_verify_area(file_out, pos_out, len, true); + if (ret) + return ret; + file_start_write(file_out); ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len, remap_flags); @@ -420,7 +426,7 @@ loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, EXPORT_SYMBOL(vfs_clone_file_range); /* Check whether we are allowed to dedupe the destination file */ -static bool allow_file_dedupe(struct file *file) +static bool may_dedupe_file(struct file *file) { struct mnt_idmap *idmap = file_mnt_idmap(file); struct inode *inode = file_inode(file); @@ -445,24 +451,29 @@ loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN)); - ret = mnt_want_write_file(dst_file); - if (ret) - return ret; - /* * This is redundant if called from vfs_dedupe_file_range(), but other * callers need it and it's not performance sesitive... */ ret = remap_verify_area(src_file, src_pos, len, false); if (ret) - goto out_drop_write; + return ret; ret = remap_verify_area(dst_file, dst_pos, len, true); if (ret) - goto out_drop_write; + return ret; + + /* + * This needs to be called after remap_verify_area() because of + * sb_start_write() and before may_dedupe_file() because the mount's + * MAY_WRITE need to be checked with mnt_get_write_access_file() held. + */ + ret = mnt_want_write_file(dst_file); + if (ret) + return ret; ret = -EPERM; - if (!allow_file_dedupe(dst_file)) + if (!may_dedupe_file(dst_file)) goto out_drop_write; ret = -EXDEV; diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 2131638f26d0..99b0ade833aa 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -25,6 +25,7 @@ #include <linux/freezer.h> #include <linux/namei.h> #include <linux/random.h> +#include <linux/splice.h> #include <linux/uuid.h> #include <linux/xattr.h> #include <uapi/linux/magic.h> @@ -1506,8 +1507,8 @@ static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off, free_xid(xid); if (rc == -EOPNOTSUPP || rc == -EXDEV) - rc = generic_copy_file_range(src_file, off, dst_file, - destoff, len, flags); + rc = splice_copy_file_range(src_file, off, dst_file, + destoff, len); return rc; } diff --git a/fs/splice.c b/fs/splice.c index d983d375ff11..218e24b1ac40 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -201,7 +201,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, unsigned int tail = pipe->tail; unsigned int head = pipe->head; unsigned int mask = pipe->ring_size - 1; - int ret = 0, page_nr = 0; + ssize_t ret = 0; + int page_nr = 0; if (!spd_pages) return 0; @@ -673,10 +674,13 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, .u.file = out, }; int nbufs = pipe->max_usage; - struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec), - GFP_KERNEL); + struct bio_vec *array; ssize_t ret; + if (!out->f_op->write_iter) + return -EINVAL; + + array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL); if (unlikely(!array)) return -ENOMEM; @@ -684,6 +688,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, splice_from_pipe_begin(&sd); while (sd.total_len) { + struct kiocb kiocb; struct iov_iter from; unsigned int head, tail, mask; size_t left; @@ -733,7 +738,10 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, } iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left); - ret = vfs_iter_write(out, &from, &sd.pos, 0); + init_sync_kiocb(&kiocb, out); + kiocb.ki_pos = sd.pos; + ret = call_write_iter(out, &kiocb, &from); + sd.pos = kiocb.ki_pos; if (ret <= 0) break; @@ -925,8 +933,8 @@ static int warn_unsupported(struct file *file, const char *op) /* * Attempt to initiate a splice from pipe to file. */ -static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags) +static ssize_t do_splice_from(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) { if (unlikely(!out->f_op->splice_write)) return warn_unsupported(out, "write"); @@ -944,27 +952,15 @@ static void do_splice_eof(struct splice_desc *sd) sd->splice_eof(sd); } -/** - * vfs_splice_read - Read data from a file and splice it into a pipe - * @in: File to splice from - * @ppos: Input file offset - * @pipe: Pipe to splice to - * @len: Number of bytes to splice - * @flags: Splice modifier flags (SPLICE_F_*) - * - * Splice the requested amount of data from the input file to the pipe. This - * is synchronous as the caller must hold the pipe lock across the entire - * operation. - * - * If successful, it returns the amount of data spliced, 0 if it hit the EOF or - * a hole and a negative error code otherwise. +/* + * Callers already called rw_verify_area() on the entire range. + * No need to call it for sub ranges. */ -long vfs_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) +static ssize_t do_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { unsigned int p_space; - int ret; if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; @@ -975,10 +971,6 @@ long vfs_splice_read(struct file *in, loff_t *ppos, p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail); len = min_t(size_t, len, p_space << PAGE_SHIFT); - ret = rw_verify_area(READ, in, ppos, len); - if (unlikely(ret < 0)) - return ret; - if (unlikely(len > MAX_RW_COUNT)) len = MAX_RW_COUNT; @@ -992,6 +984,34 @@ long vfs_splice_read(struct file *in, loff_t *ppos, return copy_splice_read(in, ppos, pipe, len, flags); return in->f_op->splice_read(in, ppos, pipe, len, flags); } + +/** + * vfs_splice_read - Read data from a file and splice it into a pipe + * @in: File to splice from + * @ppos: Input file offset + * @pipe: Pipe to splice to + * @len: Number of bytes to splice + * @flags: Splice modifier flags (SPLICE_F_*) + * + * Splice the requested amount of data from the input file to the pipe. This + * is synchronous as the caller must hold the pipe lock across the entire + * operation. + * + * If successful, it returns the amount of data spliced, 0 if it hit the EOF or + * a hole and a negative error code otherwise. + */ +ssize_t vfs_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + ssize_t ret; + + ret = rw_verify_area(READ, in, ppos, len); + if (unlikely(ret < 0)) + return ret; + + return do_splice_read(in, ppos, pipe, len, flags); +} EXPORT_SYMBOL_GPL(vfs_splice_read); /** @@ -1011,7 +1031,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, splice_direct_actor *actor) { struct pipe_inode_info *pipe; - long ret, bytes; + ssize_t ret, bytes; size_t len; int i, flags, more; @@ -1066,7 +1086,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, size_t read_len; loff_t pos = sd->pos, prev_pos = pos; - ret = vfs_splice_read(in, &pos, pipe, len, flags); + ret = do_splice_read(in, &pos, pipe, len, flags); if (unlikely(ret <= 0)) goto read_failure; @@ -1138,9 +1158,20 @@ static int direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) { struct file *file = sd->u.file; + long ret; + + file_start_write(file); + ret = do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); + file_end_write(file); + return ret; +} + +static int splice_file_range_actor(struct pipe_inode_info *pipe, + struct splice_desc *sd) +{ + struct file *file = sd->u.file; - return do_splice_from(pipe, file, sd->opos, sd->total_len, - sd->flags); + return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); } static void direct_file_splice_eof(struct splice_desc *sd) @@ -1151,24 +1182,10 @@ static void direct_file_splice_eof(struct splice_desc *sd) file->f_op->splice_eof(file); } -/** - * do_splice_direct - splices data directly between two files - * @in: file to splice from - * @ppos: input file offset - * @out: file to splice to - * @opos: output file offset - * @len: number of bytes to splice - * @flags: splice modifier flags - * - * Description: - * For use by do_sendfile(). splice can easily emulate sendfile, but - * doing it in the application would incur an extra system call - * (splice in + splice out, as compared to just sendfile()). So this helper - * can splice directly through a process-private pipe. - * - */ -long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - loff_t *opos, size_t len, unsigned int flags) +static ssize_t do_splice_direct_actor(struct file *in, loff_t *ppos, + struct file *out, loff_t *opos, + size_t len, unsigned int flags, + splice_direct_actor *actor) { struct splice_desc sd = { .len = len, @@ -1179,7 +1196,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, .splice_eof = direct_file_splice_eof, .opos = opos, }; - long ret; + ssize_t ret; if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; @@ -1187,18 +1204,63 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, if (unlikely(out->f_flags & O_APPEND)) return -EINVAL; - ret = rw_verify_area(WRITE, out, opos, len); - if (unlikely(ret < 0)) - return ret; - - ret = splice_direct_to_actor(in, &sd, direct_splice_actor); + ret = splice_direct_to_actor(in, &sd, actor); if (ret > 0) *ppos = sd.pos; return ret; } +/** + * do_splice_direct - splices data directly between two files + * @in: file to splice from + * @ppos: input file offset + * @out: file to splice to + * @opos: output file offset + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Description: + * For use by do_sendfile(). splice can easily emulate sendfile, but + * doing it in the application would incur an extra system call + * (splice in + splice out, as compared to just sendfile()). So this helper + * can splice directly through a process-private pipe. + * + * Callers already called rw_verify_area() on the entire range. + */ +ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len, unsigned int flags) +{ + return do_splice_direct_actor(in, ppos, out, opos, len, flags, + direct_splice_actor); +} EXPORT_SYMBOL(do_splice_direct); +/** + * splice_file_range - splices data between two files for copy_file_range() + * @in: file to splice from + * @ppos: input file offset + * @out: file to splice to + * @opos: output file offset + * @len: number of bytes to splice + * + * Description: + * For use by ->copy_file_range() methods. + * Like do_splice_direct(), but vfs_copy_file_range() already holds + * start_file_write() on @out file. + * + * Callers already called rw_verify_area() on the entire range. + */ +ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len) +{ + lockdep_assert(file_write_started(out)); + + return do_splice_direct_actor(in, ppos, out, opos, + min_t(size_t, len, MAX_RW_COUNT), + 0, splice_file_range_actor); +} +EXPORT_SYMBOL(splice_file_range); + static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) { for (;;) { @@ -1220,17 +1282,17 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags); -long splice_file_to_pipe(struct file *in, - struct pipe_inode_info *opipe, - loff_t *offset, - size_t len, unsigned int flags) +ssize_t splice_file_to_pipe(struct file *in, + struct pipe_inode_info *opipe, + loff_t *offset, + size_t len, unsigned int flags) { - long ret; + ssize_t ret; pipe_lock(opipe); ret = wait_for_space(opipe, flags); if (!ret) - ret = vfs_splice_read(in, offset, opipe, len, flags); + ret = do_splice_read(in, offset, opipe, len, flags); pipe_unlock(opipe); if (ret > 0) wakeup_pipe_readers(opipe); @@ -1240,13 +1302,13 @@ long splice_file_to_pipe(struct file *in, /* * Determine where to splice to/from. */ -long do_splice(struct file *in, loff_t *off_in, struct file *out, - loff_t *off_out, size_t len, unsigned int flags) +ssize_t do_splice(struct file *in, loff_t *off_in, struct file *out, + loff_t *off_out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; loff_t offset; - long ret; + ssize_t ret; if (unlikely(!(in->f_mode & FMODE_READ) || !(out->f_mode & FMODE_WRITE))) @@ -1307,6 +1369,10 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, offset = in->f_pos; } + ret = rw_verify_area(READ, in, &offset, len); + if (unlikely(ret < 0)) + return ret; + if (out->f_flags & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; @@ -1333,14 +1399,14 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, return ret; } -static long __do_splice(struct file *in, loff_t __user *off_in, - struct file *out, loff_t __user *off_out, - size_t len, unsigned int flags) +static ssize_t __do_splice(struct file *in, loff_t __user *off_in, + struct file *out, loff_t __user *off_out, + size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; loff_t offset, *__off_in = NULL, *__off_out = NULL; - long ret; + ssize_t ret; ipipe = get_pipe_info(in, true); opipe = get_pipe_info(out, true); @@ -1379,16 +1445,16 @@ static long __do_splice(struct file *in, loff_t __user *off_in, return ret; } -static int iter_to_pipe(struct iov_iter *from, - struct pipe_inode_info *pipe, - unsigned flags) +static ssize_t iter_to_pipe(struct iov_iter *from, + struct pipe_inode_info *pipe, + unsigned int flags) { struct pipe_buffer buf = { .ops = &user_page_pipe_buf_ops, .flags = flags }; size_t total = 0; - int ret = 0; + ssize_t ret = 0; while (iov_iter_count(from)) { struct page *pages[16]; @@ -1437,8 +1503,8 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, * For lack of a better implementation, implement vmsplice() to userspace * as a simple copy of the pipes pages to the user iov. */ -static long vmsplice_to_user(struct file *file, struct iov_iter *iter, - unsigned int flags) +static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter, + unsigned int flags) { struct pipe_inode_info *pipe = get_pipe_info(file, true); struct splice_desc sd = { @@ -1446,7 +1512,7 @@ static long vmsplice_to_user(struct file *file, struct iov_iter *iter, .flags = flags, .u.data = iter }; - long ret = 0; + ssize_t ret = 0; if (!pipe) return -EBADF; @@ -1470,11 +1536,11 @@ static long vmsplice_to_user(struct file *file, struct iov_iter *iter, * as splice-from-memory, where the regular splice is splice-from-file (or * to file). In both cases the output is a pipe, naturally. */ -static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter, - unsigned int flags) +static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter, + unsigned int flags) { struct pipe_inode_info *pipe; - long ret = 0; + ssize_t ret = 0; unsigned buf_flag = 0; if (flags & SPLICE_F_GIFT) @@ -1570,7 +1636,7 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, size_t, len, unsigned int, flags) { struct fd in, out; - long error; + ssize_t error; if (unlikely(!len)) return 0; @@ -1584,7 +1650,7 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, out = fdget(fd_out); if (out.file) { error = __do_splice(in.file, off_in, out.file, off_out, - len, flags); + len, flags); fdput(out); } fdput(in); @@ -1807,15 +1873,15 @@ retry: /* * Link contents of ipipe to opipe. */ -static int link_pipe(struct pipe_inode_info *ipipe, - struct pipe_inode_info *opipe, - size_t len, unsigned int flags) +static ssize_t link_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; unsigned int i_mask, o_mask; - int ret = 0; + ssize_t ret = 0; /* * Potential ABBA deadlock, work around it by ordering lock @@ -1898,11 +1964,12 @@ static int link_pipe(struct pipe_inode_info *ipipe, * The 'flags' used are the SPLICE_F_* variants, currently the only * applicable one is SPLICE_F_NONBLOCK. */ -long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) +ssize_t do_tee(struct file *in, struct file *out, size_t len, + unsigned int flags) { struct pipe_inode_info *ipipe = get_pipe_info(in, true); struct pipe_inode_info *opipe = get_pipe_info(out, true); - int ret = -EINVAL; + ssize_t ret = -EINVAL; if (unlikely(!(in->f_mode & FMODE_READ) || !(out->f_mode & FMODE_WRITE))) @@ -1939,7 +2006,7 @@ long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) { struct fd in, out; - int error; + ssize_t error; if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; |