diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-21 18:19:09 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-21 18:19:09 -0700 |
commit | d9a185f8b49678775ef56ecbdbc7b76970302897 (patch) | |
tree | 7ace1b26133e5d796af09e5d71d6531bcb69865c /fs | |
parent | c22fc16d172fba4d19ffd8f2aa8fe67edba63895 (diff) | |
parent | 989974c804574d250ac92d44e220081959ac8ac1 (diff) |
Merge tag 'ovl-update-4.19' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs
Pull overlayfs updates from Miklos Szeredi:
"This contains two new features:
- Stack file operations: this allows removal of several hacks from
the VFS, proper interaction of read-only open files with copy-up,
possibility to implement fs modifying ioctls properly, and others.
- Metadata only copy-up: when file is on lower layer and only
metadata is modified (except size) then only copy up the metadata
and continue to use the data from the lower file"
* tag 'ovl-update-4.19' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs: (66 commits)
ovl: Enable metadata only feature
ovl: Do not do metacopy only for ioctl modifying file attr
ovl: Do not do metadata only copy-up for truncate operation
ovl: add helper to force data copy-up
ovl: Check redirect on index as well
ovl: Set redirect on upper inode when it is linked
ovl: Set redirect on metacopy files upon rename
ovl: Do not set dentry type ORIGIN for broken hardlinks
ovl: Add an inode flag OVL_CONST_INO
ovl: Treat metacopy dentries as type OVL_PATH_MERGE
ovl: Check redirects for metacopy files
ovl: Move some dir related ovl_lookup_single() code in else block
ovl: Do not expose metacopy only dentry from d_real()
ovl: Open file with data except for the case of fsync
ovl: Add helper ovl_inode_realdata()
ovl: Store lower data inode in ovl_inode
ovl: Fix ovl_getattr() to get number of blocks from lower
ovl: Add helper ovl_dentry_lowerdata() to get lower data dentry
ovl: Copy up meta inode data from lowest data inode
ovl: Modify ovl_lookup() and friends to lookup metacopy dentry
...
Diffstat (limited to 'fs')
-rw-r--r-- | fs/btrfs/ctree.h | 5 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 11 | ||||
-rw-r--r-- | fs/file_table.c | 69 | ||||
-rw-r--r-- | fs/inode.c | 46 | ||||
-rw-r--r-- | fs/internal.h | 11 | ||||
-rw-r--r-- | fs/ioctl.c | 1 | ||||
-rw-r--r-- | fs/locks.c | 20 | ||||
-rw-r--r-- | fs/namei.c | 2 | ||||
-rw-r--r-- | fs/namespace.c | 69 | ||||
-rw-r--r-- | fs/ocfs2/file.c | 17 | ||||
-rw-r--r-- | fs/open.c | 44 | ||||
-rw-r--r-- | fs/overlayfs/Kconfig | 19 | ||||
-rw-r--r-- | fs/overlayfs/Makefile | 4 | ||||
-rw-r--r-- | fs/overlayfs/copy_up.c | 190 | ||||
-rw-r--r-- | fs/overlayfs/dir.c | 105 | ||||
-rw-r--r-- | fs/overlayfs/export.c | 3 | ||||
-rw-r--r-- | fs/overlayfs/file.c | 511 | ||||
-rw-r--r-- | fs/overlayfs/inode.c | 175 | ||||
-rw-r--r-- | fs/overlayfs/namei.c | 195 | ||||
-rw-r--r-- | fs/overlayfs/overlayfs.h | 47 | ||||
-rw-r--r-- | fs/overlayfs/ovl_entry.h | 6 | ||||
-rw-r--r-- | fs/overlayfs/readdir.c | 19 | ||||
-rw-r--r-- | fs/overlayfs/super.c | 103 | ||||
-rw-r--r-- | fs/overlayfs/util.c | 252 | ||||
-rw-r--r-- | fs/read_write.c | 94 | ||||
-rw-r--r-- | fs/xattr.c | 9 | ||||
-rw-r--r-- | fs/xfs/xfs_file.c | 29 |
27 files changed, 1531 insertions, 525 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 318be7864072..53af9f5253f4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3217,8 +3217,9 @@ void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); -ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen, - struct file *dst_file, u64 dst_loff); +int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff, + struct file *dst_file, loff_t dst_loff, + u64 olen); /* file.c */ int __init btrfs_auto_defrag_init(void); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d3a5d2a41e5f..63600dc2ac4c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3592,13 +3592,13 @@ out_unlock: return ret; } -ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen, - struct file *dst_file, u64 dst_loff) +int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff, + struct file *dst_file, loff_t dst_loff, + u64 olen) { struct inode *src = file_inode(src_file); struct inode *dst = file_inode(dst_file); u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; - ssize_t res; if (WARN_ON_ONCE(bs < PAGE_SIZE)) { /* @@ -3609,10 +3609,7 @@ ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen, return -EINVAL; } - res = btrfs_extent_same(src, loff, olen, dst, dst_loff); - if (res) - return res; - return olen; + return btrfs_extent_same(src, src_loff, olen, dst, dst_loff); } static int clone_finish_inode_update(struct btrfs_trans_handle *trans, diff --git a/fs/file_table.c b/fs/file_table.c index d6eccd04d703..e49af4caf15d 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -52,7 +52,8 @@ static void file_free_rcu(struct rcu_head *head) static inline void file_free(struct file *f) { security_file_free(f); - percpu_counter_dec(&nr_files); + if (!(f->f_mode & FMODE_NOACCOUNT)) + percpu_counter_dec(&nr_files); call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } @@ -91,6 +92,34 @@ int proc_nr_files(struct ctl_table *table, int write, } #endif +static struct file *__alloc_file(int flags, const struct cred *cred) +{ + struct file *f; + int error; + + f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); + if (unlikely(!f)) + return ERR_PTR(-ENOMEM); + + f->f_cred = get_cred(cred); + error = security_file_alloc(f); + if (unlikely(error)) { + file_free_rcu(&f->f_u.fu_rcuhead); + return ERR_PTR(error); + } + + atomic_long_set(&f->f_count, 1); + rwlock_init(&f->f_owner.lock); + spin_lock_init(&f->f_lock); + mutex_init(&f->f_pos_lock); + eventpoll_init_file(f); + f->f_flags = flags; + f->f_mode = OPEN_FMODE(flags); + /* f->f_version: 0 */ + + return f; +} + /* Find an unused file structure and return a pointer to it. * Returns an error pointer if some error happend e.g. we over file * structures limit, run out of memory or operation is not permitted. @@ -105,7 +134,6 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) { static long old_max; struct file *f; - int error; /* * Privileged users can go above max_files @@ -119,26 +147,10 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) goto over; } - f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); - if (unlikely(!f)) - return ERR_PTR(-ENOMEM); - - f->f_cred = get_cred(cred); - error = security_file_alloc(f); - if (unlikely(error)) { - file_free_rcu(&f->f_u.fu_rcuhead); - return ERR_PTR(error); - } + f = __alloc_file(flags, cred); + if (!IS_ERR(f)) + percpu_counter_inc(&nr_files); - atomic_long_set(&f->f_count, 1); - rwlock_init(&f->f_owner.lock); - spin_lock_init(&f->f_lock); - mutex_init(&f->f_pos_lock); - eventpoll_init_file(f); - f->f_flags = flags; - f->f_mode = OPEN_FMODE(flags); - /* f->f_version: 0 */ - percpu_counter_inc(&nr_files); return f; over: @@ -150,6 +162,21 @@ over: return ERR_PTR(-ENFILE); } +/* + * Variant of alloc_empty_file() that doesn't check and modify nr_files. + * + * Should not be used unless there's a very good reason to do so. + */ +struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred) +{ + struct file *f = __alloc_file(flags, cred); + + if (!IS_ERR(f)) + f->f_mode |= FMODE_NOACCOUNT; + + return f; +} + /** * alloc_file - allocate and initialize a 'struct file' * diff --git a/fs/inode.c b/fs/inode.c index a06de4454232..42f6d25f32a5 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1596,49 +1596,16 @@ sector_t bmap(struct inode *inode, sector_t block) EXPORT_SYMBOL(bmap); /* - * Update times in overlayed inode from underlying real inode - */ -static void update_ovl_inode_times(struct dentry *dentry, struct inode *inode, - bool rcu) -{ - struct dentry *upperdentry; - - /* - * Nothing to do if in rcu or if non-overlayfs - */ - if (rcu || likely(!(dentry->d_flags & DCACHE_OP_REAL))) - return; - - upperdentry = d_real(dentry, NULL, 0, D_REAL_UPPER); - - /* - * If file is on lower then we can't update atime, so no worries about - * stale mtime/ctime. - */ - if (upperdentry) { - struct inode *realinode = d_inode(upperdentry); - - if ((!timespec64_equal(&inode->i_mtime, &realinode->i_mtime) || - !timespec64_equal(&inode->i_ctime, &realinode->i_ctime))) { - inode->i_mtime = realinode->i_mtime; - inode->i_ctime = realinode->i_ctime; - } - } -} - -/* * With relative atime, only update atime if the previous atime is * earlier than either the ctime or mtime or if at least a day has * passed since the last atime update. */ -static int relatime_need_update(const struct path *path, struct inode *inode, - struct timespec now, bool rcu) +static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, + struct timespec now) { - if (!(path->mnt->mnt_flags & MNT_RELATIME)) + if (!(mnt->mnt_flags & MNT_RELATIME)) return 1; - - update_ovl_inode_times(path->dentry, inode, rcu); /* * Is mtime younger than atime? If yes, update atime: */ @@ -1709,8 +1676,7 @@ static int update_time(struct inode *inode, struct timespec64 *time, int flags) * This function automatically handles read only file systems and media, * as well as the "noatime" flag and inode specific "noatime" markers. */ -bool __atime_needs_update(const struct path *path, struct inode *inode, - bool rcu) +bool atime_needs_update(const struct path *path, struct inode *inode) { struct vfsmount *mnt = path->mnt; struct timespec64 now; @@ -1736,7 +1702,7 @@ bool __atime_needs_update(const struct path *path, struct inode *inode, now = current_time(inode); - if (!relatime_need_update(path, inode, timespec64_to_timespec(now), rcu)) + if (!relatime_need_update(mnt, inode, timespec64_to_timespec(now))) return false; if (timespec64_equal(&inode->i_atime, &now)) @@ -1751,7 +1717,7 @@ void touch_atime(const struct path *path) struct inode *inode = d_inode(path->dentry); struct timespec64 now; - if (!__atime_needs_update(path, inode, false)) + if (!atime_needs_update(path, inode)) return; if (!sb_start_write_trylock(inode->i_sb)) diff --git a/fs/internal.h b/fs/internal.h index 50a28fc71300..d410186bc369 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -82,10 +82,8 @@ extern void __init mnt_init(void); extern int __mnt_want_write(struct vfsmount *); extern int __mnt_want_write_file(struct file *); -extern int mnt_want_write_file_path(struct file *); extern void __mnt_drop_write(struct vfsmount *); extern void __mnt_drop_write_file(struct file *); -extern void mnt_drop_write_file_path(struct file *); /* * fs_struct.c @@ -96,6 +94,7 @@ extern void chroot_fs_refs(const struct path *, const struct path *); * file_table.c */ extern struct file *alloc_empty_file(int, const struct cred *); +extern struct file *alloc_empty_file_noaccount(int, const struct cred *); /* * super.c @@ -136,13 +135,6 @@ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); extern void inode_add_lru(struct inode *inode); extern int dentry_needs_remove_privs(struct dentry *dentry); -extern bool __atime_needs_update(const struct path *, struct inode *, bool); -static inline bool atime_needs_update_rcu(const struct path *path, - struct inode *inode) -{ - return __atime_needs_update(path, inode, true); -} - /* * fs-writeback.c */ @@ -185,7 +177,6 @@ extern const struct dentry_operations ns_dentry_operations; */ extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd, unsigned long arg); -extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); /* * iomap support: diff --git a/fs/ioctl.c b/fs/ioctl.c index b445b13fc59b..3212c29235ce 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -49,6 +49,7 @@ long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) out: return error; } +EXPORT_SYMBOL(vfs_ioctl); static int ioctl_fibmap(struct file *filp, int __user *p) { diff --git a/fs/locks.c b/fs/locks.c index 5086bde5a18e..2ecb4db8c840 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -139,11 +139,6 @@ #define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK) #define IS_REMOTELCK(fl) (fl->fl_pid <= 0) -static inline bool is_remote_lock(struct file *filp) -{ - return likely(!(filp->f_path.dentry->d_sb->s_flags & SB_NOREMOTELOCK)); -} - static bool lease_breaking(struct file_lock *fl) { return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING); @@ -1651,8 +1646,7 @@ check_conflicting_open(const struct dentry *dentry, const long arg, int flags) if (flags & FL_LAYOUT) return 0; - if ((arg == F_RDLCK) && - (atomic_read(&d_real_inode(dentry)->i_writecount) > 0)) + if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) return -EAGAIN; if ((arg == F_WRLCK) && ((d_count(dentry) > 1) || @@ -1873,7 +1867,7 @@ EXPORT_SYMBOL(generic_setlease); int vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv) { - if (filp->f_op->setlease && is_remote_lock(filp)) + if (filp->f_op->setlease) return filp->f_op->setlease(filp, arg, lease, priv); else return generic_setlease(filp, arg, lease, priv); @@ -2020,7 +2014,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) if (error) goto out_free; - if (f.file->f_op->flock && is_remote_lock(f.file)) + if (f.file->f_op->flock) error = f.file->f_op->flock(f.file, (can_sleep) ? F_SETLKW : F_SETLK, lock); @@ -2046,7 +2040,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) */ int vfs_test_lock(struct file *filp, struct file_lock *fl) { - if (filp->f_op->lock && is_remote_lock(filp)) + if (filp->f_op->lock) return filp->f_op->lock(filp, F_GETLK, fl); posix_test_lock(filp, fl); return 0; @@ -2196,7 +2190,7 @@ out: */ int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) { - if (filp->f_op->lock && is_remote_lock(filp)) + if (filp->f_op->lock) return filp->f_op->lock(filp, cmd, fl); else return posix_lock_file(filp, fl, conf); @@ -2518,7 +2512,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx) if (list_empty(&flctx->flc_flock)) return; - if (filp->f_op->flock && is_remote_lock(filp)) + if (filp->f_op->flock) filp->f_op->flock(filp, F_SETLKW, &fl); else flock_lock_inode(inode, &fl); @@ -2605,7 +2599,7 @@ EXPORT_SYMBOL(posix_unblock_lock); */ int vfs_cancel_lock(struct file *filp, struct file_lock *fl) { - if (filp->f_op->lock && is_remote_lock(filp)) + if (filp->f_op->lock) return filp->f_op->lock(filp, F_CANCELLK, fl); return 0; } diff --git a/fs/namei.c b/fs/namei.c index 3cd396277cd3..ae6aa9ae757c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1015,7 +1015,7 @@ const char *get_link(struct nameidata *nd) if (!(nd->flags & LOOKUP_RCU)) { touch_atime(&last->link); cond_resched(); - } else if (atime_needs_update_rcu(&last->link, inode)) { + } else if (atime_needs_update(&last->link, inode)) { if (unlikely(unlazy_walk(nd))) return ERR_PTR(-ECHILD); touch_atime(&last->link); diff --git a/fs/namespace.c b/fs/namespace.c index bd2f4c68506a..725d6935fab9 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -431,74 +431,20 @@ int __mnt_want_write_file(struct file *file) } /** - * mnt_want_write_file_path - get write access to a file's mount - * @file: the file who's mount on which to take a write - * - * This is like mnt_want_write, but it takes a file and can - * do some optimisations if the file is open for write already - * - * Called by the vfs for cases when we have an open file at hand, but will do an - * inode operation on it (important distinction for files opened on overlayfs, - * since the file operations will come from the real underlying file, while - * inode operations come from the overlay). - */ -int mnt_want_write_file_path(struct file *file) -{ - int ret; - - sb_start_write(file->f_path.mnt->mnt_sb); - ret = __mnt_want_write_file(file); - if (ret) - sb_end_write(file->f_path.mnt->mnt_sb); - return ret; -} - -static inline int may_write_real(struct file *file) -{ - struct dentry *dentry = file->f_path.dentry; - struct dentry *upperdentry; - - /* Writable file? */ - if (file->f_mode & FMODE_WRITER) - return 0; - - /* Not overlayfs? */ - if (likely(!(dentry->d_flags & DCACHE_OP_REAL))) - return 0; - - /* File refers to upper, writable layer? */ - upperdentry = d_real(dentry, NULL, 0, D_REAL_UPPER); - if (upperdentry && - (file_inode(file) == d_inode(upperdentry) || - file_inode(file) == d_inode(dentry))) - return 0; - - /* Lower layer: can't write to real file, sorry... */ - return -EPERM; -} - -/** * mnt_want_write_file - get write access to a file's mount * @file: the file who's mount on which to take a write * * This is like mnt_want_write, but it takes a file and can * do some optimisations if the file is open for write already - * - * Mostly called by filesystems from their ioctl operation before performing - * modification. On overlayfs this needs to check if the file is on a read-only - * lower layer and deny access in that case. */ int mnt_want_write_file(struct file *file) { int ret; - ret = may_write_real(file); - if (!ret) { - sb_start_write(file_inode(file)->i_sb); - ret = __mnt_want_write_file(file); - if (ret) - sb_end_write(file_inode(file)->i_sb); - } + sb_start_write(file_inode(file)->i_sb); + ret = __mnt_want_write_file(file); + if (ret) + sb_end_write(file_inode(file)->i_sb); return ret; } EXPORT_SYMBOL_GPL(mnt_want_write_file); @@ -538,14 +484,9 @@ void __mnt_drop_write_file(struct file *file) __mnt_drop_write(file->f_path.mnt); } -void mnt_drop_write_file_path(struct file *file) -{ - mnt_drop_write(file->f_path.mnt); -} - void mnt_drop_write_file(struct file *file) { - __mnt_drop_write(file->f_path.mnt); + __mnt_drop_write_file(file); sb_end_write(file_inode(file)->i_sb); } EXPORT_SYMBOL(mnt_drop_write_file); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 255f758af03a..9fa35cb6f6e0 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2537,19 +2537,14 @@ static int ocfs2_file_clone_range(struct file *file_in, len, false); } -static ssize_t ocfs2_file_dedupe_range(struct file *src_file, - u64 loff, - u64 len, - struct file *dst_file, - u64 dst_loff) +static int ocfs2_file_dedupe_range(struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len) { - int error; - - error = ocfs2_reflink_remap_range(src_file, loff, dst_file, dst_loff, + return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out, len, true); - if (error) - return error; - return len; } const struct inode_operations ocfs2_file_iops = { diff --git a/fs/open.c b/fs/open.c index d98e19239bb7..0285ce7dbd51 100644 --- a/fs/open.c +++ b/fs/open.c @@ -68,7 +68,6 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, long vfs_truncate(const struct path *path, loff_t length) { struct inode *inode; - struct dentry *upperdentry; long error; inode = path->dentry->d_inode; @@ -91,17 +90,7 @@ long vfs_truncate(const struct path *path, loff_t length) if (IS_APPEND(inode)) goto mnt_drop_write_and_out; - /* - * If this is an overlayfs then do as if opening the file so we get - * write access on the upper inode, not on the overlay inode. For - * non-overlay filesystems d_real() is an identity function. - */ - upperdentry = d_real(path->dentry, NULL, O_WRONLY, 0); - error = PTR_ERR(upperdentry); - if (IS_ERR(upperdentry)) - goto mnt_drop_write_and_out; - - error = get_write_access(upperdentry->d_inode); + error = get_write_access(inode); if (error) goto mnt_drop_write_and_out; @@ -120,7 +109,7 @@ long vfs_truncate(const struct path *path, loff_t length) error = do_truncate(path->dentry, length, 0, NULL); put_write_and_out: - put_write_access(upperdentry->d_inode); + put_write_access(inode); mnt_drop_write_and_out: mnt_drop_write(path->mnt); out: @@ -707,12 +696,12 @@ int ksys_fchown(unsigned int fd, uid_t user, gid_t group) if (!f.file) goto out; - error = mnt_want_write_file_path(f.file); + error = mnt_want_write_file(f.file); if (error) goto out_fput; audit_file(f.file); error = chown_common(&f.file->f_path, user, group); - mnt_drop_write_file_path(f.file); + mnt_drop_write_file(f.file); out_fput: fdput(f); out: @@ -887,13 +876,8 @@ EXPORT_SYMBOL(file_path); */ int vfs_open(const struct path *path, struct file *file) { - struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags, 0); - - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - file->f_path = *path; - return do_dentry_open(file, d_backing_inode(dentry), NULL); + return do_dentry_open(file, d_backing_inode(path->dentry), NULL); } struct file *dentry_open(const struct path *path, int flags, @@ -919,6 +903,24 @@ struct file *dentry_open(const struct path *path, int flags, } EXPORT_SYMBOL(dentry_open); +struct file *open_with_fake_path(const struct path *path, int flags, + struct inode *inode, const struct cred *cred) +{ + struct file *f = alloc_empty_file_noaccount(flags, cred); + if (!IS_ERR(f)) { + int error; + + f->f_path = *path; + error = do_dentry_open(f, inode, NULL); + if (error) { + fput(f); + f = ERR_PTR(error); + } + } + return f; +} +EXPORT_SYMBOL(open_with_fake_path); + static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) { int lookup_flags = 0; diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index 9384164253ac..2ef91be2a04e 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -64,6 +64,7 @@ config OVERLAY_FS_NFS_EXPORT bool "Overlayfs: turn on NFS export feature by default" depends on OVERLAY_FS depends on OVERLAY_FS_INDEX + depends on !OVERLAY_FS_METACOPY help If this config option is enabled then overlay filesystems will use the index directory to decode overlay NFS file handles by default. @@ -103,3 +104,21 @@ config OVERLAY_FS_XINO_AUTO For more information, see Documentation/filesystems/overlayfs.txt If unsure, say N. + +config OVERLAY_FS_METACOPY + bool "Overlayfs: turn on metadata only copy up feature by default" + depends on OVERLAY_FS + select OVERLAY_FS_REDIRECT_DIR + help + If this config option is enabled then overlay filesystems will + copy up only metadata where appropriate and data copy up will + happen when a file is opened for WRITE operation. It is still + possible to turn off this feature globally with the "metacopy=off" + module option or on a filesystem instance basis with the + "metacopy=off" mount option. + + Note, that this feature is not backward compatible. That is, + mounting an overlay which has metacopy only inodes on a kernel + that doesn't support this feature will have unexpected results. + + If unsure, say N. diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile index 30802347a020..46e1ff8ac056 100644 --- a/fs/overlayfs/Makefile +++ b/fs/overlayfs/Makefile @@ -4,5 +4,5 @@ obj-$(CONFIG_OVERLAY_FS) += overlay.o -overlay-objs := super.o namei.o util.o inode.o dir.o readdir.o copy_up.o \ - export.o +overlay-objs := super.o namei.o util.o inode.o file.o dir.o readdir.o \ + copy_up.o export.o diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index ddaddb4ce4c3..296037afecdb 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -25,35 +25,20 @@ #define OVL_COPY_UP_CHUNK_SIZE (1 << 20) -static bool __read_mostly ovl_check_copy_up; -module_param_named(check_copy_up, ovl_check_copy_up, bool, - S_IWUSR | S_IRUGO); -MODULE_PARM_DESC(ovl_check_copy_up, - "Warn on copy-up when causing process also has a R/O fd open"); - -static int ovl_check_fd(const void *data, struct file *f, unsigned int fd) +static int ovl_ccup_set(const char *buf, const struct kernel_param *param) { - const struct dentry *dentry = data; - - if (file_inode(f) == d_inode(dentry)) - pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n", - f, fd, current->pid, current->comm); + pr_warn("overlayfs: \"check_copy_up\" module option is obsolete\n"); return 0; } -/* - * Check the fds open by this process and warn if something like the following - * scenario is about to occur: - * - * fd1 = open("foo", O_RDONLY); - * fd2 = open("foo", O_RDWR); - */ -static void ovl_do_check_copy_up(struct dentry *dentry) +static int ovl_ccup_get(char *buf, const struct kernel_param *param) { - if (ovl_check_copy_up) - iterate_fd(current->files, 0, ovl_check_fd, dentry); + return sprintf(buf, "N\n"); } +module_param_call(check_copy_up, ovl_ccup_set, ovl_ccup_get, NULL, 0644); +MODULE_PARM_DESC(ovl_check_copy_up, "Obsolete; does nothing"); + int ovl_copy_xattr(struct dentry *old, struct dentry *new) { ssize_t list_size, size, value_size = 0; @@ -195,6 +180,16 @@ out_fput: return error; } +static int ovl_set_size(struct dentry *upperdentry, struct kstat *stat) +{ + struct iattr attr = { + .ia_valid = ATTR_SIZE, + .ia_size = stat->size, + }; + + return notify_change(upperdentry, &attr, NULL); +} + static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat) { struct iattr attr = { @@ -403,6 +398,7 @@ struct ovl_copy_up_ctx { bool tmpfile; bool origin; bool indexed; + bool metacopy; }; static int ovl_link_up(struct ovl_copy_up_ctx *c) @@ -505,28 +501,10 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) { int err; - if (S_ISREG(c->stat.mode)) { - struct path upperpath; - - ovl_path_upper(c->dentry, &upperpath); - BUG_ON(upperpath.dentry != NULL); - upperpath.dentry = temp; - - err = ovl_copy_up_data(&c->lowerpath, &upperpath, c->stat.size); - if (err) - return err; - } - err = ovl_copy_xattr(c->lowerpath.dentry, temp); if (err) return err; - inode_lock(temp->d_inode); - err = ovl_set_attr(temp, &c->stat); - inode_unlock(temp->d_inode); - if (err) - return err; - /* * Store identifier of lower inode in upper inode xattr to * allow lookup of the copy up origin inode. @@ -540,7 +518,34 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) return err; } - return 0; + if (S_ISREG(c->stat.mode) && !c->metacopy) { + struct path upperpath, datapath; + + ovl_path_upper(c->dentry, &upperpath); + BUG_ON(upperpath.dentry != NULL); + upperpath.dentry = temp; + + ovl_path_lowerdata(c->dentry, &datapath); + err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size); + if (err) + return err; + } + + if (c->metacopy) { + err = ovl_check_setxattr(c->dentry, temp, OVL_XATTR_METACOPY, + NULL, 0, -EOPNOTSUPP); + if (err) + return err; + } + + inode_lock(temp->d_inode); + if (c->metacopy) + err = ovl_set_size(temp, &c->stat); + if (!err) + err = ovl_set_attr(temp, &c->stat); + inode_unlock(temp->d_inode); + + return err; } static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c) @@ -575,6 +580,8 @@ static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c) if (err) goto out; + if (!c->metacopy) + ovl_set_upperdata(d_inode(c->dentry)); inode = d_inode(c->dentry); ovl_inode_update(inode, newdentry); if (S_ISDIR(inode->i_mode)) @@ -677,6 +684,49 @@ out: return err; } +static bool ovl_need_meta_copy_up(struct dentry *dentry, umode_t mode, + int flags) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + + if (!ofs->config.metacopy) + return false; + + if (!S_ISREG(mode)) + return false; + + if (flags && ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC))) + return false; + + return true; +} + +/* Copy up data of an inode which was copied up metadata only in the past. */ +static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) +{ + struct path upperpath, datapath; + int err; + + ovl_path_upper(c->dentry, &upperpath); + if (WARN_ON(upperpath.dentry == NULL)) + return -EIO; + + ovl_path_lowerdata(c->dentry, &datapath); + if (WARN_ON(datapath.dentry == NULL)) + return -EIO; + + err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size); + if (err) + return err; + + err = vfs_removexattr(upperpath.dentry, OVL_XATTR_METACOPY); + if (err) + return err; + + ovl_set_upperdata(d_inode(c->dentry)); + return err; +} + static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, int flags) { @@ -698,6 +748,8 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, if (err) return err; + ctx.metacopy = ovl_need_meta_copy_up(dentry, ctx.stat.mode, flags); + if (parent) { ovl_path_upper(parent, &parentpath); ctx.destdir = parentpath.dentry; @@ -719,9 +771,8 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, if (IS_ERR(ctx.link)) return PTR_ERR(ctx.link); } - ovl_do_check_copy_up(ctx.lowerpath.dentry); - err = ovl_copy_up_start(dentry); + err = ovl_copy_up_start(dentry, flags); /* err < 0: interrupted, err > 0: raced with another copy-up */ if (unlikely(err)) { if (err > 0) @@ -731,6 +782,8 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, err = ovl_do_copy_up(&ctx); if (!err && parent && !ovl_dentry_has_upper_alias(dentry)) err = ovl_link_up(&ctx); + if (!err && ovl_dentry_needs_data_copy_up_locked(dentry, flags)) + err = ovl_copy_up_meta_inode_data(&ctx); ovl_copy_up_end(dentry); } do_delayed_call(&done); @@ -756,21 +809,7 @@ int ovl_copy_up_flags(struct dentry *dentry, int flags) struct dentry *next; struct dentry *parent = NULL; - /* - * Check if copy-up has happened as well as for upper alias (in - * case of hard links) is there. - * - * Both checks are lockless: - * - false negatives: will recheck under oi->lock - * - false positives: - * + ovl_dentry_upper() uses memory barriers to ensure the - * upper dentry is up-to-date - * + ovl_dentry_has_upper_alias() relies on locking of - * upper parent i_rwsem to prevent reordering copy-up - * with rename. - */ - if (ovl_dentry_upper(dentry) && - (ovl_dentry_has_upper_alias(dentry) || disconnected)) + if (ovl_already_copied_up(dentry, flags)) break; next = dget(dentry); @@ -795,6 +834,41 @@ int ovl_copy_up_flags(struct dentry *dentry, int flags) return err; } +static bool ovl_open_need_copy_up(struct dentry *dentry, int flags) +{ + /* Copy up of disconnected dentry does not set upper alias */ + if (ovl_already_copied_up(dentry, flags)) + return false; + + if (special_file(d_inode(dentry)->i_mode)) + return false; + + if (!ovl_open_flags_need_copy_up(flags)) + return false; + + return true; +} + +int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags) +{ + int err = 0; + + if (ovl_open_need_copy_up(dentry, file_flags)) { + err = ovl_want_write(dentry); + if (!err) { + err = ovl_copy_up_flags(dentry, file_flags); + ovl_drop_write(dentry); + } + } + + return err; +} + +int ovl_copy_up_with_data(struct dentry *dentry) +{ + return ovl_copy_up_flags(dentry, O_WRONLY); +} + int ovl_copy_up(struct dentry *dentry) { return ovl_copy_up_flags(dentry, 0); diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index f480b1a2cd2e..ec350d4d921c 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -24,6 +24,8 @@ module_param_named(redirect_max, ovl_redirect_max, ushort, 0644); MODULE_PARM_DESC(ovl_redirect_max, "Maximum length of absolute redirect xattr value"); +static int ovl_set_redirect(struct dentry *dentry, bool samedir); + int ovl_cleanup(struct inode *wdir, struct dentry *wdentry) { int err; @@ -242,7 +244,7 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode, .newinode = inode, }; - ovl_dentry_version_inc(dentry->d_parent, false); + ovl_dir_modified(dentry->d_parent, false); ovl_dentry_set_upper_alias(dentry); if (!hardlink) { /* @@ -657,6 +659,12 @@ static int ovl_link(struct dentry *old, struct inode *newdir, if (err) goto out_drop_write; + if (ovl_is_metacopy_dentry(old)) { + err = ovl_set_redirect(old, false); + if (err) + goto out_drop_write; + } + err = ovl_nlink_start(old, &locked); if (err) goto out_drop_write; @@ -722,7 +730,7 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, if (err) goto out_d_drop; - ovl_dentry_version_inc(dentry->d_parent, true); + ovl_dir_modified(dentry->d_parent, true); out_d_drop: d_drop(dentry); out_dput_upper: @@ -767,7 +775,7 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir, err = vfs_rmdir(dir, upper); else err = vfs_unlink(dir, upper, NULL); - ovl_dentry_version_inc(dentry->d_parent, ovl_type_origin(dentry)); + ovl_dir_modified(dentry->d_parent, ovl_type_origin(dentry)); /* * Keeping this dentry hashed would mean having to release @@ -797,6 +805,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) int err; bool locked = false; const struct cred *old_cred; + struct dentry *upperdentry; bool lower_positive = ovl_lower_positive(dentry); LIST_HEAD(list); @@ -832,6 +841,17 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) drop_nlink(dentry->d_inode); } ovl_nlink_end(dentry, locked); + + /* + * Copy ctime + * + * Note: we fail to update ctime if there was no copy-up, only a + * whiteout + */ + upperdentry = ovl_dentry_upper(dentry); + if (upperdentry) + ovl_copyattr(d_inode(upperdentry), d_inode(dentry)); + out_drop_write: ovl_drop_write(dentry); out: @@ -862,13 +882,13 @@ static bool ovl_can_move(struct dentry *dentry) !d_is_dir(dentry) || !ovl_type_merge_or_lower(dentry); } -static char *ovl_get_redirect(struct dentry *dentry, bool samedir) +static char *ovl_get_redirect(struct dentry *dentry, bool abs_redirect) { char *buf, *ret; struct dentry *d, *tmp; int buflen = ovl_redirect_max + 1; - if (samedir) { + if (!abs_redirect) { ret = kstrndup(dentry->d_name.name, dentry->d_name.len, GFP_KERNEL); goto out; @@ -922,15 +942,43 @@ out: return ret ? ret : ERR_PTR(-ENOMEM); } +static bool ovl_need_absolute_redirect(struct dentry *dentry, bool samedir) +{ + struct dentry *lowerdentry; + + if (!samedir) + return true; + + if (d_is_dir(dentry)) + return false; + + /* + * For non-dir hardlinked files, we need absolute redirects + * in general as two upper hardlinks could be in different + * dirs. We could put a relative redirect now and convert + * it to absolute redirect later. But when nlink > 1 and + * indexing is on, that means relative redirect needs to be + * converted to absolute during copy up of another lower + * hardllink as well. + * + * So without optimizing too much, just check if lower is + * a hard link or not. If lower is hard link, put absolute + * redirect. + */ + lowerdentry = ovl_dentry_lower(dentry); + return (d_inode(lowerdentry)->i_nlink > 1); +} + static int ovl_set_redirect(struct dentry *dentry, bool samedir) { int err; const char *redirect = ovl_dentry_get_redirect(dentry); + bool absolute_redirect = ovl_need_absolute_redirect(dentry, samedir); - if (redirect && (samedir || redirect[0] == '/')) + if (redirect && (!absolute_redirect || redirect[0] == '/')) return 0; - redirect = ovl_get_redirect(dentry, samedir); + redirect = ovl_get_redirect(dentry, absolute_redirect); if (IS_ERR(redirect)) return PTR_ERR(redirect); @@ -1106,22 +1154,20 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, goto out_dput; err = 0; - if (is_dir) { - if (ovl_type_merge_or_lower(old)) - err = ovl_set_redirect(old, samedir); - else if (!old_opaque && ovl_type_merge(new->d_parent)) - err = ovl_set_opaque_xerr(old, olddentry, -EXDEV); - if (err) - goto out_dput; - } - if (!overwrite && new_is_dir) { - if (ovl_type_merge_or_lower(new)) - err = ovl_set_redirect(new, samedir); - else if (!new_opaque && ovl_type_merge(old->d_parent)) - err = ovl_set_opaque_xerr(new, newdentry, -EXDEV); - if (err) - goto out_dput; - } + if (ovl_type_merge_or_lower(old)) + err = ovl_set_redirect(old, samedir); + else if (is_dir && !old_opaque && ovl_type_merge(new->d_parent)) + err = ovl_set_opaque_xerr(old, olddentry, -EXDEV); + if (err) + goto out_dput; + + if (!overwrite && ovl_type_merge_or_lower(new)) + err = ovl_set_redirect(new, samedir); + else if (!overwrite && new_is_dir && !new_opaque && + ovl_type_merge(old->d_parent)) + err = ovl_set_opaque_xerr(new, newdentry, -EXDEV); + if (err) + goto out_dput; err = ovl_do_rename(old_upperdir->d_inode, olddentry, new_upperdir->d_inode, newdentry, flags); @@ -1138,10 +1184,15 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, drop_nlink(d_inode(new)); } - ovl_dentry_version_inc(old->d_parent, ovl_type_origin(old) || - (!overwrite && ovl_type_origin(new))); - ovl_dentry_version_inc(new->d_parent, ovl_type_origin(old) || - (d_inode(new) && ovl_type_origin(new))); + ovl_dir_modified(old->d_parent, ovl_type_origin(old) || + (!overwrite && ovl_type_origin(new))); + ovl_dir_modified(new->d_parent, ovl_type_origin(old) || + (d_inode(new) && ovl_type_origin(new))); + + /* copy ctime: */ + ovl_copyattr(d_inode(olddentry), d_inode(old)); + if (d_inode(new) && ovl_dentry_upper(new)) + ovl_copyattr(d_inode(newdentry), d_inode(new)); out_dput: dput(newdentry); diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 9941ece61a14..8fa37cd7818a 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -317,6 +317,9 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb, return ERR_CAST(inode); } + if (upper) + ovl_set_flag(OVL_UPPERDATA, inode); + dentry = d_find_any_alias(inode); if (!dentry) { dentry = d_alloc_anon(inode->i_sb); diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c new file mode 100644 index 000000000000..32e9282893c9 --- /dev/null +++ b/fs/overlayfs/file.c @@ -0,0 +1,511 @@ +/* + * Copyright (C) 2017 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/cred.h> +#include <linux/file.h> +#include <linux/mount.h> +#include <linux/xattr.h> +#include <linux/uio.h> +#include "overlayfs.h" + +static char ovl_whatisit(struct inode *inode, struct inode *realinode) +{ + if (realinode != ovl_inode_upper(inode)) + return 'l'; + if (ovl_has_upperdata(inode)) + return 'u'; + else + return 'm'; +} + +static struct file *ovl_open_realfile(const struct file *file, + struct inode *realinode) +{ + struct inode *inode = file_inode(file); + struct file *realfile; + const struct cred *old_cred; + + old_cred = ovl_override_creds(inode->i_sb); + realfile = open_with_fake_path(&file->f_path, file->f_flags | O_NOATIME, + realinode, current_cred()); + revert_creds(old_cred); + + pr_debug("open(%p[%pD2/%c], 0%o) -> (%p, 0%o)\n", + file, file, ovl_whatisit(inode, realinode), file->f_flags, + realfile, IS_ERR(realfile) ? 0 : realfile->f_flags); + + return realfile; +} + +#define OVL_SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT) + +static int ovl_change_flags(struct file *file, unsigned int flags) +{ + struct inode *inode = file_inode(file); + int err; + + /* No atime modificaton on underlying */ + flags |= O_NOATIME; + + /* If some flag changed that cannot be changed then something's amiss */ + if (WARN_ON((file->f_flags ^ flags) & ~OVL_SETFL_MASK)) + return -EIO; + + flags &= OVL_SETFL_MASK; + + if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode)) + return -EPERM; + + if (flags & O_DIRECT) { + if (!file->f_mapping->a_ops || + !file->f_mapping->a_ops->direct_IO) + return -EINVAL; + } + + if (file->f_op->check_flags) { + err = file->f_op->check_flags(flags); + if (err) + return err; + } + + spin_lock(&file->f_lock); + file->f_flags = (file->f_flags & ~OVL_SETFL_MASK) | flags; + spin_unlock(&file->f_lock); + + return 0; +} + +static int ovl_real_fdget_meta(const struct file *file, struct fd *real, + bool allow_meta) +{ + struct inode *inode = file_inode(file); + struct inode *realinode; + + real->flags = 0; + real->file = file->private_data; + + if (allow_meta) + realinode = ovl_inode_real(inode); + else + realinode = ovl_inode_realdata(inode); + + /* Has it been copied up since we'd opened it? */ + if (unlikely(file_inode(real->file) != realinode)) { + real->flags = FDPUT_FPUT; + real->file = ovl_open_realfile(file, realinode); + + return PTR_ERR_OR_ZERO(real->file); + } + + /* Did the flags change since open? */ + if (unlikely((file->f_flags ^ real->file->f_flags) & ~O_NOATIME)) + return ovl_change_flags(real->file, file->f_flags); + + return 0; +} + +static int ovl_real_fdget(const struct file *file, struct fd *real) +{ + return ovl_real_fdget_meta(file, real, false); +} + +static int ovl_open(struct inode *inode, struct file *file) +{ + struct dentry *dentry = file_dentry(file); + struct file *realfile; + int err; + + err = ovl_open_maybe_copy_up(dentry, file->f_flags); + if (err) + return err; + + /* No longer need these flags, so don't pass them on to underlying fs */ + file->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); + + realfile = ovl_open_realfile(file, ovl_inode_realdata(inode)); + if (IS_ERR(realfile)) + return PTR_ERR(realfile); + + /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */ + file->f_mapping = realfile->f_mapping; + + file->private_data = realfile; + + return 0; +} + +static int ovl_release(struct inode *inode, struct file *file) +{ + fput(file->private_data); + + return 0; +} + +static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *realinode = ovl_inode_real(file_inode(file)); + + return generic_file_llseek_size(file, offset, whence, + realinode->i_sb->s_maxbytes, + i_size_read(realinode)); +} + +static void ovl_file_accessed(struct file *file) +{ + struct inode *inode, *upperinode; + + if (file->f_flags & O_NOATIME) + return; + + inode = file_inode(file); + upperinode = ovl_inode_upper(inode); + + if (!upperinode) + return; + + if ((!timespec64_equal(&inode->i_mtime, &upperinode->i_mtime) || + !timespec64_equal(&inode->i_ctime, &upperinode->i_ctime))) { + inode->i_mtime = upperinode->i_mtime; + inode->i_ctime = upperinode->i_ctime; + } + + touch_atime(&file->f_path); +} + +static rwf_t ovl_iocb_to_rwf(struct kiocb *iocb) +{ + int ifl = iocb->ki_flags; + rwf_t flags = 0; + + if (ifl & IOCB_NOWAIT) + flags |= RWF_NOWAIT; + if (ifl & IOCB_HIPRI) + flags |= RWF_HIPRI; + if (ifl & IOCB_DSYNC) + flags |= RWF_DSYNC; + if (ifl & IOCB_SYNC) + flags |= RWF_SYNC; + + return flags; +} + +static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct fd real; + const struct cred *old_cred; + ssize_t ret; + + if (!iov_iter_count(iter)) + return 0; + + ret = ovl_real_fdget(file, &real); + if (ret) + return ret; + + old_cred = ovl_override_creds(file_inode(file)->i_sb); + ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, + ovl_iocb_to_rwf(iocb)); + revert_creds(old_cred); + + ovl_file_accessed(file); + + fdput(real); + + return ret; +} + +static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct fd real; + const struct cred *old_cred; + ssize_t ret; + + if (!iov_iter_count(iter)) + return 0; + + inode_lock(inode); + /* Update mode */ + ovl_copyattr(ovl_inode_real(inode), inode); + ret = file_remove_privs(file); + if (ret) + goto out_unlock; + + ret = ovl_real_fdget(file, &real); + if (ret) + goto out_unlock; + + old_cred = ovl_override_creds(file_inode(file)->i_sb); + ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, + ovl_iocb_to_rwf(iocb)); + revert_creds(old_cred); + + /* Update size */ + ovl_copyattr(ovl_inode_real(inode), inode); + + fdput(real); + +out_unlock: + inode_unlock(inode); + + return ret; +} + +static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct fd real; + const struct cred *old_cred; + int ret; + + ret = ovl_real_fdget_meta(file, &real, !datasync); + if (ret) + return ret; + + /* Don't sync lower file for fear of receiving EROFS error */ + if (file_inode(real.file) == ovl_inode_upper(file_inode(file))) { + old_cred = ovl_override_creds(file_inode(file)->i_sb); + ret = vfs_fsync_range(real.file, start, end, datasync); + revert_creds(old_cred); + } + + fdput(real); + + return ret; +} + +static int ovl_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct file *realfile = file->private_data; + const struct cred *old_cred; + int ret; + + if (!realfile->f_op->mmap) + return -ENODEV; + + if (WARN_ON(file != vma->vm_file)) + return -EIO; + + vma->vm_file = get_file(realfile); + + old_cred = ovl_override_creds(file_inode(file)->i_sb); + ret = call_mmap(vma->vm_file, vma); + revert_creds(old_cred); + + if (ret) { + /* Drop reference count from new vm_file value */ + fput(realfile); + } else { + /* Drop reference count from previous vm_file value */ + fput(file); + } + + ovl_file_accessed(file); + + return ret; +} + +static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(file); + struct fd real; + const struct cred *old_cred; + int ret; + + ret = ovl_real_fdget(file, &real); + if (ret) + return ret; + + old_cred = ovl_override_creds(file_inode(file)->i_sb); + ret = vfs_fallocate(real.file, mode, offset, len); + revert_creds(old_cred); + + /* Update size */ + ovl_copyattr(ovl_inode_real(inode), inode); + + fdput(real); + + return ret; +} + +static long ovl_real_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct fd real; + const struct cred *old_cred; + long ret; + + ret = ovl_real_fdget(file, &real); + if (ret) + return ret; + + old_cred = ovl_override_creds(file_inode(file)->i_sb); + ret = vfs_ioctl(real.file, cmd, arg); + revert_creds(old_cred); + + fdput(real); + + return ret; +} + +static long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long ret; + struct inode *inode = file_inode(file); + + switch (cmd) { + case FS_IOC_GETFLAGS: + ret = ovl_real_ioctl(file, cmd, arg); + break; + + case FS_IOC_SETFLAGS: + if (!inode_owner_or_capable(inode)) + return -EACCES; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + ret = ovl_copy_up_with_data(file_dentry(file)); + if (!ret) { + ret = ovl_real_ioctl(file, cmd, arg); + + inode_lock(inode); + ovl_copyflags(ovl_inode_real(inode), inode); + inode_unlock(inode); + } + + mnt_drop_write_file(file); + break; + + default: + ret = -ENOTTY; + } + + return ret; +} + +static long ovl_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case FS_IOC32_GETFLAGS: + cmd = FS_IOC_GETFLAGS; + break; + + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; + break; + + default: + return -ENOIOCTLCMD; + } + + return ovl_ioctl(file, cmd, arg); +} + +enum ovl_copyop { + OVL_COPY, + OVL_CLONE, + OVL_DEDUPE, +}; + +static ssize_t ovl_copyfile(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + u64 len, unsigned int flags, enum ovl_copyop op) +{ + struct inode *inode_out = file_inode(file_out); + struct fd real_in, real_out; + const struct cred *old_cred; + ssize_t ret; + + ret = ovl_real_fdget(file_out, &real_out); + if (ret) + return ret; + + ret = ovl_real_fdget(file_in, &real_in); + if (ret) { + fdput(real_out); + return ret; + } + + old_cred = ovl_override_creds(file_inode(file_out)->i_sb); + switch (op) { + case OVL_COPY: + ret = vfs_copy_file_range(real_in.file, pos_in, + real_out.file, pos_out, len, flags); + break; + + case OVL_CLONE: + ret = vfs_clone_file_range(real_in.file, pos_in, + real_out.file, pos_out, len); + break; + + case OVL_DEDUPE: + ret = vfs_dedupe_file_range_one(real_in.file, pos_in, + real_out.file, pos_out, len); + break; + } + revert_creds(old_cred); + + /* Update size */ + ovl_copyattr(ovl_inode_real(inode_out), inode_out); + + fdput(real_in); + fdput(real_out); + + return ret; +} + +static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t len, unsigned int flags) +{ + return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, flags, + OVL_COPY); +} + +static int ovl_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len) +{ + return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, + OVL_CLONE); +} + +static int ovl_dedupe_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len) +{ + /* + * Don't copy up because of a dedupe request, this wouldn't make sense + * most of the time (data would be duplicated instead of deduplicated). + */ + if (!ovl_inode_upper(file_inode(file_in)) || + !ovl_inode_upper(file_inode(file_out))) + return -EPERM; + + return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, + OVL_DEDUPE); +} + +const struct file_operations ovl_file_operations = { + .open = ovl_open, + .release = ovl_release, + .llseek = ovl_llseek, + .read_iter = ovl_read_iter, + .write_iter = ovl_write_iter, + .fsync = ovl_fsync, + .mmap = ovl_mmap, + .fallocate = ovl_fallocate, + .unlocked_ioctl = ovl_ioctl, + .compat_ioctl = ovl_compat_ioctl, + + .copy_file_range = ovl_copy_file_range, + .clone_file_range = ovl_clone_file_range, + .dedupe_file_range = ovl_dedupe_file_range, +}; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index ed16a898caeb..e0bb217c01e2 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -19,18 +19,10 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) { int err; + bool full_copy_up = false; struct dentry *upperdentry; const struct cred *old_cred; - /* - * Check for permissions before trying to copy-up. This is redundant - * since it will be rechecked later by ->setattr() on upper dentry. But - * without this, copy-up can be triggered by just about anybody. - * - * We don't initialize inode->size, which just means that - * inode_newsize_ok() will always check against MAX_LFS_FILESIZE and not - * check for a swapfile (which this won't be anyway). - */ err = setattr_prepare(dentry, attr); if (err) return err; @@ -39,10 +31,33 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) if (err) goto out; - err = ovl_copy_up(dentry); + if (attr->ia_valid & ATTR_SIZE) { + struct inode *realinode = d_inode(ovl_dentry_real(dentry)); + + err = -ETXTBSY; + if (atomic_read(&realinode->i_writecount) < 0) + goto out_drop_write; + + /* Truncate should trigger data copy up as well */ + full_copy_up = true; + } + + if (!full_copy_up) + err = ovl_copy_up(dentry); + else + err = ovl_copy_up_with_data(dentry); if (!err) { + struct inode *winode = NULL; + upperdentry = ovl_dentry_upper(dentry); + if (attr->ia_valid & ATTR_SIZE) { + winode = d_inode(upperdentry); + err = get_write_access(winode); + if (err) + goto out_drop_write; + } + if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) attr->ia_valid &= ~ATTR_MODE; @@ -53,7 +68,11 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) if (!err) ovl_copyattr(upperdentry->d_inode, dentry->d_inode); inode_unlock(upperdentry->d_inode); + + if (winode) + put_write_access(winode); } +out_drop_write: ovl_drop_write(dentry); out: return err; @@ -133,6 +152,9 @@ int ovl_getattr(const struct path *path, struct kstat *stat, bool samefs = ovl_same_sb(dentry->d_sb); struct ovl_layer *lower_layer = NULL; int err; + bool metacopy_blocks = false; + + metacopy_blocks = ovl_is_metacopy_dentry(dentry); type = ovl_path_real(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); @@ -154,7 +176,8 @@ int ovl_getattr(const struct path *path, struct kstat *stat, lower_layer = ovl_layer_lower(dentry); } else if (OVL_TYPE_ORIGIN(type)) { struct kstat lowerstat; - u32 lowermask = STATX_INO | (!is_dir ? STATX_NLINK : 0); + u32 lowermask = STATX_INO | STATX_BLOCKS | + (!is_dir ? STATX_NLINK : 0); ovl_path_lower(dentry, &realpath); err = vfs_getattr(&realpath, &lowerstat, @@ -183,6 +206,35 @@ int ovl_getattr(const struct path *path, struct kstat *stat, stat->ino = lowerstat.ino; lower_layer = ovl_layer_lower(dentry); } + + /* + * If we are querying a metacopy dentry and lower + * dentry is data dentry, then use the blocks we + * queried just now. We don't have to do additional + * vfs_getattr(). If lower itself is metacopy, then + * additional vfs_getattr() is unavoidable. + */ + if (metacopy_blocks && + realpath.dentry == ovl_dentry_lowerdata(dentry)) { + stat->blocks = lowerstat.blocks; + metacopy_blocks = false; + } + } + + if (metacopy_blocks) { + /* + * If lower is not same as lowerdata or if there was + * no origin on upper, we can end up here. + */ + struct kstat lowerdatastat; + u32 lowermask = STATX_BLOCKS; + + ovl_path_lowerdata(dentry, &realpath); + err = vfs_getattr(&realpath, &lowerdatastat, + lowermask, flags); + if (err) + goto out; + stat->blocks = lowerdatastat.blocks; } } @@ -304,6 +356,9 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, } revert_creds(old_cred); + /* copy c/mtime */ + ovl_copyattr(d_inode(realdentry), inode); + out_drop_write: ovl_drop_write(dentry); out: @@ -384,38 +439,6 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type) return acl; } -static bool ovl_open_need_copy_up(struct dentry *dentry, int flags) -{ - /* Copy up of disconnected dentry does not set upper alias */ - if (ovl_dentry_upper(dentry) && - (ovl_dentry_has_upper_alias(dentry) || - (dentry->d_flags & DCACHE_DISCONNECTED))) - return false; - - if (special_file(d_inode(dentry)->i_mode)) - return false; - - if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC)) - return false; - - return true; -} - -int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags) -{ - int err = 0; - - if (ovl_open_need_copy_up(dentry, file_flags)) { - err = ovl_want_write(dentry); - if (!err) { - err = ovl_copy_up_flags(dentry, file_flags); - ovl_drop_write(dentry); - } - } - - return err; -} - int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags) { if (flags & S_ATIME) { @@ -433,6 +456,23 @@ int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags) return 0; } +static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + int err; + struct inode *realinode = ovl_inode_real(inode); + const struct cred *old_cred; + + if (!realinode->i_op->fiemap) + return -EOPNOTSUPP; + + old_cred = ovl_override_creds(inode->i_sb); + err = realinode->i_op->fiemap(realinode, fieinfo, start, len); + revert_creds(old_cred); + + return err; +} + static const struct inode_operations ovl_file_inode_operations = { .setattr = ovl_setattr, .permission = ovl_permission, @@ -440,6 +480,7 @@ static const struct inode_operations ovl_file_inode_operations = { .listxattr = ovl_listxattr, .get_acl = ovl_get_acl, .update_time = ovl_update_time, + .fiemap = ovl_fiemap, }; static const struct inode_operations ovl_symlink_inode_operations = { @@ -450,6 +491,15 @@ static const struct inode_operations ovl_symlink_inode_operations = { .update_time = ovl_update_time, }; +static const struct inode_operations ovl_special_inode_operations = { + .setattr = ovl_setattr, + .permission = ovl_permission, + .getattr = ovl_getattr, + .listxattr = ovl_listxattr, + .get_acl = ovl_get_acl, + .update_time = ovl_update_time, +}; + /* * It is possible to stack overlayfs instance on top of another * overlayfs instance as lower layer. We need to annonate the @@ -520,6 +570,7 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev, switch (mode & S_IFMT) { case S_IFREG: inode->i_op = &ovl_file_inode_operations; + inode->i_fop = &ovl_file_operations; break; case S_IFDIR: @@ -532,7 +583,7 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev, break; default: - inode->i_op = &ovl_file_inode_operations; + inode->i_op = &ovl_special_inode_operations; init_special_inode(inode, mode, rdev); break; } @@ -769,8 +820,9 @@ struct inode *ovl_get_inode(struct super_block *sb, bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, oip->index); int fsid = bylower ? oip->lowerpath->layer->fsid : 0; - bool is_dir; + bool is_dir, metacopy = false; unsigned long ino = 0; + int err = -ENOMEM; if (!realinode) realinode = d_inode(lowerdentry); @@ -787,7 +839,7 @@ struct inode *ovl_get_inode(struct super_block *sb, inode = ovl_iget5(sb, oip->newinode, key); if (!inode) - goto out_nomem; + goto out_err; if (!(inode->i_state & I_NEW)) { /* * Verify that the underlying files stored in the inode @@ -796,11 +848,12 @@ struct inode *ovl_get_inode(struct super_block *sb, if (!ovl_verify_inode(inode, lowerdentry, upperdentry, true)) { iput(inode); - inode = ERR_PTR(-ESTALE); - goto out; + err = -ESTALE; + goto out_err; } dput(upperdentry); + kfree(oip->redirect); goto out; } @@ -812,11 +865,13 @@ struct inode *ovl_get_inode(struct super_block *sb, } else { /* Lower hardlink that will be broken on copy up */ inode = new_inode(sb); - if (!inode) - goto out_nomem; + if (!inode) { + err = -ENOMEM; + goto out_err; + } } ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev, ino, fsid); - ovl_inode_init(inode, upperdentry, lowerdentry); + ovl_inode_init(inode, upperdentry, lowerdentry, oip->lowerdata); if (upperdentry && ovl_is_impuredir(upperdentry)) ovl_set_flag(OVL_IMPURE, inode); @@ -824,6 +879,20 @@ struct inode *ovl_get_inode(struct super_block *sb, if (oip->index) ovl_set_flag(OVL_INDEX, inode); + if (upperdentry) { + err = ovl_check_metacopy_xattr(upperdentry); + if (err < 0) + goto out_err; + metacopy = err; + if (!metacopy) + ovl_set_flag(OVL_UPPERDATA, inode); + } + + OVL_I(inode)->redirect = oip->redirect; + + if (bylower) + ovl_set_flag(OVL_CONST_INO, inode); + /* Check for non-merge dir that may have whiteouts */ if (is_dir) { if (((upperdentry && lowerdentry) || oip->numlower > 1) || @@ -837,7 +906,7 @@ struct inode *ovl_get_inode(struct super_block *sb, out: return inode; -out_nomem: - inode = ERR_PTR(-ENOMEM); +out_err: + inode = ERR_PTR(err); goto out; } diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index c993dd8db739..f28711846dd6 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -24,38 +24,20 @@ struct ovl_lookup_data { bool stop; bool last; char *redirect; + bool metacopy; }; static int ovl_check_redirect(struct dentry *dentry, struct ovl_lookup_data *d, size_t prelen, const char *post) { int res; - char *s, *next, *buf = NULL; + char *buf; - res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, NULL, 0); - if (res < 0) { - if (res == -ENODATA || res == -EOPNOTSUPP) - return 0; - goto fail; - } - buf = kzalloc(prelen + res + strlen(post) + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; + buf = ovl_get_redirect_xattr(dentry, prelen + strlen(post)); + if (IS_ERR_OR_NULL(buf)) + return PTR_ERR(buf); - if (res == 0) - goto invalid; - - res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, buf, res); - if (res < 0) - goto fail; - if (res == 0) - goto invalid; if (buf[0] == '/') { - for (s = buf; *s++ == '/'; s = next) { - next = strchrnul(s, '/'); - if (s == next) - goto invalid; - } /* * One of the ancestor path elements in an absolute path * lookup in ovl_lookup_layer() could have been opaque and @@ -66,9 +48,7 @@ static int ovl_check_redirect(struct dentry *dentry, struct ovl_lookup_data *d, */ d->stop = false; } else { - if (strchr(buf, '/') != NULL) - goto invalid; - + res = strlen(buf) + 1; memmove(buf + prelen, buf, res); memcpy(buf, d->name.name, prelen); } @@ -80,16 +60,6 @@ static int ovl_check_redirect(struct dentry *dentry, struct ovl_lookup_data *d, d->name.len = strlen(d->redirect); return 0; - -err_free: - kfree(buf); - return 0; -fail: - pr_warn_ratelimited("overlayfs: failed to get redirect (%i)\n", res); - goto err_free; -invalid: - pr_warn_ratelimited("overlayfs: invalid redirect (%s)\n", buf); - goto err_free; } static int ovl_acceptable(void *ctx, struct dentry *dentry) @@ -252,28 +222,39 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, d->stop = d->opaque = true; goto put_and_out; } - if (!d_can_lookup(this)) { + /* + * This dentry should be a regular file if previous layer lookup + * found a metacopy dentry. + */ + if (last_element && d->metacopy && !d_is_reg(this)) { d->stop = true; - if (d->is_dir) - goto put_and_out; - - /* - * NB: handle failure to lookup non-last element when non-dir - * redirects become possible - */ - WARN_ON(!last_element); - goto out; + goto put_and_out; } - if (last_element) - d->is_dir = true; - if (d->last) - goto out; + if (!d_can_lookup(this)) { + if (d->is_dir || !last_element) { + d->stop = true; + goto put_and_out; + } + err = ovl_check_metacopy_xattr(this); + if (err < 0) + goto out_err; - if (ovl_is_opaquedir(this)) { - d->stop = true; + d->metacopy = err; + d->stop = !d->metacopy; + if (!d->metacopy || d->last) + goto out; + } else { if (last_element) - d->opaque = true; - goto out; + d->is_dir = true; + if (d->last) + goto out; + + if (ovl_is_opaquedir(this)) { + d->stop = true; + if (last_element) + d->opaque = true; + goto out; + } } err = ovl_check_redirect(this, d, prelen, post); if (err) @@ -823,7 +804,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, struct ovl_fs *ofs = dentry->d_sb->s_fs_info; struct ovl_entry *poe = dentry->d_parent->d_fsdata; struct ovl_entry *roe = dentry->d_sb->s_root->d_fsdata; - struct ovl_path *stack = NULL; + struct ovl_path *stack = NULL, *origin_path = NULL; struct dentry *upperdir, *upperdentry = NULL; struct dentry *origin = NULL; struct dentry *index = NULL; @@ -834,6 +815,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, struct dentry *this; unsigned int i; int err; + bool metacopy = false; struct ovl_lookup_data d = { .name = dentry->d_name, .is_dir = false, @@ -841,6 +823,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, .stop = false, .last = ofs->config.redirect_follow ? false : !poe->numlower, .redirect = NULL, + .metacopy = false, }; if (dentry->d_name.len > ofs->namelen) @@ -859,7 +842,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, goto out; } if (upperdentry && !d.is_dir) { - BUG_ON(!d.stop || d.redirect); + unsigned int origin_ctr = 0; + /* * Lookup copy up origin by decoding origin file handle. * We may get a disconnected dentry, which is fine, @@ -870,9 +854,13 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, * number - it's the same as if we held a reference * to a dentry in lower layer that was moved under us. */ - err = ovl_check_origin(ofs, upperdentry, &stack, &ctr); + err = ovl_check_origin(ofs, upperdentry, &origin_path, + &origin_ctr); if (err) goto out_put_upper; + + if (d.metacopy) + metacopy = true; } if (d.redirect) { @@ -913,7 +901,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, * If no origin fh is stored in upper of a merge dir, store fh * of lower dir and set upper parent "impure". */ - if (upperdentry && !ctr && !ofs->noxattr) { + if (upperdentry && !ctr && !ofs->noxattr && d.is_dir) { err = ovl_fix_origin(dentry, this, upperdentry); if (err) { dput(this); @@ -925,18 +913,35 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, * When "verify_lower" feature is enabled, do not merge with a * lower dir that does not match a stored origin xattr. In any * case, only verified origin is used for index lookup. + * + * For non-dir dentry, if index=on, then ensure origin + * matches the dentry found using path based lookup, + * otherwise error out. */ - if (upperdentry && !ctr && ovl_verify_lower(dentry->d_sb)) { + if (upperdentry && !ctr && + ((d.is_dir && ovl_verify_lower(dentry->d_sb)) || + (!d.is_dir && ofs->config.index && origin_path))) { err = ovl_verify_origin(upperdentry, this, false); if (err) { dput(this); - break; + if (d.is_dir) + break; + goto out_put; } - - /* Bless lower dir as verified origin */ origin = this; } + if (d.metacopy) + metacopy = true; + /* + * Do not store intermediate metacopy dentries in chain, + * except top most lower metacopy dentry + */ + if (d.metacopy && ctr) { + dput(this); + continue; + } + stack[ctr].dentry = this; stack[ctr].layer = lower.layer; ctr++; @@ -968,13 +973,48 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, } } + if (metacopy) { + /* + * Found a metacopy dentry but did not find corresponding + * data dentry + */ + if (d.metacopy) { + err = -EIO; + goto out_put; + } + + err = -EPERM; + if (!ofs->config.metacopy) { + pr_warn_ratelimited("overlay: refusing to follow metacopy origin for (%pd2)\n", + dentry); + goto out_put; + } + } else if (!d.is_dir && upperdentry && !ctr && origin_path) { + if (WARN_ON(stack != NULL)) { + err = -EIO; + goto out_put; + } + stack = origin_path; + ctr = 1; + origin_path = NULL; + } + /* * Lookup index by lower inode and verify it matches upper inode. * We only trust dir index if we verified that lower dir matches * origin, otherwise dir index entries may be inconsistent and we - * ignore them. Always lookup index of non-dir and non-upper. + * ignore them. + * + * For non-dir upper metacopy dentry, we already set "origin" if we + * verified that lower matched upper origin. If upper origin was + * not present (because lower layer did not support fh encode/decode), + * or indexing is not enabled, do not set "origin" and skip looking up + * index. This case should be handled in same way as a non-dir upper + * without ORIGIN is handled. + * + * Always lookup index of non-dir non-metacopy and non-upper. */ - if (ctr && (!upperdentry || !d.is_dir)) + if (ctr && (!upperdentry || (!d.is_dir && !metacopy))) origin = stack[0].dentry; if (origin && ovl_indexdir(dentry->d_sb) && @@ -1000,8 +1040,15 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (upperdentry) ovl_dentry_set_upper_alias(dentry); - else if (index) + else if (index) { upperdentry = dget(index); + upperredirect = ovl_get_redirect_xattr(upperdentry, 0); + if (IS_ERR(upperredirect)) { + err = PTR_ERR(upperredirect); + upperredirect = NULL; + goto out_free_oe; + } + } if (upperdentry || ctr) { struct ovl_inode_params oip = { @@ -1009,22 +1056,22 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, .lowerpath = stack, .index = index, .numlower = ctr, + .redirect = upperredirect, + .lowerdata = (ctr > 1 && !d.is_dir) ? + stack[ctr - 1].dentry : NULL, }; inode = ovl_get_inode(dentry->d_sb, &oip); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_free_oe; - - /* - * NB: handle redirected hard links when non-dir redirects - * become possible - */ - WARN_ON(OVL_I(inode)->redirect); - OVL_I(inode)->redirect = upperredirect; } revert_creds(old_cred); + if (origin_path) { + dput(origin_path->dentry); + kfree(origin_path); + } dput(index); kfree(stack); kfree(d.redirect); @@ -1039,6 +1086,10 @@ out_put: dput(stack[i].dentry); kfree(stack); out_put_upper: + if (origin_path) { + dput(origin_path->dentry); + kfree(origin_path); + } dput(upperdentry); kfree(upperredirect); out: diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 7538b9b56237..f61839e1054c 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/uuid.h> +#include <linux/fs.h> #include "ovl_entry.h" enum ovl_path_type { @@ -28,6 +29,7 @@ enum ovl_path_type { #define OVL_XATTR_IMPURE OVL_XATTR_PREFIX "impure" #define OVL_XATTR_NLINK OVL_XATTR_PREFIX "nlink" #define OVL_XATTR_UPPER OVL_XATTR_PREFIX "upper" +#define OVL_XATTR_METACOPY OVL_XATTR_PREFIX "metacopy" enum ovl_inode_flag { /* Pure upper dir that may contain non pure upper entries */ @@ -35,6 +37,9 @@ enum ovl_inode_flag { /* Non-merge dir that may contain whiteout entries */ OVL_WHITEOUTS, OVL_INDEX, + OVL_UPPERDATA, + /* Inode number will remain constant over copy up. */ + OVL_CONST_INO, }; enum ovl_entry_flag { @@ -190,6 +195,14 @@ static inline struct dentry *ovl_do_tmpfile(struct dentry *dentry, umode_t mode) return ret; } +static inline bool ovl_open_flags_need_copy_up(int flags) +{ + if (!flags) + return false; + + return ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC)); +} + /* util.c */ int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); @@ -206,15 +219,19 @@ bool ovl_dentry_weird(struct dentry *dentry); enum ovl_path_type ovl_path_type(struct dentry *dentry); void ovl_path_upper(struct dentry *dentry, struct path *path); void ovl_path_lower(struct dentry *dentry, struct path *path); +void ovl_path_lowerdata(struct dentry *dentry, struct path *path); enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); struct dentry *ovl_dentry_upper(struct dentry *dentry); struct dentry *ovl_dentry_lower(struct dentry *dentry); +struct dentry *ovl_dentry_lowerdata(struct dentry *dentry); struct ovl_layer *ovl_layer_lower(struct dentry *dentry); struct dentry *ovl_dentry_real(struct dentry *dentry); struct dentry *ovl_i_dentry_upper(struct inode *inode); struct inode *ovl_inode_upper(struct inode *inode); struct inode *ovl_inode_lower(struct inode *inode); +struct inode *ovl_inode_lowerdata(struct inode *inode); struct inode *ovl_inode_real(struct inode *inode); +struct inode *ovl_inode_realdata(struct inode *inode); struct ovl_dir_cache *ovl_dir_cache(struct inode *inode); void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache); void ovl_dentry_set_flag(unsigned long flag, struct dentry *dentry); @@ -225,18 +242,23 @@ bool ovl_dentry_is_whiteout(struct dentry *dentry); void ovl_dentry_set_opaque(struct dentry *dentry); bool ovl_dentry_has_upper_alias(struct dentry *dentry); void ovl_dentry_set_upper_alias(struct dentry *dentry); +bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags); +bool ovl_dentry_needs_data_copy_up_locked(struct dentry *dentry, int flags); +bool ovl_has_upperdata(struct inode *inode); +void ovl_set_upperdata(struct inode *inode); bool ovl_redirect_dir(struct super_block *sb); const char *ovl_dentry_get_redirect(struct dentry *dentry); void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect); void ovl_inode_init(struct inode *inode, struct dentry *upperdentry, - struct dentry *lowerdentry); + struct dentry *lowerdentry, struct dentry *lowerdata); void ovl_inode_update(struct inode *inode, struct dentry *upperdentry); -void ovl_dentry_version_inc(struct dentry *dentry, bool impurity); +void ovl_dir_modified(struct dentry *dentry, bool impurity); u64 ovl_dentry_version_get(struct dentry *dentry); bool ovl_is_whiteout(struct dentry *dentry); struct file *ovl_path_open(struct path *path, int flags); -int ovl_copy_up_start(struct dentry *dentry); +int ovl_copy_up_start(struct dentry *dentry, int flags); void ovl_copy_up_end(struct dentry *dentry); +bool ovl_already_copied_up(struct dentry *dentry, int flags); bool ovl_check_origin_xattr(struct dentry *dentry); bool ovl_check_dir_xattr(struct dentry *dentry, const char *name); int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, @@ -252,6 +274,9 @@ bool ovl_need_index(struct dentry *dentry); int ovl_nlink_start(struct dentry *dentry, bool *locked); void ovl_nlink_end(struct dentry *dentry, bool locked); int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir); +int ovl_check_metacopy_xattr(struct dentry *dentry); +bool ovl_is_metacopy_dentry(struct dentry *dentry); +char *ovl_get_redirect_xattr(struct dentry *dentry, int padding); static inline bool ovl_is_impuredir(struct dentry *dentry) { @@ -324,7 +349,6 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size); ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); struct posix_acl *ovl_get_acl(struct inode *inode, int type); -int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags); int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags); bool ovl_is_private_xattr(const char *name); @@ -334,6 +358,8 @@ struct ovl_inode_params { struct ovl_path *lowerpath; struct dentry *index; unsigned int numlower; + char *redirect; + struct dentry *lowerdata; }; struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev); struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real, @@ -348,6 +374,14 @@ static inline void ovl_copyattr(struct inode *from, struct inode *to) to->i_atime = from->i_atime; to->i_mtime = from->i_mtime; to->i_ctime = from->i_ctime; + i_size_write(to, i_size_read(from)); +} + +static inline void ovl_copyflags(struct inode *from, struct inode *to) +{ + unsigned int mask = S_SYNC | S_IMMUTABLE | S_APPEND | S_NOATIME; + + inode_set_flags(to, from->i_flags & mask, mask); } /* dir.c */ @@ -368,9 +402,14 @@ struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry, int ovl_cleanup(struct inode *dir, struct dentry *dentry); struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr); +/* file.c */ +extern const struct file_operations ovl_file_operations; + /* copy_up.c */ int ovl_copy_up(struct dentry *dentry); +int ovl_copy_up_with_data(struct dentry *dentry); int ovl_copy_up_flags(struct dentry *dentry, int flags); +int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags); int ovl_copy_xattr(struct dentry *old, struct dentry *new); int ovl_set_attr(struct dentry *upper, struct kstat *stat); struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 41655a7d6894..ec237035333a 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -19,6 +19,7 @@ struct ovl_config { bool index; bool nfs_export; int xino; + bool metacopy; }; struct ovl_sb { @@ -88,7 +89,10 @@ static inline struct ovl_entry *OVL_E(struct dentry *dentry) } struct ovl_inode { - struct ovl_dir_cache *cache; + union { + struct ovl_dir_cache *cache; /* directory */ + struct inode *lowerdata; /* regular file */ + }; const char *redirect; u64 version; unsigned long flags; diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index ef1fe42ff7bb..cc8303a806b4 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -668,6 +668,21 @@ static int ovl_fill_real(struct dir_context *ctx, const char *name, return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type); } +static bool ovl_is_impure_dir(struct file *file) +{ + struct ovl_dir_file *od = file->private_data; + struct inode *dir = d_inode(file->f_path.dentry); + + /* + * Only upper dir can be impure, but if we are in the middle of + * iterating a lower real dir, dir could be copied up and marked + * impure. We only want the impure cache if we started iterating + * a real upper dir to begin with. + */ + return od->is_upper && ovl_test_flag(OVL_IMPURE, dir); + +} + static int ovl_iterate_real(struct file *file, struct dir_context *ctx) { int err; @@ -696,7 +711,7 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx) rdt.parent_ino = stat.ino; } - if (ovl_test_flag(OVL_IMPURE, d_inode(dir))) { + if (ovl_is_impure_dir(file)) { rdt.cache = ovl_cache_get_impure(&file->f_path); if (IS_ERR(rdt.cache)) return PTR_ERR(rdt.cache); @@ -727,7 +742,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) */ if (ovl_xino_bits(dentry->d_sb) || (ovl_same_sb(dentry->d_sb) && - (ovl_test_flag(OVL_IMPURE, d_inode(dentry)) || + (ovl_is_impure_dir(file) || OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) { return ovl_iterate_real(file, ctx); } diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 704b37311467..2e0fc93c2c06 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -64,6 +64,11 @@ static void ovl_entry_stack_free(struct ovl_entry *oe) dput(oe->lowerstack[i].dentry); } +static bool ovl_metacopy_def = IS_ENABLED(CONFIG_OVERLAY_FS_METACOPY); +module_param_named(metacopy, ovl_metacopy_def, bool, 0644); +MODULE_PARM_DESC(ovl_metacopy_def, + "Default to on or off for the metadata only copy up feature"); + static void ovl_dentry_release(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; @@ -74,31 +79,14 @@ static void ovl_dentry_release(struct dentry *dentry) } } -static int ovl_check_append_only(struct inode *inode, int flag) -{ - /* - * This test was moot in vfs may_open() because overlay inode does - * not have the S_APPEND flag, so re-check on real upper inode - */ - if (IS_APPEND(inode)) { - if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND)) - return -EPERM; - if (flag & O_TRUNC) - return -EPERM; - } - - return 0; -} - static struct dentry *ovl_d_real(struct dentry *dentry, - const struct inode *inode, - unsigned int open_flags, unsigned int flags) + const struct inode *inode) { struct dentry *real; - int err; - if (flags & D_REAL_UPPER) - return ovl_dentry_upper(dentry); + /* It's an overlay file */ + if (inode && d_inode(dentry) == inode) + return dentry; if (!d_is_reg(dentry)) { if (!inode || inode == d_inode(dentry)) @@ -106,28 +94,19 @@ static struct dentry *ovl_d_real(struct dentry *dentry, goto bug; } - if (open_flags) { - err = ovl_open_maybe_copy_up(dentry, open_flags); - if (err) - return ERR_PTR(err); - } - real = ovl_dentry_upper(dentry); - if (real && (!inode || inode == d_inode(real))) { - if (!inode) { - err = ovl_check_append_only(d_inode(real), open_flags); - if (err) - return ERR_PTR(err); - } + if (real && (inode == d_inode(real))) + return real; + + if (real && !inode && ovl_has_upperdata(d_inode(dentry))) return real; - } - real = ovl_dentry_lower(dentry); + real = ovl_dentry_lowerdata(dentry); if (!real) goto bug; /* Handle recursion */ - real = d_real(real, inode, open_flags, 0); + real = d_real(real, inode); if (!inode || inode == d_inode(real)) return real; @@ -205,6 +184,7 @@ static struct inode *ovl_alloc_inode(struct super_block *sb) oi->flags = 0; oi->__upperdentry = NULL; oi->lower = NULL; + oi->lowerdata = NULL; mutex_init(&oi->lock); return &oi->vfs_inode; @@ -223,8 +203,11 @@ static void ovl_destroy_inode(struct inode *inode) dput(oi->__upperdentry); iput(oi->lower); + if (S_ISDIR(inode->i_mode)) + ovl_dir_cache_free(inode); + else + iput(oi->lowerdata); kfree(oi->redirect); - ovl_dir_cache_free(inode); mutex_destroy(&oi->lock); call_rcu(&inode->i_rcu, ovl_i_callback); @@ -376,6 +359,9 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) "on" : "off"); if (ofs->config.xino != ovl_xino_def()) seq_printf(m, ",xino=%s", ovl_xino_str[ofs->config.xino]); + if (ofs->config.metacopy != ovl_metacopy_def) + seq_printf(m, ",metacopy=%s", + ofs->config.metacopy ? "on" : "off"); return 0; } @@ -413,6 +399,8 @@ enum { OPT_XINO_ON, OPT_XINO_OFF, OPT_XINO_AUTO, + OPT_METACOPY_ON, + OPT_METACOPY_OFF, OPT_ERR, }; @@ -429,6 +417,8 @@ static const match_table_t ovl_tokens = { {OPT_XINO_ON, "xino=on"}, {OPT_XINO_OFF, "xino=off"}, {OPT_XINO_AUTO, "xino=auto"}, + {OPT_METACOPY_ON, "metacopy=on"}, + {OPT_METACOPY_OFF, "metacopy=off"}, {OPT_ERR, NULL} }; @@ -481,6 +471,7 @@ static int ovl_parse_redirect_mode(struct ovl_config *config, const char *mode) static int ovl_parse_opt(char *opt, struct ovl_config *config) { char *p; + int err; config->redirect_mode = kstrdup(ovl_redirect_mode_def(), GFP_KERNEL); if (!config->redirect_mode) @@ -555,6 +546,14 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) config->xino = OVL_XINO_AUTO; break; + case OPT_METACOPY_ON: + config->metacopy = true; + break; + + case OPT_METACOPY_OFF: + config->metacopy = false; + break; + default: pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p); return -EINVAL; @@ -569,7 +568,20 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) config->workdir = NULL; } - return ovl_parse_redirect_mode(config, config->redirect_mode); + err = ovl_parse_redirect_mode(config, config->redirect_mode); + if (err) + return err; + + /* metacopy feature with upper requires redirect_dir=on */ + if (config->upperdir && config->metacopy && !config->redirect_dir) { + pr_warn("overlayfs: metadata only copy up requires \"redirect_dir=on\", falling back to metacopy=off.\n"); + config->metacopy = false; + } else if (config->metacopy && !config->redirect_follow) { + pr_warn("overlayfs: metadata only copy up requires \"redirect_dir=follow\" on non-upper mount, falling back to metacopy=off.\n"); + config->metacopy = false; + } + + return 0; } #define OVL_WORKDIR_NAME "work" @@ -1042,7 +1054,8 @@ static int ovl_make_workdir(struct ovl_fs *ofs, struct path *workpath) if (err) { ofs->noxattr = true; ofs->config.index = false; - pr_warn("overlayfs: upper fs does not support xattr, falling back to index=off.\n"); + ofs->config.metacopy = false; + pr_warn("overlayfs: upper fs does not support xattr, falling back to index=off and metacopy=off.\n"); err = 0; } else { vfs_removexattr(ofs->workdir, OVL_XATTR_OPAQUE); @@ -1064,7 +1077,6 @@ static int ovl_make_workdir(struct ovl_fs *ofs, struct path *workpath) pr_warn("overlayfs: NFS export requires \"index=on\", falling back to nfs_export=off.\n"); ofs->config.nfs_export = false; } - out: mnt_drop_write(mnt); return err; @@ -1375,6 +1387,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) ofs->config.index = ovl_index_def; ofs->config.nfs_export = ovl_nfs_export_def; ofs->config.xino = ovl_xino_def(); + ofs->config.metacopy = ovl_metacopy_def; err = ovl_parse_opt((char *) data, &ofs->config); if (err) goto out_err; @@ -1445,6 +1458,11 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) } } + if (ofs->config.metacopy && ofs->config.nfs_export) { + pr_warn("overlayfs: NFS export is not supported with metadata only copy up, falling back to nfs_export=off.\n"); + ofs->config.nfs_export = false; + } + if (ofs->config.nfs_export) sb->s_export_op = &ovl_export_operations; @@ -1455,7 +1473,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &ovl_super_operations; sb->s_xattr = ovl_xattr_handlers; sb->s_fs_info = ofs; - sb->s_flags |= SB_POSIXACL | SB_NOREMOTELOCK; + sb->s_flags |= SB_POSIXACL; err = -ENOMEM; root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, 0)); @@ -1474,8 +1492,9 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) /* Root is always merge -> can have whiteouts */ ovl_set_flag(OVL_WHITEOUTS, d_inode(root_dentry)); ovl_dentry_set_flag(OVL_E_CONNECTED, root_dentry); + ovl_set_upperdata(d_inode(root_dentry)); ovl_inode_init(d_inode(root_dentry), upperpath.dentry, - ovl_dentry_lower(root_dentry)); + ovl_dentry_lower(root_dentry), NULL); sb->s_root = root_dentry; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 6f1078028c66..8cfb62cc8672 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -133,8 +133,10 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry) * Non-dir dentry can hold lower dentry of its copy up origin. */ if (oe->numlower) { - type |= __OVL_PATH_ORIGIN; - if (d_is_dir(dentry)) + if (ovl_test_flag(OVL_CONST_INO, d_inode(dentry))) + type |= __OVL_PATH_ORIGIN; + if (d_is_dir(dentry) || + !ovl_has_upperdata(d_inode(dentry))) type |= __OVL_PATH_MERGE; } } else { @@ -164,6 +166,18 @@ void ovl_path_lower(struct dentry *dentry, struct path *path) } } +void ovl_path_lowerdata(struct dentry *dentry, struct path *path) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + if (oe->numlower) { + path->mnt = oe->lowerstack[oe->numlower - 1].layer->mnt; + path->dentry = oe->lowerstack[oe->numlower - 1].dentry; + } else { + *path = (struct path) { }; + } +} + enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path) { enum ovl_path_type type = ovl_path_type(dentry); @@ -195,6 +209,19 @@ struct ovl_layer *ovl_layer_lower(struct dentry *dentry) return oe->numlower ? oe->lowerstack[0].layer : NULL; } +/* + * ovl_dentry_lower() could return either a data dentry or metacopy dentry + * dependig on what is stored in lowerstack[0]. At times we need to find + * lower dentry which has data (and not metacopy dentry). This helper + * returns the lower data dentry. + */ +struct dentry *ovl_dentry_lowerdata(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->numlower ? oe->lowerstack[oe->numlower - 1].dentry : NULL; +} + struct dentry *ovl_dentry_real(struct dentry *dentry) { return ovl_dentry_upper(dentry) ?: ovl_dentry_lower(dentry); @@ -222,6 +249,26 @@ struct inode *ovl_inode_real(struct inode *inode) return ovl_inode_upper(inode) ?: ovl_inode_lower(inode); } +/* Return inode which contains lower data. Do not return metacopy */ +struct inode *ovl_inode_lowerdata(struct inode *inode) +{ + if (WARN_ON(!S_ISREG(inode->i_mode))) + return NULL; + + return OVL_I(inode)->lowerdata ?: ovl_inode_lower(inode); +} + +/* Return real inode which contains data. Does not return metacopy inode */ +struct inode *ovl_inode_realdata(struct inode *inode) +{ + struct inode *upperinode; + + upperinode = ovl_inode_upper(inode); + if (upperinode && ovl_has_upperdata(inode)) + return upperinode; + + return ovl_inode_lowerdata(inode); +} struct ovl_dir_cache *ovl_dir_cache(struct inode *inode) { @@ -279,6 +326,62 @@ void ovl_dentry_set_upper_alias(struct dentry *dentry) ovl_dentry_set_flag(OVL_E_UPPER_ALIAS, dentry); } +static bool ovl_should_check_upperdata(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return false; + + if (!ovl_inode_lower(inode)) + return false; + + return true; +} + +bool ovl_has_upperdata(struct inode *inode) +{ + if (!ovl_should_check_upperdata(inode)) + return true; + + if (!ovl_test_flag(OVL_UPPERDATA, inode)) + return false; + /* + * Pairs with smp_wmb() in ovl_set_upperdata(). Main user of + * ovl_has_upperdata() is ovl_copy_up_meta_inode_data(). Make sure + * if setting of OVL_UPPERDATA is visible, then effects of writes + * before that are visible too. + */ + smp_rmb(); + return true; +} + +void ovl_set_upperdata(struct inode *inode) +{ + /* + * Pairs with smp_rmb() in ovl_has_upperdata(). Make sure + * if OVL_UPPERDATA flag is visible, then effects of write operations + * before it are visible as well. + */ + smp_wmb(); + ovl_set_flag(OVL_UPPERDATA, inode); +} + +/* Caller should hold ovl_inode->lock */ +bool ovl_dentry_needs_data_copy_up_locked(struct dentry *dentry, int flags) +{ + if (!ovl_open_flags_need_copy_up(flags)) + return false; + + return !ovl_test_flag(OVL_UPPERDATA, d_inode(dentry)); +} + +bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags) +{ + if (!ovl_open_flags_need_copy_up(flags)) + return false; + + return !ovl_has_upperdata(d_inode(dentry)); +} + bool ovl_redirect_dir(struct super_block *sb) { struct ovl_fs *ofs = sb->s_fs_info; @@ -300,7 +403,7 @@ void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect) } void ovl_inode_init(struct inode *inode, struct dentry *upperdentry, - struct dentry *lowerdentry) + struct dentry *lowerdentry, struct dentry *lowerdata) { struct inode *realinode = d_inode(upperdentry ?: lowerdentry); @@ -308,8 +411,11 @@ void ovl_inode_init(struct inode *inode, struct dentry *upperdentry, OVL_I(inode)->__upperdentry = upperdentry; if (lowerdentry) OVL_I(inode)->lower = igrab(d_inode(lowerdentry)); + if (lowerdata) + OVL_I(inode)->lowerdata = igrab(d_inode(lowerdata)); ovl_copyattr(realinode, inode); + ovl_copyflags(realinode, inode); if (!inode->i_ino) inode->i_ino = realinode->i_ino; } @@ -333,7 +439,7 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry) } } -void ovl_dentry_version_inc(struct dentry *dentry, bool impurity) +static void ovl_dentry_version_inc(struct dentry *dentry, bool impurity) { struct inode *inode = d_inode(dentry); @@ -348,6 +454,14 @@ void ovl_dentry_version_inc(struct dentry *dentry, bool impurity) OVL_I(inode)->version++; } +void ovl_dir_modified(struct dentry *dentry, bool impurity) +{ + /* Copy mtime/ctime */ + ovl_copyattr(d_inode(ovl_dentry_upper(dentry)), d_inode(dentry)); + + ovl_dentry_version_inc(dentry, impurity); +} + u64 ovl_dentry_version_get(struct dentry *dentry) { struct inode *inode = d_inode(dentry); @@ -368,13 +482,51 @@ struct file *ovl_path_open(struct path *path, int flags) return dentry_open(path, flags | O_NOATIME, current_cred()); } -int ovl_copy_up_start(struct dentry *dentry) +/* Caller should hold ovl_inode->lock */ +static bool ovl_already_copied_up_locked(struct dentry *dentry, int flags) +{ + bool disconnected = dentry->d_flags & DCACHE_DISCONNECTED; + + if (ovl_dentry_upper(dentry) && + (ovl_dentry_has_upper_alias(dentry) || disconnected) && + !ovl_dentry_needs_data_copy_up_locked(dentry, flags)) + return true; + + return false; +} + +bool ovl_already_copied_up(struct dentry *dentry, int flags) +{ + bool disconnected = dentry->d_flags & DCACHE_DISCONNECTED; + + /* + * Check if copy-up has happened as well as for upper alias (in + * case of hard links) is there. + * + * Both checks are lockless: + * - false negatives: will recheck under oi->lock + * - false positives: + * + ovl_dentry_upper() uses memory barriers to ensure the + * upper dentry is up-to-date + * + ovl_dentry_has_upper_alias() relies on locking of + * upper parent i_rwsem to prevent reordering copy-up + * with rename. + */ + if (ovl_dentry_upper(dentry) && + (ovl_dentry_has_upper_alias(dentry) || disconnected) && + !ovl_dentry_needs_data_copy_up(dentry, flags)) + return true; + + return false; +} + +int ovl_copy_up_start(struct dentry *dentry, int flags) { struct ovl_inode *oi = OVL_I(d_inode(dentry)); int err; err = mutex_lock_interruptible(&oi->lock); - if (!err && ovl_dentry_has_upper_alias(dentry)) { + if (!err && ovl_already_copied_up_locked(dentry, flags)) { err = 1; /* Already copied up */ mutex_unlock(&oi->lock); } @@ -675,3 +827,91 @@ err: pr_err("overlayfs: failed to lock workdir+upperdir\n"); return -EIO; } + +/* err < 0, 0 if no metacopy xattr, 1 if metacopy xattr found */ +int ovl_check_metacopy_xattr(struct dentry *dentry) +{ + int res; + + /* Only regular files can have metacopy xattr */ + if (!S_ISREG(d_inode(dentry)->i_mode)) + return 0; + + res = vfs_getxattr(dentry, OVL_XATTR_METACOPY, NULL, 0); + if (res < 0) { + if (res == -ENODATA || res == -EOPNOTSUPP) + return 0; + goto out; + } + + return 1; +out: + pr_warn_ratelimited("overlayfs: failed to get metacopy (%i)\n", res); + return res; +} + +bool ovl_is_metacopy_dentry(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + if (!d_is_reg(dentry)) + return false; + + if (ovl_dentry_upper(dentry)) { + if (!ovl_has_upperdata(d_inode(dentry))) + return true; + return false; + } + + return (oe->numlower > 1); +} + +char *ovl_get_redirect_xattr(struct dentry *dentry, int padding) +{ + int res; + char *s, *next, *buf = NULL; + + res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, NULL, 0); + if (res < 0) { + if (res == -ENODATA || res == -EOPNOTSUPP) + return NULL; + goto fail; + } + + buf = kzalloc(res + padding + 1, GFP_KERNEL); + if (!buf) + return ERR_PTR(-ENOMEM); + + if (res == 0) + goto invalid; + + res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, buf, res); + if (res < 0) + goto fail; + if (res == 0) + goto invalid; + + if (buf[0] == '/') { + for (s = buf; *s++ == '/'; s = next) { + next = strchrnul(s, '/'); + if (s == next) + goto invalid; + } + } else { + if (strchr(buf, '/') != NULL) + goto invalid; + } + + return buf; + +err_free: + kfree(buf); + return ERR_PTR(res); +fail: + pr_warn_ratelimited("overlayfs: failed to get redirect (%i)\n", res); + goto err_free; +invalid: + pr_warn_ratelimited("overlayfs: invalid redirect (%s)\n", buf); + res = -EINVAL; + goto err_free; +} diff --git a/fs/read_write.c b/fs/read_write.c index 153f8f690490..39b4a21dd933 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1964,6 +1964,44 @@ out_error: } EXPORT_SYMBOL(vfs_dedupe_file_range_compare); +int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, + struct file *dst_file, loff_t dst_pos, u64 len) +{ + s64 ret; + + ret = mnt_want_write_file(dst_file); + if (ret) + return ret; + + ret = clone_verify_area(dst_file, dst_pos, len, true); + if (ret < 0) + goto out_drop_write; + + ret = -EINVAL; + if (!(capable(CAP_SYS_ADMIN) || (dst_file->f_mode & FMODE_WRITE))) + goto out_drop_write; + + ret = -EXDEV; + if (src_file->f_path.mnt != dst_file->f_path.mnt) + goto out_drop_write; + + ret = -EISDIR; + if (S_ISDIR(file_inode(dst_file)->i_mode)) + goto out_drop_write; + + ret = -EINVAL; + if (!dst_file->f_op->dedupe_file_range) + goto out_drop_write; + + ret = dst_file->f_op->dedupe_file_range(src_file, src_pos, + dst_file, dst_pos, len); +out_drop_write: + mnt_drop_write_file(dst_file); + + return ret; +} +EXPORT_SYMBOL(vfs_dedupe_file_range_one); + int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) { struct file_dedupe_range_info *info; @@ -1972,11 +2010,8 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) u64 len; int i; int ret; - bool is_admin = capable(CAP_SYS_ADMIN); u16 count = same->dest_count; - struct file *dst_file; - loff_t dst_off; - ssize_t deduped; + int deduped; if (!(file->f_mode & FMODE_READ)) return -EINVAL; @@ -2003,6 +2038,9 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) if (off + len > i_size_read(src)) return -EINVAL; + /* Arbitrary 1G limit on a single dedupe request, can be raised. */ + len = min_t(u64, len, 1 << 30); + /* pre-format output fields to sane values */ for (i = 0; i < count; i++) { same->info[i].bytes_deduped = 0ULL; @@ -2010,54 +2048,28 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) } for (i = 0, info = same->info; i < count; i++, info++) { - struct inode *dst; struct fd dst_fd = fdget(info->dest_fd); + struct file *dst_file = dst_fd.file; - dst_file = dst_fd.file; if (!dst_file) { info->status = -EBADF; goto next_loop; } - dst = file_inode(dst_file); - - ret = mnt_want_write_file(dst_file); - if (ret) { - info->status = ret; - goto next_fdput; - } - - dst_off = info->dest_offset; - ret = clone_verify_area(dst_file, dst_off, len, true); - if (ret < 0) { - info->status = ret; - goto next_file; - } - ret = 0; if (info->reserved) { info->status = -EINVAL; - } else if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) { - info->status = -EINVAL; - } else if (file->f_path.mnt != dst_file->f_path.mnt) { - info->status = -EXDEV; - } else if (S_ISDIR(dst->i_mode)) { - info->status = -EISDIR; - } else if (dst_file->f_op->dedupe_file_range == NULL) { - info->status = -EINVAL; - } else { - deduped = dst_file->f_op->dedupe_file_range(file, off, - len, dst_file, - info->dest_offset); - if (deduped == -EBADE) - info->status = FILE_DEDUPE_RANGE_DIFFERS; - else if (deduped < 0) - info->status = deduped; - else - info->bytes_deduped += deduped; + goto next_fdput; } -next_file: - mnt_drop_write_file(dst_file); + deduped = vfs_dedupe_file_range_one(file, off, dst_file, + info->dest_offset, len); + if (deduped == -EBADE) + info->status = FILE_DEDUPE_RANGE_DIFFERS; + else if (deduped < 0) + info->status = deduped; + else + info->bytes_deduped = len; + next_fdput: fdput(dst_fd); next_loop: diff --git a/fs/xattr.c b/fs/xattr.c index f9cb1db187b7..3a24027c062d 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -23,7 +23,6 @@ #include <linux/posix_acl_xattr.h> #include <linux/uaccess.h> -#include "internal.h" static const char * strcmp_prefix(const char *a, const char *a_prefix) @@ -501,10 +500,10 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, if (!f.file) return error; audit_file(f.file); - error = mnt_want_write_file_path(f.file); + error = mnt_want_write_file(f.file); if (!error) { error = setxattr(f.file->f_path.dentry, name, value, size, flags); - mnt_drop_write_file_path(f.file); + mnt_drop_write_file(f.file); } fdput(f); return error; @@ -733,10 +732,10 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) if (!f.file) return error; audit_file(f.file); - error = mnt_want_write_file_path(f.file); + error = mnt_want_write_file(f.file); if (!error) { error = removexattr(f.file->f_path.dentry, name); - mnt_drop_write_file_path(f.file); + mnt_drop_write_file(f.file); } fdput(f); return error; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 5eaef2c17293..61a5ad2600e8 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -931,31 +931,16 @@ xfs_file_clone_range( len, false); } -STATIC ssize_t +STATIC int xfs_file_dedupe_range( - struct file *src_file, - u64 loff, - u64 len, - struct file *dst_file, - u64 dst_loff) + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len) { - struct inode *srci = file_inode(src_file); - u64 max_dedupe; - int error; - - /* - * Since we have to read all these pages in to compare them, cut - * it off at MAX_RW_COUNT/2 rounded down to the nearest block. - * That means we won't do more than MAX_RW_COUNT IO per request. - */ - max_dedupe = (MAX_RW_COUNT >> 1) & ~(i_blocksize(srci) - 1); - if (len > max_dedupe) - len = max_dedupe; - error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff, + return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, len, true); - if (error) - return error; - return len; } STATIC int |