diff options
Diffstat (limited to 'fs')
250 files changed, 10148 insertions, 9155 deletions
@@ -43,7 +43,6 @@ #include <linux/mount.h> #include <linux/pseudo_fs.h> -#include <asm/kmap_types.h> #include <linux/uaccess.h> #include <linux/nospec.h> @@ -324,13 +323,16 @@ static void aio_free_ring(struct kioctx *ctx) } } -static int aio_ring_mremap(struct vm_area_struct *vma) +static int aio_ring_mremap(struct vm_area_struct *vma, unsigned long flags) { struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; struct kioctx_table *table; int i, res = -EINVAL; + if (flags & MREMAP_DONTUNMAP) + return -EINVAL; + spin_lock(&mm->ioctx_lock); rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index 322b7dfb4ea0..5bf781ea6d67 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -4,9 +4,10 @@ * Copyright 2008 Ian Kent <raven@themaw.net> */ +#include <linux/module.h> #include <linux/miscdevice.h> #include <linux/compat.h> -#include <linux/syscalls.h> +#include <linux/fdtable.h> #include <linux/magic.h> #include <linux/nospec.h> @@ -289,7 +290,7 @@ static int autofs_dev_ioctl_closemount(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { - return ksys_close(param->ioctlfd); + return close_fd(param->ioctlfd); } /* diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 3ac7611ef7ce..fd691e4815c5 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -350,7 +350,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / sizeof(struct bfs_inode) + BFS_ROOT_INO - 1; if (info->si_lasti == BFS_MAX_LASTI) - printf("WARNING: filesystem %s was created with 512 inodes, the real maximum is 511, mounting anyway\n", s->s_id); + printf("NOTE: filesystem %s was created with 512 inodes, the real maximum is 511, mounting anyway\n", s->s_id); else if (info->si_lasti > BFS_MAX_LASTI) { printf("Impossible last inode number %lu > %d on %s\n", info->si_lasti, BFS_MAX_LASTI, s->s_id); goto out1; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index fa50e8936f5f..950bc177238a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1246,7 +1246,7 @@ out_free_interp: set_binfmt(&elf_format); #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES - retval = arch_setup_additional_pages(bprm, !!interpreter); + retval = ARCH_SETUP_ADDITIONAL_PAGES(bprm, elf_ex, !!interpreter); if (retval < 0) goto out; #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ @@ -1307,7 +1307,7 @@ out_free_interp: #endif finalize_exec(bprm); - start_thread(regs, elf_entry, bprm->p); + START_THREAD(elf_ex, regs, elf_entry, bprm->p); retval = 0; out: return retval; @@ -2198,6 +2198,7 @@ static int elf_core_dump(struct coredump_params *cprm) { size_t sz = get_note_info_size(&info); + /* For cell spufs */ sz += elf_coredump_extra_notes_size(); phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL); @@ -2261,6 +2262,7 @@ static int elf_core_dump(struct coredump_params *cprm) if (!write_note_info(&info, cprm)) goto end_coredump; + /* For cell spufs */ if (elf_coredump_extra_notes_write(cprm)) goto end_coredump; diff --git a/fs/block_dev.c b/fs/block_dev.c index 9e84b1928b94..9e56ee1f2652 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -32,6 +32,7 @@ #include <linux/cleancache.h> #include <linux/task_io_accounting_ops.h> #include <linux/falloc.h> +#include <linux/part_stat.h> #include <linux/uaccess.h> #include <linux/suspend.h> #include "internal.h" @@ -110,24 +111,20 @@ EXPORT_SYMBOL(invalidate_bdev); int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, loff_t lend) { - struct block_device *claimed_bdev = NULL; - int err; - /* * If we don't hold exclusive handle for the device, upgrade to it * while we discard the buffer cache to avoid discarding buffers * under live filesystem. */ if (!(mode & FMODE_EXCL)) { - claimed_bdev = bdev->bd_contains; - err = bd_prepare_to_claim(bdev, claimed_bdev, - truncate_bdev_range); + int err = bd_prepare_to_claim(bdev, truncate_bdev_range); if (err) return err; } + truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend); - if (claimed_bdev) - bd_abort_claiming(bdev, claimed_bdev, truncate_bdev_range); + if (!(mode & FMODE_EXCL)) + bd_abort_claiming(bdev, truncate_bdev_range); return 0; } EXPORT_SYMBOL(truncate_bdev_range); @@ -548,55 +545,47 @@ EXPORT_SYMBOL(fsync_bdev); * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze * actually. */ -struct super_block *freeze_bdev(struct block_device *bdev) +int freeze_bdev(struct block_device *bdev) { struct super_block *sb; int error = 0; mutex_lock(&bdev->bd_fsfreeze_mutex); - if (++bdev->bd_fsfreeze_count > 1) { - /* - * We don't even need to grab a reference - the first call - * to freeze_bdev grab an active reference and only the last - * thaw_bdev drops it. - */ - sb = get_super(bdev); - if (sb) - drop_super(sb); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return sb; - } + if (++bdev->bd_fsfreeze_count > 1) + goto done; sb = get_active_super(bdev); if (!sb) - goto out; + goto sync; if (sb->s_op->freeze_super) error = sb->s_op->freeze_super(sb); else error = freeze_super(sb); + deactivate_super(sb); + if (error) { - deactivate_super(sb); bdev->bd_fsfreeze_count--; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return ERR_PTR(error); + goto done; } - deactivate_super(sb); - out: + bdev->bd_fsfreeze_sb = sb; + +sync: sync_blockdev(bdev); +done: mutex_unlock(&bdev->bd_fsfreeze_mutex); - return sb; /* thaw_bdev releases s->s_umount */ + return error; } EXPORT_SYMBOL(freeze_bdev); /** * thaw_bdev -- unlock filesystem * @bdev: blockdevice to unlock - * @sb: associated superblock * * Unlocks the filesystem and marks it writeable again after freeze_bdev(). */ -int thaw_bdev(struct block_device *bdev, struct super_block *sb) +int thaw_bdev(struct block_device *bdev) { + struct super_block *sb; int error = -EINVAL; mutex_lock(&bdev->bd_fsfreeze_mutex); @@ -607,6 +596,7 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb) if (--bdev->bd_fsfreeze_count > 0) goto out; + sb = bdev->bd_fsfreeze_sb; if (!sb) goto out; @@ -792,23 +782,19 @@ static struct inode *bdev_alloc_inode(struct super_block *sb) static void bdev_free_inode(struct inode *inode) { + struct block_device *bdev = I_BDEV(inode); + + free_percpu(bdev->bd_stats); + kfree(bdev->bd_meta_info); + kmem_cache_free(bdev_cachep, BDEV_I(inode)); } -static void init_once(void *foo) +static void init_once(void *data) { - struct bdev_inode *ei = (struct bdev_inode *) foo; - struct block_device *bdev = &ei->bdev; + struct bdev_inode *ei = data; - memset(bdev, 0, sizeof(*bdev)); - mutex_init(&bdev->bd_mutex); -#ifdef CONFIG_SYSFS - INIT_LIST_HEAD(&bdev->bd_holder_disks); -#endif - bdev->bd_bdi = &noop_backing_dev_info; inode_init_once(&ei->vfs_inode); - /* Initialize mutex for freeze. */ - mutex_init(&bdev->bd_fsfreeze_mutex); } static void bdev_evict_inode(struct inode *inode) @@ -870,72 +856,72 @@ void __init bdev_cache_init(void) blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ } -/* - * Most likely _very_ bad one - but then it's hardly critical for small - * /dev and can be fixed when somebody will need really large one. - * Keep in mind that it will be fed through icache hash function too. - */ -static inline unsigned long hash(dev_t dev) +struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) { - return MAJOR(dev)+MINOR(dev); -} + struct block_device *bdev; + struct inode *inode; -static int bdev_test(struct inode *inode, void *data) -{ - return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data; + inode = new_inode(blockdev_superblock); + if (!inode) + return NULL; + inode->i_mode = S_IFBLK; + inode->i_rdev = 0; + inode->i_data.a_ops = &def_blk_aops; + mapping_set_gfp_mask(&inode->i_data, GFP_USER); + + bdev = I_BDEV(inode); + memset(bdev, 0, sizeof(*bdev)); + mutex_init(&bdev->bd_mutex); + mutex_init(&bdev->bd_fsfreeze_mutex); + spin_lock_init(&bdev->bd_size_lock); + bdev->bd_disk = disk; + bdev->bd_partno = partno; + bdev->bd_inode = inode; + bdev->bd_bdi = &noop_backing_dev_info; +#ifdef CONFIG_SYSFS + INIT_LIST_HEAD(&bdev->bd_holder_disks); +#endif + bdev->bd_stats = alloc_percpu(struct disk_stats); + if (!bdev->bd_stats) { + iput(inode); + return NULL; + } + return bdev; } -static int bdev_set(struct inode *inode, void *data) +void bdev_add(struct block_device *bdev, dev_t dev) { - BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data; - return 0; + bdev->bd_dev = dev; + bdev->bd_inode->i_rdev = dev; + bdev->bd_inode->i_ino = dev; + insert_inode_hash(bdev->bd_inode); } static struct block_device *bdget(dev_t dev) { - struct block_device *bdev; struct inode *inode; - inode = iget5_locked(blockdev_superblock, hash(dev), - bdev_test, bdev_set, &dev); - + inode = ilookup(blockdev_superblock, dev); if (!inode) return NULL; - - bdev = &BDEV_I(inode)->bdev; - - if (inode->i_state & I_NEW) { - spin_lock_init(&bdev->bd_size_lock); - bdev->bd_contains = NULL; - bdev->bd_super = NULL; - bdev->bd_inode = inode; - bdev->bd_part_count = 0; - inode->i_mode = S_IFBLK; - inode->i_rdev = dev; - inode->i_bdev = bdev; - inode->i_data.a_ops = &def_blk_aops; - mapping_set_gfp_mask(&inode->i_data, GFP_USER); - unlock_new_inode(inode); - } - return bdev; + return &BDEV_I(inode)->bdev; } /** * bdgrab -- Grab a reference to an already referenced block device * @bdev: Block device to grab a reference to. + * + * Returns the block_device with an additional reference when successful, + * or NULL if the inode is already beeing freed. */ struct block_device *bdgrab(struct block_device *bdev) { - ihold(bdev->bd_inode); + if (!igrab(bdev->bd_inode)) + return NULL; return bdev; } EXPORT_SYMBOL(bdgrab); -struct block_device *bdget_part(struct hd_struct *part) -{ - return bdget(part_devt(part)); -} - long nr_blockdev_pages(void) { struct inode *inode; @@ -953,67 +939,8 @@ void bdput(struct block_device *bdev) { iput(bdev->bd_inode); } - EXPORT_SYMBOL(bdput); -static struct block_device *bd_acquire(struct inode *inode) -{ - struct block_device *bdev; - - spin_lock(&bdev_lock); - bdev = inode->i_bdev; - if (bdev && !inode_unhashed(bdev->bd_inode)) { - bdgrab(bdev); - spin_unlock(&bdev_lock); - return bdev; - } - spin_unlock(&bdev_lock); - - /* - * i_bdev references block device inode that was already shut down - * (corresponding device got removed). Remove the reference and look - * up block device inode again just in case new device got - * reestablished under the same device number. - */ - if (bdev) - bd_forget(inode); - - bdev = bdget(inode->i_rdev); - if (bdev) { - spin_lock(&bdev_lock); - if (!inode->i_bdev) { - /* - * We take an additional reference to bd_inode, - * and it's released in clear_inode() of inode. - * So, we can access it via ->i_mapping always - * without igrab(). - */ - bdgrab(bdev); - inode->i_bdev = bdev; - inode->i_mapping = bdev->bd_inode->i_mapping; - } - spin_unlock(&bdev_lock); - } - return bdev; -} - -/* Call when you free inode */ - -void bd_forget(struct inode *inode) -{ - struct block_device *bdev = NULL; - - spin_lock(&bdev_lock); - if (!sb_is_blkdev_sb(inode->i_sb)) - bdev = inode->i_bdev; - inode->i_bdev = NULL; - inode->i_mapping = &inode->i_data; - spin_unlock(&bdev_lock); - - if (bdev) - bdput(bdev); -} - /** * bd_may_claim - test whether a block device can be claimed * @bdev: block device of interest @@ -1049,7 +976,6 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, /** * bd_prepare_to_claim - claim a block device * @bdev: block device of interest - * @whole: the whole device containing @bdev, may equal @bdev * @holder: holder trying to claim @bdev * * Claim @bdev. This function fails if @bdev is already claimed by another @@ -1059,9 +985,12 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, * RETURNS: * 0 if @bdev can be claimed, -EBUSY otherwise. */ -int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole, - void *holder) +int bd_prepare_to_claim(struct block_device *bdev, void *holder) { + struct block_device *whole = bdev_whole(bdev); + + if (WARN_ON_ONCE(!holder)) + return -EINVAL; retry: spin_lock(&bdev_lock); /* if someone else claimed, fail */ @@ -1089,27 +1018,6 @@ retry: } EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ -static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno) -{ - struct gendisk *disk = get_gendisk(bdev->bd_dev, partno); - - if (!disk) - return NULL; - /* - * Now that we hold gendisk reference we make sure bdev we looked up is - * not stale. If it is, it means device got removed and created before - * we looked up gendisk and we fail open in such case. Associating - * unhashed bdev with newly created gendisk could lead to two bdevs - * (and thus two independent caches) being associated with one device - * which is bad. - */ - if (inode_unhashed(bdev->bd_inode)) { - put_disk_and_module(disk); - return NULL; - } - return disk; -} - static void bd_clear_claiming(struct block_device *whole, void *holder) { lockdep_assert_held(&bdev_lock); @@ -1122,15 +1030,15 @@ static void bd_clear_claiming(struct block_device *whole, void *holder) /** * bd_finish_claiming - finish claiming of a block device * @bdev: block device of interest - * @whole: whole block device * @holder: holder that has claimed @bdev * * Finish exclusive open of a block device. Mark the device as exlusively * open by the holder and wake up all waiters for exclusive open to finish. */ -static void bd_finish_claiming(struct block_device *bdev, - struct block_device *whole, void *holder) +static void bd_finish_claiming(struct block_device *bdev, void *holder) { + struct block_device *whole = bdev_whole(bdev); + spin_lock(&bdev_lock); BUG_ON(!bd_may_claim(bdev, whole, holder)); /* @@ -1155,11 +1063,10 @@ static void bd_finish_claiming(struct block_device *bdev, * also used when exclusive open is not actually desired and we just needed * to block other exclusive openers for a while. */ -void bd_abort_claiming(struct block_device *bdev, struct block_device *whole, - void *holder) +void bd_abort_claiming(struct block_device *bdev, void *holder) { spin_lock(&bdev_lock); - bd_clear_claiming(whole, holder); + bd_clear_claiming(bdev_whole(bdev), holder); spin_unlock(&bdev_lock); } EXPORT_SYMBOL(bd_abort_claiming); @@ -1230,7 +1137,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) WARN_ON_ONCE(!bdev->bd_holder); /* FIXME: remove the following once add_disk() handles errors */ - if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir)) + if (WARN_ON(!disk->slave_dir || !bdev->bd_holder_dir)) goto out_unlock; holder = bd_find_holder_disk(bdev, disk); @@ -1249,24 +1156,24 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) holder->disk = disk; holder->refcnt = 1; - ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); + ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); if (ret) goto out_free; - ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); + ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); if (ret) goto out_del; /* * bdev could be deleted beneath us which would implicitly destroy * the holder directory. Hold on to it. */ - kobject_get(bdev->bd_part->holder_dir); + kobject_get(bdev->bd_holder_dir); list_add(&holder->list, &bdev->bd_holder_disks); goto out_unlock; out_del: - del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); + del_symlink(disk->slave_dir, bdev_kobj(bdev)); out_free: kfree(holder); out_unlock: @@ -1294,10 +1201,9 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) holder = bd_find_holder_disk(bdev, disk); if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { - del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); - del_symlink(bdev->bd_part->holder_dir, - &disk_to_dev(disk)->kobj); - kobject_put(bdev->bd_part->holder_dir); + del_symlink(disk->slave_dir, bdev_kobj(bdev)); + del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); + kobject_put(bdev->bd_holder_dir); list_del_init(&holder->list); kfree(holder); } @@ -1307,77 +1213,6 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); #endif -/** - * check_disk_size_change - checks for disk size change and adjusts bdev size. - * @disk: struct gendisk to check - * @bdev: struct bdev to adjust. - * @verbose: if %true log a message about a size change if there is any - * - * This routine checks to see if the bdev size does not match the disk size - * and adjusts it if it differs. When shrinking the bdev size, its all caches - * are freed. - */ -static void check_disk_size_change(struct gendisk *disk, - struct block_device *bdev, bool verbose) -{ - loff_t disk_size, bdev_size; - - spin_lock(&bdev->bd_size_lock); - disk_size = (loff_t)get_capacity(disk) << 9; - bdev_size = i_size_read(bdev->bd_inode); - if (disk_size != bdev_size) { - if (verbose) { - printk(KERN_INFO - "%s: detected capacity change from %lld to %lld\n", - disk->disk_name, bdev_size, disk_size); - } - i_size_write(bdev->bd_inode, disk_size); - } - spin_unlock(&bdev->bd_size_lock); - - if (bdev_size > disk_size) { - if (__invalidate_device(bdev, false)) - pr_warn("VFS: busy inodes on resized disk %s\n", - disk->disk_name); - } -} - -/** - * revalidate_disk_size - checks for disk size change and adjusts bdev size. - * @disk: struct gendisk to check - * @verbose: if %true log a message about a size change if there is any - * - * This routine checks to see if the bdev size does not match the disk size - * and adjusts it if it differs. When shrinking the bdev size, its all caches - * are freed. - */ -void revalidate_disk_size(struct gendisk *disk, bool verbose) -{ - struct block_device *bdev; - - /* - * Hidden disks don't have associated bdev so there's no point in - * revalidating them. - */ - if (disk->flags & GENHD_FL_HIDDEN) - return; - - bdev = bdget_disk(disk, 0); - if (bdev) { - check_disk_size_change(disk, bdev, verbose); - bdput(bdev); - } -} -EXPORT_SYMBOL(revalidate_disk_size); - -void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors) -{ - spin_lock(&bdev->bd_size_lock); - i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); - spin_unlock(&bdev->bd_size_lock); -} -EXPORT_SYMBOL(bd_set_nr_sectors); - static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); int bdev_disk_changed(struct block_device *bdev, bool invalidate) @@ -1411,8 +1246,6 @@ rescan: disk->fops->revalidate_disk(disk); } - check_disk_size_change(disk, bdev, !invalidate); - if (get_capacity(disk)) { ret = blk_add_partitions(disk, bdev); if (ret == -EAGAIN) @@ -1439,71 +1272,19 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed); * mutex_lock(part->bd_mutex) * mutex_lock_nested(whole->bd_mutex, 1) */ - -static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, - int for_part) +static int __blkdev_get(struct block_device *bdev, fmode_t mode) { - struct block_device *whole = NULL, *claiming = NULL; - struct gendisk *disk; - int ret; - int partno; - bool first_open = false, unblock_events = true, need_restart; - - restart: - need_restart = false; - ret = -ENXIO; - disk = bdev_get_gendisk(bdev, &partno); - if (!disk) - goto out; - - if (partno) { - whole = bdget_disk(disk, 0); - if (!whole) { - ret = -ENOMEM; - goto out_put_disk; - } - } - - if (!for_part && (mode & FMODE_EXCL)) { - WARN_ON_ONCE(!holder); - if (whole) - claiming = whole; - else - claiming = bdev; - ret = bd_prepare_to_claim(bdev, claiming, holder); - if (ret) - goto out_put_whole; - } + struct gendisk *disk = bdev->bd_disk; + int ret = 0; - disk_block_events(disk); - mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { - first_open = true; - bdev->bd_disk = disk; - bdev->bd_contains = bdev; - bdev->bd_partno = partno; - - if (!partno) { - ret = -ENXIO; - bdev->bd_part = disk_get_part(disk, partno); - if (!bdev->bd_part) - goto out_clear; - + if (!bdev_is_partition(bdev)) { ret = 0; - if (disk->fops->open) { + if (disk->fops->open) ret = disk->fops->open(bdev, mode); - /* - * If we lost a race with 'disk' being deleted, - * try again. See md.c - */ - if (ret == -ERESTARTSYS) - need_restart = true; - } - if (!ret) { - bd_set_nr_sectors(bdev, get_capacity(disk)); + if (!ret) set_init_blocksize(bdev); - } /* * If the device is invalidated, rescan partition @@ -1516,28 +1297,33 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, bdev_disk_changed(bdev, ret == -ENOMEDIUM); if (ret) - goto out_clear; + return ret; } else { - BUG_ON(for_part); - ret = __blkdev_get(whole, mode, NULL, 1); - if (ret) - goto out_clear; - bdev->bd_contains = bdgrab(whole); - bdev->bd_part = disk_get_part(disk, partno); + struct block_device *whole = bdgrab(disk->part0); + + mutex_lock_nested(&whole->bd_mutex, 1); + ret = __blkdev_get(whole, mode); + if (ret) { + mutex_unlock(&whole->bd_mutex); + bdput(whole); + return ret; + } + whole->bd_part_count++; + mutex_unlock(&whole->bd_mutex); + if (!(disk->flags & GENHD_FL_UP) || - !bdev->bd_part || !bdev->bd_part->nr_sects) { - ret = -ENXIO; - goto out_clear; + !bdev_nr_sectors(bdev)) { + __blkdev_put(whole, mode, 1); + bdput(whole); + return -ENXIO; } - bd_set_nr_sectors(bdev, bdev->bd_part->nr_sects); set_init_blocksize(bdev); } if (bdev->bd_bdi == &noop_backing_dev_info) bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); } else { - if (bdev->bd_contains == bdev) { - ret = 0; + if (!bdev_is_partition(bdev)) { if (bdev->bd_disk->fops->open) ret = bdev->bd_disk->fops->open(bdev, mode); /* the same as first opener case, read comment there */ @@ -1545,101 +1331,145 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, (!ret || ret == -ENOMEDIUM)) bdev_disk_changed(bdev, ret == -ENOMEDIUM); if (ret) - goto out_unlock_bdev; + return ret; } } bdev->bd_openers++; - if (for_part) - bdev->bd_part_count++; - if (claiming) - bd_finish_claiming(bdev, claiming, holder); + return 0; +} - /* - * Block event polling for write claims if requested. Any write holder - * makes the write_holder state stick until all are released. This is - * good enough and tracking individual writeable reference is too - * fragile given the way @mode is used in blkdev_get/put(). - */ - if (claiming && (mode & FMODE_WRITE) && !bdev->bd_write_holder && - (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { - bdev->bd_write_holder = true; - unblock_events = false; - } - mutex_unlock(&bdev->bd_mutex); +struct block_device *blkdev_get_no_open(dev_t dev) +{ + struct block_device *bdev; + struct gendisk *disk; - if (unblock_events) - disk_unblock_events(disk); + down_read(&bdev_lookup_sem); + bdev = bdget(dev); + if (!bdev) { + up_read(&bdev_lookup_sem); + blk_request_module(dev); + down_read(&bdev_lookup_sem); + + bdev = bdget(dev); + if (!bdev) + goto unlock; + } - /* only one opener holds refs to the module and disk */ - if (!first_open) - put_disk_and_module(disk); - if (whole) - bdput(whole); - return 0; + disk = bdev->bd_disk; + if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj)) + goto bdput; + if ((disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP) + goto put_disk; + if (!try_module_get(bdev->bd_disk->fops->owner)) + goto put_disk; + up_read(&bdev_lookup_sem); + return bdev; +put_disk: + put_disk(disk); +bdput: + bdput(bdev); +unlock: + up_read(&bdev_lookup_sem); + return NULL; +} - out_clear: - disk_put_part(bdev->bd_part); - bdev->bd_disk = NULL; - bdev->bd_part = NULL; - if (bdev != bdev->bd_contains) - __blkdev_put(bdev->bd_contains, mode, 1); - bdev->bd_contains = NULL; - out_unlock_bdev: - if (claiming) - bd_abort_claiming(bdev, claiming, holder); - mutex_unlock(&bdev->bd_mutex); - disk_unblock_events(disk); - out_put_whole: - if (whole) - bdput(whole); - out_put_disk: - put_disk_and_module(disk); - if (need_restart) - goto restart; - out: - return ret; +void blkdev_put_no_open(struct block_device *bdev) +{ + module_put(bdev->bd_disk->fops->owner); + put_disk(bdev->bd_disk); + bdput(bdev); } /** - * blkdev_get - open a block device - * @bdev: block_device to open + * blkdev_get_by_dev - open a block device by device number + * @dev: device number of block device to open * @mode: FMODE_* mask * @holder: exclusive holder identifier * - * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is - * open with exclusive access. Specifying %FMODE_EXCL with %NULL - * @holder is invalid. Exclusive opens may nest for the same @holder. + * Open the block device described by device number @dev. If @mode includes + * %FMODE_EXCL, the block device is opened with exclusive access. Specifying + * %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may nest for + * the same @holder. * - * On success, the reference count of @bdev is unchanged. On failure, - * @bdev is put. + * Use this interface ONLY if you really do not have anything better - i.e. when + * you are behind a truly sucky interface and all you are given is a device + * number. Everything else should use blkdev_get_by_path(). * * CONTEXT: * Might sleep. * * RETURNS: - * 0 on success, -errno on failure. + * Reference to the block_device on success, ERR_PTR(-errno) on failure. */ -static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) +struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) { - int ret, perm = 0; + bool unblock_events = true; + struct block_device *bdev; + struct gendisk *disk; + int ret; - if (mode & FMODE_READ) - perm |= MAY_READ; - if (mode & FMODE_WRITE) - perm |= MAY_WRITE; - ret = devcgroup_inode_permission(bdev->bd_inode, perm); + ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, + MAJOR(dev), MINOR(dev), + ((mode & FMODE_READ) ? DEVCG_ACC_READ : 0) | + ((mode & FMODE_WRITE) ? DEVCG_ACC_WRITE : 0)); if (ret) - goto bdput; + return ERR_PTR(ret); + + /* + * If we lost a race with 'disk' being deleted, try again. See md.c. + */ +retry: + bdev = blkdev_get_no_open(dev); + if (!bdev) + return ERR_PTR(-ENXIO); + disk = bdev->bd_disk; + + if (mode & FMODE_EXCL) { + ret = bd_prepare_to_claim(bdev, holder); + if (ret) + goto put_blkdev; + } + + disk_block_events(disk); - ret =__blkdev_get(bdev, mode, holder, 0); + mutex_lock(&bdev->bd_mutex); + ret =__blkdev_get(bdev, mode); if (ret) - goto bdput; - return 0; + goto abort_claiming; + if (mode & FMODE_EXCL) { + bd_finish_claiming(bdev, holder); -bdput: - bdput(bdev); - return ret; + /* + * Block event polling for write claims if requested. Any write + * holder makes the write_holder state stick until all are + * released. This is good enough and tracking individual + * writeable reference is too fragile given the way @mode is + * used in blkdev_get/put(). + */ + if ((mode & FMODE_WRITE) && !bdev->bd_write_holder && + (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { + bdev->bd_write_holder = true; + unblock_events = false; + } + } + mutex_unlock(&bdev->bd_mutex); + + if (unblock_events) + disk_unblock_events(disk); + return bdev; + +abort_claiming: + if (mode & FMODE_EXCL) + bd_abort_claiming(bdev, holder); + mutex_unlock(&bdev->bd_mutex); + disk_unblock_events(disk); +put_blkdev: + blkdev_put_no_open(bdev); + if (ret == -ERESTARTSYS) + goto retry; + return ERR_PTR(ret); } +EXPORT_SYMBOL(blkdev_get_by_dev); /** * blkdev_get_by_path - open a block device by name @@ -1647,32 +1477,30 @@ bdput: * @mode: FMODE_* mask * @holder: exclusive holder identifier * - * Open the blockdevice described by the device file at @path. @mode - * and @holder are identical to blkdev_get(). - * - * On success, the returned block_device has reference count of one. + * Open the block device described by the device file at @path. If @mode + * includes %FMODE_EXCL, the block device is opened with exclusive access. + * Specifying %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may + * nest for the same @holder. * * CONTEXT: * Might sleep. * * RETURNS: - * Pointer to block_device on success, ERR_PTR(-errno) on failure. + * Reference to the block_device on success, ERR_PTR(-errno) on failure. */ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, void *holder) { struct block_device *bdev; - int err; - - bdev = lookup_bdev(path); - if (IS_ERR(bdev)) - return bdev; + dev_t dev; + int error; - err = blkdev_get(bdev, mode, holder); - if (err) - return ERR_PTR(err); + error = lookup_bdev(path, &dev); + if (error) + return ERR_PTR(error); - if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) { + bdev = blkdev_get_by_dev(dev, mode, holder); + if (!IS_ERR(bdev) && (mode & FMODE_WRITE) && bdev_read_only(bdev)) { blkdev_put(bdev, mode); return ERR_PTR(-EACCES); } @@ -1681,45 +1509,6 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, } EXPORT_SYMBOL(blkdev_get_by_path); -/** - * blkdev_get_by_dev - open a block device by device number - * @dev: device number of block device to open - * @mode: FMODE_* mask - * @holder: exclusive holder identifier - * - * Open the blockdevice described by device number @dev. @mode and - * @holder are identical to blkdev_get(). - * - * Use it ONLY if you really do not have anything better - i.e. when - * you are behind a truly sucky interface and all you are given is a - * device number. _Never_ to be used for internal purposes. If you - * ever need it - reconsider your API. - * - * On success, the returned block_device has reference count of one. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * Pointer to block_device on success, ERR_PTR(-errno) on failure. - */ -struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) -{ - struct block_device *bdev; - int err; - - bdev = bdget(dev); - if (!bdev) - return ERR_PTR(-ENOMEM); - - err = blkdev_get(bdev, mode, holder); - if (err) - return ERR_PTR(err); - - return bdev; -} -EXPORT_SYMBOL(blkdev_get_by_dev); - static int blkdev_open(struct inode * inode, struct file * filp) { struct block_device *bdev; @@ -1741,14 +1530,12 @@ static int blkdev_open(struct inode * inode, struct file * filp) if ((filp->f_flags & O_ACCMODE) == 3) filp->f_mode |= FMODE_WRITE_IOCTL; - bdev = bd_acquire(inode); - if (bdev == NULL) - return -ENOMEM; - + bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); filp->f_mapping = bdev->bd_inode->i_mapping; filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); - - return blkdev_get(bdev, filp->f_mode, filp); + return 0; } static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) @@ -1774,34 +1561,28 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) WARN_ON_ONCE(bdev->bd_holders); sync_blockdev(bdev); kill_bdev(bdev); - bdev_write_inode(bdev); + if (bdev_is_partition(bdev)) + victim = bdev_whole(bdev); } - if (bdev->bd_contains == bdev) { - if (disk->fops->release) - disk->fops->release(disk, mode); - } - if (!bdev->bd_openers) { - disk_put_part(bdev->bd_part); - bdev->bd_part = NULL; - bdev->bd_disk = NULL; - if (bdev != bdev->bd_contains) - victim = bdev->bd_contains; - bdev->bd_contains = NULL; - - put_disk_and_module(disk); - } + + if (!bdev_is_partition(bdev) && disk->fops->release) + disk->fops->release(disk, mode); mutex_unlock(&bdev->bd_mutex); - bdput(bdev); - if (victim) + if (victim) { __blkdev_put(victim, mode, 1); + bdput(victim); + } } void blkdev_put(struct block_device *bdev, fmode_t mode) { + struct gendisk *disk = bdev->bd_disk; + mutex_lock(&bdev->bd_mutex); if (mode & FMODE_EXCL) { + struct block_device *whole = bdev_whole(bdev); bool bdev_free; /* @@ -1812,13 +1593,12 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) spin_lock(&bdev_lock); WARN_ON_ONCE(--bdev->bd_holders < 0); - WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0); + WARN_ON_ONCE(--whole->bd_holders < 0); - /* bd_contains might point to self, check in a separate step */ if ((bdev_free = !bdev->bd_holders)) bdev->bd_holder = NULL; - if (!bdev->bd_contains->bd_holders) - bdev->bd_contains->bd_holder = NULL; + if (!whole->bd_holders) + whole->bd_holder = NULL; spin_unlock(&bdev_lock); @@ -1827,7 +1607,7 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) * unblock evpoll if it was a write holder. */ if (bdev_free && bdev->bd_write_holder) { - disk_unblock_events(bdev->bd_disk); + disk_unblock_events(disk); bdev->bd_write_holder = false; } } @@ -1837,11 +1617,11 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) * event. This is to ensure detection of media removal commanded * from userland - e.g. eject(1). */ - disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE); - + disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); mutex_unlock(&bdev->bd_mutex); __blkdev_put(bdev, mode, 0); + blkdev_put_no_open(bdev); } EXPORT_SYMBOL(blkdev_put); @@ -2054,37 +1834,32 @@ const struct file_operations def_blk_fops = { * namespace if possible and return it. Return ERR_PTR(error) * otherwise. */ -struct block_device *lookup_bdev(const char *pathname) +int lookup_bdev(const char *pathname, dev_t *dev) { - struct block_device *bdev; struct inode *inode; struct path path; int error; if (!pathname || !*pathname) - return ERR_PTR(-EINVAL); + return -EINVAL; error = kern_path(pathname, LOOKUP_FOLLOW, &path); if (error) - return ERR_PTR(error); + return error; inode = d_backing_inode(path.dentry); error = -ENOTBLK; if (!S_ISBLK(inode->i_mode)) - goto fail; + goto out_path_put; error = -EACCES; if (!may_open_dev(&path)) - goto fail; - error = -ENOMEM; - bdev = bd_acquire(inode); - if (!bdev) - goto fail; -out: + goto out_path_put; + + *dev = inode->i_rdev; + error = 0; +out_path_put: path_put(&path); - return bdev; -fail: - bdev = ERR_PTR(error); - goto out; + return error; } EXPORT_SYMBOL(lookup_bdev); diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index e738f6206ea5..9f1b1a88e317 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -3,7 +3,7 @@ obj-$(CONFIG_BTRFS_FS) := btrfs.o btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ - file-item.o inode-item.o inode-map.o disk-io.o \ + file-item.o inode-item.o disk-io.o \ transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ @@ -16,6 +16,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o +btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 771a036867dc..02d7d7b2563b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -783,8 +783,8 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, BUG_ON(ref->key_for_search.type); BUG_ON(!ref->wanted_disk_byte); - eb = read_tree_block(fs_info, ref->wanted_disk_byte, 0, - ref->level - 1, NULL); + eb = read_tree_block(fs_info, ref->wanted_disk_byte, + ref->root_id, 0, ref->level - 1, NULL); if (IS_ERR(eb)) { free_pref(ref); return PTR_ERR(eb); @@ -1331,7 +1331,7 @@ again: struct extent_buffer *eb; eb = read_tree_block(fs_info, ref->parent, 0, - ref->level, NULL); + 0, ref->level, NULL); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; @@ -1341,14 +1341,12 @@ again: goto out; } - if (!path->skip_locking) { + if (!path->skip_locking) btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_read(eb); - } ret = find_extent_in_eb(eb, bytenr, *extent_item_pos, &eie, ignore_offset); if (!path->skip_locking) - btrfs_tree_read_unlock_blocking(eb); + btrfs_tree_read_unlock(eb); free_extent_buffer(eb); if (ret < 0) goto out; @@ -1671,13 +1669,11 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, s64 bytes_left = ((s64)size) - 1; struct extent_buffer *eb = eb_in; struct btrfs_key found_key; - int leave_spinning = path->leave_spinning; struct btrfs_inode_ref *iref; if (bytes_left >= 0) dest[bytes_left] = '\0'; - path->leave_spinning = 1; while (1) { bytes_left -= name_len; if (bytes_left >= 0) @@ -1685,7 +1681,7 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, name_off, name_len); if (eb != eb_in) { if (!path->skip_locking) - btrfs_tree_read_unlock_blocking(eb); + btrfs_tree_read_unlock(eb); free_extent_buffer(eb); } ret = btrfs_find_item(fs_root, path, parent, 0, @@ -1705,8 +1701,6 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, eb = path->nodes[0]; /* make sure we can use eb after releasing the path */ if (eb != eb_in) { - if (!path->skip_locking) - btrfs_set_lock_blocking_read(eb); path->nodes[0] = NULL; path->locks[0] = 0; } @@ -1723,7 +1717,6 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, } btrfs_release_path(path); - path->leave_spinning = leave_spinning; if (ret) return ERR_PTR(ret); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 3ba6f3839d39..52f2198d44c9 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -424,6 +424,23 @@ int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache) return ret; } +static bool space_cache_v1_done(struct btrfs_block_group *cache) +{ + bool ret; + + spin_lock(&cache->lock); + ret = cache->cached != BTRFS_CACHE_FAST; + spin_unlock(&cache->lock); + + return ret; +} + +void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache, + struct btrfs_caching_control *caching_ctl) +{ + wait_event(caching_ctl->wait, space_cache_v1_done(cache)); +} + #ifdef CONFIG_BTRFS_DEBUG static void fragment_free_space(struct btrfs_block_group *block_group) { @@ -639,11 +656,28 @@ static noinline void caching_thread(struct btrfs_work *work) mutex_lock(&caching_ctl->mutex); down_read(&fs_info->commit_root_sem); + if (btrfs_test_opt(fs_info, SPACE_CACHE)) { + ret = load_free_space_cache(block_group); + if (ret == 1) { + ret = 0; + goto done; + } + + /* + * We failed to load the space cache, set ourselves to + * CACHE_STARTED and carry on. + */ + spin_lock(&block_group->lock); + block_group->cached = BTRFS_CACHE_STARTED; + spin_unlock(&block_group->lock); + wake_up(&caching_ctl->wait); + } + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) ret = load_free_space_tree(caching_ctl); else ret = load_extent_tree_free(caching_ctl); - +done: spin_lock(&block_group->lock); block_group->caching_ctl = NULL; block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; @@ -679,7 +713,7 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only { DEFINE_WAIT(wait); struct btrfs_fs_info *fs_info = cache->fs_info; - struct btrfs_caching_control *caching_ctl; + struct btrfs_caching_control *caching_ctl = NULL; int ret = 0; caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); @@ -691,119 +725,41 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only init_waitqueue_head(&caching_ctl->wait); caching_ctl->block_group = cache; caching_ctl->progress = cache->start; - refcount_set(&caching_ctl->count, 1); + refcount_set(&caching_ctl->count, 2); btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); spin_lock(&cache->lock); - /* - * This should be a rare occasion, but this could happen I think in the - * case where one thread starts to load the space cache info, and then - * some other thread starts a transaction commit which tries to do an - * allocation while the other thread is still loading the space cache - * info. The previous loop should have kept us from choosing this block - * group, but if we've moved to the state where we will wait on caching - * block groups we need to first check if we're doing a fast load here, - * so we can wait for it to finish, otherwise we could end up allocating - * from a block group who's cache gets evicted for one reason or - * another. - */ - while (cache->cached == BTRFS_CACHE_FAST) { - struct btrfs_caching_control *ctl; - - ctl = cache->caching_ctl; - refcount_inc(&ctl->count); - prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&cache->lock); - - schedule(); - - finish_wait(&ctl->wait, &wait); - btrfs_put_caching_control(ctl); - spin_lock(&cache->lock); - } - if (cache->cached != BTRFS_CACHE_NO) { - spin_unlock(&cache->lock); kfree(caching_ctl); - return 0; + + caching_ctl = cache->caching_ctl; + if (caching_ctl) + refcount_inc(&caching_ctl->count); + spin_unlock(&cache->lock); + goto out; } WARN_ON(cache->caching_ctl); cache->caching_ctl = caching_ctl; - cache->cached = BTRFS_CACHE_FAST; + if (btrfs_test_opt(fs_info, SPACE_CACHE)) + cache->cached = BTRFS_CACHE_FAST; + else + cache->cached = BTRFS_CACHE_STARTED; + cache->has_caching_ctl = 1; spin_unlock(&cache->lock); - if (btrfs_test_opt(fs_info, SPACE_CACHE)) { - mutex_lock(&caching_ctl->mutex); - ret = load_free_space_cache(cache); - - spin_lock(&cache->lock); - if (ret == 1) { - cache->caching_ctl = NULL; - cache->cached = BTRFS_CACHE_FINISHED; - cache->last_byte_to_unpin = (u64)-1; - caching_ctl->progress = (u64)-1; - } else { - if (load_cache_only) { - cache->caching_ctl = NULL; - cache->cached = BTRFS_CACHE_NO; - } else { - cache->cached = BTRFS_CACHE_STARTED; - cache->has_caching_ctl = 1; - } - } - spin_unlock(&cache->lock); -#ifdef CONFIG_BTRFS_DEBUG - if (ret == 1 && - btrfs_should_fragment_free_space(cache)) { - u64 bytes_used; - - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - bytes_used = cache->length - cache->used; - cache->space_info->bytes_used += bytes_used >> 1; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - fragment_free_space(cache); - } -#endif - mutex_unlock(&caching_ctl->mutex); - - wake_up(&caching_ctl->wait); - if (ret == 1) { - btrfs_put_caching_control(caching_ctl); - btrfs_free_excluded_extents(cache); - return 0; - } - } else { - /* - * We're either using the free space tree or no caching at all. - * Set cached to the appropriate value and wakeup any waiters. - */ - spin_lock(&cache->lock); - if (load_cache_only) { - cache->caching_ctl = NULL; - cache->cached = BTRFS_CACHE_NO; - } else { - cache->cached = BTRFS_CACHE_STARTED; - cache->has_caching_ctl = 1; - } - spin_unlock(&cache->lock); - wake_up(&caching_ctl->wait); - } - - if (load_cache_only) { - btrfs_put_caching_control(caching_ctl); - return 0; - } - - down_write(&fs_info->commit_root_sem); + spin_lock(&fs_info->block_group_cache_lock); refcount_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); - up_write(&fs_info->commit_root_sem); + spin_unlock(&fs_info->block_group_cache_lock); btrfs_get_block_group(cache); btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); +out: + if (load_cache_only && caching_ctl) + btrfs_wait_space_cache_v1_finished(cache, caching_ctl); + if (caching_ctl) + btrfs_put_caching_control(caching_ctl); return ret; } @@ -892,8 +848,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_block_group *block_group; struct btrfs_free_cluster *cluster; - struct btrfs_root *tree_root = fs_info->tree_root; - struct btrfs_key key; struct inode *inode; struct kobject *kobj = NULL; int ret; @@ -971,42 +925,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&trans->transaction->dirty_bgs_lock); mutex_unlock(&trans->transaction->cache_write_mutex); - if (!IS_ERR(inode)) { - ret = btrfs_orphan_add(trans, BTRFS_I(inode)); - if (ret) { - btrfs_add_delayed_iput(inode); - goto out; - } - clear_nlink(inode); - /* One for the block groups ref */ - spin_lock(&block_group->lock); - if (block_group->iref) { - block_group->iref = 0; - block_group->inode = NULL; - spin_unlock(&block_group->lock); - iput(inode); - } else { - spin_unlock(&block_group->lock); - } - /* One for our lookup ref */ - btrfs_add_delayed_iput(inode); - } - - key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.type = 0; - key.offset = block_group->start; - - ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); - if (ret < 0) + ret = btrfs_remove_free_space_inode(trans, inode, block_group); + if (ret) goto out; - if (ret > 0) - btrfs_release_path(path); - if (ret == 0) { - ret = btrfs_del_item(trans, tree_root, path); - if (ret) - goto out; - btrfs_release_path(path); - } spin_lock(&fs_info->block_group_cache_lock); rb_erase(&block_group->cache_node, @@ -1043,7 +964,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (block_group->cached == BTRFS_CACHE_STARTED) btrfs_wait_block_group_cache_done(block_group); if (block_group->has_caching_ctl) { - down_write(&fs_info->commit_root_sem); + spin_lock(&fs_info->block_group_cache_lock); if (!caching_ctl) { struct btrfs_caching_control *ctl; @@ -1057,7 +978,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, } if (caching_ctl) list_del_init(&caching_ctl->list); - up_write(&fs_info->commit_root_sem); + spin_unlock(&fs_info->block_group_cache_lock); if (caching_ctl) { /* Once for the caching bgs list and once for us. */ btrfs_put_caching_control(caching_ctl); @@ -1723,6 +1644,7 @@ out: static int exclude_super_stripes(struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = cache->fs_info; + const bool zoned = btrfs_is_zoned(fs_info); u64 bytenr; u64 *logical; int stripe_len; @@ -1744,6 +1666,14 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) if (ret) return ret; + /* Shouldn't have super stripes in sequential zones */ + if (zoned && nr) { + btrfs_err(fs_info, + "zoned: block group %llu must not contain super block", + cache->start); + return -EUCLEAN; + } + while (nr--) { u64 len = min_t(u64, stripe_len, cache->start + cache->length - logical[nr]); @@ -1805,7 +1735,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( INIT_LIST_HEAD(&cache->discard_list); INIT_LIST_HEAD(&cache->dirty_list); INIT_LIST_HEAD(&cache->io_list); - btrfs_init_free_space_ctl(cache); + btrfs_init_free_space_ctl(cache, cache->free_space_ctl); atomic_set(&cache->frozen, 0); mutex_init(&cache->free_space_lock); btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); @@ -1985,6 +1915,51 @@ error: return ret; } +static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree; + struct btrfs_space_info *space_info; + struct rb_node *node; + int ret = 0; + + for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { + struct extent_map *em; + struct map_lookup *map; + struct btrfs_block_group *bg; + + em = rb_entry(node, struct extent_map, rb_node); + map = em->map_lookup; + bg = btrfs_create_block_group_cache(fs_info, em->start); + if (!bg) { + ret = -ENOMEM; + break; + } + + /* Fill dummy cache as FULL */ + bg->length = em->len; + bg->flags = map->type; + bg->last_byte_to_unpin = (u64)-1; + bg->cached = BTRFS_CACHE_FINISHED; + bg->used = em->len; + bg->flags = map->type; + ret = btrfs_add_block_group_cache(fs_info, bg); + if (ret) { + btrfs_remove_free_space_cache(bg); + btrfs_put_block_group(bg); + break; + } + btrfs_update_space_info(fs_info, bg->flags, em->len, em->len, + 0, &space_info); + bg->space_info = space_info; + link_block_group(bg); + + set_avail_alloc_bits(fs_info, bg->flags); + } + if (!ret) + btrfs_init_global_block_rsv(fs_info); + return ret; +} + int btrfs_read_block_groups(struct btrfs_fs_info *info) { struct btrfs_path *path; @@ -1995,6 +1970,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) int need_clear = 0; u64 cache_gen; + if (!info->extent_root) + return fill_dummy_bgs(info); + key.objectid = 0; key.offset = 0; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; @@ -2152,7 +2130,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, cache->flags = type; cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; - cache->needs_free_space = 1; + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + cache->needs_free_space = 1; ret = exclude_super_stripes(cache); if (ret) { /* We may have excluded something, so call this just in case */ @@ -2361,6 +2340,9 @@ static int cache_save_setup(struct btrfs_block_group *block_group, int retries = 0; int ret = 0; + if (!btrfs_test_opt(fs_info, SPACE_CACHE)) + return 0; + /* * If this block group is smaller than 100 megs don't bother caching the * block group. @@ -2401,7 +2383,7 @@ again: * time. */ BTRFS_I(inode)->generation = 0; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) { /* * So theoretically we could recover from this, simply set the @@ -3307,14 +3289,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) struct btrfs_caching_control *caching_ctl; struct rb_node *n; - down_write(&info->commit_root_sem); + spin_lock(&info->block_group_cache_lock); while (!list_empty(&info->caching_block_groups)) { caching_ctl = list_entry(info->caching_block_groups.next, struct btrfs_caching_control, list); list_del(&caching_ctl->list); btrfs_put_caching_control(caching_ctl); } - up_write(&info->commit_root_sem); + spin_unlock(&info->block_group_cache_lock); spin_lock(&info->unused_bgs_lock); while (!list_empty(&info->unused_bgs)) { diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index adfd7583a17b..8f74a96074f7 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -268,6 +268,8 @@ void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type); u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); int btrfs_free_block_groups(struct btrfs_fs_info *info); +void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache, + struct btrfs_caching_control *caching_ctl); static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) { diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index bc920afe23bf..04a6226e0388 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -426,6 +426,14 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->delayed_block_rsv.space_info = space_info; fs_info->delayed_refs_rsv.space_info = space_info; + /* + * Our various recovery options can leave us with NULL roots, so check + * here and just bail before we go dereferencing NULLs everywhere. + */ + if (!fs_info->extent_root || !fs_info->csum_root || + !fs_info->dev_root || !fs_info->chunk_root || !fs_info->tree_root) + return; + fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 92dd86bceae3..555cbcef6585 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -35,6 +35,13 @@ enum { BTRFS_INODE_IN_DELALLOC_LIST, BTRFS_INODE_HAS_PROPS, BTRFS_INODE_SNAPSHOT_FLUSH, + /* + * Set and used when logging an inode and it serves to signal that an + * inode does not have xattrs, so subsequent fsyncs can avoid searching + * for xattrs to log. This bit must be cleared whenever a xattr is added + * to an inode. + */ + BTRFS_INODE_NO_XATTRS, }; /* in memory btrfs inode */ @@ -50,7 +57,8 @@ struct btrfs_inode { /* * Lock for counters and all fields used to determine if the inode is in * the log or not (last_trans, last_sub_trans, last_log_commit, - * logged_trans). + * logged_trans), to access/update new_delalloc_bytes and to update the + * VFS' inode number of bytes used. */ spinlock_t lock; @@ -203,16 +211,6 @@ struct btrfs_inode { /* Hook into fs_info->delayed_iputs */ struct list_head delayed_iput; - /* - * To avoid races between lockless (i_mutex not held) direct IO writes - * and concurrent fsync requests. Direct IO writes must acquire read - * access on this semaphore for creating an extent map and its - * corresponding ordered extent. The fast fsync path must acquire write - * access on this semaphore before it collects ordered extents and - * extent maps. - */ - struct rw_semaphore dio_sem; - struct inode vfs_inode; }; @@ -341,8 +339,7 @@ static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode, u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) { struct btrfs_root *root = inode->root; - struct btrfs_super_block *sb = root->fs_info->super_copy; - const u16 csum_size = btrfs_super_csum_size(sb); + const u32 csum_size = root->fs_info->csum_size; /* Output minus objectid, which is more meaningful */ if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 81a8c87a5afb..6ff44e53814c 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -233,7 +233,6 @@ struct btrfsic_stack_frame { struct btrfsic_state { u32 print_mask; int include_extent_data; - int csum_size; struct list_head all_blocks_list; struct btrfsic_block_hashtable block_hashtable; struct btrfsic_block_link_hashtable block_link_hashtable; @@ -660,8 +659,6 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, return -1; } - state->csum_size = btrfs_super_csum_size(selected_super); - for (pass = 0; pass < 3; pass++) { int num_copies; int mirror_num; @@ -954,7 +951,7 @@ static noinline_for_stack int btrfsic_process_metablock( sf->prev = NULL; continue_with_new_stack_frame: - sf->block->generation = le64_to_cpu(sf->hdr->generation); + sf->block->generation = btrfs_stack_header_generation(sf->hdr); if (0 == sf->hdr->level) { struct btrfs_leaf *const leafhdr = (struct btrfs_leaf *)sf->hdr; @@ -1723,7 +1720,7 @@ static noinline_for_stack int btrfsic_test_for_metadata( crypto_shash_update(shash, data, sublen); } crypto_shash_final(shash, csum); - if (memcmp(csum, h->csum, state->csum_size)) + if (memcmp(csum, h->csum, fs_info->csum_size)) return 1; return 0; /* is metadata */ @@ -2695,8 +2692,7 @@ static void __btrfsic_submit_bio(struct bio *bio) BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_disk=%p)\n", bio_op(bio), bio->bi_opf, segs, - (unsigned long long)bio->bi_iter.bi_sector, - dev_bytenr, bio->bi_disk); + bio->bi_iter.bi_sector, dev_bytenr, bio->bi_disk); mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS); @@ -2797,7 +2793,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, state->fs_info = fs_info; state->print_mask = print_mask; state->include_extent_data = including_extent_data; - state->csum_size = 0; state->metablock_size = fs_info->nodesize; state->datablock_size = fs_info->sectorsize; INIT_LIST_HEAD(&state->all_blocks_list); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index eeface30facd..5ae3fa0386b7 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -131,10 +131,8 @@ static int btrfs_decompress_bio(struct compressed_bio *cb); static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, unsigned long disk_size) { - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); - return sizeof(struct compressed_bio) + - (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size; + (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * fs_info->csum_size; } static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, @@ -142,7 +140,7 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, { struct btrfs_fs_info *fs_info = inode->root->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + const u32 csum_size = fs_info->csum_size; struct page *page; unsigned long i; char *kaddr; @@ -150,7 +148,7 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, struct compressed_bio *cb = bio->bi_private; u8 *cb_sum = cb->sums; - if (inode->flags & BTRFS_INODE_NODATASUM) + if (!fs_info->csum_root || (inode->flags & BTRFS_INODE_NODATASUM)) return 0; shash->tfm = fs_info->csum_shash; @@ -220,7 +218,7 @@ static void end_compressed_bio_read(struct bio *bio) inode = cb->inode; ret = check_compressed_csum(BTRFS_I(inode), bio, - (u64)bio->bi_iter.bi_sector << 9); + bio->bi_iter.bi_sector << 9); if (ret) goto csum_failed; @@ -622,13 +620,12 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, unsigned long pg_index; struct page *page; struct bio *comp_bio; - u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9; + u64 cur_disk_byte = bio->bi_iter.bi_sector << 9; u64 em_len; u64 em_start; struct extent_map *em; blk_status_t ret = BLK_STS_RESOURCE; int faili = 0; - const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); u8 *sums; em_tree = &BTRFS_I(inode)->extent_tree; @@ -722,15 +719,12 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, */ refcount_inc(&cb->pending_bios); - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - ret = btrfs_lookup_bio_sums(inode, comp_bio, - (u64)-1, sums); - BUG_ON(ret); /* -ENOMEM */ - } + ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); + BUG_ON(ret); /* -ENOMEM */ nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size, fs_info->sectorsize); - sums += csum_size * nr_sectors; + sums += fs_info->csum_size * nr_sectors; ret = btrfs_map_bio(fs_info, comp_bio, mirror_num); if (ret) { @@ -751,10 +745,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - ret = btrfs_lookup_bio_sums(inode, comp_bio, (u64)-1, sums); - BUG_ON(ret); /* -ENOMEM */ - } + ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); + BUG_ON(ret); /* -ENOMEM */ ret = btrfs_map_bio(fs_info, comp_bio, mirror_num); if (ret) { diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 113da62dc17f..07810891e204 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1278,14 +1278,11 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, if (!tm) return eb; - btrfs_set_path_blocking(path); - btrfs_set_lock_blocking_read(eb); - if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { BUG_ON(tm->slot != 0); eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); if (!eb_rewin) { - btrfs_tree_read_unlock_blocking(eb); + btrfs_tree_read_unlock(eb); free_extent_buffer(eb); return NULL; } @@ -1297,13 +1294,13 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, } else { eb_rewin = btrfs_clone_extent_buffer(eb); if (!eb_rewin) { - btrfs_tree_read_unlock_blocking(eb); + btrfs_tree_read_unlock(eb); free_extent_buffer(eb); return NULL; } } - btrfs_tree_read_unlock_blocking(eb); + btrfs_tree_read_unlock(eb); free_extent_buffer(eb); btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin), @@ -1356,7 +1353,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq) if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); - old = read_tree_block(fs_info, logical, 0, level, NULL); + old = read_tree_block(fs_info, logical, root->root_key.objectid, + 0, level, NULL); if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { if (!IS_ERR(old)) free_extent_buffer(old); @@ -1373,9 +1371,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq) free_extent_buffer(eb_root); eb = alloc_dummy_extent_buffer(fs_info, logical); } else { - btrfs_set_lock_blocking_read(eb_root); eb = btrfs_clone_extent_buffer(eb_root); - btrfs_tree_read_unlock_blocking(eb_root); + btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); } @@ -1483,10 +1480,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, search_start = buf->start & ~((u64)SZ_1G - 1); - if (parent) - btrfs_set_lock_blocking_write(parent); - btrfs_set_lock_blocking_write(buf); - /* * Before CoWing this block for later modification, check if it's * the subtree root and do the delayed subtree trace if needed. @@ -1578,7 +1571,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *cur; u64 blocknr; - u64 gen; u64 search_start = *last_ret; u64 last_block = 0; u64 other; @@ -1586,14 +1578,10 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, int end_slot; int i; int err = 0; - int parent_level; - int uptodate; u32 blocksize; int progress_passed = 0; struct btrfs_disk_key disk_key; - parent_level = btrfs_header_level(parent); - WARN_ON(trans->transaction != fs_info->running_transaction); WARN_ON(trans->transid != fs_info->generation); @@ -1604,10 +1592,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (parent_nritems <= 1) return 0; - btrfs_set_lock_blocking_write(parent); - for (i = start_slot; i <= end_slot; i++) { - struct btrfs_key first_key; int close = 1; btrfs_node_key(parent, &disk_key, i); @@ -1616,8 +1601,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, progress_passed = 1; blocknr = btrfs_node_blockptr(parent, i); - gen = btrfs_node_ptr_generation(parent, i); - btrfs_node_key_to_cpu(parent, &first_key, i); if (last_block == 0) last_block = blocknr; @@ -1634,36 +1617,13 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, continue; } - cur = find_extent_buffer(fs_info, blocknr); - if (cur) - uptodate = btrfs_buffer_uptodate(cur, gen, 0); - else - uptodate = 0; - if (!cur || !uptodate) { - if (!cur) { - cur = read_tree_block(fs_info, blocknr, gen, - parent_level - 1, - &first_key); - if (IS_ERR(cur)) { - return PTR_ERR(cur); - } else if (!extent_buffer_uptodate(cur)) { - free_extent_buffer(cur); - return -EIO; - } - } else if (!uptodate) { - err = btrfs_read_buffer(cur, gen, - parent_level - 1,&first_key); - if (err) { - free_extent_buffer(cur); - return err; - } - } - } + cur = btrfs_read_node_slot(parent, i); + if (IS_ERR(cur)) + return PTR_ERR(cur); if (search_start == 0) search_start = last_block; btrfs_tree_lock(cur); - btrfs_set_lock_blocking_write(cur); err = __btrfs_cow_block(trans, root, cur, parent, i, &cur, search_start, min(16 * blocksize, @@ -1723,9 +1683,10 @@ static noinline int generic_bin_search(struct extent_buffer *eb, oip = offset_in_page(offset); if (oip + key_size <= PAGE_SIZE) { - const unsigned long idx = offset >> PAGE_SHIFT; + const unsigned long idx = get_eb_page_index(offset); char *kaddr = page_address(eb->pages[idx]); + oip = get_eb_offset_in_page(eb, offset); tmp = (struct btrfs_disk_key *)(kaddr + oip); } else { read_extent_buffer(eb, &unaligned, offset, key_size); @@ -1801,6 +1762,7 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, btrfs_node_key_to_cpu(parent, &first_key, slot); eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), + btrfs_header_owner(parent), btrfs_node_ptr_generation(parent, slot), level - 1, &first_key); if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) { @@ -1835,8 +1797,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, mid = path->nodes[level]; - WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK && - path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING); + WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK); WARN_ON(btrfs_header_generation(mid) != trans->transid); orig_ptr = btrfs_node_blockptr(mid, orig_slot); @@ -1865,7 +1826,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } btrfs_tree_lock(child); - btrfs_set_lock_blocking_write(child); ret = btrfs_cow_block(trans, root, child, mid, 0, &child, BTRFS_NESTING_COW); if (ret) { @@ -1904,7 +1864,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (left) { __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); - btrfs_set_lock_blocking_write(left); wret = btrfs_cow_block(trans, root, left, parent, pslot - 1, &left, BTRFS_NESTING_LEFT_COW); @@ -1920,7 +1879,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (right) { __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); - btrfs_set_lock_blocking_write(right); wret = btrfs_cow_block(trans, root, right, parent, pslot + 1, &right, BTRFS_NESTING_RIGHT_COW); @@ -2084,7 +2042,6 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, u32 left_nr; __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); - btrfs_set_lock_blocking_write(left); left_nr = btrfs_header_nritems(left); if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { @@ -2139,7 +2096,6 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, u32 right_nr; __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); - btrfs_set_lock_blocking_write(right); right_nr = btrfs_header_nritems(right); if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { @@ -2243,7 +2199,7 @@ static void reada_for_search(struct btrfs_fs_info *fs_info, search = btrfs_node_blockptr(node, nr); if ((search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { - readahead_tree_block(fs_info, search); + btrfs_readahead_node_child(node, nr); nread += blocksize; } nscan++; @@ -2252,16 +2208,11 @@ static void reada_for_search(struct btrfs_fs_info *fs_info, } } -static noinline void reada_for_balance(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, int level) +static noinline void reada_for_balance(struct btrfs_path *path, int level) { + struct extent_buffer *parent; int slot; int nritems; - struct extent_buffer *parent; - struct extent_buffer *eb; - u64 gen; - u64 block1 = 0; - u64 block2 = 0; parent = path->nodes[level + 1]; if (!parent) @@ -2270,32 +2221,10 @@ static noinline void reada_for_balance(struct btrfs_fs_info *fs_info, nritems = btrfs_header_nritems(parent); slot = path->slots[level + 1]; - if (slot > 0) { - block1 = btrfs_node_blockptr(parent, slot - 1); - gen = btrfs_node_ptr_generation(parent, slot - 1); - eb = find_extent_buffer(fs_info, block1); - /* - * if we get -eagain from btrfs_buffer_uptodate, we - * don't want to return eagain here. That will loop - * forever - */ - if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) - block1 = 0; - free_extent_buffer(eb); - } - if (slot + 1 < nritems) { - block2 = btrfs_node_blockptr(parent, slot + 1); - gen = btrfs_node_ptr_generation(parent, slot + 1); - eb = find_extent_buffer(fs_info, block2); - if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) - block2 = 0; - free_extent_buffer(eb); - } - - if (block1) - readahead_tree_block(fs_info, block1); - if (block2) - readahead_tree_block(fs_info, block2); + if (slot > 0) + btrfs_readahead_node_child(parent, slot - 1); + if (slot + 1 < nritems) + btrfs_readahead_node_child(parent, slot + 1); } @@ -2399,14 +2328,6 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, return 0; } - /* the pages were up to date, but we failed - * the generation number check. Do a full - * read for the generation number that is correct. - * We must do this without dropping locks so - * we can trust our generation number - */ - btrfs_set_path_blocking(p); - /* now we're allowed to do a blocking uptodate check */ ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key); if (!ret) { @@ -2426,14 +2347,13 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, * out which blocks to read. */ btrfs_unlock_up_safe(p, level + 1); - btrfs_set_path_blocking(p); if (p->reada != READA_NONE) reada_for_search(fs_info, p, level, slot, key->objectid); ret = -EAGAIN; - tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1, - &first_key); + tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid, + gen, parent_level - 1, &first_key); if (!IS_ERR(tmp)) { /* * If the read above didn't mark this buffer up to date, @@ -2468,58 +2388,42 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, int *write_lock_level) { struct btrfs_fs_info *fs_info = root->fs_info; - int ret; + int ret = 0; if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) { - int sret; if (*write_lock_level < level + 1) { *write_lock_level = level + 1; btrfs_release_path(p); - goto again; + return -EAGAIN; } - btrfs_set_path_blocking(p); - reada_for_balance(fs_info, p, level); - sret = split_node(trans, root, p, level); + reada_for_balance(p, level); + ret = split_node(trans, root, p, level); - BUG_ON(sret > 0); - if (sret) { - ret = sret; - goto done; - } b = p->nodes[level]; } else if (ins_len < 0 && btrfs_header_nritems(b) < BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) { - int sret; if (*write_lock_level < level + 1) { *write_lock_level = level + 1; btrfs_release_path(p); - goto again; + return -EAGAIN; } - btrfs_set_path_blocking(p); - reada_for_balance(fs_info, p, level); - sret = balance_level(trans, root, p, level); + reada_for_balance(p, level); + ret = balance_level(trans, root, p, level); + if (ret) + return ret; - if (sret) { - ret = sret; - goto done; - } b = p->nodes[level]; if (!b) { btrfs_release_path(p); - goto again; + return -EAGAIN; } BUG_ON(btrfs_header_nritems(b) == 1); } - return 0; - -again: - ret = -EAGAIN; -done: return ret; } @@ -2616,7 +2520,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, * We don't know the level of the root node until we actually * have it read locked */ - b = __btrfs_read_lock_root_node(root, p->recurse); + b = btrfs_read_lock_root_node(root); level = btrfs_header_level(b); if (level > write_lock_level) goto out; @@ -2752,7 +2656,6 @@ again: goto again; } - btrfs_set_path_blocking(p); if (last_level) err = btrfs_cow_block(trans, root, b, NULL, 0, &b, @@ -2822,7 +2725,6 @@ cow_done: goto again; } - btrfs_set_path_blocking(p); err = split_leaf(trans, root, key, p, ins_len, ret == 0); @@ -2884,17 +2786,10 @@ cow_done: if (!p->skip_locking) { level = btrfs_header_level(b); if (level <= write_lock_level) { - if (!btrfs_try_tree_write_lock(b)) { - btrfs_set_path_blocking(p); - btrfs_tree_lock(b); - } + btrfs_tree_lock(b); p->locks[level] = BTRFS_WRITE_LOCK; } else { - if (!btrfs_tree_read_lock_atomic(b)) { - btrfs_set_path_blocking(p); - __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL, - p->recurse); - } + btrfs_tree_read_lock(b); p->locks[level] = BTRFS_READ_LOCK; } p->nodes[level] = b; @@ -2902,12 +2797,6 @@ cow_done: } ret = 1; done: - /* - * we don't really know what they plan on doing with the path - * from here on, so for now just mark it as blocking - */ - if (!p->leave_spinning) - btrfs_set_path_blocking(p); if (ret < 0 && !p->skip_release_on_error) btrfs_release_path(p); return ret; @@ -2999,10 +2888,7 @@ again: } level = btrfs_header_level(b); - if (!btrfs_tree_read_lock_atomic(b)) { - btrfs_set_path_blocking(p); - btrfs_tree_read_lock(b); - } + btrfs_tree_read_lock(b); b = tree_mod_log_rewind(fs_info, p, b, time_seq); if (!b) { ret = -ENOMEM; @@ -3013,8 +2899,6 @@ again: } ret = 1; done: - if (!p->leave_spinning) - btrfs_set_path_blocking(p); if (ret < 0) btrfs_release_path(p); @@ -3441,7 +3325,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, add_root_to_dirty_list(root); atomic_inc(&c->refs); path->nodes[level] = c; - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; path->slots[level] = 0; return 0; } @@ -3562,7 +3446,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans, (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); btrfs_set_header_nritems(split, c_nritems - mid); btrfs_set_header_nritems(c, mid); - ret = 0; btrfs_mark_buffer_dirty(c); btrfs_mark_buffer_dirty(split); @@ -3580,7 +3463,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_tree_unlock(split); free_extent_buffer(split); } - return ret; + return 0; } /* @@ -3814,7 +3697,6 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root return 1; __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); - btrfs_set_lock_blocking_write(right); free_space = btrfs_leaf_free_space(right); if (free_space < data_size) @@ -4053,7 +3935,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root return 1; __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); - btrfs_set_lock_blocking_write(left); free_space = btrfs_leaf_free_space(left); if (free_space < data_size) { @@ -4448,7 +4329,6 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, goto err; } - btrfs_set_path_blocking(path); ret = split_leaf(trans, root, &key, path, ins_len, 1); if (ret) goto err; @@ -4478,8 +4358,6 @@ static noinline int split_item(struct btrfs_path *path, leaf = path->nodes[0]; BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)); - btrfs_set_path_blocking(path); - item = btrfs_item_nr(path->slots[0]); orig_offset = btrfs_item_offset(leaf, item); item_size = btrfs_item_size(leaf, item); @@ -5055,7 +4933,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (leaf == root->node) { btrfs_set_header_level(leaf, 0); } else { - btrfs_set_path_blocking(path); btrfs_clean_tree_block(leaf); btrfs_del_leaf(trans, root, path, leaf); } @@ -5077,7 +4954,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, slot = path->slots[1]; atomic_inc(&leaf->refs); - btrfs_set_path_blocking(path); wret = push_leaf_left(trans, root, path, 1, 1, 1, (u32)-1); if (wret < 0 && wret != -ENOSPC) @@ -5248,7 +5124,6 @@ find_next_key: */ if (slot >= nritems) { path->slots[level] = slot; - btrfs_set_path_blocking(path); sret = btrfs_find_next_key(root, path, min_key, level, min_trans); if (sret == 0) { @@ -5265,7 +5140,6 @@ find_next_key: ret = 0; goto out; } - btrfs_set_path_blocking(path); cur = btrfs_read_node_slot(cur, slot); if (IS_ERR(cur)) { ret = PTR_ERR(cur); @@ -5282,7 +5156,6 @@ out: path->keep_locks = keep_locks; if (ret == 0) { btrfs_unlock_up_safe(path, path->lowest_level + 1); - btrfs_set_path_blocking(path); memcpy(min_key, &found_key, sizeof(found_key)); } return ret; @@ -5384,8 +5257,7 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key key; u32 nritems; int ret; - int old_spinning = path->leave_spinning; - int next_rw_lock = 0; + int i; nritems = btrfs_header_nritems(path->nodes[0]); if (nritems == 0) @@ -5395,11 +5267,9 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, again: level = 1; next = NULL; - next_rw_lock = 0; btrfs_release_path(path); path->keep_locks = 1; - path->leave_spinning = 1; if (time_seq) ret = btrfs_search_old_slot(root, &key, path, time_seq); @@ -5459,13 +5329,22 @@ again: continue; } - if (next) { - btrfs_tree_unlock_rw(next, next_rw_lock); - free_extent_buffer(next); + + /* + * Our current level is where we're going to start from, and to + * make sure lockdep doesn't complain we need to drop our locks + * and nodes from 0 to our current level. + */ + for (i = 0; i < level; i++) { + if (path->locks[level]) { + btrfs_tree_read_unlock(path->nodes[i]); + path->locks[i] = 0; + } + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; } next = c; - next_rw_lock = path->locks[level]; ret = read_block_for_search(root, path, &next, level, slot, &key); if (ret == -EAGAIN) @@ -5491,28 +5370,18 @@ again: cond_resched(); goto again; } - if (!ret) { - btrfs_set_path_blocking(path); - __btrfs_tree_read_lock(next, - BTRFS_NESTING_RIGHT, - path->recurse); - } - next_rw_lock = BTRFS_READ_LOCK; + if (!ret) + btrfs_tree_read_lock(next); } break; } path->slots[level] = slot; while (1) { level--; - c = path->nodes[level]; - if (path->locks[level]) - btrfs_tree_unlock_rw(c, path->locks[level]); - - free_extent_buffer(c); path->nodes[level] = next; path->slots[level] = 0; if (!path->skip_locking) - path->locks[level] = next_rw_lock; + path->locks[level] = BTRFS_READ_LOCK; if (!level) break; @@ -5526,23 +5395,12 @@ again: goto done; } - if (!path->skip_locking) { - ret = btrfs_try_tree_read_lock(next); - if (!ret) { - btrfs_set_path_blocking(path); - __btrfs_tree_read_lock(next, - BTRFS_NESTING_RIGHT, - path->recurse); - } - next_rw_lock = BTRFS_READ_LOCK; - } + if (!path->skip_locking) + btrfs_tree_read_lock(next); } ret = 0; done: unlock_up(path, 0, 1, 0, NULL); - path->leave_spinning = old_spinning; - if (!old_spinning) - btrfs_set_path_blocking(path); return ret; } @@ -5564,7 +5422,6 @@ int btrfs_previous_item(struct btrfs_root *root, while (1) { if (path->slots[0] == 0) { - btrfs_set_path_blocking(path); ret = btrfs_prev_leaf(root, path); if (ret != 0) return ret; @@ -5606,7 +5463,6 @@ int btrfs_previous_extent_item(struct btrfs_root *root, while (1) { if (path->slots[0] == 0) { - btrfs_set_path_blocking(path); ret = btrfs_prev_leaf(root, path); if (ret != 0) return ret; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0b29bdb25105..1d3c1e479f3d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -17,7 +17,6 @@ #include <linux/wait.h> #include <linux/slab.h> #include <trace/events/btrfs.h> -#include <asm/kmap_types.h> #include <asm/unaligned.h> #include <linux/pagemap.h> #include <linux/btrfs.h> @@ -28,6 +27,7 @@ #include <linux/dynamic_debug.h> #include <linux/refcount.h> #include <linux/crc32c.h> +#include <linux/iomap.h> #include "extent-io-tree.h" #include "extent_io.h" #include "extent_map.h" @@ -67,12 +67,6 @@ struct btrfs_ref; #define BTRFS_OLDEST_GENERATION 0ULL /* - * the max metadata block size. This limit is somewhat artificial, - * but the memmove costs go through the roof for larger blocks. - */ -#define BTRFS_MAX_METADATA_BLOCKSIZE 65536 - -/* * we can actually store much bigger names, but lets not confuse the rest * of linux */ @@ -370,11 +364,9 @@ struct btrfs_path { unsigned int search_for_split:1; unsigned int keep_locks:1; unsigned int skip_locking:1; - unsigned int leave_spinning:1; unsigned int search_commit_root:1; unsigned int need_commit_sem:1; unsigned int skip_release_on_error:1; - unsigned int recurse:1; }; #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ sizeof(struct btrfs_item)) @@ -469,10 +461,11 @@ struct btrfs_discard_ctl { struct btrfs_block_group *block_group; struct list_head discard_list[BTRFS_NR_DISCARD_LISTS]; u64 prev_discard; + u64 prev_discard_time; atomic_t discardable_extents; atomic64_t discardable_bytes; u64 max_discard_size; - unsigned long delay; + u64 delay_ms; u32 iops_limit; u32 kbps_limit; u64 discard_extent_bytes; @@ -559,6 +552,9 @@ enum { /* Indicate that the discard workqueue can service discards. */ BTRFS_FS_DISCARD_RUNNING, + + /* Indicate that we need to cleanup space cache v1 */ + BTRFS_FS_CLEANUP_SPACE_CACHE_V1, }; /* @@ -912,6 +908,7 @@ struct btrfs_fs_info { /* Extent buffer radix tree */ spinlock_t buffer_lock; + /* Entries are eb->start / sectorsize */ struct radix_tree_root buffer_radix; /* next backup root to be overwritten */ @@ -934,6 +931,10 @@ struct btrfs_fs_info { /* Cached block sizes */ u32 nodesize; u32 sectorsize; + /* ilog2 of sectorsize, use to avoid 64bit division */ + u32 sectorsize_bits; + u32 csum_size; + u32 csums_per_leaf; u32 stripesize; /* Block groups and devices containing active swapfiles. */ @@ -951,6 +952,18 @@ struct btrfs_fs_info { /* Type of exclusive operation running */ unsigned long exclusive_operation; + /* + * Zone size > 0 when in ZONED mode, otherwise it's used for a check + * if the mode is enabled + */ + union { + u64 zone_size; + u64 zoned; + }; + + /* Max size to emit ZONE_APPEND write command */ + u64 max_zone_append_size; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; @@ -1021,7 +1034,7 @@ enum { BTRFS_ROOT_DEAD_RELOC_TREE, /* Mark dead root stored on device whose cleanup needs to be resumed */ BTRFS_ROOT_DEAD_TREE, - /* The root has a log tree. Used only for subvolume roots. */ + /* The root has a log tree. Used for subvolume roots and the tree root. */ BTRFS_ROOT_HAS_LOG_TREE, /* Qgroup flushing is in progress */ BTRFS_ROOT_QGROUP_FLUSHING, @@ -1060,15 +1073,6 @@ struct btrfs_root { spinlock_t accounting_lock; struct btrfs_block_rsv *block_rsv; - /* free ino cache stuff */ - struct btrfs_free_space_ctl *free_ino_ctl; - enum btrfs_caching_type ino_cache_state; - spinlock_t ino_cache_lock; - wait_queue_head_t ino_cache_wait; - struct btrfs_free_space_ctl *free_ino_pinned; - u64 ino_cache_progress; - struct inode *ino_cache_inode; - struct mutex log_mutex; wait_queue_head_t log_writer_wait; wait_queue_head_t log_commit_wait[2]; @@ -1227,6 +1231,63 @@ struct btrfs_replace_extent_info { int insertions; }; +/* Arguments for btrfs_drop_extents() */ +struct btrfs_drop_extents_args { + /* Input parameters */ + + /* + * If NULL, btrfs_drop_extents() will allocate and free its own path. + * If 'replace_extent' is true, this must not be NULL. Also the path + * is always released except if 'replace_extent' is true and + * btrfs_drop_extents() sets 'extent_inserted' to true, in which case + * the path is kept locked. + */ + struct btrfs_path *path; + /* Start offset of the range to drop extents from */ + u64 start; + /* End (exclusive, last byte + 1) of the range to drop extents from */ + u64 end; + /* If true drop all the extent maps in the range */ + bool drop_cache; + /* + * If true it means we want to insert a new extent after dropping all + * the extents in the range. If this is true, the 'extent_item_size' + * parameter must be set as well and the 'extent_inserted' field will + * be set to true by btrfs_drop_extents() if it could insert the new + * extent. + * Note: when this is set to true the path must not be NULL. + */ + bool replace_extent; + /* + * Used if 'replace_extent' is true. Size of the file extent item to + * insert after dropping all existing extents in the range + */ + u32 extent_item_size; + + /* Output parameters */ + + /* + * Set to the minimum between the input parameter 'end' and the end + * (exclusive, last byte + 1) of the last dropped extent. This is always + * set even if btrfs_drop_extents() returns an error. + */ + u64 drop_end; + /* + * The number of allocated bytes found in the range. This can be smaller + * than the range's length when there are holes in the range. + */ + u64 bytes_found; + /* + * Only set if 'replace_extent' is true. Set to true if we were able + * to insert a replacement extent after dropping all extents in the + * range, otherwise set to false by btrfs_drop_extents(). + * Also, if btrfs_drop_extents() has set this to true it means it + * returned with the path locked, otherwise if it has set this to + * false it has returned with the path released. + */ + bool extent_inserted; +}; + struct btrfs_file_private { void *filldir_buf; }; @@ -1285,7 +1346,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) #define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) -#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) +/* bit 17 is free */ #define BTRFS_MOUNT_USEBACKUPROOT (1 << 18) #define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) #define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) @@ -1298,6 +1359,8 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) #define BTRFS_MOUNT_NOLOGREPLAY (1 << 27) #define BTRFS_MOUNT_REF_VERIFY (1 << 28) #define BTRFS_MOUNT_DISCARD_ASYNC (1 << 29) +#define BTRFS_MOUNT_IGNOREBADROOTS (1 << 30) +#define BTRFS_MOUNT_IGNOREDATACSUMS (1 << 31) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (2048) @@ -1330,9 +1393,7 @@ do { \ * transaction commit) */ -#define BTRFS_PENDING_SET_INODE_MAP_CACHE (0) -#define BTRFS_PENDING_CLEAR_INODE_MAP_CACHE (1) -#define BTRFS_PENDING_COMMIT (2) +#define BTRFS_PENDING_COMMIT (0) #define btrfs_test_pending(info, opt) \ test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) @@ -1405,7 +1466,7 @@ struct btrfs_map_token { }; #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ - ((bytes) >> (fs_info)->sb->s_blocksize_bits) + ((bytes) >> (fs_info)->sectorsize_bits) static inline void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb) @@ -1490,13 +1551,14 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ { \ - const type *p = page_address(eb->pages[0]); \ + const type *p = page_address(eb->pages[0]) + \ + offset_in_page(eb->start); \ return get_unaligned_le##bits(&p->member); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, \ u##bits val) \ { \ - type *p = page_address(eb->pages[0]); \ + type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \ put_unaligned_le##bits(val, &p->member); \ } @@ -2086,6 +2148,7 @@ BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8); BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8); BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8); BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64); BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32); @@ -2518,7 +2581,17 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, enum btrfs_inline_ref_type is_data); u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); -u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes); +/* + * Take the number of bytes to be checksummmed and figure out how many leaves + * it would require to store the csums for that many bytes. + */ +static inline u64 btrfs_csum_bytes_to_leaves( + const struct btrfs_fs_info *fs_info, u64 csum_bytes) +{ + const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits; + + return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf); +} /* * Use this if we would be adding new items, as we could split nodes as we cow @@ -2593,7 +2666,6 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int delalloc); int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len); -void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); @@ -2940,8 +3012,7 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, - u64 offset, u8 *dst); +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, @@ -2968,13 +3039,13 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len); int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len); -void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size); +void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size); u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); -int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, +int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, struct page *page, u64 start, u64 end, int mirror); struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len); @@ -2994,11 +3065,11 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, const char *name, int name_len, int add_backref, u64 index); int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry); -int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, - int front); +int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, + int front); int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct inode *inode, u64 new_size, + struct btrfs_inode *inode, u64 new_size, u32 min_type); int btrfs_start_delalloc_snapshot(struct btrfs_root *root); @@ -3038,14 +3109,13 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 end); int btrfs_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode); + struct btrfs_root *root, struct btrfs_inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode); + struct btrfs_root *root, struct btrfs_inode *inode); int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); -int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); +int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size); void btrfs_add_delayed_iput(struct inode *inode); void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info); int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info); @@ -3063,7 +3133,18 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, u64 end, int uptodate); extern const struct dentry_operations btrfs_dentry_operations; -ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter); +extern const struct iomap_ops btrfs_dio_iomap_ops; +extern const struct iomap_dio_ops btrfs_dio_ops; + +/* Inode locking type flags, by default the exclusive lock is taken */ +#define BTRFS_ILOCK_SHARED (1U << 0) +#define BTRFS_ILOCK_TRY (1U << 1) + +int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags); +void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); +void btrfs_update_inode_bytes(struct btrfs_inode *inode, + const u64 add_bytes, + const u64 del_bytes); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -3093,16 +3174,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end, int skip_pinned); extern const struct file_operations btrfs_file_operations; -int __btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, - struct btrfs_path *path, u64 start, u64 end, - u64 *drop_end, int drop_cache, - int replace_extent, - u32 extent_item_size, - int *key_inserted); int btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, u64 start, - u64 end, int drop_cache); + struct btrfs_root *root, struct btrfs_inode *inode, + struct btrfs_drop_extents_args *args); int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, const u64 start, const u64 end, struct btrfs_replace_extent_info *extent_info, @@ -3112,7 +3186,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, int btrfs_release_file(struct inode *inode, struct file *file); int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, - struct extent_state **cached); + struct extent_state **cached, bool noreserve); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, size_t *write_bytes); @@ -3292,6 +3366,39 @@ static inline void assertfail(const char *expr, const char* file, int line) { } #endif /* + * Get the correct offset inside the page of extent buffer. + * + * @eb: target extent buffer + * @start: offset inside the extent buffer + * + * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases. + */ +static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb, + unsigned long offset) +{ + /* + * For sectorsize == PAGE_SIZE case, eb->start will always be aligned + * to PAGE_SIZE, thus adding it won't cause any difference. + * + * For sectorsize < PAGE_SIZE, we must only read the data that belongs + * to the eb, thus we have to take the eb->start into consideration. + */ + return offset_in_page(offset + eb->start); +} + +static inline unsigned long get_eb_page_index(unsigned long offset) +{ + /* + * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough. + * + * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE, + * and have ensured that all tree blocks are contained in one page, + * thus we always get index == 0. + */ + return offset >> PAGE_SHIFT; +} + +/* * Use that for functions that are conditionally exported for sanity tests but * otherwise static */ @@ -3600,4 +3707,9 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) } #endif +static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) +{ + return fs_info->zoned != 0; +} + #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 5aba81e16113..70c0340d839c 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -740,13 +740,6 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, goto out; } - /* - * we need allocate some memory space, but it might cause the task - * to sleep, so we set all locked nodes in the path to blocking locks - * first. - */ - btrfs_set_path_blocking(path); - keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS); if (!keys) { ret = -ENOMEM; @@ -1154,7 +1147,6 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->leave_spinning = 1; block_rsv = trans->block_rsv; trans->block_rsv = &fs_info->delayed_block_rsv; @@ -1219,7 +1211,6 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, btrfs_release_delayed_node(delayed_node); return -ENOMEM; } - path->leave_spinning = 1; block_rsv = trans->block_rsv; trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; @@ -1264,7 +1255,6 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode) ret = -ENOMEM; goto trans_out; } - path->leave_spinning = 1; block_rsv = trans->block_rsv; trans->block_rsv = &fs_info->delayed_block_rsv; @@ -1333,7 +1323,6 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work) if (!delayed_node) break; - path->leave_spinning = 1; root = delayed_node->root; trans = btrfs_join_transaction(root); @@ -1826,27 +1815,29 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) } int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode) + struct btrfs_root *root, + struct btrfs_inode *inode) { struct btrfs_delayed_node *delayed_node; int ret = 0; - delayed_node = btrfs_get_or_create_delayed_node(BTRFS_I(inode)); + delayed_node = btrfs_get_or_create_delayed_node(inode); if (IS_ERR(delayed_node)) return PTR_ERR(delayed_node); mutex_lock(&delayed_node->mutex); if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { - fill_stack_inode_item(trans, &delayed_node->inode_item, inode); + fill_stack_inode_item(trans, &delayed_node->inode_item, + &inode->vfs_inode); goto release_node; } - ret = btrfs_delayed_inode_reserve_metadata(trans, root, BTRFS_I(inode), + ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode, delayed_node); if (ret) goto release_node; - fill_stack_inode_item(trans, &delayed_node->inode_item, inode); + fill_stack_inode_item(trans, &delayed_node->inode_item, &inode->vfs_inode); set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); delayed_node->count++; atomic_inc(&root->fs_info->delayed_root->items); diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index ca96ef007d8f..b2412160c5bc 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -110,7 +110,8 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode); int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode); + struct btrfs_root *root, + struct btrfs_inode *inode); int btrfs_fill_inode(struct inode *inode, u32 *rdev); int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 10638537b9ef..a98e33f232d5 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -21,6 +21,7 @@ #include "rcu-string.h" #include "dev-replace.h" #include "sysfs.h" +#include "zoned.h" /* * Device replace overview @@ -96,7 +97,7 @@ no_valid_dev_replace_entry_found: * a replace target, fail the mount. */ if (btrfs_find_device(fs_info->fs_devices, - BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) { + BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) { btrfs_err(fs_info, "found replace target device without a valid replace item"); ret = -EUCLEAN; @@ -159,7 +160,7 @@ no_valid_dev_replace_entry_found: * replace target, fail the mount. */ if (btrfs_find_device(fs_info->fs_devices, - BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) { + BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) { btrfs_err(fs_info, "replace devid present without an active replace item"); ret = -EUCLEAN; @@ -171,10 +172,10 @@ no_valid_dev_replace_entry_found: case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, - src_devid, NULL, NULL, true); + src_devid, NULL, NULL); dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, BTRFS_DEV_REPLACE_DEVID, - NULL, NULL, true); + NULL, NULL); /* * allow 'btrfs dev replace_cancel' if src/tgt device is * missing @@ -259,6 +260,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return PTR_ERR(bdev); } + if (!btrfs_check_device_zone_type(fs_info, bdev)) { + btrfs_err(fs_info, + "dev-replace: zoned type of target device mismatch with filesystem"); + ret = -EINVAL; + goto error; + } + sync_blockdev(bdev); list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { @@ -313,6 +321,10 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); device->fs_devices = fs_info->fs_devices; + ret = btrfs_get_dev_zone_info(device); + if (ret) + goto error; + mutex_lock(&fs_info->fs_devices->device_list_mutex); list_add(&device->dev_list, &fs_info->fs_devices->devices); fs_info->fs_devices->num_devices++; diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 863367c2c620..98b63ebed539 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -127,7 +127,6 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->leave_spinning = 1; btrfs_cpu_key_to_disk(&disk_key, location); diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 741c7e19c32f..1db966bf85b2 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -355,7 +355,7 @@ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, block_group = find_next_block_group(discard_ctl, now); if (block_group) { - unsigned long delay = discard_ctl->delay; + u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC; u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit); /* @@ -366,9 +366,9 @@ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, if (kbps_limit && discard_ctl->prev_discard) { u64 bps_limit = ((u64)kbps_limit) * SZ_1K; u64 bps_delay = div64_u64(discard_ctl->prev_discard * - MSEC_PER_SEC, bps_limit); + NSEC_PER_SEC, bps_limit); - delay = max(delay, msecs_to_jiffies(bps_delay)); + delay = max(delay, bps_delay); } /* @@ -378,11 +378,20 @@ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, if (now < block_group->discard_eligible_time) { u64 bg_timeout = block_group->discard_eligible_time - now; - delay = max(delay, nsecs_to_jiffies(bg_timeout)); + delay = max(delay, bg_timeout); + } + + if (override && discard_ctl->prev_discard) { + u64 elapsed = now - discard_ctl->prev_discard_time; + + if (delay > elapsed) + delay -= elapsed; + else + delay = 0; } mod_delayed_work(discard_ctl->discard_workers, - &discard_ctl->work, delay); + &discard_ctl->work, nsecs_to_jiffies(delay)); } out: spin_unlock(&discard_ctl->lock); @@ -465,7 +474,12 @@ static void btrfs_discard_workfn(struct work_struct *work) discard_ctl->discard_extent_bytes += trimmed; } + /* + * Updated without locks as this is inside the workfn and nothing else + * is reading the values + */ discard_ctl->prev_discard = trimmed; + discard_ctl->prev_discard_time = ktime_get_ns(); /* Determine next steps for a block_group */ if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) { @@ -519,7 +533,6 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) s64 discardable_bytes; u32 iops_limit; unsigned long delay; - unsigned long lower_limit = BTRFS_DISCARD_MIN_DELAY_MSEC; discardable_extents = atomic_read(&discard_ctl->discardable_extents); if (!discardable_extents) @@ -550,12 +563,13 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) iops_limit = READ_ONCE(discard_ctl->iops_limit); if (iops_limit) - lower_limit = max_t(unsigned long, lower_limit, - MSEC_PER_SEC / iops_limit); + delay = MSEC_PER_SEC / iops_limit; + else + delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents; - delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents; - delay = clamp(delay, lower_limit, BTRFS_DISCARD_MAX_DELAY_MSEC); - discard_ctl->delay = msecs_to_jiffies(delay); + delay = clamp(delay, BTRFS_DISCARD_MIN_DELAY_MSEC, + BTRFS_DISCARD_MAX_DELAY_MSEC); + discard_ctl->delay_ms = delay; spin_unlock(&discard_ctl->lock); } @@ -563,15 +577,14 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) /** * btrfs_discard_update_discardable - propagate discard counters * @block_group: block_group of interest - * @ctl: free_space_ctl of @block_group * * This propagates deltas of counters up to the discard_ctl. It maintains a * current counter and a previous counter passing the delta up to the global * stat. Then the current counter value becomes the previous counter value. */ -void btrfs_discard_update_discardable(struct btrfs_block_group *block_group, - struct btrfs_free_space_ctl *ctl) +void btrfs_discard_update_discardable(struct btrfs_block_group *block_group) { + struct btrfs_free_space_ctl *ctl; struct btrfs_discard_ctl *discard_ctl; s32 extents_delta; s64 bytes_delta; @@ -581,8 +594,10 @@ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group, !btrfs_is_block_group_data_only(block_group)) return; + ctl = block_group->free_space_ctl; discard_ctl = &block_group->fs_info->discard_ctl; + lockdep_assert_held(&ctl->tree_lock); extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] - ctl->discardable_extents[BTRFS_STAT_PREV]; if (extents_delta) { @@ -684,10 +699,11 @@ void btrfs_discard_init(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&discard_ctl->discard_list[i]); discard_ctl->prev_discard = 0; + discard_ctl->prev_discard_time = 0; atomic_set(&discard_ctl->discardable_extents, 0); atomic64_set(&discard_ctl->discardable_bytes, 0); discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE; - discard_ctl->delay = BTRFS_DISCARD_MAX_DELAY_MSEC; + discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC; discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS; discard_ctl->kbps_limit = 0; discard_ctl->discard_extent_bytes = 0; diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h index 353228d62f5a..57b9202f427f 100644 --- a/fs/btrfs/discard.h +++ b/fs/btrfs/discard.h @@ -28,8 +28,7 @@ bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl); /* Update operations */ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl); -void btrfs_discard_update_discardable(struct btrfs_block_group *block_group, - struct btrfs_free_space_ctl *ctl); +void btrfs_discard_update_discardable(struct btrfs_block_group *block_group); /* Setup/cleanup operations */ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index af97ddcc6b3e..765deefda92b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -29,7 +29,6 @@ #include "tree-log.h" #include "free-space-cache.h" #include "free-space-tree.h" -#include "inode-map.h" #include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" @@ -42,6 +41,7 @@ #include "block-group.h" #include "discard.h" #include "space-info.h" +#include "zoned.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ @@ -109,15 +109,13 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) * just before they are sent down the IO stack. */ struct async_submit_bio { - void *private_data; + struct inode *inode; struct bio *bio; extent_submit_bio_start_t *submit_bio_start; int mirror_num; - /* - * bio_offset is optional, can be used if the pages in the bio - * can't tell us where in the file the bio should go - */ - u64 bio_offset; + + /* Optional parameter for submit_bio_start used by direct io */ + u64 dio_file_offset; struct btrfs_work work; blk_status_t status; }; @@ -150,40 +148,41 @@ struct async_submit_bio { # error # endif +#define DEFINE_LEVEL(stem, level) \ + .names[level] = "btrfs-" stem "-0" #level, + +#define DEFINE_NAME(stem) \ + DEFINE_LEVEL(stem, 0) \ + DEFINE_LEVEL(stem, 1) \ + DEFINE_LEVEL(stem, 2) \ + DEFINE_LEVEL(stem, 3) \ + DEFINE_LEVEL(stem, 4) \ + DEFINE_LEVEL(stem, 5) \ + DEFINE_LEVEL(stem, 6) \ + DEFINE_LEVEL(stem, 7) + static struct btrfs_lockdep_keyset { u64 id; /* root objectid */ - const char *name_stem; /* lock name stem */ - char names[BTRFS_MAX_LEVEL + 1][20]; - struct lock_class_key keys[BTRFS_MAX_LEVEL + 1]; + /* Longest entry: btrfs-free-space-00 */ + char names[BTRFS_MAX_LEVEL][20]; + struct lock_class_key keys[BTRFS_MAX_LEVEL]; } btrfs_lockdep_keysets[] = { - { .id = BTRFS_ROOT_TREE_OBJECTID, .name_stem = "root" }, - { .id = BTRFS_EXTENT_TREE_OBJECTID, .name_stem = "extent" }, - { .id = BTRFS_CHUNK_TREE_OBJECTID, .name_stem = "chunk" }, - { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" }, - { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" }, - { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" }, - { .id = BTRFS_QUOTA_TREE_OBJECTID, .name_stem = "quota" }, - { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" }, - { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" }, - { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" }, - { .id = BTRFS_UUID_TREE_OBJECTID, .name_stem = "uuid" }, - { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, .name_stem = "free-space" }, - { .id = 0, .name_stem = "tree" }, + { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") }, + { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") }, + { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") }, + { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") }, + { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") }, + { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") }, + { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") }, + { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") }, + { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") }, + { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") }, + { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, + { .id = 0, DEFINE_NAME("tree") }, }; -void __init btrfs_init_lockdep(void) -{ - int i, j; - - /* initialize lockdep class names */ - for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) { - struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i]; - - for (j = 0; j < ARRAY_SIZE(ks->names); j++) - snprintf(ks->names[j], sizeof(ks->names[j]), - "btrfs-%s-%02d", ks->name_stem, j); - } -} +#undef DEFINE_LEVEL +#undef DEFINE_NAME void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level) @@ -210,15 +209,16 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result) { struct btrfs_fs_info *fs_info = buf->fs_info; const int num_pages = fs_info->nodesize >> PAGE_SHIFT; + const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize); SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); char *kaddr; int i; shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); - kaddr = page_address(buf->pages[0]); + kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start); crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, - PAGE_SIZE - BTRFS_CSUM_SIZE); + first_page_part - BTRFS_CSUM_SIZE); for (i = 1; i < num_pages; i++) { kaddr = page_address(buf->pages[i]); @@ -248,10 +248,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, if (atomic) return -EAGAIN; - if (need_lock) { + if (need_lock) btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_read(eb); - } lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); @@ -280,7 +278,7 @@ out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); if (need_lock) - btrfs_tree_read_unlock_blocking(eb); + btrfs_tree_read_unlock(eb); return ret; } @@ -319,7 +317,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result); - if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb))) + if (memcmp(disk_sb->csum, result, fs_info->csum_size)) return 1; return 0; @@ -443,16 +441,16 @@ static int btree_read_extent_buffer_pages(struct extent_buffer *eb, } /* - * checksum a dirty tree block before IO. This has extra checks to make sure - * we only fill in the checksum field in the first page of a multi-page block + * Checksum a dirty tree block before IO. This has extra checks to make sure + * we only fill in the checksum field in the first page of a multi-page block. + * For subpage extent buffers we need bvec to also read the offset in the page. */ - -static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) +static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec) { + struct page *page = bvec->bv_page; u64 start = page_offset(page); u64 found_start; u8 result[BTRFS_CSUM_SIZE]; - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); struct extent_buffer *eb; int ret; @@ -489,7 +487,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); return ret; } - write_extent_buffer(eb, result, 0, csum_size); + write_extent_buffer(eb, result, 0, fs_info->csum_size); return 0; } @@ -523,65 +521,37 @@ static int check_tree_block_fsid(struct extent_buffer *eb) return 1; } -int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, - struct page *page, u64 start, u64 end, - int mirror) +/* Do basic extent buffer checks at read time */ +static int validate_extent_buffer(struct extent_buffer *eb) { + struct btrfs_fs_info *fs_info = eb->fs_info; u64 found_start; - int found_level; - struct extent_buffer *eb; - struct btrfs_fs_info *fs_info; - u16 csum_size; - int ret = 0; + const u32 csum_size = fs_info->csum_size; + u8 found_level; u8 result[BTRFS_CSUM_SIZE]; - int reads_done; - - if (!page->private) - goto out; - - eb = (struct extent_buffer *)page->private; - fs_info = eb->fs_info; - csum_size = btrfs_super_csum_size(fs_info->super_copy); - - /* the pending IO might have been the only thing that kept this buffer - * in memory. Make sure we have a ref for all this other checks - */ - atomic_inc(&eb->refs); - - reads_done = atomic_dec_and_test(&eb->io_pages); - if (!reads_done) - goto err; - - eb->read_mirror = mirror; - if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { - ret = -EIO; - goto err; - } + int ret = 0; found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu", eb->start, found_start); ret = -EIO; - goto err; + goto out; } if (check_tree_block_fsid(eb)) { btrfs_err_rl(fs_info, "bad fsid on block %llu", eb->start); ret = -EIO; - goto err; + goto out; } found_level = btrfs_header_level(eb); if (found_level >= BTRFS_MAX_LEVEL) { btrfs_err(fs_info, "bad tree block level %d on %llu", (int)btrfs_header_level(eb), eb->start); ret = -EIO; - goto err; + goto out; } - btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), - eb, found_level); - csum_tree_block(eb, result); if (memcmp_extent_buffer(eb, result, 0, csum_size)) { @@ -595,7 +565,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, CSUM_FMT_VALUE(csum_size, result), btrfs_header_level(eb)); ret = -EUCLEAN; - goto err; + goto out; } /* @@ -617,6 +587,37 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, btrfs_err(fs_info, "block=%llu read time tree block corruption detected", eb->start); +out: + return ret; +} + +int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, + struct page *page, u64 start, u64 end, + int mirror) +{ + struct extent_buffer *eb; + int ret = 0; + int reads_done; + + ASSERT(page->private); + eb = (struct extent_buffer *)page->private; + + /* + * The pending IO might have been the only thing that kept this buffer + * in memory. Make sure we have a ref for all this other checks + */ + atomic_inc(&eb->refs); + + reads_done = atomic_dec_and_test(&eb->io_pages); + if (!reads_done) + goto err; + + eb->read_mirror = mirror; + if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { + ret = -EIO; + goto err; + } + ret = validate_extent_buffer(eb); err: if (reads_done && test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) @@ -632,7 +633,7 @@ err: clear_extent_buffer_uptodate(eb); } free_extent_buffer(eb); -out: + return ret; } @@ -694,8 +695,8 @@ static void run_one_async_start(struct btrfs_work *work) blk_status_t ret; async = container_of(work, struct async_submit_bio, work); - ret = async->submit_bio_start(async->private_data, async->bio, - async->bio_offset); + ret = async->submit_bio_start(async->inode, async->bio, + async->dio_file_offset); if (ret) async->status = ret; } @@ -715,7 +716,7 @@ static void run_one_async_done(struct btrfs_work *work) blk_status_t ret; async = container_of(work, struct async_submit_bio, work); - inode = async->private_data; + inode = async->inode; /* If an error occurred we just want to clean up the bio and move on */ if (async->status) { @@ -745,18 +746,19 @@ static void run_one_async_free(struct btrfs_work *work) kfree(async); } -blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, +blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, - u64 bio_offset, void *private_data, + u64 dio_file_offset, extent_submit_bio_start_t *submit_bio_start) { + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); if (!async) return BLK_STS_RESOURCE; - async->private_data = private_data; + async->inode = inode; async->bio = bio; async->mirror_num = mirror_num; async->submit_bio_start = submit_bio_start; @@ -764,7 +766,7 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, run_one_async_free); - async->bio_offset = bio_offset; + async->dio_file_offset = dio_file_offset; async->status = 0; @@ -785,7 +787,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio) ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { root = BTRFS_I(bvec->bv_page->mapping->host)->root; - ret = csum_dirty_buffer(root->fs_info, bvec->bv_page); + ret = csum_dirty_buffer(root->fs_info, bvec); if (ret) break; } @@ -793,8 +795,8 @@ static blk_status_t btree_csum_one_bio(struct bio *bio) return errno_to_blk_status(ret); } -static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio, - u64 bio_offset) +static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, + u64 dio_file_offset) { /* * when we're called for a write, we're already in the async @@ -840,8 +842,8 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, * kthread helpers are used to submit writes so that * checksumming can happen in parallel across all CPUs */ - ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0, - 0, inode, btree_submit_bio_start); + ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0, + 0, btree_submit_bio_start); } if (ret) @@ -947,47 +949,33 @@ static const struct address_space_operations btree_aops = { .set_page_dirty = btree_set_page_dirty, }; -void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr) -{ - struct extent_buffer *buf = NULL; - int ret; - - buf = btrfs_find_create_tree_block(fs_info, bytenr); - if (IS_ERR(buf)) - return; - - ret = read_extent_buffer_pages(buf, WAIT_NONE, 0); - if (ret < 0) - free_extent_buffer_stale(buf); - else - free_extent_buffer(buf); -} - struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, - u64 bytenr) + u64 bytenr, u64 owner_root, + int level) { if (btrfs_is_testing(fs_info)) return alloc_test_extent_buffer(fs_info, bytenr); - return alloc_extent_buffer(fs_info, bytenr); + return alloc_extent_buffer(fs_info, bytenr, owner_root, level); } /* * Read tree block at logical address @bytenr and do variant basic but critical * verification. * + * @owner_root: the objectid of the root owner for this block. * @parent_transid: expected transid of this tree block, skip check if 0 * @level: expected level, mandatory check * @first_key: expected key in slot 0, skip check if NULL */ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 parent_transid, int level, - struct btrfs_key *first_key) + u64 owner_root, u64 parent_transid, + int level, struct btrfs_key *first_key) { struct extent_buffer *buf = NULL; int ret; - buf = btrfs_find_create_tree_block(fs_info, bytenr); + buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); if (IS_ERR(buf)) return buf; @@ -1012,8 +1000,6 @@ void btrfs_clean_tree_block(struct extent_buffer *buf) percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -buf->len, fs_info->dirty_metadata_batch); - /* ugh, clear_extent_buffer_dirty needs to lock the page */ - btrfs_set_lock_blocking_write(buf); clear_extent_buffer_dirty(buf); } } @@ -1155,7 +1141,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); leaf = NULL; - goto fail; + goto fail_unlock; } root->node = leaf; @@ -1164,8 +1150,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, root->commit_root = btrfs_root_node(root); set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - root->root_item.flags = 0; - root->root_item.byte_limit = 0; + btrfs_set_root_flags(&root->root_item, 0); + btrfs_set_root_limit(&root->root_item, 0); btrfs_set_root_bytenr(&root->root_item, leaf->start); btrfs_set_root_generation(&root->root_item, trans->transid); btrfs_set_root_level(&root->root_item, 0); @@ -1177,7 +1163,9 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, generate_random_guid(root->root_item.uuid); else export_guid(root->root_item.uuid, &guid_null); - root->root_item.drop_level = 0; + btrfs_set_root_drop_level(&root->root_item, 0); + + btrfs_tree_unlock(leaf); key.objectid = objectid; key.type = BTRFS_ROOT_ITEM_KEY; @@ -1186,13 +1174,12 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, if (ret) goto fail; - btrfs_tree_unlock(leaf); - return root; -fail: +fail_unlock: if (leaf) btrfs_tree_unlock(leaf); +fail: btrfs_put_root(root); return ERR_PTR(ret); @@ -1307,7 +1294,7 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, level = btrfs_root_level(&root->root_item); root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item), - generation, level, NULL); + key->objectid, generation, level, NULL); if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); root->node = NULL; @@ -1348,14 +1335,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) int ret; unsigned int nofs_flag; - root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); - root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), - GFP_NOFS); - if (!root->free_ino_pinned || !root->free_ino_ctl) { - ret = -ENOMEM; - goto fail; - } - /* * We might be called under a transaction (e.g. indirect backref * resolution) which could deadlock if it triggers memory reclaim @@ -1372,10 +1351,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) btrfs_check_and_init_root_item(&root->root_item); } - btrfs_init_free_ino_ctl(root); - spin_lock_init(&root->ino_cache_lock); - init_waitqueue_head(&root->ino_cache_wait); - /* * Don't assign anonymous block device to roots that are not exposed to * userspace, the id pool is limited to 1M @@ -1774,13 +1749,13 @@ static int transaction_kthread(void *arg) struct btrfs_trans_handle *trans; struct btrfs_transaction *cur; u64 transid; - time64_t now; + time64_t delta; unsigned long delay; bool cannot_commit; do { cannot_commit = false; - delay = HZ * fs_info->commit_interval; + delay = msecs_to_jiffies(fs_info->commit_interval * 1000); mutex_lock(&fs_info->transaction_kthread_mutex); spin_lock(&fs_info->trans_lock); @@ -1790,12 +1765,13 @@ static int transaction_kthread(void *arg) goto sleep; } - now = ktime_get_seconds(); + delta = ktime_get_seconds() - cur->start_time; if (cur->state < TRANS_STATE_COMMIT_START && - (now < cur->start_time || - now - cur->start_time < fs_info->commit_interval)) { + delta < fs_info->commit_interval) { spin_unlock(&fs_info->trans_lock); - delay = HZ * 5; + delay -= msecs_to_jiffies((delta - 1) * 1000); + delay = min(delay, + msecs_to_jiffies(fs_info->commit_interval * 1000)); goto sleep; } transid = cur->transid; @@ -2044,8 +2020,6 @@ void btrfs_put_root(struct btrfs_root *root) free_anon_bdev(root->anon_dev); btrfs_drew_lock_destroy(&root->snapshot_lock); free_root_extent_buffers(root); - kfree(root->free_ino_ctl); - kfree(root->free_ino_pinned); #ifdef CONFIG_BTRFS_DEBUG spin_lock(&root->fs_info->fs_roots_radix_lock); list_del_init(&root->leak_list); @@ -2260,8 +2234,9 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, return -ENOMEM; log_tree_root->node = read_tree_block(fs_info, bytenr, - fs_info->generation + 1, - level, NULL); + BTRFS_TREE_LOG_OBJECTID, + fs_info->generation + 1, level, + NULL); if (IS_ERR(log_tree_root->node)) { btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); @@ -2306,30 +2281,42 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->extent_root = root; } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->extent_root = root; location.objectid = BTRFS_DEV_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->dev_root = root; + btrfs_init_devices_late(fs_info); } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->dev_root = root; - btrfs_init_devices_late(fs_info); - location.objectid = BTRFS_CSUM_TREE_OBJECTID; - root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; + /* If IGNOREDATACSUMS is set don't bother reading the csum root. */ + if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) { + location.objectid = BTRFS_CSUM_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) { + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->csum_root = root; + } } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->csum_root = root; /* * This tree can share blocks with some other fs tree during relocation @@ -2338,11 +2325,14 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) root = btrfs_get_fs_root(tree_root->fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID, true); if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->data_reloc_root = root; } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->data_reloc_root = root; location.objectid = BTRFS_QUOTA_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); @@ -2355,9 +2345,11 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) location.objectid = BTRFS_UUID_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { - ret = PTR_ERR(root); - if (ret != -ENOENT) - goto out; + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + if (ret != -ENOENT) + goto out; + } } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->uuid_root = root; @@ -2367,11 +2359,14 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->free_space_root = root; } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->free_space_root = root; } return 0; @@ -2627,6 +2622,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) generation = btrfs_super_generation(sb); level = btrfs_super_root_level(sb); tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb), + BTRFS_ROOT_TREE_OBJECTID, generation, level, NULL); if (IS_ERR(tree_root->node)) { handle_error = true; @@ -2791,6 +2787,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) /* Usable values until the real ones are cached from the superblock */ fs_info->nodesize = 4096; fs_info->sectorsize = 4096; + fs_info->sectorsize_bits = ilog2(4096); fs_info->stripesize = 4096; spin_lock_init(&fs_info->swapfile_pins_lock); @@ -2873,6 +2870,109 @@ static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) return 0; } +/* + * Some options only have meaning at mount time and shouldn't persist across + * remounts, or be displayed. Clear these at the end of mount and remount + * code paths. + */ +void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info) +{ + btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT); + btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE); +} + +/* + * Mounting logic specific to read-write file systems. Shared by open_ctree + * and btrfs_remount when remounting from read-only to read-write. + */ +int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) +{ + int ret; + const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE); + bool clear_free_space_tree = false; + + if (btrfs_test_opt(fs_info, CLEAR_CACHE) && + btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + clear_free_space_tree = true; + } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) { + btrfs_warn(fs_info, "free space tree is invalid"); + clear_free_space_tree = true; + } + + if (clear_free_space_tree) { + btrfs_info(fs_info, "clearing free space tree"); + ret = btrfs_clear_free_space_tree(fs_info); + if (ret) { + btrfs_warn(fs_info, + "failed to clear free space tree: %d", ret); + goto out; + } + } + + ret = btrfs_cleanup_fs_roots(fs_info); + if (ret) + goto out; + + down_read(&fs_info->cleanup_work_sem); + if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) || + (ret = btrfs_orphan_cleanup(fs_info->tree_root))) { + up_read(&fs_info->cleanup_work_sem); + goto out; + } + up_read(&fs_info->cleanup_work_sem); + + mutex_lock(&fs_info->cleaner_mutex); + ret = btrfs_recover_relocation(fs_info->tree_root); + mutex_unlock(&fs_info->cleaner_mutex); + if (ret < 0) { + btrfs_warn(fs_info, "failed to recover relocation: %d", ret); + goto out; + } + + if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) && + !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + btrfs_info(fs_info, "creating free space tree"); + ret = btrfs_create_free_space_tree(fs_info); + if (ret) { + btrfs_warn(fs_info, + "failed to create free space tree: %d", ret); + goto out; + } + } + + if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) { + ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt); + if (ret) + goto out; + } + + ret = btrfs_resume_balance_async(fs_info); + if (ret) + goto out; + + ret = btrfs_resume_dev_replace_async(fs_info); + if (ret) { + btrfs_warn(fs_info, "failed to resume dev_replace"); + goto out; + } + + btrfs_qgroup_rescan_resume(fs_info); + + if (!fs_info->uuid_root) { + btrfs_info(fs_info, "creating UUID tree"); + ret = btrfs_create_uuid_tree(fs_info); + if (ret) { + btrfs_warn(fs_info, + "failed to create the UUID tree %d", ret); + goto out; + } + } + +out: + return ret; +} + int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) { @@ -2888,7 +2988,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device struct btrfs_root *chunk_root; int ret; int err = -EINVAL; - int clear_free_space_tree = 0; int level; ret = init_mount_fs_info(fs_info, sb); @@ -3035,6 +3134,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) btrfs_info(fs_info, "has skinny extents"); + fs_info->zoned = (features & BTRFS_FEATURE_INCOMPAT_ZONED); + /* * flag our filesystem as having big metadata blocks if * they are bigger than the page size @@ -3055,6 +3156,9 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device /* Cache block sizes */ fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; + fs_info->sectorsize_bits = ilog2(sectorsize); + fs_info->csum_size = btrfs_super_csum_size(disk_super); + fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; /* @@ -3111,6 +3215,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device chunk_root->node = read_tree_block(fs_info, btrfs_super_chunk_root(disk_super), + BTRFS_CHUNK_TREE_OBJECTID, generation, level, NULL); if (IS_ERR(chunk_root->node) || !extent_buffer_uptodate(chunk_root->node)) { @@ -3134,11 +3239,13 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } /* - * Keep the devid that is marked to be the target device for the - * device replace procedure + * At this point we know all the devices that make this filesystem, + * including the seed devices but we don't know yet if the replace + * target is required. So free devices that are not part of this + * filesystem but skip the replace traget device which is checked + * below in btrfs_init_dev_replace(). */ - btrfs_free_extra_devids(fs_devices, 0); - + btrfs_free_extra_devids(fs_devices); if (!fs_devices->latest_bdev) { btrfs_err(fs_info, "failed to read devices"); goto fail_tree_roots; @@ -3185,7 +3292,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_block_groups; } - btrfs_free_extra_devids(fs_devices, 1); + ret = btrfs_check_zoned_mode(fs_info); + if (ret) { + btrfs_err(fs_info, "failed to initialize zoned mode: %d", + ret); + goto fail_block_groups; + } ret = btrfs_sysfs_add_fsid(fs_devices); if (ret) { @@ -3275,22 +3387,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (ret) goto fail_qgroup; - if (!sb_rdonly(sb)) { - ret = btrfs_cleanup_fs_roots(fs_info); - if (ret) - goto fail_qgroup; - - mutex_lock(&fs_info->cleaner_mutex); - ret = btrfs_recover_relocation(tree_root); - mutex_unlock(&fs_info->cleaner_mutex); - if (ret < 0) { - btrfs_warn(fs_info, "failed to recover relocation: %d", - ret); - err = -EINVAL; - goto fail_qgroup; - } - } - fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true); if (IS_ERR(fs_info->fs_root)) { err = PTR_ERR(fs_info->fs_root); @@ -3300,78 +3396,18 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } if (sb_rdonly(sb)) - return 0; - - if (btrfs_test_opt(fs_info, CLEAR_CACHE) && - btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - clear_free_space_tree = 1; - } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && - !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) { - btrfs_warn(fs_info, "free space tree is invalid"); - clear_free_space_tree = 1; - } - - if (clear_free_space_tree) { - btrfs_info(fs_info, "clearing free space tree"); - ret = btrfs_clear_free_space_tree(fs_info); - if (ret) { - btrfs_warn(fs_info, - "failed to clear free space tree: %d", ret); - close_ctree(fs_info); - return ret; - } - } - - if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) && - !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - btrfs_info(fs_info, "creating free space tree"); - ret = btrfs_create_free_space_tree(fs_info); - if (ret) { - btrfs_warn(fs_info, - "failed to create free space tree: %d", ret); - close_ctree(fs_info); - return ret; - } - } - - down_read(&fs_info->cleanup_work_sem); - if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) || - (ret = btrfs_orphan_cleanup(fs_info->tree_root))) { - up_read(&fs_info->cleanup_work_sem); - close_ctree(fs_info); - return ret; - } - up_read(&fs_info->cleanup_work_sem); + goto clear_oneshot; - ret = btrfs_resume_balance_async(fs_info); - if (ret) { - btrfs_warn(fs_info, "failed to resume balance: %d", ret); - close_ctree(fs_info); - return ret; - } - - ret = btrfs_resume_dev_replace_async(fs_info); + ret = btrfs_start_pre_rw_mount(fs_info); if (ret) { - btrfs_warn(fs_info, "failed to resume device replace: %d", ret); close_ctree(fs_info); return ret; } - - btrfs_qgroup_rescan_resume(fs_info); btrfs_discard_resume(fs_info); - if (!fs_info->uuid_root) { - btrfs_info(fs_info, "creating UUID tree"); - ret = btrfs_create_uuid_tree(fs_info); - if (ret) { - btrfs_warn(fs_info, - "failed to create the UUID tree: %d", ret); - close_ctree(fs_info); - return ret; - } - } else if (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) || - fs_info->generation != - btrfs_super_uuid_tree_generation(disk_super)) { + if (fs_info->uuid_root && + (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) || + fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) { btrfs_info(fs_info, "checking UUID tree"); ret = btrfs_check_uuid_tree(fs_info); if (ret) { @@ -3381,14 +3417,11 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device return ret; } } - set_bit(BTRFS_FS_OPEN, &fs_info->flags); - /* - * backuproot only affect mount behavior, and if open_ctree succeeded, - * no need to keep the flag - */ - btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT); + set_bit(BTRFS_FS_OPEN, &fs_info->flags); +clear_oneshot: + btrfs_clear_oneshot_options(fs_info); return 0; fail_qgroup: @@ -3469,10 +3502,17 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, { struct btrfs_super_block *super; struct page *page; - u64 bytenr; + u64 bytenr, bytenr_orig; struct address_space *mapping = bdev->bd_inode->i_mapping; + int ret; + + bytenr_orig = btrfs_sb_offset(copy_num); + ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr); + if (ret == -ENOENT) + return ERR_PTR(-EINVAL); + else if (ret) + return ERR_PTR(ret); - bytenr = btrfs_sb_offset(copy_num); if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode)) return ERR_PTR(-EINVAL); @@ -3486,7 +3526,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, return ERR_PTR(-ENODATA); } - if (btrfs_super_bytenr(super) != bytenr) { + if (btrfs_super_bytenr(super) != bytenr_orig) { btrfs_release_disk_super(super); return ERR_PTR(-EINVAL); } @@ -3541,7 +3581,8 @@ static int write_dev_supers(struct btrfs_device *device, SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); int i; int errors = 0; - u64 bytenr; + int ret; + u64 bytenr, bytenr_orig; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; @@ -3553,12 +3594,22 @@ static int write_dev_supers(struct btrfs_device *device, struct bio *bio; struct btrfs_super_block *disk_super; - bytenr = btrfs_sb_offset(i); + bytenr_orig = btrfs_sb_offset(i); + ret = btrfs_sb_log_location(device, i, WRITE, &bytenr); + if (ret == -ENOENT) { + continue; + } else if (ret < 0) { + btrfs_err(device->fs_info, + "couldn't get super block location for mirror %d", + i); + errors++; + continue; + } if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->commit_total_bytes) break; - btrfs_set_super_bytenr(sb, bytenr); + btrfs_set_super_bytenr(sb, bytenr_orig); crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, @@ -3603,6 +3654,7 @@ static int write_dev_supers(struct btrfs_device *device, bio->bi_opf |= REQ_FUA; btrfsic_submit_bio(bio); + btrfs_advance_sb_log(device, i); } return errors < i ? 0 : -1; } @@ -3619,6 +3671,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) int i; int errors = 0; bool primary_failed = false; + int ret; u64 bytenr; if (max_mirrors == 0) @@ -3627,7 +3680,15 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) for (i = 0; i < max_mirrors; i++) { struct page *page; - bytenr = btrfs_sb_offset(i); + ret = btrfs_sb_log_location(device, i, READ, &bytenr); + if (ret == -ENOENT) { + break; + } else if (ret < 0) { + errors++; + if (i == 0) + primary_failed = true; + continue; + } if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->commit_total_bytes) break; @@ -3941,14 +4002,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, } } - if (root->free_ino_pinned) - __btrfs_remove_free_space_cache(root->free_ino_pinned); - if (root->free_ino_ctl) - __btrfs_remove_free_space_cache(root->free_ino_ctl); - if (root->ino_cache_inode) { - iput(root->ino_cache_inode); - root->ino_cache_inode = NULL; - } if (drop_ref) btrfs_put_root(root); } @@ -4171,8 +4224,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, void btrfs_mark_buffer_dirty(struct extent_buffer *buf) { - struct btrfs_fs_info *fs_info; - struct btrfs_root *root; + struct btrfs_fs_info *fs_info = buf->fs_info; u64 transid = btrfs_header_generation(buf); int was_dirty; @@ -4185,8 +4237,6 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags))) return; #endif - root = BTRFS_I(buf->pages[0]->mapping->host)->root; - fs_info = root->fs_info; btrfs_assert_tree_locked(buf); if (transid != fs_info->generation) WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n", @@ -4691,3 +4741,58 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) return 0; } + +int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) +{ + struct btrfs_path *path; + int ret; + struct extent_buffer *l; + struct btrfs_key search_key; + struct btrfs_key found_key; + int slot; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + search_key.objectid = BTRFS_LAST_FREE_OBJECTID; + search_key.type = -1; + search_key.offset = (u64)-1; + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto error; + BUG_ON(ret == 0); /* Corruption */ + if (path->slots[0] > 0) { + slot = path->slots[0] - 1; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); + *objectid = max_t(u64, found_key.objectid, + BTRFS_FIRST_FREE_OBJECTID - 1); + } else { + *objectid = BTRFS_FIRST_FREE_OBJECTID - 1; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid) +{ + int ret; + mutex_lock(&root->objectid_mutex); + + if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) { + btrfs_warn(root->fs_info, + "the objectid of root %llu reaches its highest value", + root->root_key.objectid); + ret = -ENOSPC; + goto out; + } + + *objectid = ++root->highest_objectid; + ret = 0; +out: + mutex_unlock(&root->objectid_mutex); + return ret; +} diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 182540bdcea0..e45057c0c016 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -43,13 +43,15 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info); int btrfs_verify_level_key(struct extent_buffer *eb, int level, struct btrfs_key *first_key, u64 parent_transid); struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 parent_transid, int level, - struct btrfs_key *first_key); -void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr); + u64 owner_root, u64 parent_transid, + int level, struct btrfs_key *first_key); struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, - u64 bytenr); + u64 bytenr, u64 owner_root, + int level); void btrfs_clean_tree_block(struct extent_buffer *buf); +void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info); +int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options); @@ -79,7 +81,7 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); -int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, +int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, struct page *page, u64 start, u64 end, int mirror); blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, @@ -112,10 +114,10 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, struct btrfs_key *first_key); blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata); -blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset, void *private_data, - extent_submit_bio_start_t *submit_bio_start); +blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 dio_file_offset, + extent_submit_bio_start_t *submit_bio_start); blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, int mirror_num); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, @@ -131,16 +133,15 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); +int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid); +int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid); int __init btrfs_end_io_wq_init(void); void __cold btrfs_end_io_wq_exit(void); #ifdef CONFIG_DEBUG_LOCK_ALLOC -void btrfs_init_lockdep(void); void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level); #else -static inline void btrfs_init_lockdep(void) -{ } static inline void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level) { diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 1a8d419d9e1f..1d4c2397d0d6 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -222,7 +222,6 @@ static int btrfs_get_name(struct dentry *parent, char *name, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->leave_spinning = 1; if (ino == BTRFS_FIRST_FREE_OBJECTID) { key.objectid = BTRFS_I(inode)->root->root_key.objectid; diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 9800a8306368..04083ee5ae6e 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -21,10 +21,24 @@ struct io_failure_record; #define EXTENT_NORESERVE (1U << 11) #define EXTENT_QGROUP_RESERVED (1U << 12) #define EXTENT_CLEAR_DATA_RESV (1U << 13) +/* + * Must be cleared only during ordered extent completion or on error paths if we + * did not manage to submit bios and create the ordered extents for the range. + * Should not be cleared during page release and page invalidation (if there is + * an ordered extent in flight), that is left for the ordered extent completion. + */ #define EXTENT_DELALLOC_NEW (1U << 14) +/* + * When an ordered extent successfully completes for a region marked as a new + * delalloc range, use this flag when clearing a new delalloc range to indicate + * that the VFS' inode number of bytes should be incremented and the inode's new + * delalloc bytes decremented, in an atomic way to prevent races with stat(2). + */ +#define EXTENT_ADD_INODE_BYTES (1U << 15) #define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ EXTENT_CLEAR_DATA_RESV) -#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING) +#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | \ + EXTENT_ADD_INODE_BYTES) /* * Redefined bits above which are used only in the device allocation tree, @@ -73,7 +87,7 @@ struct extent_state { /* ADD NEW ELEMENTS AFTER THIS */ wait_queue_head_t wq; refcount_t refs; - unsigned state; + u32 state; struct io_failure_record *failrec; @@ -105,19 +119,18 @@ void __cold extent_io_exit(void); u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, - u64 max_bytes, unsigned bits, int contig); + u64 max_bytes, u32 bits, int contig); void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int filled, - struct extent_state *cached_state); + u32 bits, int filled, struct extent_state *cached_state); int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, struct extent_changeset *changeset); + u32 bits, struct extent_changeset *changeset); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int wake, int delete, + u32 bits, int wake, int delete, struct extent_state **cached); int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int wake, int delete, + u32 bits, int wake, int delete, struct extent_state **cached, gfp_t mask, struct extent_changeset *changeset); @@ -141,7 +154,7 @@ static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree, } static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, unsigned bits) + u64 end, u32 bits) { int wake = 0; @@ -152,17 +165,19 @@ static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, } int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, struct extent_changeset *changeset); + u32 bits, struct extent_changeset *changeset); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, u64 *failed_start, - struct extent_state **cached_state, gfp_t mask); + u32 bits, unsigned exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask, + struct extent_changeset *changeset); int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits); + u32 bits); static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, unsigned bits) + u64 end, u32 bits) { - return set_extent_bit(tree, start, end, bits, NULL, NULL, GFP_NOFS); + return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, + NULL); } static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, @@ -175,8 +190,8 @@ static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, - NULL, mask); + return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, NULL, + mask, NULL); } static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, @@ -188,16 +203,16 @@ static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, } int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, unsigned clear_bits, + u32 bits, u32 clear_bits, struct extent_state **cached_state); static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, - u64 end, unsigned int extra_bits, + u64 end, u32 extra_bits, struct extent_state **cached_state) { return set_extent_bit(tree, start, end, EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits, - NULL, cached_state, GFP_NOFS); + 0, NULL, cached_state, GFP_NOFS, NULL); } static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, @@ -205,30 +220,30 @@ static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, { return set_extent_bit(tree, start, end, EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, - NULL, cached_state, GFP_NOFS); + 0, NULL, cached_state, GFP_NOFS, NULL); } static inline int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end) { - return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, - GFP_NOFS); + return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, NULL, + GFP_NOFS, NULL); } static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL, - cached_state, mask); + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, + cached_state, mask, NULL); } int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned bits, + u64 *start_ret, u64 *end_ret, u32 bits, struct extent_state **cached_state); void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned bits); + u64 *start_ret, u64 *end_ret, u32 bits); int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned bits); + u64 *start_ret, u64 *end_ret, u32 bits); int extent_invalidatepage(struct extent_io_tree *tree, struct page *page, unsigned long offset); bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5fd60b13f4f8..56ea380f5a17 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1465,7 +1465,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, parent, root_objectid, owner, @@ -1489,7 +1488,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - path->leave_spinning = 1; /* now insert the actual backref */ if (owner < BTRFS_FIRST_FREE_OBJECTID) { BUG_ON(refs_to_add != 1); @@ -1605,7 +1603,6 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, } again: - path->leave_spinning = 1; ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1); if (ret < 0) { err = ret; @@ -2133,25 +2130,6 @@ static u64 find_middle(struct rb_root *root) #endif /* - * Takes the number of bytes to be csumm'ed and figures out how many leaves it - * would require to store the csums for that many bytes. - */ -u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) -{ - u64 csum_size; - u64 num_csums_per_leaf; - u64 num_csums; - - csum_size = BTRFS_MAX_ITEM_SIZE(fs_info); - num_csums_per_leaf = div64_u64(csum_size, - (u64)btrfs_super_csum_size(fs_info->super_copy)); - num_csums = div64_u64(csum_bytes, fs_info->sectorsize); - num_csums += num_csums_per_leaf - 1; - num_csums = div64_u64(num_csums, num_csums_per_leaf); - return num_csums; -} - -/* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be * 0, which means to process everything in the tree at the start @@ -2663,6 +2641,11 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, BUG_ON(!btrfs_block_group_done(block_group)); ret = btrfs_remove_free_space(block_group, start, num_bytes); } else { + /* + * We must wait for v1 caching to finish, otherwise we may not + * remove our space. + */ + btrfs_wait_space_cache_v1_finished(block_group, caching_ctl); mutex_lock(&caching_ctl->mutex); if (start >= caching_ctl->progress) { @@ -2730,31 +2713,6 @@ btrfs_inc_block_group_reservations(struct btrfs_block_group *bg) atomic_inc(&bg->reservations); } -void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) -{ - struct btrfs_caching_control *next; - struct btrfs_caching_control *caching_ctl; - struct btrfs_block_group *cache; - - down_write(&fs_info->commit_root_sem); - - list_for_each_entry_safe(caching_ctl, next, - &fs_info->caching_block_groups, list) { - cache = caching_ctl->block_group; - if (btrfs_block_group_done(cache)) { - cache->last_byte_to_unpin = (u64)-1; - list_del_init(&caching_ctl->list); - btrfs_put_caching_control(caching_ctl); - } else { - cache->last_byte_to_unpin = caching_ctl->progress; - } - } - - up_write(&fs_info->commit_root_sem); - - btrfs_update_global_block_rsv(fs_info); -} - /* * Returns the free cluster for the given space info and sets empty_cluster to * what it should be based on the mount options. @@ -2816,11 +2774,13 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, len = cache->start + cache->length - start; len = min(len, end + 1 - start); - if (start < cache->last_byte_to_unpin) { - len = min(len, cache->last_byte_to_unpin - start); - if (return_free_space) - btrfs_add_free_space(cache, start, len); + down_read(&fs_info->commit_root_sem); + if (start < cache->last_byte_to_unpin && return_free_space) { + u64 add_len = min(len, cache->last_byte_to_unpin - start); + + btrfs_add_free_space(cache, start, add_len); } + up_read(&fs_info->commit_root_sem); start += len; total_unpinned += len; @@ -3040,8 +3000,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; - is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; if (!is_data && refs_to_drop != 1) { @@ -3106,7 +3064,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } btrfs_release_path(path); - path->leave_spinning = 1; /* Slow path to locate EXTENT/METADATA_ITEM */ key.objectid = bytenr; @@ -4448,7 +4405,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, ins, size); if (ret) { @@ -4533,7 +4489,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, &extent_key, size); if (ret) { @@ -4662,7 +4617,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *buf; - buf = btrfs_find_create_tree_block(fs_info, bytenr); + buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level); if (IS_ERR(buf)) return buf; @@ -4679,12 +4634,16 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, return ERR_PTR(-EUCLEAN); } + /* + * This needs to stay, because we could allocate a freed block from an + * old tree into a new tree, so we need to make sure this new block is + * set to the appropriate level and owner. + */ btrfs_set_buffer_lockdep_class(owner, buf, level); __btrfs_tree_lock(buf, nest); btrfs_clean_tree_block(buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); - btrfs_set_lock_blocking_write(buf); set_extent_buffer_uptodate(buf); memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header)); @@ -4905,7 +4864,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, continue; } reada: - readahead_tree_block(fs_info, bytenr); + btrfs_readahead_node_child(eb, slot); nread++; } wc->reada_slot = slot; @@ -5064,16 +5023,13 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, next = find_extent_buffer(fs_info, bytenr); if (!next) { - next = btrfs_find_create_tree_block(fs_info, bytenr); + next = btrfs_find_create_tree_block(fs_info, bytenr, + root->root_key.objectid, level - 1); if (IS_ERR(next)) return PTR_ERR(next); - - btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, - level - 1); reada = 1; } btrfs_tree_lock(next); - btrfs_set_lock_blocking_write(next); ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, &wc->refs[level - 1], @@ -5124,8 +5080,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (!next) { if (reada && level == 1) reada_walk_down(trans, root, wc, path); - next = read_tree_block(fs_info, bytenr, generation, level - 1, - &first_key); + next = read_tree_block(fs_info, bytenr, root->root_key.objectid, + generation, level - 1, &first_key); if (IS_ERR(next)) { return PTR_ERR(next); } else if (!extent_buffer_uptodate(next)) { @@ -5133,7 +5089,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, return -EIO; } btrfs_tree_lock(next); - btrfs_set_lock_blocking_write(next); } level--; @@ -5145,7 +5100,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, } path->nodes[level] = next; path->slots[level] = 0; - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; wc->level = level; if (wc->level == 1) wc->reada_slot = 0; @@ -5273,8 +5228,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (!path->locks[level]) { BUG_ON(level == 0); btrfs_tree_lock(eb); - btrfs_set_lock_blocking_write(eb); - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, @@ -5317,8 +5271,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (!path->locks[level] && btrfs_header_generation(eb) == trans->transid) { btrfs_tree_lock(eb); - btrfs_set_lock_blocking_write(eb); - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; } btrfs_clean_tree_block(eb); } @@ -5486,9 +5439,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_header_level(root->node); path->nodes[level] = btrfs_lock_root_node(root); - btrfs_set_lock_blocking_write(path->nodes[level]); path->slots[level] = 0; - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; memset(&wc->update_progress, 0, sizeof(wc->update_progress)); } else { @@ -5496,7 +5448,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) memcpy(&wc->update_progress, &key, sizeof(wc->update_progress)); - level = root_item->drop_level; + level = btrfs_root_drop_level(root_item); BUG_ON(level == 0); path->lowest_level = level; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -5516,8 +5468,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) level = btrfs_header_level(root->node); while (1) { btrfs_tree_lock(path->nodes[level]); - btrfs_set_lock_blocking_write(path->nodes[level]); - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; ret = btrfs_lookup_extent_info(trans, fs_info, path->nodes[level]->start, @@ -5529,7 +5480,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) } BUG_ON(wc->refs[level] == 0); - if (level == root_item->drop_level) + if (level == btrfs_root_drop_level(root_item)) break; btrfs_tree_unlock(path->nodes[level]); @@ -5574,7 +5525,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) } btrfs_cpu_key_to_disk(&root_item->drop_progress, &wc->drop_progress); - root_item->drop_level = wc->drop_level; + btrfs_set_root_drop_level(root_item, wc->drop_level); BUG_ON(wc->level == 0); if (btrfs_should_end_transaction(trans) || @@ -5704,7 +5655,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, level = btrfs_header_level(node); path->nodes[level] = node; path->slots[level] = 0; - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; + path->locks[level] = BTRFS_WRITE_LOCK; wc->refs[parent_level] = 1; wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 60f5f68d892d..6e3b72e63e42 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -142,7 +142,7 @@ struct extent_page_data { unsigned int sync_io:1; }; -static int add_extent_changeset(struct extent_state *state, unsigned bits, +static int add_extent_changeset(struct extent_state *state, u32 bits, struct extent_changeset *changeset, int set) { @@ -530,7 +530,7 @@ static void merge_state(struct extent_io_tree *tree, } static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, unsigned *bits, + struct extent_state *state, u32 *bits, struct extent_changeset *changeset); /* @@ -547,7 +547,7 @@ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, u64 start, u64 end, struct rb_node ***p, struct rb_node **parent, - unsigned *bits, struct extent_changeset *changeset) + u32 *bits, struct extent_changeset *changeset) { struct rb_node *node; @@ -628,11 +628,11 @@ static struct extent_state *next_state(struct extent_state *state) */ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, - unsigned *bits, int wake, + u32 *bits, int wake, struct extent_changeset *changeset) { struct extent_state *next; - unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS; + u32 bits_to_clear = *bits & ~EXTENT_CTLBITS; int ret; if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { @@ -695,9 +695,9 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err) * This takes the tree lock, and returns 0 on success and < 0 on error. */ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int wake, int delete, - struct extent_state **cached_state, - gfp_t mask, struct extent_changeset *changeset) + u32 bits, int wake, int delete, + struct extent_state **cached_state, + gfp_t mask, struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *cached; @@ -868,7 +868,7 @@ static void wait_on_state(struct extent_io_tree *tree, * The tree lock is taken by this function */ static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits) + u32 bits) { struct extent_state *state; struct rb_node *node; @@ -915,9 +915,9 @@ out: static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, - unsigned *bits, struct extent_changeset *changeset) + u32 *bits, struct extent_changeset *changeset) { - unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; + u32 bits_to_set = *bits & ~EXTENT_CTLBITS; int ret; if (tree->private_data && is_data_inode(tree->private_data)) @@ -961,12 +961,10 @@ static void cache_state(struct extent_state *state, * * [start, end] is inclusive This takes the tree lock. */ - -static int __must_check -__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, unsigned exclusive_bits, - u64 *failed_start, struct extent_state **cached_state, - gfp_t mask, struct extent_changeset *changeset) +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + u32 exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask, + struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -980,6 +978,10 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, btrfs_debug_check_extent_io_range(tree, start, end); trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits); + if (exclusive_bits) + ASSERT(failed_start); + else + ASSERT(failed_start == NULL); again: if (!prealloc && gfpflags_allow_blocking(mask)) { /* @@ -1179,15 +1181,6 @@ out: } -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, u64 * failed_start, - struct extent_state **cached_state, gfp_t mask) -{ - return __set_extent_bit(tree, start, end, bits, 0, failed_start, - cached_state, mask, NULL); -} - - /** * convert_extent_bit - convert all bits in a given range from one bit to * another @@ -1207,7 +1200,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, * All allocations are done with GFP_NOFS. */ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, unsigned clear_bits, + u32 bits, u32 clear_bits, struct extent_state **cached_state) { struct extent_state *state; @@ -1408,7 +1401,7 @@ out: /* wrappers around set/clear extent bit */ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, struct extent_changeset *changeset) + u32 bits, struct extent_changeset *changeset) { /* * We don't support EXTENT_LOCKED yet, as current changeset will @@ -1418,19 +1411,19 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ BUG_ON(bits & EXTENT_LOCKED); - return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, - changeset); + return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, + changeset); } int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits) + u32 bits) { - return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, - GFP_NOWAIT, NULL); + return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, + GFP_NOWAIT, NULL); } int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int wake, int delete, + u32 bits, int wake, int delete, struct extent_state **cached) { return __clear_extent_bit(tree, start, end, bits, wake, delete, @@ -1438,7 +1431,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, } int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, struct extent_changeset *changeset) + u32 bits, struct extent_changeset *changeset) { /* * Don't support EXTENT_LOCKED case, same reason as @@ -1461,9 +1454,9 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u64 failed_start; while (1) { - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, - EXTENT_LOCKED, &failed_start, - cached_state, GFP_NOFS, NULL); + err = set_extent_bit(tree, start, end, EXTENT_LOCKED, + EXTENT_LOCKED, &failed_start, + cached_state, GFP_NOFS, NULL); if (err == -EEXIST) { wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); start = failed_start; @@ -1479,8 +1472,8 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) int err; u64 failed_start; - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, - &failed_start, NULL, GFP_NOFS, NULL); + err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, + &failed_start, NULL, GFP_NOFS, NULL); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, @@ -1526,8 +1519,7 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) * nothing was found after 'start' */ static struct extent_state * -find_first_extent_bit_state(struct extent_io_tree *tree, - u64 start, unsigned bits) +find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits) { struct rb_node *node; struct extent_state *state; @@ -1554,14 +1546,15 @@ out: } /* - * find the first offset in the io tree with 'bits' set. zero is - * returned if we find something, and *start_ret and *end_ret are - * set to reflect the state struct that was found. + * Find the first offset in the io tree with one or more @bits set. * - * If nothing was found, 1 is returned. If found something, return 0. + * Note: If there are multiple bits set in @bits, any of them will match. + * + * Return 0 if we find something, and update @start_ret and @end_ret. + * Return 1 if we found nothing. */ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned bits, + u64 *start_ret, u64 *end_ret, u32 bits, struct extent_state **cached_state) { struct extent_state *state; @@ -1612,7 +1605,7 @@ out: * returned will be the full contiguous area with the bits set. */ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned bits) + u64 *start_ret, u64 *end_ret, u32 bits) { struct extent_state *state; int ret = 1; @@ -1649,7 +1642,7 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, * trim @end_ret to the appropriate size. */ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned bits) + u64 *start_ret, u64 *end_ret, u32 bits) { struct extent_state *state; struct rb_node *node, *prev = NULL, *next; @@ -1946,7 +1939,7 @@ static int __process_pages_contig(struct address_space *mapping, unsigned long page_ops, pgoff_t *index_ret) { unsigned long nr_pages = end_index - start_index + 1; - unsigned long pages_locked = 0; + unsigned long pages_processed = 0; pgoff_t index = start_index; struct page *pages[16]; unsigned ret; @@ -1981,7 +1974,7 @@ static int __process_pages_contig(struct address_space *mapping, if (locked_page && pages[i] == locked_page) { put_page(pages[i]); - pages_locked++; + pages_processed++; continue; } if (page_ops & PAGE_CLEAR_DIRTY) @@ -2006,7 +1999,7 @@ static int __process_pages_contig(struct address_space *mapping, } } put_page(pages[i]); - pages_locked++; + pages_processed++; } nr_pages -= ret; index += ret; @@ -2014,14 +2007,13 @@ static int __process_pages_contig(struct address_space *mapping, } out: if (err && index_ret) - *index_ret = start_index + pages_locked - 1; + *index_ret = start_index + pages_processed - 1; return err; } void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, - unsigned clear_bits, - unsigned long page_ops) + u32 clear_bits, unsigned long page_ops) { clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL); @@ -2037,7 +2029,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, */ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, - unsigned bits, int contig) + u32 bits, int contig) { struct rb_node *node; struct extent_state *state; @@ -2157,7 +2149,7 @@ out: * range is found set. */ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int filled, struct extent_state *cached) + u32 bits, int filled, struct extent_state *cached) { struct extent_state *state = NULL; struct rb_node *node; @@ -2642,7 +2634,7 @@ static bool btrfs_io_needs_validation(struct inode *inode, struct bio *bio) } blk_status_t btrfs_submit_read_repair(struct inode *inode, - struct bio *failed_bio, u64 phy_offset, + struct bio *failed_bio, u32 bio_offset, struct page *page, unsigned int pgoff, u64 start, u64 end, int failed_mirror, submit_bio_hook_t *submit_bio_hook) @@ -2652,7 +2644,7 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode, struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio); - const int icsum = phy_offset >> inode->i_sb->s_blocksize_bits; + const int icsum = bio_offset >> fs_info->sectorsize_bits; bool need_validation; struct bio *repair_bio; struct btrfs_io_bio *repair_io_bio; @@ -2685,7 +2677,7 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode, repair_bio->bi_private = failed_bio->bi_private; if (failed_io_bio->csum) { - const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + const u32 csum_size = fs_info->csum_size; repair_io_bio->csum = repair_io_bio->csum_inline; memcpy(repair_io_bio->csum, @@ -2775,16 +2767,88 @@ static void end_bio_extent_writepage(struct bio *bio) bio_put(bio); } -static void -endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, - int uptodate) +/* + * Record previously processed extent range + * + * For endio_readpage_release_extent() to handle a full extent range, reducing + * the extent io operations. + */ +struct processed_extent { + struct btrfs_inode *inode; + /* Start of the range in @inode */ + u64 start; + /* End of the range in in @inode */ + u64 end; + bool uptodate; +}; + +/* + * Try to release processed extent range + * + * May not release the extent range right now if the current range is + * contiguous to processed extent. + * + * Will release processed extent when any of @inode, @uptodate, the range is + * no longer contiguous to the processed range. + * + * Passing @inode == NULL will force processed extent to be released. + */ +static void endio_readpage_release_extent(struct processed_extent *processed, + struct btrfs_inode *inode, u64 start, u64 end, + bool uptodate) { struct extent_state *cached = NULL; - u64 end = start + len - 1; + struct extent_io_tree *tree; + + /* The first extent, initialize @processed */ + if (!processed->inode) + goto update; - if (uptodate && tree->track_uptodate) - set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); - unlock_extent_cached_atomic(tree, start, end, &cached); + /* + * Contiguous to processed extent, just uptodate the end. + * + * Several things to notice: + * + * - bio can be merged as long as on-disk bytenr is contiguous + * This means we can have page belonging to other inodes, thus need to + * check if the inode still matches. + * - bvec can contain range beyond current page for multi-page bvec + * Thus we need to do processed->end + 1 >= start check + */ + if (processed->inode == inode && processed->uptodate == uptodate && + processed->end + 1 >= start && end >= processed->end) { + processed->end = end; + return; + } + + tree = &processed->inode->io_tree; + /* + * Now we don't have range contiguous to the processed range, release + * the processed range now. + */ + if (processed->uptodate && tree->track_uptodate) + set_extent_uptodate(tree, processed->start, processed->end, + &cached, GFP_ATOMIC); + unlock_extent_cached_atomic(tree, processed->start, processed->end, + &cached); + +update: + /* Update processed to current range */ + processed->inode = inode; + processed->start = start; + processed->end = end; + processed->uptodate = uptodate; +} + +static void endio_readpage_update_page_status(struct page *page, bool uptodate) +{ + if (uptodate) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); } /* @@ -2804,12 +2868,12 @@ static void end_bio_extent_readpage(struct bio *bio) int uptodate = !bio->bi_status; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct extent_io_tree *tree, *failure_tree; - u64 offset = 0; - u64 start; - u64 end; - u64 len; - u64 extent_start = 0; - u64 extent_len = 0; + struct processed_extent processed = { 0 }; + /* + * The offset to the beginning of a bio, since one bio can never be + * larger than UINT_MAX, u32 here is enough. + */ + u32 bio_offset = 0; int mirror; int ret; struct bvec_iter_all iter_all; @@ -2819,42 +2883,48 @@ static void end_bio_extent_readpage(struct bio *bio) struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const u32 sectorsize = fs_info->sectorsize; + u64 start; + u64 end; + u32 len; btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", - (u64)bio->bi_iter.bi_sector, bio->bi_status, + bio->bi_iter.bi_sector, bio->bi_status, io_bio->mirror_num); tree = &BTRFS_I(inode)->io_tree; failure_tree = &BTRFS_I(inode)->io_failure_tree; - /* We always issue full-page reads, but if some block - * in a page fails to read, blk_update_request() will - * advance bv_offset and adjust bv_len to compensate. - * Print a warning for nonzero offsets, and an error - * if they don't add up to a full page. */ - if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { - if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) - btrfs_err(fs_info, - "partial page read in btrfs with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); - else - btrfs_info(fs_info, - "incomplete page read in btrfs with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); - } - - start = page_offset(page); - end = start + bvec->bv_offset + bvec->bv_len - 1; + /* + * We always issue full-sector reads, but if some block in a + * page fails to read, blk_update_request() will advance + * bv_offset and adjust bv_len to compensate. Print a warning + * for unaligned offsets, and an error if they don't add up to + * a full sector. + */ + if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) + btrfs_err(fs_info, + "partial page read in btrfs with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len, + sectorsize)) + btrfs_info(fs_info, + "incomplete page read with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + + start = page_offset(page) + bvec->bv_offset; + end = start + bvec->bv_len - 1; len = bvec->bv_len; mirror = io_bio->mirror_num; if (likely(uptodate)) { if (is_data_inode(inode)) - ret = btrfs_verify_data_csum(io_bio, offset, page, - start, end, mirror); + ret = btrfs_verify_data_csum(io_bio, + bio_offset, page, start, end, + mirror); else ret = btrfs_validate_metadata_buffer(io_bio, - offset, page, start, end, mirror); + page, start, end, mirror); if (ret) uptodate = 0; else @@ -2879,12 +2949,14 @@ static void end_bio_extent_readpage(struct bio *bio) * If it can't handle the error it will return -EIO and * we remain responsible for that page. */ - if (!btrfs_submit_read_repair(inode, bio, offset, page, + if (!btrfs_submit_read_repair(inode, bio, bio_offset, + page, start - page_offset(page), start, end, mirror, btrfs_submit_data_bio)) { uptodate = !bio->bi_status; - offset += len; + ASSERT(bio_offset + len > bio_offset); + bio_offset += len; continue; } } else { @@ -2908,40 +2980,17 @@ readpage_ok: off = offset_in_page(i_size); if (page->index == end_index && off) zero_user_segment(page, off, PAGE_SIZE); - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); - offset += len; - - if (unlikely(!uptodate)) { - if (extent_len) { - endio_readpage_release_extent(tree, - extent_start, - extent_len, 1); - extent_start = 0; - extent_len = 0; - } - endio_readpage_release_extent(tree, start, - end - start + 1, 0); - } else if (!extent_len) { - extent_start = start; - extent_len = end + 1 - start; - } else if (extent_start + extent_len == start) { - extent_len += end + 1 - start; - } else { - endio_readpage_release_extent(tree, extent_start, - extent_len, uptodate); - extent_start = start; - extent_len = end + 1 - start; } - } + ASSERT(bio_offset + len > bio_offset); + bio_offset += len; - if (extent_len) - endio_readpage_release_extent(tree, extent_start, extent_len, - uptodate); + /* Update page status and unlock */ + endio_readpage_update_page_status(page, uptodate); + endio_readpage_release_extent(&processed, BTRFS_I(inode), + start, end, uptodate); + } + /* Release the last extent */ + endio_readpage_release_extent(&processed, NULL, 0, 0, false); btrfs_io_bio_free_csum(io_bio); bio_put(bio); } @@ -3038,7 +3087,7 @@ static int submit_extent_page(unsigned int opf, { int ret = 0; struct bio *bio; - size_t page_size = min_t(size_t, size, PAGE_SIZE); + size_t io_size = min_t(size_t, size, PAGE_SIZE); sector_t sector = offset >> 9; struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree; @@ -3054,12 +3103,12 @@ static int submit_extent_page(unsigned int opf, else contig = bio_end_sector(bio) == sector; - if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags)) + if (btrfs_bio_fits_in_stripe(page, io_size, bio, bio_flags)) can_merge = false; if (prev_bio_flags != bio_flags || !contig || !can_merge || force_bio_submit || - bio_add_page(bio, page, page_size, pg_offset) < page_size) { + bio_add_page(bio, page, io_size, pg_offset) < io_size) { ret = submit_one_bio(bio, mirror_num, prev_bio_flags); if (ret < 0) { *bio_ret = NULL; @@ -3068,13 +3117,13 @@ static int submit_extent_page(unsigned int opf, bio = NULL; } else { if (wbc) - wbc_account_cgroup_owner(wbc, page, page_size); + wbc_account_cgroup_owner(wbc, page, io_size); return 0; } } bio = btrfs_bio_alloc(offset); - bio_add_page(bio, page, page_size, pg_offset); + bio_add_page(bio, page, io_size, pg_offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; bio->bi_write_hint = page->mapping->host->i_write_hint; @@ -3085,7 +3134,7 @@ static int submit_extent_page(unsigned int opf, bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev; bio_set_dev(bio, bdev); wbc_init_bio(wbc, bio); - wbc_account_cgroup_owner(wbc, page, page_size); + wbc_account_cgroup_owner(wbc, page, io_size); } *bio_ret = bio; @@ -3096,6 +3145,15 @@ static int submit_extent_page(unsigned int opf, static void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page) { + /* + * If the page is mapped to btree inode, we should hold the private + * lock to prevent race. + * For cloned or dummy extent buffers, their pages are not mapped and + * will not race with any other ebs. + */ + if (page->mapping) + lockdep_assert_held(&page->mapping->private_lock); + if (!PagePrivate(page)) attach_page_private(page, eb); else @@ -3158,7 +3216,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, int nr = 0; size_t pg_offset = 0; size_t iosize; - size_t disk_io_size; size_t blocksize = inode->i_sb->s_blocksize; unsigned long this_bio_flag = 0; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; @@ -3224,13 +3281,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, iosize = min(extent_map_end(em) - cur, end - cur + 1); cur_end = min(extent_map_end(em) - 1, end); iosize = ALIGN(iosize, blocksize); - if (this_bio_flag & EXTENT_BIO_COMPRESSED) { - disk_io_size = em->block_len; + if (this_bio_flag & EXTENT_BIO_COMPRESSED) offset = em->block_start; - } else { + else offset = em->block_start + extent_offset; - disk_io_size = iosize; - } block_start = em->block_start; if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) block_start = EXTENT_MAP_HOLE; @@ -3319,7 +3373,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, - page, offset, disk_io_size, + page, offset, iosize, pg_offset, bio, end_bio_extent_readpage, 0, *bio_flags, @@ -3656,11 +3710,14 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb) } /* - * Lock eb pages and flush the bio if we can't the locks + * Lock extent buffer status and pages for writeback. + * + * May try to flush write bio if we can't get the lock. * - * Return 0 if nothing went wrong - * Return >0 is same as 0, except bio is not submitted - * Return <0 if something went wrong, no page is locked + * Return 0 if the extent buffer doesn't need to be submitted. + * (E.g. the extent buffer is not dirty) + * Return >0 is the extent buffer is submitted to bio. + * Return <0 if something went wrong, no page is locked. */ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb, struct extent_page_data *epd) @@ -3930,10 +3987,81 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, return ret; } +/* + * Submit all page(s) of one extent buffer. + * + * @page: the page of one extent buffer + * @eb_context: to determine if we need to submit this page, if current page + * belongs to this eb, we don't need to submit + * + * The caller should pass each page in their bytenr order, and here we use + * @eb_context to determine if we have submitted pages of one extent buffer. + * + * If we have, we just skip until we hit a new page that doesn't belong to + * current @eb_context. + * + * If not, we submit all the page(s) of the extent buffer. + * + * Return >0 if we have submitted the extent buffer successfully. + * Return 0 if we don't need to submit the page, as it's already submitted by + * previous call. + * Return <0 for fatal error. + */ +static int submit_eb_page(struct page *page, struct writeback_control *wbc, + struct extent_page_data *epd, + struct extent_buffer **eb_context) +{ + struct address_space *mapping = page->mapping; + struct extent_buffer *eb; + int ret; + + if (!PagePrivate(page)) + return 0; + + spin_lock(&mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&mapping->private_lock); + return 0; + } + + eb = (struct extent_buffer *)page->private; + + /* + * Shouldn't happen and normally this would be a BUG_ON but no point + * crashing the machine for something we can survive anyway. + */ + if (WARN_ON(!eb)) { + spin_unlock(&mapping->private_lock); + return 0; + } + + if (eb == *eb_context) { + spin_unlock(&mapping->private_lock); + return 0; + } + ret = atomic_inc_not_zero(&eb->refs); + spin_unlock(&mapping->private_lock); + if (!ret) + return 0; + + *eb_context = eb; + + ret = lock_extent_buffer_for_io(eb, epd); + if (ret <= 0) { + free_extent_buffer(eb); + return ret; + } + ret = write_one_eb(eb, wbc, epd); + free_extent_buffer(eb); + if (ret < 0) + return ret; + return 1; +} + int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc) { - struct extent_buffer *eb, *prev_eb = NULL; + struct extent_buffer *eb_context = NULL; struct extent_page_data epd = { .bio = NULL, .extent_locked = 0, @@ -3979,55 +4107,13 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - if (!PagePrivate(page)) - continue; - - spin_lock(&mapping->private_lock); - if (!PagePrivate(page)) { - spin_unlock(&mapping->private_lock); + ret = submit_eb_page(page, wbc, &epd, &eb_context); + if (ret == 0) continue; - } - - eb = (struct extent_buffer *)page->private; - - /* - * Shouldn't happen and normally this would be a BUG_ON - * but no sense in crashing the users box for something - * we can survive anyway. - */ - if (WARN_ON(!eb)) { - spin_unlock(&mapping->private_lock); - continue; - } - - if (eb == prev_eb) { - spin_unlock(&mapping->private_lock); - continue; - } - - ret = atomic_inc_not_zero(&eb->refs); - spin_unlock(&mapping->private_lock); - if (!ret) - continue; - - prev_eb = eb; - ret = lock_extent_buffer_for_io(eb, &epd); - if (!ret) { - free_extent_buffer(eb); - continue; - } else if (ret < 0) { - done = 1; - free_extent_buffer(eb); - break; - } - - ret = write_one_eb(eb, wbc, &epd); - if (ret) { + if (ret < 0) { done = 1; - free_extent_buffer(eb); break; } - free_extent_buffer(eb); /* * the filesystem may choose to bump up nr_to_write. @@ -4048,7 +4134,6 @@ retry: index = 0; goto retry; } - ASSERT(ret <= 0); if (ret < 0) { end_write_bio(&epd, ret); return ret; @@ -4382,14 +4467,22 @@ int extent_invalidatepage(struct extent_io_tree *tree, u64 end = start + PAGE_SIZE - 1; size_t blocksize = page->mapping->host->i_sb->s_blocksize; + /* This function is only called for the btree inode */ + ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO); + start += ALIGN(offset, blocksize); if (start > end) return 0; lock_extent_bits(tree, start, end, &cached_state); wait_on_page_writeback(page); - clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 1, 1, &cached_state); + + /* + * Currently for btree io tree, only EXTENT_LOCKED is utilized, + * so here we only need to unlock the extent range to free any + * existing extent state. + */ + unlock_extent_cached(tree, start, end, &cached_state); return 0; } @@ -4409,12 +4502,14 @@ static int try_release_extent_state(struct extent_io_tree *tree, ret = 0; } else { /* - * at this point we can safely clear everything except the - * locked bit and the nodatasum bit + * At this point we can safely clear everything except the + * locked bit, the nodatasum bit and the delalloc new bit. + * The delalloc new bit will be cleared by ordered extent + * completion. */ ret = __clear_extent_bit(tree, start, end, - ~(EXTENT_LOCKED | EXTENT_NODATASUM), - 0, 0, NULL, mask, NULL); + ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW), + 0, 0, NULL, mask, NULL); /* if clear_extent_bit failed for enomem reasons, * we can't allow the release to continue. @@ -4691,7 +4786,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->leave_spinning = 1; roots = ulist_alloc(GFP_KERNEL); tmp_ulist = ulist_alloc(GFP_KERNEL); @@ -4950,12 +5044,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, eb->len = len; eb->fs_info = fs_info; eb->bflags = 0; - rwlock_init(&eb->lock); - atomic_set(&eb->blocking_readers, 0); - eb->blocking_writers = 0; - eb->lock_recursed = false; - init_waitqueue_head(&eb->write_lock_wq); - init_waitqueue_head(&eb->read_lock_wq); + init_rwsem(&eb->lock); btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list, &fs_info->allocated_ebs); @@ -4964,19 +5053,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, atomic_set(&eb->refs, 1); atomic_set(&eb->io_pages, 0); - /* - * Sanity checks, currently the maximum is 64k covered by 16x 4k pages - */ - BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE - > MAX_INLINE_EXTENT_BUFFER_SIZE); - BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); - -#ifdef CONFIG_BTRFS_DEBUG - eb->spinning_writers = 0; - atomic_set(&eb->spinning_readers, 0); - atomic_set(&eb->read_locks, 0); - eb->write_locks = 0; -#endif + ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE); return eb; } @@ -5105,7 +5182,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, rcu_read_lock(); eb = radix_tree_lookup(&fs_info->buffer_radix, - start >> PAGE_SHIFT); + start >> fs_info->sectorsize_bits); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); /* @@ -5157,7 +5234,7 @@ again: } spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, - start >> PAGE_SHIFT, eb); + start >> fs_info->sectorsize_bits, eb); spin_unlock(&fs_info->buffer_lock); radix_tree_preload_end(); if (ret == -EEXIST) { @@ -5178,7 +5255,7 @@ free_eb: #endif struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) + u64 start, u64 owner_root, int level) { unsigned long len = fs_info->nodesize; int num_pages; @@ -5196,6 +5273,14 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, return ERR_PTR(-EINVAL); } + if (fs_info->sectorsize < PAGE_SIZE && + offset_in_page(start) + len > PAGE_SIZE) { + btrfs_err(fs_info, + "tree block crosses page boundary, start %llu nodesize %lu", + start, len); + return ERR_PTR(-EINVAL); + } + eb = find_extent_buffer(fs_info, start); if (eb) return eb; @@ -5203,6 +5288,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return ERR_PTR(-ENOMEM); + btrfs_set_buffer_lockdep_class(owner_root, eb, level); num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++, index++) { @@ -5231,13 +5317,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, } exists = NULL; - /* - * Do this so attach doesn't complain and we need to - * drop the ref the old guy had. - */ - ClearPagePrivate(p); WARN_ON(PageDirty(p)); - put_page(p); + detach_page_private(p); } attach_extent_buffer_page(eb, p); spin_unlock(&mapping->private_lock); @@ -5265,7 +5346,7 @@ again: spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, - start >> PAGE_SHIFT, eb); + start >> fs_info->sectorsize_bits, eb); spin_unlock(&fs_info->buffer_lock); radix_tree_preload_end(); if (ret == -EEXIST) { @@ -5321,7 +5402,7 @@ static int release_extent_buffer(struct extent_buffer *eb) spin_lock(&fs_info->buffer_lock); radix_tree_delete(&fs_info->buffer_radix, - eb->start >> PAGE_SHIFT); + eb->start >> fs_info->sectorsize_bits); spin_unlock(&fs_info->buffer_lock); } else { spin_unlock(&eb->refs_lock); @@ -5622,12 +5703,12 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, struct page *page; char *kaddr; char *dst = (char *)dstv; - unsigned long i = start >> PAGE_SHIFT; + unsigned long i = get_eb_page_index(start); if (check_eb_range(eb, start, len)) return; - offset = offset_in_page(start); + offset = get_eb_offset_in_page(eb, start); while (len > 0) { page = eb->pages[i]; @@ -5652,13 +5733,13 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, struct page *page; char *kaddr; char __user *dst = (char __user *)dstv; - unsigned long i = start >> PAGE_SHIFT; + unsigned long i = get_eb_page_index(start); int ret = 0; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = offset_in_page(start); + offset = get_eb_offset_in_page(eb, start); while (len > 0) { page = eb->pages[i]; @@ -5687,13 +5768,13 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, struct page *page; char *kaddr; char *ptr = (char *)ptrv; - unsigned long i = start >> PAGE_SHIFT; + unsigned long i = get_eb_page_index(start); int ret = 0; if (check_eb_range(eb, start, len)) return -EINVAL; - offset = offset_in_page(start); + offset = get_eb_offset_in_page(eb, start); while (len > 0) { page = eb->pages[i]; @@ -5719,7 +5800,7 @@ void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, char *kaddr; WARN_ON(!PageUptodate(eb->pages[0])); - kaddr = page_address(eb->pages[0]); + kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0); memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv, BTRFS_FSID_SIZE); } @@ -5729,7 +5810,7 @@ void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv) char *kaddr; WARN_ON(!PageUptodate(eb->pages[0])); - kaddr = page_address(eb->pages[0]); + kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0); memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv, BTRFS_FSID_SIZE); } @@ -5742,12 +5823,12 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, struct page *page; char *kaddr; char *src = (char *)srcv; - unsigned long i = start >> PAGE_SHIFT; + unsigned long i = get_eb_page_index(start); if (check_eb_range(eb, start, len)) return; - offset = offset_in_page(start); + offset = get_eb_offset_in_page(eb, start); while (len > 0) { page = eb->pages[i]; @@ -5771,12 +5852,12 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, size_t offset; struct page *page; char *kaddr; - unsigned long i = start >> PAGE_SHIFT; + unsigned long i = get_eb_page_index(start); if (check_eb_range(eb, start, len)) return; - offset = offset_in_page(start); + offset = get_eb_offset_in_page(eb, start); while (len > 0) { page = eb->pages[i]; @@ -5800,10 +5881,20 @@ void copy_extent_buffer_full(const struct extent_buffer *dst, ASSERT(dst->len == src->len); - num_pages = num_extent_pages(dst); - for (i = 0; i < num_pages; i++) - copy_page(page_address(dst->pages[i]), - page_address(src->pages[i])); + if (dst->fs_info->sectorsize == PAGE_SIZE) { + num_pages = num_extent_pages(dst); + for (i = 0; i < num_pages; i++) + copy_page(page_address(dst->pages[i]), + page_address(src->pages[i])); + } else { + size_t src_offset = get_eb_offset_in_page(src, 0); + size_t dst_offset = get_eb_offset_in_page(dst, 0); + + ASSERT(src->fs_info->sectorsize < PAGE_SIZE); + memcpy(page_address(dst->pages[0]) + dst_offset, + page_address(src->pages[0]) + src_offset, + src->len); + } } void copy_extent_buffer(const struct extent_buffer *dst, @@ -5816,7 +5907,7 @@ void copy_extent_buffer(const struct extent_buffer *dst, size_t offset; struct page *page; char *kaddr; - unsigned long i = dst_offset >> PAGE_SHIFT; + unsigned long i = get_eb_page_index(dst_offset); if (check_eb_range(dst, dst_offset, len) || check_eb_range(src, src_offset, len)) @@ -5824,7 +5915,7 @@ void copy_extent_buffer(const struct extent_buffer *dst, WARN_ON(src->len != dst_len); - offset = offset_in_page(dst_offset); + offset = get_eb_offset_in_page(dst, dst_offset); while (len > 0) { page = dst->pages[i]; @@ -5868,7 +5959,7 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb, * the bitmap item in the extent buffer + the offset of the byte in the * bitmap item. */ - offset = start + byte_offset; + offset = start + offset_in_page(eb->start) + byte_offset; *page_index = offset >> PAGE_SHIFT; *page_offset = offset_in_page(offset); @@ -6022,11 +6113,11 @@ void memcpy_extent_buffer(const struct extent_buffer *dst, return; while (len > 0) { - dst_off_in_page = offset_in_page(dst_offset); - src_off_in_page = offset_in_page(src_offset); + dst_off_in_page = get_eb_offset_in_page(dst, dst_offset); + src_off_in_page = get_eb_offset_in_page(dst, src_offset); - dst_i = dst_offset >> PAGE_SHIFT; - src_i = src_offset >> PAGE_SHIFT; + dst_i = get_eb_page_index(dst_offset); + src_i = get_eb_page_index(src_offset); cur = min(len, (unsigned long)(PAGE_SIZE - src_off_in_page)); @@ -6062,11 +6153,11 @@ void memmove_extent_buffer(const struct extent_buffer *dst, return; } while (len > 0) { - dst_i = dst_end >> PAGE_SHIFT; - src_i = src_end >> PAGE_SHIFT; + dst_i = get_eb_page_index(dst_end); + src_i = get_eb_page_index(src_end); - dst_off_in_page = offset_in_page(dst_end); - src_off_in_page = offset_in_page(src_end); + dst_off_in_page = get_eb_offset_in_page(dst, dst_end); + src_off_in_page = get_eb_offset_in_page(dst, src_end); cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); @@ -6121,3 +6212,54 @@ int try_release_extent_buffer(struct page *page) return release_extent_buffer(eb); } + +/* + * btrfs_readahead_tree_block - attempt to readahead a child block + * @fs_info: the fs_info + * @bytenr: bytenr to read + * @owner_root: objectid of the root that owns this eb + * @gen: generation for the uptodate check, can be 0 + * @level: level for the eb + * + * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a + * normal uptodate check of the eb, without checking the generation. If we have + * to read the block we will not block on anything. + */ +void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, + u64 bytenr, u64 owner_root, u64 gen, int level) +{ + struct extent_buffer *eb; + int ret; + + eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); + if (IS_ERR(eb)) + return; + + if (btrfs_buffer_uptodate(eb, gen, 1)) { + free_extent_buffer(eb); + return; + } + + ret = read_extent_buffer_pages(eb, WAIT_NONE, 0); + if (ret < 0) + free_extent_buffer_stale(eb); + else + free_extent_buffer(eb); +} + +/* + * btrfs_readahead_node_child - readahead a node's child block + * @node: parent node we're reading from + * @slot: slot in the parent node for the child we want to read + * + * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at + * the slot in the node provided. + */ +void btrfs_readahead_node_child(struct extent_buffer *node, int slot) +{ + btrfs_readahead_tree_block(node->fs_info, + btrfs_node_blockptr(node, slot), + btrfs_header_owner(node), + btrfs_node_ptr_generation(node, slot), + btrfs_header_level(node) - 1); +} diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index f39d02e7f7ef..19221095c635 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -6,6 +6,7 @@ #include <linux/rbtree.h> #include <linux/refcount.h> #include <linux/fiemap.h> +#include <linux/btrfs_tree.h> #include "ulist.h" /* @@ -71,11 +72,10 @@ typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); -typedef blk_status_t (extent_submit_bio_start_t)(void *private_data, - struct bio *bio, u64 bio_offset); +typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode, + struct bio *bio, u64 dio_file_offset); -#define INLINE_EXTENT_BUFFER_PAGES 16 -#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE) +#define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE) struct extent_buffer { u64 start; unsigned long len; @@ -87,31 +87,13 @@ struct extent_buffer { int read_mirror; struct rcu_head rcu_head; pid_t lock_owner; - - int blocking_writers; - atomic_t blocking_readers; - bool lock_recursed; /* >= 0 if eb belongs to a log tree, -1 otherwise */ - short log_index; - - /* protects write locks */ - rwlock_t lock; + s8 log_index; - /* readers use lock_wq while they wait for the write - * lock holders to unlock - */ - wait_queue_head_t write_lock_wq; + struct rw_semaphore lock; - /* writers use read_lock_wq while they wait for readers - * to unlock - */ - wait_queue_head_t read_lock_wq; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; #ifdef CONFIG_BTRFS_DEBUG - int spinning_writers; - atomic_t spinning_readers; - atomic_t read_locks; - int write_locks; struct list_head leak_list; #endif }; @@ -199,7 +181,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start); + u64 start, u64 owner_root, int level); struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len); struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, @@ -215,11 +197,20 @@ void free_extent_buffer_stale(struct extent_buffer *eb); int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num); void wait_on_extent_buffer_writeback(struct extent_buffer *eb); +void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, + u64 bytenr, u64 owner_root, u64 gen, int level); +void btrfs_readahead_node_child(struct extent_buffer *node, int slot); static inline int num_extent_pages(const struct extent_buffer *eb) { - return (round_up(eb->start + eb->len, PAGE_SIZE) >> PAGE_SHIFT) - - (eb->start >> PAGE_SHIFT); + /* + * For sectorsize == PAGE_SIZE case, since nodesize is always aligned to + * sectorsize, it's just eb->len >> PAGE_SHIFT. + * + * For sectorsize < PAGE_SIZE case, we could have nodesize < PAGE_SIZE, + * thus have to ensure we get at least one page. + */ + return (eb->len >> PAGE_SHIFT) ?: 1; } static inline int extent_buffer_uptodate(const struct extent_buffer *eb) @@ -270,8 +261,7 @@ void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, - unsigned bits_to_clear, - unsigned long page_ops); + u32 bits_to_clear, unsigned long page_ops); struct bio *btrfs_bio_alloc(u64 first_byte); struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs); struct bio *btrfs_bio_clone(struct bio *bio); @@ -307,7 +297,7 @@ struct io_failure_record { blk_status_t btrfs_submit_read_repair(struct inode *inode, - struct bio *failed_bio, u64 phy_offset, + struct bio *failed_bio, u32 bio_offset, struct page *page, unsigned int pgoff, u64 start, u64 end, int failed_mirror, submit_bio_hook_t *submit_bio_hook); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 8f4f2bd6d9b9..1545c22ef280 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -38,27 +38,27 @@ * Finally new_i_size should only be set in the case of truncate where we're not * ready to use i_size_read() as the limiter yet. */ -void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size) +void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size) { - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 start, end, i_size; int ret; - i_size = new_i_size ?: i_size_read(inode); + i_size = new_i_size ?: i_size_read(&inode->vfs_inode); if (btrfs_fs_incompat(fs_info, NO_HOLES)) { - BTRFS_I(inode)->disk_i_size = i_size; + inode->disk_i_size = i_size; return; } - spin_lock(&BTRFS_I(inode)->lock); - ret = find_contiguous_extent_bit(&BTRFS_I(inode)->file_extent_tree, 0, - &start, &end, EXTENT_DIRTY); + spin_lock(&inode->lock); + ret = find_contiguous_extent_bit(&inode->file_extent_tree, 0, &start, + &end, EXTENT_DIRTY); if (!ret && start == 0) i_size = min(i_size, end + 1); else i_size = 0; - BTRFS_I(inode)->disk_i_size = i_size; - spin_unlock(&BTRFS_I(inode)->lock); + inode->disk_i_size = i_size; + spin_unlock(&inode->lock); } /** @@ -142,7 +142,6 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, file_key.offset = pos; file_key.type = BTRFS_EXTENT_DATA_KEY; - path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &file_key, sizeof(*item)); if (ret < 0) @@ -181,7 +180,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans, struct btrfs_csum_item *item; struct extent_buffer *leaf; u64 csum_offset = 0; - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + const u32 csum_size = fs_info->csum_size; int csums_in_item; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; @@ -201,7 +200,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans, goto fail; csum_offset = (bytenr - found_key.offset) >> - fs_info->sb->s_blocksize_bits; + fs_info->sectorsize_bits; csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); csums_in_item /= csum_size; @@ -239,12 +238,117 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, return ret; } +/* + * Find checksums for logical bytenr range [disk_bytenr, disk_bytenr + len) and + * estore the result to @dst. + * + * Return >0 for the number of sectors we found. + * Return 0 for the range [disk_bytenr, disk_bytenr + sectorsize) has no csum + * for it. Caller may want to try next sector until one range is hit. + * Return <0 for fatal error. + */ +static int search_csum_tree(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 disk_bytenr, + u64 len, u8 *dst) +{ + struct btrfs_csum_item *item = NULL; + struct btrfs_key key; + const u32 sectorsize = fs_info->sectorsize; + const u32 csum_size = fs_info->csum_size; + u32 itemsize; + int ret; + u64 csum_start; + u64 csum_len; + + ASSERT(IS_ALIGNED(disk_bytenr, sectorsize) && + IS_ALIGNED(len, sectorsize)); + + /* Check if the current csum item covers disk_bytenr */ + if (path->nodes[0]) { + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_csum_item); + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + + csum_start = key.offset; + csum_len = (itemsize / csum_size) * sectorsize; + + if (in_range(disk_bytenr, csum_start, csum_len)) + goto found; + } + + /* Current item doesn't contain the desired range, search again */ + btrfs_release_path(path); + item = btrfs_lookup_csum(NULL, fs_info->csum_root, path, disk_bytenr, 0); + if (IS_ERR(item)) { + ret = PTR_ERR(item); + goto out; + } + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + itemsize = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + + csum_start = key.offset; + csum_len = (itemsize / csum_size) * sectorsize; + ASSERT(in_range(disk_bytenr, csum_start, csum_len)); + +found: + ret = (min(csum_start + csum_len, disk_bytenr + len) - + disk_bytenr) >> fs_info->sectorsize_bits; + read_extent_buffer(path->nodes[0], dst, (unsigned long)item, + ret * csum_size); +out: + if (ret == -ENOENT) + ret = 0; + return ret; +} + +/* + * Locate the file_offset of @cur_disk_bytenr of a @bio. + * + * Bio of btrfs represents read range of + * [bi_sector << 9, bi_sector << 9 + bi_size). + * Knowing this, we can iterate through each bvec to locate the page belong to + * @cur_disk_bytenr and get the file offset. + * + * @inode is used to determine if the bvec page really belongs to @inode. + * + * Return 0 if we can't find the file offset + * Return >0 if we find the file offset and restore it to @file_offset_ret + */ +static int search_file_offset_in_bio(struct bio *bio, struct inode *inode, + u64 disk_bytenr, u64 *file_offset_ret) +{ + struct bvec_iter iter; + struct bio_vec bvec; + u64 cur = bio->bi_iter.bi_sector << SECTOR_SHIFT; + int ret = 0; + + bio_for_each_segment(bvec, bio, iter) { + struct page *page = bvec.bv_page; + + if (cur > disk_bytenr) + break; + if (cur + bvec.bv_len <= disk_bytenr) { + cur += bvec.bv_len; + continue; + } + ASSERT(in_range(disk_bytenr, cur, bvec.bv_len)); + if (page->mapping && page->mapping->host && + page->mapping->host == inode) { + ret = 1; + *file_offset_ret = page_offset(page) + bvec.bv_offset + + disk_bytenr - cur; + break; + } + } + return ret; +} + /** - * btrfs_lookup_bio_sums - Look up checksums for a bio. + * Lookup the checksum for the read bio in csum tree. + * * @inode: inode that the bio is for. * @bio: bio to look up. - * @offset: Unless (u64)-1, look up checksums for this offset in the file. - * If (u64)-1, use the page offsets from the bio instead. * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If * NULL, the checksum buffer is allocated and returned in @@ -252,31 +356,40 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, * * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. */ -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, - u64 offset, u8 *dst) +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct bio_vec bvec; - struct bvec_iter iter; - struct btrfs_csum_item *item = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_path *path; - const bool page_offsets = (offset == (u64)-1); + const u32 sectorsize = fs_info->sectorsize; + const u32 csum_size = fs_info->csum_size; + u32 orig_len = bio->bi_iter.bi_size; + u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; + u64 cur_disk_bytenr; u8 *csum; - u64 item_start_offset = 0; - u64 item_last_offset = 0; - u64 disk_bytenr; - u64 page_bytes_left; - u32 diff; - int nblocks; + const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; int count = 0; - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + if (!fs_info->csum_root || (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) + return BLK_STS_OK; + + /* + * This function is only called for read bio. + * + * This means two things: + * - All our csums should only be in csum tree + * No ordered extents csums, as ordered extents are only for write + * path. + * - No need to bother any other info from bvec + * Since we're looking up csums, the only important info is the + * disk_bytenr and the length, which can be extracted from bi_iter + * directly. + */ + ASSERT(bio_op(bio) == REQ_OP_READ); path = btrfs_alloc_path(); if (!path) return BLK_STS_RESOURCE; - nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; if (!dst) { struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio); @@ -295,7 +408,11 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, csum = dst; } - if (bio->bi_iter.bi_size > PAGE_SIZE * 8) + /* + * If requested number of sectors is larger than one leaf can contain, + * kick the readahead for csum tree. + */ + if (nblocks > fs_info->csums_per_leaf) path->reada = READA_FORWARD; /* @@ -309,85 +426,62 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, path->skip_locking = 1; } - disk_bytenr = (u64)bio->bi_iter.bi_sector << 9; + for (cur_disk_bytenr = orig_disk_bytenr; + cur_disk_bytenr < orig_disk_bytenr + orig_len; + cur_disk_bytenr += (count * sectorsize)) { + u64 search_len = orig_disk_bytenr + orig_len - cur_disk_bytenr; + unsigned int sector_offset; + u8 *csum_dst; - bio_for_each_segment(bvec, bio, iter) { - page_bytes_left = bvec.bv_len; - if (count) - goto next; - - if (page_offsets) - offset = page_offset(bvec.bv_page) + bvec.bv_offset; - count = btrfs_find_ordered_sum(BTRFS_I(inode), offset, - disk_bytenr, csum, nblocks); - if (count) - goto found; + /* + * Although both cur_disk_bytenr and orig_disk_bytenr is u64, + * we're calculating the offset to the bio start. + * + * Bio size is limited to UINT_MAX, thus unsigned int is large + * enough to contain the raw result, not to mention the right + * shifted result. + */ + ASSERT(cur_disk_bytenr - orig_disk_bytenr < UINT_MAX); + sector_offset = (cur_disk_bytenr - orig_disk_bytenr) >> + fs_info->sectorsize_bits; + csum_dst = csum + sector_offset * csum_size; + + count = search_csum_tree(fs_info, path, cur_disk_bytenr, + search_len, csum_dst); + if (count <= 0) { + /* + * Either we hit a critical error or we didn't find + * the csum. + * Either way, we put zero into the csums dst, and skip + * to the next sector. + */ + memset(csum_dst, 0, csum_size); + count = 1; - if (!item || disk_bytenr < item_start_offset || - disk_bytenr >= item_last_offset) { - struct btrfs_key found_key; - u32 item_size; - - if (item) - btrfs_release_path(path); - item = btrfs_lookup_csum(NULL, fs_info->csum_root, - path, disk_bytenr, 0); - if (IS_ERR(item)) { - count = 1; - memset(csum, 0, csum_size); - if (BTRFS_I(inode)->root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID) { - set_extent_bits(io_tree, offset, - offset + fs_info->sectorsize - 1, + /* + * For data reloc inode, we need to mark the range + * NODATASUM so that balance won't report false csum + * error. + */ + if (BTRFS_I(inode)->root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + u64 file_offset; + int ret; + + ret = search_file_offset_in_bio(bio, inode, + cur_disk_bytenr, &file_offset); + if (ret) + set_extent_bits(io_tree, file_offset, + file_offset + sectorsize - 1, EXTENT_NODATASUM); - } else { - btrfs_info_rl(fs_info, - "no csum found for inode %llu start %llu", - btrfs_ino(BTRFS_I(inode)), offset); - } - item = NULL; - btrfs_release_path(path); - goto found; + } else { + btrfs_warn_rl(fs_info, + "csum hole found for disk bytenr range [%llu, %llu)", + cur_disk_bytenr, cur_disk_bytenr + sectorsize); } - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); - - item_start_offset = found_key.offset; - item_size = btrfs_item_size_nr(path->nodes[0], - path->slots[0]); - item_last_offset = item_start_offset + - (item_size / csum_size) * - fs_info->sectorsize; - item = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_csum_item); - } - /* - * this byte range must be able to fit inside - * a single leaf so it will also fit inside a u32 - */ - diff = disk_bytenr - item_start_offset; - diff = diff / fs_info->sectorsize; - diff = diff * csum_size; - count = min_t(int, nblocks, (item_last_offset - disk_bytenr) >> - inode->i_sb->s_blocksize_bits); - read_extent_buffer(path->nodes[0], csum, - ((unsigned long)item) + diff, - csum_size * count); -found: - csum += count * csum_size; - nblocks -= count; -next: - while (count > 0) { - count--; - disk_bytenr += fs_info->sectorsize; - offset += fs_info->sectorsize; - page_bytes_left -= fs_info->sectorsize; - if (!page_bytes_left) - break; /* move to next bio */ } } - WARN_ON_ONCE(count); btrfs_free_path(path); return BLK_STS_OK; } @@ -406,7 +500,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, int ret; size_t size; u64 csum_end; - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + const u32 csum_size = fs_info->csum_size; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(end + 1, fs_info->sectorsize)); @@ -433,8 +527,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && key.type == BTRFS_EXTENT_CSUM_KEY) { - offset = (start - key.offset) >> - fs_info->sb->s_blocksize_bits; + offset = (start - key.offset) >> fs_info->sectorsize_bits; if (offset * csum_size < btrfs_item_size_nr(leaf, path->slots[0] - 1)) path->slots[0]--; @@ -484,10 +577,9 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, sums->bytenr = start; sums->len = (int)size; - offset = (start - key.offset) >> - fs_info->sb->s_blocksize_bits; + offset = (start - key.offset) >> fs_info->sectorsize_bits; offset *= csum_size; - size >>= fs_info->sb->s_blocksize_bits; + size >>= fs_info->sectorsize_bits; read_extent_buffer(path->nodes[0], sums->sums, @@ -539,7 +631,6 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, int i; u64 offset; unsigned nofs_flag; - const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); nofs_flag = memalloc_nofs_save(); sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), @@ -557,7 +648,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, else offset = 0; /* shut up gcc */ - sums->bytenr = (u64)bio->bi_iter.bi_sector << 9; + sums->bytenr = bio->bi_iter.bi_sector << 9; index = 0; shash->tfm = fs_info->csum_shash; @@ -596,7 +687,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, ordered = btrfs_lookup_ordered_extent(inode, offset); ASSERT(ordered); /* Logic error */ - sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) + sums->bytenr = (bio->bi_iter.bi_sector << 9) + total_bytes; index = 0; } @@ -607,7 +698,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, fs_info->sectorsize, sums->sums + index); kunmap_atomic(data); - index += csum_size; + index += fs_info->csum_size; offset += fs_info->sectorsize; this_sum_bytes += fs_info->sectorsize; total_bytes += fs_info->sectorsize; @@ -637,14 +728,14 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, u64 bytenr, u64 len) { struct extent_buffer *leaf; - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + const u32 csum_size = fs_info->csum_size; u64 csum_end; u64 end_byte = bytenr + len; - u32 blocksize_bits = fs_info->sb->s_blocksize_bits; + u32 blocksize_bits = fs_info->sectorsize_bits; leaf = path->nodes[0]; csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; - csum_end <<= fs_info->sb->s_blocksize_bits; + csum_end <<= blocksize_bits; csum_end += key->offset; if (key->offset < bytenr && csum_end <= end_byte) { @@ -691,8 +782,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, u64 csum_end; struct extent_buffer *leaf; int ret; - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); - int blocksize_bits = fs_info->sb->s_blocksize_bits; + const u32 csum_size = fs_info->csum_size; + u32 blocksize_bits = fs_info->sectorsize_bits; ASSERT(root == fs_info->csum_root || root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); @@ -706,7 +797,6 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, key.offset = end_byte - 1; key.type = BTRFS_EXTENT_CSUM_KEY; - path->leave_spinning = 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { if (path->slots[0] == 0) @@ -846,7 +936,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, int index = 0; int found_next; int ret; - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + const u32 csum_size = fs_info->csum_size; path = btrfs_alloc_path(); if (!path) @@ -921,7 +1011,7 @@ again: if (btrfs_leaf_free_space(leaf) >= csum_size) { btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); csum_offset = (bytenr - found_key.offset) >> - fs_info->sb->s_blocksize_bits; + fs_info->sectorsize_bits; goto extend_csum; } @@ -939,8 +1029,7 @@ again: leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - csum_offset = (bytenr - found_key.offset) >> - fs_info->sb->s_blocksize_bits; + csum_offset = (bytenr - found_key.offset) >> fs_info->sectorsize_bits; if (found_key.type != BTRFS_EXTENT_CSUM_KEY || found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || @@ -956,7 +1045,7 @@ extend_csum: u32 diff; tmp = sums->len - total_bytes; - tmp >>= fs_info->sb->s_blocksize_bits; + tmp >>= fs_info->sectorsize_bits; WARN_ON(tmp < 1); extend_nr = max_t(int, 1, (int)tmp); @@ -981,9 +1070,9 @@ insert: u64 tmp; tmp = sums->len - total_bytes; - tmp >>= fs_info->sb->s_blocksize_bits; + tmp >>= fs_info->sectorsize_bits; tmp = min(tmp, (next_offset - file_key.offset) >> - fs_info->sb->s_blocksize_bits); + fs_info->sectorsize_bits); tmp = max_t(u64, 1, tmp); tmp = min_t(u64, tmp, MAX_CSUM_ITEMS(fs_info, csum_size)); @@ -991,10 +1080,8 @@ insert: } else { ins_size = csum_size; } - path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &file_key, ins_size); - path->leave_spinning = 0; if (ret < 0) goto out; if (WARN_ON(ret != 0)) @@ -1007,8 +1094,7 @@ csum: item = (struct btrfs_csum_item *)((unsigned char *)item + csum_offset * csum_size); found: - ins_size = (u32)(sums->len - total_bytes) >> - fs_info->sb->s_blocksize_bits; + ins_size = (u32)(sums->len - total_bytes) >> fs_info->sectorsize_bits; ins_size *= csum_size; ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item, ins_size); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4373da7bcc0d..0e41459b8de6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -462,7 +462,7 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) */ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, - struct extent_state **cached) + struct extent_state **cached, bool noreserve) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int err = 0; @@ -474,7 +474,13 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, loff_t isize = i_size_read(&inode->vfs_inode); unsigned int extra_bits = 0; - start_pos = pos & ~((u64) fs_info->sectorsize - 1); + if (write_bytes == 0) + return 0; + + if (noreserve) + extra_bits |= EXTENT_NORESERVE; + + start_pos = round_down(pos, fs_info->sectorsize); num_bytes = round_up(write_bytes + pos - start_pos, fs_info->sectorsize); @@ -671,14 +677,16 @@ next: * If an extent intersects the range but is not entirely inside the range * it is either truncated or split. Anything entirely inside the range * is deleted from the tree. + * + * Note: the VFS' inode number of bytes is not updated, it's up to the caller + * to deal with that. We set the field 'bytes_found' of the arguments structure + * with the number of allocated bytes found in the target range, so that the + * caller can update the inode's number of bytes in an atomic way when + * replacing extents in a range to avoid races with stat(2). */ -int __btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, - struct btrfs_path *path, u64 start, u64 end, - u64 *drop_end, int drop_cache, - int replace_extent, - u32 extent_item_size, - int *key_inserted) +int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_inode *inode, + struct btrfs_drop_extents_args *args) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; @@ -686,14 +694,13 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_ref ref = { 0 }; struct btrfs_key key; struct btrfs_key new_key; - struct inode *vfs_inode = &inode->vfs_inode; u64 ino = btrfs_ino(inode); - u64 search_start = start; + u64 search_start = args->start; u64 disk_bytenr = 0; u64 num_bytes = 0; u64 extent_offset = 0; u64 extent_end = 0; - u64 last_end = start; + u64 last_end = args->start; int del_nr = 0; int del_slot = 0; int extent_type; @@ -703,11 +710,26 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int update_refs; int found = 0; int leafs_visited = 0; + struct btrfs_path *path = args->path; + + args->bytes_found = 0; + args->extent_inserted = false; - if (drop_cache) - btrfs_drop_extent_cache(inode, start, end - 1, 0); + /* Must always have a path if ->replace_extent is true */ + ASSERT(!(args->replace_extent && !args->path)); - if (start >= inode->disk_i_size && !replace_extent) + if (!path) { + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + } + + if (args->drop_cache) + btrfs_drop_extent_cache(inode, args->start, args->end - 1, 0); + + if (args->start >= inode->disk_i_size && !args->replace_extent) modify_tree = 0; update_refs = (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || @@ -718,7 +740,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, search_start, modify_tree); if (ret < 0) break; - if (ret > 0 && path->slots[0] > 0 && search_start == start) { + if (ret > 0 && path->slots[0] > 0 && search_start == args->start) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); if (key.objectid == ino && @@ -753,7 +775,7 @@ next_slot: path->slots[0]++; goto next_slot; } - if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) + if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end) break; fi = btrfs_item_ptr(leaf, path->slots[0], @@ -795,7 +817,7 @@ next_slot: } found = 1; - search_start = max(key.offset, start); + search_start = max(key.offset, args->start); if (recow || !modify_tree) { modify_tree = -1; btrfs_release_path(path); @@ -806,7 +828,7 @@ next_slot: * | - range to drop - | * | -------- extent -------- | */ - if (start > key.offset && end < extent_end) { + if (args->start > key.offset && args->end < extent_end) { BUG_ON(del_nr > 0); if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; @@ -814,7 +836,7 @@ next_slot: } memcpy(&new_key, &key, sizeof(new_key)); - new_key.offset = start; + new_key.offset = args->start; ret = btrfs_duplicate_item(trans, root, path, &new_key); if (ret == -EAGAIN) { @@ -828,15 +850,15 @@ next_slot: fi = btrfs_item_ptr(leaf, path->slots[0] - 1, struct btrfs_file_extent_item); btrfs_set_file_extent_num_bytes(leaf, fi, - start - key.offset); + args->start - key.offset); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - extent_offset += start - key.offset; + extent_offset += args->start - key.offset; btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, - extent_end - start); + extent_end - args->start); btrfs_mark_buffer_dirty(leaf); if (update_refs && disk_bytenr > 0) { @@ -846,11 +868,11 @@ next_slot: btrfs_init_data_ref(&ref, root->root_key.objectid, new_key.objectid, - start - extent_offset); + args->start - extent_offset); ret = btrfs_inc_extent_ref(trans, &ref); BUG_ON(ret); /* -ENOMEM */ } - key.offset = start; + key.offset = args->start; } /* * From here on out we will have actually dropped something, so @@ -862,23 +884,23 @@ next_slot: * | ---- range to drop ----- | * | -------- extent -------- | */ - if (start <= key.offset && end < extent_end) { + if (args->start <= key.offset && args->end < extent_end) { if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; break; } memcpy(&new_key, &key, sizeof(new_key)); - new_key.offset = end; + new_key.offset = args->end; btrfs_set_item_key_safe(fs_info, path, &new_key); - extent_offset += end - key.offset; + extent_offset += args->end - key.offset; btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, - extent_end - end); + extent_end - args->end); btrfs_mark_buffer_dirty(leaf); if (update_refs && disk_bytenr > 0) - inode_sub_bytes(vfs_inode, end - key.offset); + args->bytes_found += args->end - key.offset; break; } @@ -887,7 +909,7 @@ next_slot: * | ---- range to drop ----- | * | -------- extent -------- | */ - if (start > key.offset && end >= extent_end) { + if (args->start > key.offset && args->end >= extent_end) { BUG_ON(del_nr > 0); if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; @@ -895,11 +917,11 @@ next_slot: } btrfs_set_file_extent_num_bytes(leaf, fi, - start - key.offset); + args->start - key.offset); btrfs_mark_buffer_dirty(leaf); if (update_refs && disk_bytenr > 0) - inode_sub_bytes(vfs_inode, extent_end - start); - if (end == extent_end) + args->bytes_found += extent_end - args->start; + if (args->end == extent_end) break; path->slots[0]++; @@ -910,7 +932,7 @@ next_slot: * | ---- range to drop ----- | * | ------ extent ------ | */ - if (start <= key.offset && end >= extent_end) { + if (args->start <= key.offset && args->end >= extent_end) { delete_extent_item: if (del_nr == 0) { del_slot = path->slots[0]; @@ -922,8 +944,7 @@ delete_extent_item: if (update_refs && extent_type == BTRFS_FILE_EXTENT_INLINE) { - inode_sub_bytes(vfs_inode, - extent_end - key.offset); + args->bytes_found += extent_end - key.offset; extent_end = ALIGN(extent_end, fs_info->sectorsize); } else if (update_refs && disk_bytenr > 0) { @@ -936,11 +957,10 @@ delete_extent_item: key.offset - extent_offset); ret = btrfs_free_extent(trans, &ref); BUG_ON(ret); /* -ENOMEM */ - inode_sub_bytes(vfs_inode, - extent_end - key.offset); + args->bytes_found += extent_end - key.offset; } - if (end == extent_end) + if (args->end == extent_end) break; if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { @@ -970,7 +990,7 @@ delete_extent_item: * Set path->slots[0] to first slot, so that after the delete * if items are move off from our leaf to its immediate left or * right neighbor leafs, we end up with a correct and adjusted - * path->slots[0] for our insertion (if replace_extent != 0). + * path->slots[0] for our insertion (if args->replace_extent). */ path->slots[0] = del_slot; ret = btrfs_del_items(trans, root, path, del_slot, del_nr); @@ -984,15 +1004,14 @@ delete_extent_item: * which case it unlocked our path, so check path->locks[0] matches a * write lock. */ - if (!ret && replace_extent && leafs_visited == 1 && - (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING || - path->locks[0] == BTRFS_WRITE_LOCK) && + if (!ret && args->replace_extent && leafs_visited == 1 && + path->locks[0] == BTRFS_WRITE_LOCK && btrfs_leaf_free_space(leaf) >= - sizeof(struct btrfs_item) + extent_item_size) { + sizeof(struct btrfs_item) + args->extent_item_size) { key.objectid = ino; key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = start; + key.offset = args->start; if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) { struct btrfs_key slot_key; @@ -1000,30 +1019,18 @@ delete_extent_item: if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) path->slots[0]++; } - setup_items_for_insert(root, path, &key, &extent_item_size, 1); - *key_inserted = 1; + setup_items_for_insert(root, path, &key, + &args->extent_item_size, 1); + args->extent_inserted = true; } - if (!replace_extent || !(*key_inserted)) + if (!args->path) + btrfs_free_path(path); + else if (!args->extent_inserted) btrfs_release_path(path); - if (drop_end) - *drop_end = found ? min(end, last_end) : end; - return ret; -} - -int btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, u64 start, - u64 end, int drop_cache) -{ - struct btrfs_path *path; - int ret; +out: + args->drop_end = found ? min(args->end, last_end) : args->end; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path, start, - end, NULL, drop_cache, 0, 0, NULL); - btrfs_free_path(path); return ret; } @@ -1558,11 +1565,84 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode) btrfs_drew_write_unlock(&inode->root->snapshot_lock); } +static void update_time_for_write(struct inode *inode) +{ + struct timespec64 now; + + if (IS_NOCMTIME(inode)) + return; + + now = current_time(inode); + if (!timespec64_equal(&inode->i_mtime, &now)) + inode->i_mtime = now; + + if (!timespec64_equal(&inode->i_ctime, &now)) + inode->i_ctime = now; + + if (IS_I_VERSION(inode)) + inode_inc_iversion(inode); +} + +static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, + size_t count) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + loff_t pos = iocb->ki_pos; + int ret; + loff_t oldsize; + loff_t start_pos; + + if (iocb->ki_flags & IOCB_NOWAIT) { + size_t nocow_bytes = count; + + /* We will allocate space in case nodatacow is not set, so bail */ + if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes) <= 0) + return -EAGAIN; + /* + * There are holes in the range or parts of the range that must + * be COWed (shared extents, RO block groups, etc), so just bail + * out. + */ + if (nocow_bytes < count) + return -EAGAIN; + } + + current->backing_dev_info = inode_to_bdi(inode); + ret = file_remove_privs(file); + if (ret) + return ret; + + /* + * We reserve space for updating the inode when we reserve space for the + * extent we are going to write, so we will enospc out there. We don't + * need to start yet another transaction to update the inode as we will + * update the inode when we finish writing whatever data we write. + */ + update_time_for_write(inode); + + start_pos = round_down(pos, fs_info->sectorsize); + oldsize = i_size_read(inode); + if (start_pos > oldsize) { + /* Expand hole size to cover write data, preventing empty gap */ + loff_t end_pos = round_up(pos + count, fs_info->sectorsize); + + ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos); + if (ret) { + current->backing_dev_info = NULL; + return ret; + } + } + + return 0; +} + static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) { struct file *file = iocb->ki_filp; - loff_t pos = iocb->ki_pos; + loff_t pos; struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct page **pages = NULL; @@ -1572,17 +1652,37 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, u64 lockend; size_t num_written = 0; int nrptrs; - int ret = 0; + ssize_t ret; bool only_release_metadata = false; bool force_page_uptodate = false; + loff_t old_isize = i_size_read(inode); + unsigned int ilock_flags = 0; + + if (iocb->ki_flags & IOCB_NOWAIT) + ilock_flags |= BTRFS_ILOCK_TRY; + + ret = btrfs_inode_lock(inode, ilock_flags); + if (ret < 0) + return ret; + + ret = generic_write_checks(iocb, i); + if (ret <= 0) + goto out; + ret = btrfs_write_check(iocb, i, ret); + if (ret < 0) + goto out; + + pos = iocb->ki_pos; nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE), PAGE_SIZE / (sizeof(struct page *))); nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); nrptrs = max(nrptrs, 8); pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL); - if (!pages) - return -ENOMEM; + if (!pages) { + ret = -ENOMEM; + goto out; + } while (iov_iter_count(i) > 0) { struct extent_state *cached_state = NULL; @@ -1591,8 +1691,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, size_t write_bytes = min(iov_iter_count(i), nrptrs * (size_t)PAGE_SIZE - offset); - size_t num_pages = DIV_ROUND_UP(write_bytes + offset, - PAGE_SIZE); + size_t num_pages; size_t reserve_bytes; size_t dirty_pages; size_t copied; @@ -1600,8 +1699,6 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, size_t num_sectors; int extents_locked; - WARN_ON(num_pages > nrptrs); - /* * Fault pages before locking them in prepare_pages * to avoid recursive lock @@ -1613,35 +1710,28 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, only_release_metadata = false; sector_offset = pos & (fs_info->sectorsize - 1); - reserve_bytes = round_up(write_bytes + sector_offset, - fs_info->sectorsize); extent_changeset_release(data_reserved); ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, pos, write_bytes); if (ret < 0) { + /* + * If we don't have to COW at the offset, reserve + * metadata only. write_bytes may get smaller than + * requested here. + */ if (btrfs_check_nocow_lock(BTRFS_I(inode), pos, - &write_bytes) > 0) { - /* - * For nodata cow case, no need to reserve - * data space. - */ + &write_bytes) > 0) only_release_metadata = true; - /* - * our prealloc extent may be smaller than - * write_bytes, so scale down. - */ - num_pages = DIV_ROUND_UP(write_bytes + offset, - PAGE_SIZE); - reserve_bytes = round_up(write_bytes + - sector_offset, - fs_info->sectorsize); - } else { + else break; - } } + num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE); + WARN_ON(num_pages > nrptrs); + reserve_bytes = round_up(write_bytes + sector_offset, + fs_info->sectorsize); WARN_ON(reserve_bytes == 0); ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), reserve_bytes); @@ -1710,8 +1800,7 @@ again: if (num_sectors > dirty_sectors) { /* release everything except the sectors we dirtied */ - release_bytes -= dirty_sectors << - fs_info->sb->s_blocksize_bits; + release_bytes -= dirty_sectors << fs_info->sectorsize_bits; if (only_release_metadata) { btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes, true); @@ -1730,10 +1819,9 @@ again: release_bytes = round_up(copied + sector_offset, fs_info->sectorsize); - if (copied > 0) - ret = btrfs_dirty_pages(BTRFS_I(inode), pages, - dirty_pages, pos, copied, - &cached_state); + ret = btrfs_dirty_pages(BTRFS_I(inode), pages, + dirty_pages, pos, copied, + &cached_state, only_release_metadata); /* * If we have not locked the extent range, because the range's @@ -1758,17 +1846,6 @@ again: if (only_release_metadata) btrfs_check_nocow_unlock(BTRFS_I(inode)); - if (only_release_metadata && copied > 0) { - lockstart = round_down(pos, - fs_info->sectorsize); - lockend = round_up(pos + copied, - fs_info->sectorsize) - 1; - - set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, EXTENT_NORESERVE, NULL, - NULL, GFP_NOFS); - } - btrfs_drop_pages(pages, num_pages); cond_resched(); @@ -1795,24 +1872,102 @@ again: } extent_changeset_free(data_reserved); + if (num_written > 0) { + pagecache_isize_extended(inode, old_isize, iocb->ki_pos); + iocb->ki_pos += num_written; + } +out: + btrfs_inode_unlock(inode, ilock_flags); return num_written ? num_written : ret; } -static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) +static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, + const struct iov_iter *iter, loff_t offset) +{ + const u32 blocksize_mask = fs_info->sectorsize - 1; + + if (offset & blocksize_mask) + return -EINVAL; + + if (iov_iter_alignment(iter) & blocksize_mask) + return -EINVAL; + + return 0; +} + +static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); loff_t pos; - ssize_t written; + ssize_t written = 0; ssize_t written_buffered; loff_t endbyte; - int err; + ssize_t err; + unsigned int ilock_flags = 0; + struct iomap_dio *dio = NULL; + + if (iocb->ki_flags & IOCB_NOWAIT) + ilock_flags |= BTRFS_ILOCK_TRY; + + /* If the write DIO is within EOF, use a shared lock */ + if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode)) + ilock_flags |= BTRFS_ILOCK_SHARED; + +relock: + err = btrfs_inode_lock(inode, ilock_flags); + if (err < 0) + return err; + + err = generic_write_checks(iocb, from); + if (err <= 0) { + btrfs_inode_unlock(inode, ilock_flags); + return err; + } + + err = btrfs_write_check(iocb, from, err); + if (err < 0) { + btrfs_inode_unlock(inode, ilock_flags); + goto out; + } + + pos = iocb->ki_pos; + /* + * Re-check since file size may have changed just before taking the + * lock or pos may have changed because of O_APPEND in generic_write_check() + */ + if ((ilock_flags & BTRFS_ILOCK_SHARED) && + pos + iov_iter_count(from) > i_size_read(inode)) { + btrfs_inode_unlock(inode, ilock_flags); + ilock_flags &= ~BTRFS_ILOCK_SHARED; + goto relock; + } - written = btrfs_direct_IO(iocb, from); + if (check_direct_IO(fs_info, from, pos)) { + btrfs_inode_unlock(inode, ilock_flags); + goto buffered; + } - if (written < 0 || !iov_iter_count(from)) - return written; + dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, + &btrfs_dio_ops, is_sync_kiocb(iocb)); + btrfs_inode_unlock(inode, ilock_flags); + + if (IS_ERR_OR_NULL(dio)) { + err = PTR_ERR_OR_ZERO(dio); + if (err < 0 && err != -ENOTBLK) + goto out; + } else { + written = iomap_dio_complete(dio); + } + + if (written < 0 || !iov_iter_count(from)) { + err = written; + goto out; + } + +buffered: pos = iocb->ki_pos; written_buffered = btrfs_buffered_write(iocb, from); if (written_buffered < 0) { @@ -1838,24 +1993,6 @@ out: return written ? written : err; } -static void update_time_for_write(struct inode *inode) -{ - struct timespec64 now; - - if (IS_NOCMTIME(inode)) - return; - - now = current_time(inode); - if (!timespec64_equal(&inode->i_mtime, &now)) - inode->i_mtime = now; - - if (!timespec64_equal(&inode->i_ctime, &now)) - inode->i_ctime = now; - - if (IS_I_VERSION(inode)) - inode_inc_iversion(inode); -} - static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { @@ -1863,148 +2000,28 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; - u64 start_pos; - u64 end_pos; ssize_t num_written = 0; const bool sync = iocb->ki_flags & IOCB_DSYNC; - ssize_t err; - loff_t pos; - size_t count; - loff_t oldsize; - int clean_page = 0; - - if (!(iocb->ki_flags & IOCB_DIRECT) && - (iocb->ki_flags & IOCB_NOWAIT)) - return -EOPNOTSUPP; - - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!inode_trylock(inode)) - return -EAGAIN; - } else { - inode_lock(inode); - } - - err = generic_write_checks(iocb, from); - if (err <= 0) { - inode_unlock(inode); - return err; - } - - pos = iocb->ki_pos; - count = iov_iter_count(from); - if (iocb->ki_flags & IOCB_NOWAIT) { - size_t nocow_bytes = count; - - /* - * We will allocate space in case nodatacow is not set, - * so bail - */ - if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes) - <= 0) { - inode_unlock(inode); - return -EAGAIN; - } - /* - * There are holes in the range or parts of the range that must - * be COWed (shared extents, RO block groups, etc), so just bail - * out. - */ - if (nocow_bytes < count) { - inode_unlock(inode); - return -EAGAIN; - } - } - - current->backing_dev_info = inode_to_bdi(inode); - err = file_remove_privs(file); - if (err) { - inode_unlock(inode); - goto out; - } /* - * If BTRFS flips readonly due to some impossible error - * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR), - * although we have opened a file as writable, we have - * to stop this write operation to ensure FS consistency. + * If the fs flips readonly due to some impossible error, although we + * have opened a file as writable, we have to stop this write operation + * to ensure consistency. */ - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { - inode_unlock(inode); - err = -EROFS; - goto out; - } + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + return -EROFS; - /* - * We reserve space for updating the inode when we reserve space for the - * extent we are going to write, so we will enospc out there. We don't - * need to start yet another transaction to update the inode as we will - * update the inode when we finish writing whatever data we write. - */ - update_time_for_write(inode); - - start_pos = round_down(pos, fs_info->sectorsize); - oldsize = i_size_read(inode); - if (start_pos > oldsize) { - /* Expand hole size to cover write data, preventing empty gap */ - end_pos = round_up(pos + count, - fs_info->sectorsize); - err = btrfs_cont_expand(inode, oldsize, end_pos); - if (err) { - inode_unlock(inode); - goto out; - } - if (start_pos > round_up(oldsize, fs_info->sectorsize)) - clean_page = 1; - } + if (!(iocb->ki_flags & IOCB_DIRECT) && + (iocb->ki_flags & IOCB_NOWAIT)) + return -EOPNOTSUPP; if (sync) atomic_inc(&BTRFS_I(inode)->sync_writers); - if (iocb->ki_flags & IOCB_DIRECT) { - /* - * 1. We must always clear IOCB_DSYNC in order to not deadlock - * in iomap, as it calls generic_write_sync() in this case. - * 2. If we are async, we can call iomap_dio_complete() either - * in - * - * 2.1. A worker thread from the last bio completed. In this - * case we need to mark the btrfs_dio_data that it is - * async in order to call generic_write_sync() properly. - * This is handled by setting BTRFS_DIO_SYNC_STUB in the - * current->journal_info. - * 2.2 The submitter context, because all IO completed - * before we exited iomap_dio_rw(). In this case we can - * just re-set the IOCB_DSYNC on the iocb and we'll do - * the sync below. If our ->end_io() gets called and - * current->journal_info is set, then we know we're in - * our current context and we will clear - * current->journal_info to indicate that we need to - * sync below. - */ - if (sync) { - ASSERT(current->journal_info == NULL); - iocb->ki_flags &= ~IOCB_DSYNC; - current->journal_info = BTRFS_DIO_SYNC_STUB; - } - num_written = __btrfs_direct_write(iocb, from); - - /* - * As stated above, we cleared journal_info, so we need to do - * the sync ourselves. - */ - if (sync && current->journal_info == NULL) - iocb->ki_flags |= IOCB_DSYNC; - current->journal_info = NULL; - } else { + if (iocb->ki_flags & IOCB_DIRECT) + num_written = btrfs_direct_write(iocb, from); + else num_written = btrfs_buffered_write(iocb, from); - if (num_written > 0) - iocb->ki_pos = pos + num_written; - if (clean_page) - pagecache_isize_extended(inode, oldsize, - i_size_read(inode)); - } - - inode_unlock(inode); /* * We also have to set last_sub_trans to the current log transid, @@ -2019,9 +2036,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, if (sync) atomic_dec(&BTRFS_I(inode)->sync_writers); -out: + current->backing_dev_info = NULL; - return num_written ? num_written : err; + return num_written; } int btrfs_release_file(struct inode *inode, struct file *filp) @@ -2116,13 +2133,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) inode_lock(inode); - /* - * We take the dio_sem here because the tree log stuff can race with - * lockless dio writes and get an extent map logged for an extent we - * never waited on. We need it this high up for lockdep reasons. - */ - down_write(&BTRFS_I(inode)->dio_sem); - atomic_inc(&root->log_batch); /* @@ -2153,7 +2163,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ ret = start_ordered_ops(inode, start, end); if (ret) { - up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); goto out; } @@ -2250,7 +2259,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ - up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); if (ret != BTRFS_NO_LOG_SYNC) { @@ -2281,7 +2289,6 @@ out: out_release_extents: btrfs_release_log_ctx_extents(&ctx); - up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); goto out; } @@ -2443,13 +2450,13 @@ out: * em->start + em->len > start) * When a hole extent is found, return 1 and modify start/len. */ -static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) +static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map *em; int ret = 0; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, + em = btrfs_get_extent(inode, NULL, 0, round_down(*start, fs_info->sectorsize), round_up(*len, fs_info->sectorsize)); if (IS_ERR(em)) @@ -2509,13 +2516,14 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, } static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, - struct inode *inode, + struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_replace_extent_info *extent_info, - const u64 replace_len) + const u64 replace_len, + const u64 bytes_to_drop) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *extent; struct extent_buffer *leaf; struct btrfs_key key; @@ -2527,10 +2535,12 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, return 0; if (extent_info->disk_offset == 0 && - btrfs_fs_incompat(fs_info, NO_HOLES)) + btrfs_fs_incompat(fs_info, NO_HOLES)) { + btrfs_update_inode_bytes(inode, 0, bytes_to_drop); return 0; + } - key.objectid = btrfs_ino(BTRFS_I(inode)); + key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = extent_info->file_offset; ret = btrfs_insert_empty_item(trans, root, path, &key, @@ -2551,23 +2561,25 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), - extent_info->file_offset, replace_len); + ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset, + replace_len); if (ret) return ret; /* If it's a hole, nothing more needs to be done. */ - if (extent_info->disk_offset == 0) + if (extent_info->disk_offset == 0) { + btrfs_update_inode_bytes(inode, 0, bytes_to_drop); return 0; + } - inode_add_bytes(inode, replace_len); + btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop); if (extent_info->is_new_extent && extent_info->insertions == 0) { key.objectid = extent_info->disk_offset; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = extent_info->disk_len; ret = btrfs_alloc_reserved_file_extent(trans, root, - btrfs_ino(BTRFS_I(inode)), + btrfs_ino(inode), extent_info->file_offset, extent_info->qgroup_reserved, &key); @@ -2579,7 +2591,7 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, extent_info->disk_len, 0); ref_offset = extent_info->file_offset - extent_info->data_offset; btrfs_init_data_ref(&ref, root->root_key.objectid, - btrfs_ino(BTRFS_I(inode)), ref_offset); + btrfs_ino(inode), ref_offset); ret = btrfs_inc_extent_ref(trans, &ref); } @@ -2602,6 +2614,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out) { + struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1); u64 ino_size = round_up(inode->i_size, fs_info->sectorsize); @@ -2610,7 +2623,6 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, struct btrfs_block_rsv *rsv; unsigned int rsv_count; u64 cur_offset; - u64 drop_end; u64 len = end - start; int ret = 0; @@ -2649,10 +2661,16 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, trans->block_rsv = rsv; cur_offset = start; + drop_args.path = path; + drop_args.end = end + 1; + drop_args.drop_cache = true; while (cur_offset < end) { - ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path, - cur_offset, end + 1, &drop_end, - 1, 0, 0, NULL); + drop_args.start = cur_offset; + ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); + /* If we are punching a hole decrement the inode's byte count */ + if (!extent_info) + btrfs_update_inode_bytes(BTRFS_I(inode), 0, + drop_args.bytes_found); if (ret != -ENOSPC) { /* * When cloning we want to avoid transaction aborts when @@ -2669,10 +2687,10 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, trans->block_rsv = &fs_info->trans_block_rsv; - if (!extent_info && cur_offset < drop_end && + if (!extent_info && cur_offset < drop_args.drop_end && cur_offset < ino_size) { ret = fill_holes(trans, BTRFS_I(inode), path, - cur_offset, drop_end); + cur_offset, drop_args.drop_end); if (ret) { /* * If we failed then we didn't insert our hole @@ -2683,7 +2701,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, btrfs_abort_transaction(trans, ret); break; } - } else if (!extent_info && cur_offset < drop_end) { + } else if (!extent_info && cur_offset < drop_args.drop_end) { /* * We are past the i_size here, but since we didn't * insert holes we need to clear the mapped area so we @@ -2691,7 +2709,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, * file extent is inserted here. */ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), - cur_offset, drop_end - cur_offset); + cur_offset, + drop_args.drop_end - cur_offset); if (ret) { /* * We couldn't clear our area, so we could @@ -2703,11 +2722,14 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, } } - if (extent_info && drop_end > extent_info->file_offset) { - u64 replace_len = drop_end - extent_info->file_offset; + if (extent_info && + drop_args.drop_end > extent_info->file_offset) { + u64 replace_len = drop_args.drop_end - + extent_info->file_offset; - ret = btrfs_insert_replace_extent(trans, inode, path, - extent_info, replace_len); + ret = btrfs_insert_replace_extent(trans, BTRFS_I(inode), + path, extent_info, replace_len, + drop_args.bytes_found); if (ret) { btrfs_abort_transaction(trans, ret); break; @@ -2717,9 +2739,9 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, extent_info->file_offset += replace_len; } - cur_offset = drop_end; + cur_offset = drop_args.drop_end; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) break; @@ -2739,7 +2761,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, trans->block_rsv = rsv; if (!extent_info) { - ret = find_first_non_hole(inode, &cur_offset, &len); + ret = find_first_non_hole(BTRFS_I(inode), &cur_offset, + &len); if (unlikely(ret < 0)) break; if (ret && !len) { @@ -2776,25 +2799,26 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, * will not record the existence of the hole region * [existing_hole_start, lockend]. */ - if (drop_end <= end) - drop_end = end + 1; + if (drop_args.drop_end <= end) + drop_args.drop_end = end + 1; /* * Don't insert file hole extent item if it's for a range beyond eof * (because it's useless) or if it represents a 0 bytes range (when * cur_offset == drop_end). */ - if (!extent_info && cur_offset < ino_size && cur_offset < drop_end) { + if (!extent_info && cur_offset < ino_size && + cur_offset < drop_args.drop_end) { ret = fill_holes(trans, BTRFS_I(inode), path, - cur_offset, drop_end); + cur_offset, drop_args.drop_end); if (ret) { /* Same comment as above. */ btrfs_abort_transaction(trans, ret); goto out_trans; } - } else if (!extent_info && cur_offset < drop_end) { + } else if (!extent_info && cur_offset < drop_args.drop_end) { /* See the comment in the loop above for the reasoning here. */ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), - cur_offset, drop_end - cur_offset); + cur_offset, drop_args.drop_end - cur_offset); if (ret) { btrfs_abort_transaction(trans, ret); goto out_trans; @@ -2802,8 +2826,9 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, } if (extent_info) { - ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, - extent_info->data_len); + ret = btrfs_insert_replace_extent(trans, BTRFS_I(inode), path, + extent_info, extent_info->data_len, + drop_args.bytes_found); if (ret) { btrfs_abort_transaction(trans, ret); goto out_trans; @@ -2849,7 +2874,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) inode_lock(inode); ino_size = round_up(inode->i_size, fs_info->sectorsize); - ret = find_first_non_hole(inode, &offset, &len); + ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); if (ret < 0) goto out_only_mutex; if (ret && !len) { @@ -2874,7 +2899,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (same_block && len < fs_info->sectorsize) { if (offset < ino_size) { truncated_block = true; - ret = btrfs_truncate_block(inode, offset, len, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), offset, len, + 0); } else { ret = 0; } @@ -2884,7 +2910,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) /* zero back part of the first block */ if (offset < ino_size) { truncated_block = true; - ret = btrfs_truncate_block(inode, offset, 0, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); if (ret) { inode_unlock(inode); return ret; @@ -2899,7 +2925,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) /* after truncate page, check hole again */ len = offset + len - lockstart; offset = lockstart; - ret = find_first_non_hole(inode, &offset, &len); + ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); if (ret < 0) goto out_only_mutex; if (ret && !len) { @@ -2913,14 +2939,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) tail_start = lockend + 1; tail_len = offset + len - tail_start; if (tail_len) { - ret = find_first_non_hole(inode, &tail_start, &tail_len); + ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len); if (unlikely(ret < 0)) goto out_only_mutex; if (!ret) { /* zero the front end of the last page */ if (tail_start + tail_len < ino_size) { truncated_block = true; - ret = btrfs_truncate_block(inode, + ret = btrfs_truncate_block(BTRFS_I(inode), tail_start + tail_len, 0, 1); if (ret) @@ -2954,7 +2980,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) ASSERT(trans != NULL); inode_inc_iversion(inode); inode->i_mtime = inode->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); updated_inode = true; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -2981,7 +3007,7 @@ out_only_mutex: } else { int ret2; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); ret2 = btrfs_end_transaction(trans); if (!ret) ret = ret2; @@ -3049,8 +3075,8 @@ static int btrfs_fallocate_update_isize(struct inode *inode, inode->i_ctime = current_time(inode); i_size_write(inode, end); - btrfs_inode_safe_disk_i_size_write(inode, 0); - ret = btrfs_update_inode(trans, root, inode); + btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); ret2 = btrfs_end_transaction(trans); return ret ? ret : ret2; @@ -3162,7 +3188,8 @@ static int btrfs_zero_range(struct inode *inode, } if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) { free_extent_map(em); - ret = btrfs_truncate_block(inode, offset, len, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), offset, len, + 0); if (!ret) ret = btrfs_fallocate_update_isize(inode, offset + len, @@ -3193,7 +3220,7 @@ static int btrfs_zero_range(struct inode *inode, alloc_start = round_down(offset, sectorsize); ret = 0; } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { - ret = btrfs_truncate_block(inode, offset, 0, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); if (ret) goto out; } else { @@ -3210,7 +3237,8 @@ static int btrfs_zero_range(struct inode *inode, alloc_end = round_up(offset + len, sectorsize); ret = 0; } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { - ret = btrfs_truncate_block(inode, offset + len, 0, 1); + ret = btrfs_truncate_block(BTRFS_I(inode), offset + len, + 0, 1); if (ret) goto out; } else { @@ -3280,6 +3308,10 @@ static long btrfs_fallocate(struct file *file, int mode, int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode)); int ret; + /* Do not allow fallocate in ZONED mode */ + if (btrfs_is_zoned(btrfs_sb(inode->i_sb))) + return -EOPNOTSUPP; + alloc_start = round_down(offset, blocksize); alloc_end = round_up(offset + len, blocksize); cur_offset = alloc_start; @@ -3304,7 +3336,7 @@ static long btrfs_fallocate(struct file *file, int mode, return ret; } - inode_lock(inode); + btrfs_inode_lock(inode, 0); if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { ret = inode_newsize_ok(inode, offset + len); @@ -3320,7 +3352,7 @@ static long btrfs_fallocate(struct file *file, int mode, * But that's a minor problem and won't do much harm BTW. */ if (alloc_start > inode->i_size) { - ret = btrfs_cont_expand(inode, i_size_read(inode), + ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode), alloc_start); if (ret) goto out; @@ -3330,7 +3362,7 @@ static long btrfs_fallocate(struct file *file, int mode, * need to zero out the end of the block if i_size lands in the * middle of a block. */ - ret = btrfs_truncate_block(inode, inode->i_size, 0, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0); if (ret) goto out; } @@ -3543,9 +3575,9 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) return generic_file_llseek(file, offset, whence); case SEEK_DATA: case SEEK_HOLE: - inode_lock_shared(inode); + btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); offset = find_desired_extent(inode, offset, whence); - inode_unlock_shared(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); break; } @@ -3561,16 +3593,47 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) return generic_file_open(inode, filp); } +static int check_direct_read(struct btrfs_fs_info *fs_info, + const struct iov_iter *iter, loff_t offset) +{ + int ret; + int i, seg; + + ret = check_direct_IO(fs_info, iter, offset); + if (ret < 0) + return ret; + + if (!iter_is_iovec(iter)) + return 0; + + for (seg = 0; seg < iter->nr_segs; seg++) + for (i = seg + 1; i < iter->nr_segs; i++) + if (iter->iov[seg].iov_base == iter->iov[i].iov_base) + return -EINVAL; + return 0; +} + +static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos)) + return 0; + + btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); + ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + is_sync_kiocb(iocb)); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + return ret; +} + static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t ret = 0; if (iocb->ki_flags & IOCB_DIRECT) { - struct inode *inode = file_inode(iocb->ki_filp); - - inode_lock_shared(inode); - ret = btrfs_direct_IO(iocb, to); - inode_unlock_shared(inode); + ret = btrfs_direct_read(iocb, to); if (ret < 0 || !iov_iter_count(to) || iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp))) return ret; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index af0013d3df63..4d8897879c9c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -16,7 +16,6 @@ #include "transaction.h" #include "disk-io.h" #include "extent_io.h" -#include "inode-map.h" #include "volumes.h" #include "space-info.h" #include "delalloc-space.h" @@ -33,16 +32,18 @@ struct btrfs_trim_range { struct list_head list; }; -static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *bitmap_info); static int link_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); static void unlink_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); -static int btrfs_wait_cache_io_root(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_io_ctl *io_ctl, - struct btrfs_path *path); +static int search_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *bitmap_info, u64 *offset, + u64 *bytes, bool for_alloc); +static void free_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *bitmap_info); +static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, u64 offset, + u64 bytes); static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_path *path, @@ -141,17 +142,15 @@ static int __create_free_space_inode(struct btrfs_root *root, struct btrfs_free_space_header *header; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; - u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC; + /* We inline CRCs for the free disk space cache */ + const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC | + BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; int ret; ret = btrfs_insert_empty_inode(trans, root, path, ino); if (ret) return ret; - /* We inline crc's for the free disk space cache */ - if (ino != BTRFS_FREE_INO_OBJECTID) - flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; - leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); @@ -207,6 +206,65 @@ int create_free_space_inode(struct btrfs_trans_handle *trans, ino, block_group->start); } +/* + * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode + * handles lookup, otherwise it takes ownership and iputs the inode. + * Don't reuse an inode pointer after passing it into this function. + */ +int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_block_group *block_group) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (!inode) + inode = lookup_free_space_inode(block_group, path); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) != -ENOENT) + ret = PTR_ERR(inode); + goto out; + } + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + if (ret) { + btrfs_add_delayed_iput(inode); + goto out; + } + clear_nlink(inode); + /* One for the block groups ref */ + spin_lock(&block_group->lock); + if (block_group->iref) { + block_group->iref = 0; + block_group->inode = NULL; + spin_unlock(&block_group->lock); + iput(inode); + } else { + spin_unlock(&block_group->lock); + } + /* One for the lookup ref */ + btrfs_add_delayed_iput(inode); + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.type = 0; + key.offset = block_group->start; + ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path, + -1, 1); + if (ret) { + if (ret > 0) + ret = 0; + goto out; + } + ret = btrfs_del_item(trans, trans->fs_info->tree_root, path); +out: + btrfs_free_path(path); + return ret; +} + int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv) { @@ -267,12 +325,12 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, * We skip the throttling logic for free space cache inodes, so we don't * need to check for -EAGAIN. */ - ret = btrfs_truncate_inode_items(trans, root, inode, + ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), 0, BTRFS_EXTENT_DATA_KEY); if (ret) goto fail; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); fail: if (locked) @@ -304,16 +362,11 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, int write) { int num_pages; - int check_crcs = 0; num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FREE_INO_OBJECTID) - check_crcs = 1; - /* Make sure we can fit our crcs and generation into the first page */ - if (write && check_crcs && - (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE) + if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE) return -ENOSPC; memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); @@ -324,7 +377,6 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, io_ctl->num_pages = num_pages; io_ctl->fs_info = btrfs_sb(inode->i_sb); - io_ctl->check_crcs = check_crcs; io_ctl->inode = inode; return 0; @@ -419,13 +471,8 @@ static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) * Skip the csum areas. If we don't check crcs then we just have a * 64bit chunk at the front of the first page. */ - if (io_ctl->check_crcs) { - io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); - io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); - } else { - io_ctl->cur += sizeof(u64); - io_ctl->size -= sizeof(u64) * 2; - } + io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); + io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); put_unaligned_le64(generation, io_ctl->cur); io_ctl->cur += sizeof(u64); @@ -439,14 +486,8 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) * Skip the crc area. If we don't check crcs then we just have a 64bit * chunk at the front of the first page. */ - if (io_ctl->check_crcs) { - io_ctl->cur += sizeof(u32) * io_ctl->num_pages; - io_ctl->size -= sizeof(u64) + - (sizeof(u32) * io_ctl->num_pages); - } else { - io_ctl->cur += sizeof(u64); - io_ctl->size -= sizeof(u64) * 2; - } + io_ctl->cur += sizeof(u32) * io_ctl->num_pages; + io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); cache_gen = get_unaligned_le64(io_ctl->cur); if (cache_gen != generation) { @@ -466,11 +507,6 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) u32 crc = ~(u32)0; unsigned offset = 0; - if (!io_ctl->check_crcs) { - io_ctl_unmap_page(io_ctl); - return; - } - if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; @@ -488,11 +524,6 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) u32 crc = ~(u32)0; unsigned offset = 0; - if (!io_ctl->check_crcs) { - io_ctl_map_page(io_ctl, 0); - return 0; - } - if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; @@ -625,42 +656,42 @@ static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, return 0; } -/* - * Since we attach pinned extents after the fact we can have contiguous sections - * of free space that are split up in entries. This poses a problem with the - * tree logging stuff since it could have allocated across what appears to be 2 - * entries since we would have merged the entries when adding the pinned extents - * back to the free space cache. So run through the space cache that we just - * loaded and merge contiguous entries. This will make the log replay stuff not - * blow up and it will make for nicer allocator behavior. - */ -static void merge_space_tree(struct btrfs_free_space_ctl *ctl) +static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) { - struct btrfs_free_space *e, *prev = NULL; - struct rb_node *n; + struct btrfs_block_group *block_group = ctl->private; + u64 max_bytes; + u64 bitmap_bytes; + u64 extent_bytes; + u64 size = block_group->length; + u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; + u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); -again: - spin_lock(&ctl->tree_lock); - for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { - e = rb_entry(n, struct btrfs_free_space, offset_index); - if (!prev) - goto next; - if (e->bitmap || prev->bitmap) - goto next; - if (prev->offset + prev->bytes == e->offset) { - unlink_free_space(ctl, prev); - unlink_free_space(ctl, e); - prev->bytes += e->bytes; - kmem_cache_free(btrfs_free_space_cachep, e); - link_free_space(ctl, prev); - prev = NULL; - spin_unlock(&ctl->tree_lock); - goto again; - } -next: - prev = e; - } - spin_unlock(&ctl->tree_lock); + max_bitmaps = max_t(u64, max_bitmaps, 1); + + ASSERT(ctl->total_bitmaps <= max_bitmaps); + + /* + * We are trying to keep the total amount of memory used per 1GiB of + * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation + * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of + * bitmaps, we may end up using more memory than this. + */ + if (size < SZ_1G) + max_bytes = MAX_CACHE_BYTES_PER_GIG; + else + max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); + + bitmap_bytes = ctl->total_bitmaps * ctl->unit; + + /* + * we want the extent entry threshold to always be at most 1/2 the max + * bytes we can have, or whatever is less than that. + */ + extent_bytes = max_bytes - bitmap_bytes; + extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); + + ctl->extents_thresh = + div_u64(extent_bytes, sizeof(struct btrfs_free_space)); } static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, @@ -753,16 +784,6 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, goto free_cache; } - /* - * Sync discard ensures that the free space cache is always - * trimmed. So when reading this in, the state should reflect - * that. We also do this for async as a stop gap for lack of - * persistence. - */ - if (btrfs_test_opt(fs_info, DISCARD_SYNC) || - btrfs_test_opt(fs_info, DISCARD_ASYNC)) - e->trim_state = BTRFS_TRIM_STATE_TRIMMED; - if (!e->bytes) { kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; @@ -791,7 +812,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, spin_lock(&ctl->tree_lock); ret = link_free_space(ctl, e); ctl->total_bitmaps++; - ctl->op->recalc_thresholds(ctl); + recalculate_thresholds(ctl); spin_unlock(&ctl->tree_lock); if (ret) { btrfs_err(fs_info, @@ -816,19 +837,11 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, ret = io_ctl_read_bitmap(&io_ctl, e); if (ret) goto free_cache; - e->bitmap_extents = count_bitmap_extents(ctl, e); - if (!btrfs_free_space_trimmed(e)) { - ctl->discardable_extents[BTRFS_STAT_CURR] += - e->bitmap_extents; - ctl->discardable_bytes[BTRFS_STAT_CURR] += e->bytes; - } } io_ctl_drop_pages(&io_ctl); - merge_space_tree(ctl); ret = 1; out: - btrfs_discard_update_discardable(ctl->private, ctl); io_ctl_free(&io_ctl); return ret; free_cache: @@ -837,10 +850,46 @@ free_cache: goto out; } +static int copy_free_space_cache(struct btrfs_block_group *block_group, + struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_free_space *info; + struct rb_node *n; + int ret = 0; + + while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { + info = rb_entry(n, struct btrfs_free_space, offset_index); + if (!info->bitmap) { + unlink_free_space(ctl, info); + ret = btrfs_add_free_space(block_group, info->offset, + info->bytes); + kmem_cache_free(btrfs_free_space_cachep, info); + } else { + u64 offset = info->offset; + u64 bytes = ctl->unit; + + while (search_bitmap(ctl, info, &offset, &bytes, + false) == 0) { + ret = btrfs_add_free_space(block_group, offset, + bytes); + if (ret) + break; + bitmap_clear_bits(ctl, info, offset, bytes); + offset = info->offset; + bytes = ctl->unit; + } + free_bitmap(ctl, info); + } + cond_resched(); + } + return ret; +} + int load_free_space_cache(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space_ctl tmp_ctl = {}; struct inode *inode; struct btrfs_path *path; int ret = 0; @@ -848,6 +897,13 @@ int load_free_space_cache(struct btrfs_block_group *block_group) u64 used = block_group->used; /* + * Because we could potentially discard our loaded free space, we want + * to load everything into a temporary structure first, and then if it's + * valid copy it all into the actual free space ctl. + */ + btrfs_init_free_space_ctl(block_group, &tmp_ctl); + + /* * If this block group has been marked to be cleared for one reason or * another then we can't trust the on disk cache, so just return. */ @@ -898,19 +954,25 @@ int load_free_space_cache(struct btrfs_block_group *block_group) } spin_unlock(&block_group->lock); - ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, + ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl, path, block_group->start); btrfs_free_path(path); if (ret <= 0) goto out; - spin_lock(&ctl->tree_lock); - matched = (ctl->free_space == (block_group->length - used - - block_group->bytes_super)); - spin_unlock(&ctl->tree_lock); + matched = (tmp_ctl.free_space == (block_group->length - used - + block_group->bytes_super)); - if (!matched) { - __btrfs_remove_free_space_cache(ctl); + if (matched) { + ret = copy_free_space_cache(block_group, &tmp_ctl); + /* + * ret == 1 means we successfully loaded the free space cache, + * so we need to re-set it here. + */ + if (ret == 0) + ret = 1; + } else { + __btrfs_remove_free_space_cache(&tmp_ctl); btrfs_warn(fs_info, "block group %llu has wrong amount of free space", block_group->start); @@ -929,6 +991,9 @@ out: block_group->start); } + spin_lock(&ctl->tree_lock); + btrfs_discard_update_discardable(block_group); + spin_unlock(&ctl->tree_lock); iput(inode); return ret; } @@ -1191,7 +1256,7 @@ out: "failed to write free space cache for block group %llu error %d", block_group->start, ret); } - btrfs_update_inode(trans, root, inode); + btrfs_update_inode(trans, root, BTRFS_I(inode)); if (block_group) { /* the dirty list is protected by the dirty_bgs_lock */ @@ -1220,14 +1285,6 @@ out: } -static int btrfs_wait_cache_io_root(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_io_ctl *io_ctl, - struct btrfs_path *path) -{ - return __btrfs_wait_cache_io(root, trans, NULL, io_ctl, path, 0); -} - int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path) @@ -1332,7 +1389,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, /* Everything is written out, now we dirty the pages in the file. */ ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, io_ctl->num_pages, 0, i_size_read(inode), - &cached_state); + &cached_state, false); if (ret) goto out_nospc; @@ -1381,7 +1438,7 @@ out: invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; } - btrfs_update_inode(trans, root, inode); + btrfs_update_inode(trans, root, BTRFS_I(inode)); if (must_iput) iput(inode); return ret; @@ -1672,44 +1729,6 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, return ret; } -static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) -{ - struct btrfs_block_group *block_group = ctl->private; - u64 max_bytes; - u64 bitmap_bytes; - u64 extent_bytes; - u64 size = block_group->length; - u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; - u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); - - max_bitmaps = max_t(u64, max_bitmaps, 1); - - ASSERT(ctl->total_bitmaps <= max_bitmaps); - - /* - * We are trying to keep the total amount of memory used per 1GiB of - * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation - * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of - * bitmaps, we may end up using more memory than this. - */ - if (size < SZ_1G) - max_bytes = MAX_CACHE_BYTES_PER_GIG; - else - max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); - - bitmap_bytes = ctl->total_bitmaps * ctl->unit; - - /* - * we want the extent entry threshold to always be at most 1/2 the max - * bytes we can have, or whatever is less than that. - */ - extent_bytes = max_bytes - bitmap_bytes; - extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); - - ctl->extents_thresh = - div_u64(extent_bytes, sizeof(struct btrfs_free_space)); -} - static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, u64 bytes) @@ -1912,29 +1931,6 @@ out: return NULL; } -static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *bitmap_info) -{ - struct btrfs_block_group *block_group = ctl->private; - u64 bytes = bitmap_info->bytes; - unsigned int rs, re; - int count = 0; - - if (!block_group || !bytes) - return count; - - bitmap_for_each_set_region(bitmap_info->bitmap, rs, re, 0, - BITS_PER_BITMAP) { - bytes -= (rs - re) * ctl->unit; - count++; - - if (!bytes) - break; - } - - return count; -} - static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset) { @@ -1944,8 +1940,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, INIT_LIST_HEAD(&info->list); link_free_space(ctl, info); ctl->total_bitmaps++; - - ctl->op->recalc_thresholds(ctl); + recalculate_thresholds(ctl); } static void free_bitmap(struct btrfs_free_space_ctl *ctl, @@ -1967,7 +1962,7 @@ static void free_bitmap(struct btrfs_free_space_ctl *ctl, kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); kmem_cache_free(btrfs_free_space_cachep, bitmap_info); ctl->total_bitmaps--; - ctl->op->recalc_thresholds(ctl); + recalculate_thresholds(ctl); } static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, @@ -2134,7 +2129,6 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, } static const struct btrfs_free_space_op free_space_op = { - .recalc_thresholds = recalculate_thresholds, .use_bitmap = use_bitmap, }; @@ -2508,7 +2502,7 @@ link: if (ret) kmem_cache_free(btrfs_free_space_cachep, info); out: - btrfs_discard_update_discardable(block_group, ctl); + btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); if (ret) { @@ -2643,7 +2637,7 @@ again: goto again; } out_lock: - btrfs_discard_update_discardable(block_group, ctl); + btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); out: return ret; @@ -2674,10 +2668,10 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group, "%d blocks of free space at or bigger than bytes is", count); } -void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group) +void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, + struct btrfs_free_space_ctl *ctl) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; spin_lock_init(&ctl->tree_lock); ctl->unit = fs_info->sectorsize; @@ -2779,7 +2773,7 @@ void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) spin_lock(&ctl->tree_lock); __btrfs_remove_free_space_cache_locked(ctl); if (ctl->private) - btrfs_discard_update_discardable(ctl->private, ctl); + btrfs_discard_update_discardable(ctl->private); spin_unlock(&ctl->tree_lock); } @@ -2801,7 +2795,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) cond_resched_lock(&ctl->tree_lock); } __btrfs_remove_free_space_cache_locked(ctl); - btrfs_discard_update_discardable(block_group, ctl); + btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); } @@ -2885,7 +2879,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, link_free_space(ctl, entry); } out: - btrfs_discard_update_discardable(block_group, ctl); + btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); if (align_gap_len) @@ -3054,7 +3048,7 @@ out: kmem_cache_free(btrfs_free_space_bitmap_cachep, entry->bitmap); ctl->total_bitmaps--; - ctl->op->recalc_thresholds(ctl); + recalculate_thresholds(ctl); } else if (!btrfs_free_space_trimmed(entry)) { ctl->discardable_extents[BTRFS_STAT_CURR]--; } @@ -3828,166 +3822,62 @@ int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, return ret; } -/* - * Find the left-most item in the cache tree, and then return the - * smallest inode number in the item. - * - * Note: the returned inode number may not be the smallest one in - * the tree, if the left-most item is a bitmap. - */ -u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root) +bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info) { - struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl; - struct btrfs_free_space *entry = NULL; - u64 ino = 0; - - spin_lock(&ctl->tree_lock); - - if (RB_EMPTY_ROOT(&ctl->free_space_offset)) - goto out; - - entry = rb_entry(rb_first(&ctl->free_space_offset), - struct btrfs_free_space, offset_index); - - if (!entry->bitmap) { - ino = entry->offset; - - unlink_free_space(ctl, entry); - entry->offset++; - entry->bytes--; - if (!entry->bytes) - kmem_cache_free(btrfs_free_space_cachep, entry); - else - link_free_space(ctl, entry); - } else { - u64 offset = 0; - u64 count = 1; - int ret; - - ret = search_bitmap(ctl, entry, &offset, &count, true); - /* Logic error; Should be empty if it can't find anything */ - ASSERT(!ret); - - ino = offset; - bitmap_clear_bits(ctl, entry, offset, 1); - if (entry->bytes == 0) - free_bitmap(ctl, entry); - } -out: - spin_unlock(&ctl->tree_lock); - - return ino; + return btrfs_super_cache_generation(fs_info->super_copy); } -struct inode *lookup_free_ino_inode(struct btrfs_root *root, - struct btrfs_path *path) +static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans) { - struct inode *inode = NULL; - - spin_lock(&root->ino_cache_lock); - if (root->ino_cache_inode) - inode = igrab(root->ino_cache_inode); - spin_unlock(&root->ino_cache_lock); - if (inode) - return inode; - - inode = __lookup_free_space_inode(root, path, 0); - if (IS_ERR(inode)) - return inode; - - spin_lock(&root->ino_cache_lock); - if (!btrfs_fs_closing(root->fs_info)) - root->ino_cache_inode = igrab(inode); - spin_unlock(&root->ino_cache_lock); - - return inode; -} - -int create_free_ino_inode(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path) -{ - return __create_free_space_inode(root, trans, path, - BTRFS_FREE_INO_OBJECTID, 0); -} - -int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) -{ - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct btrfs_path *path; - struct inode *inode; - int ret = 0; - u64 root_gen = btrfs_root_generation(&root->root_item); - - if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE)) - return 0; - - /* - * If we're unmounting then just return, since this does a search on the - * normal root and not the commit root and we could deadlock. - */ - if (btrfs_fs_closing(fs_info)) - return 0; - - path = btrfs_alloc_path(); - if (!path) - return 0; - - inode = lookup_free_ino_inode(root, path); - if (IS_ERR(inode)) - goto out; - - if (root_gen != BTRFS_I(inode)->generation) - goto out_put; + struct btrfs_block_group *block_group; + struct rb_node *node; + int ret; - ret = __load_free_space_cache(root, inode, ctl, path, 0); + btrfs_info(fs_info, "cleaning free space cache v1"); - if (ret < 0) - btrfs_err(fs_info, - "failed to load free ino cache for root %llu", - root->root_key.objectid); -out_put: - iput(inode); + node = rb_first(&fs_info->block_group_cache_tree); + while (node) { + block_group = rb_entry(node, struct btrfs_block_group, cache_node); + ret = btrfs_remove_free_space_inode(trans, NULL, block_group); + if (ret) + goto out; + node = rb_next(node); + } out: - btrfs_free_path(path); return ret; } -int btrfs_write_out_ino_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct inode *inode) +int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct btrfs_trans_handle *trans; int ret; - struct btrfs_io_ctl io_ctl; - bool release_metadata = true; - if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE)) - return 0; + /* + * update_super_roots will appropriately set or unset + * super_copy->cache_generation based on SPACE_CACHE and + * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a + * transaction commit whether we are enabling space cache v1 and don't + * have any other work to do, or are disabling it and removing free + * space inodes. + */ + trans = btrfs_start_transaction(fs_info->tree_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); - memset(&io_ctl, 0, sizeof(io_ctl)); - ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, trans); - if (!ret) { - /* - * At this point writepages() didn't error out, so our metadata - * reservation is released when the writeback finishes, at - * inode.c:btrfs_finish_ordered_io(), regardless of it finishing - * with or without an error. - */ - release_metadata = false; - ret = btrfs_wait_cache_io_root(root, trans, &io_ctl, path); + if (!active) { + set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); + ret = cleanup_free_space_cache_v1(fs_info, trans); + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out; + } } - if (ret) { - if (release_metadata) - btrfs_delalloc_release_metadata(BTRFS_I(inode), - inode->i_size, true); - btrfs_debug(fs_info, - "failed to write free ino cache for root %llu error %d", - root->root_key.objectid, ret); - } + ret = btrfs_commit_transaction(trans); +out: + clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); return ret; } diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index e3d5e0ad8f8e..ecb09a02d544 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -60,7 +60,6 @@ struct btrfs_free_space_ctl { }; struct btrfs_free_space_op { - void (*recalc_thresholds)(struct btrfs_free_space_ctl *ctl); bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); }; @@ -76,7 +75,6 @@ struct btrfs_io_ctl { int num_pages; int entries; int bitmaps; - unsigned check_crcs:1; }; struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, @@ -84,6 +82,9 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, int create_free_space_inode(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path); +int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_block_group *block_group); int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); @@ -97,19 +98,9 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, int btrfs_write_out_cache(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path); -struct inode *lookup_free_ino_inode(struct btrfs_root *root, - struct btrfs_path *path); -int create_free_ino_inode(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path); -int load_free_ino_cache(struct btrfs_fs_info *fs_info, - struct btrfs_root *root); -int btrfs_write_out_ino_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct inode *inode); -void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group); +void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, + struct btrfs_free_space_ctl *ctl); int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, struct btrfs_free_space_ctl *ctl, u64 bytenr, u64 size, @@ -126,7 +117,6 @@ bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group); u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, u64 offset, u64 bytes, u64 empty_size, u64 *max_extent_size); -u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root); void btrfs_dump_free_space(struct btrfs_block_group *block_group, u64 bytes); int btrfs_find_space_cluster(struct btrfs_block_group *block_group, @@ -148,6 +138,8 @@ int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen, u64 maxlen, bool async); +bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info); +int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active); /* Support functions for running our sanity tests */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int test_add_free_space_entry(struct btrfs_block_group *cache, diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 6b9faf3b0e96..e33a65bd9a0c 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -136,9 +136,10 @@ static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans, return 0; } -static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize) +static inline u32 free_space_bitmap_size(const struct btrfs_fs_info *fs_info, + u64 size) { - return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE); + return DIV_ROUND_UP(size >> fs_info->sectorsize_bits, BITS_PER_BYTE); } static unsigned long *alloc_bitmap(u32 bitmap_size) @@ -200,8 +201,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, int done = 0, nr; int ret; - bitmap_size = free_space_bitmap_size(block_group->length, - fs_info->sectorsize); + bitmap_size = free_space_bitmap_size(fs_info, block_group->length); bitmap = alloc_bitmap(bitmap_size); if (!bitmap) { ret = -ENOMEM; @@ -290,8 +290,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, u32 data_size; extent_size = min(end - i, bitmap_range); - data_size = free_space_bitmap_size(extent_size, - fs_info->sectorsize); + data_size = free_space_bitmap_size(fs_info, extent_size); key.objectid = i; key.type = BTRFS_FREE_SPACE_BITMAP_KEY; @@ -339,8 +338,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, int done = 0, nr; int ret; - bitmap_size = free_space_bitmap_size(block_group->length, - fs_info->sectorsize); + bitmap_size = free_space_bitmap_size(fs_info, block_group->length); bitmap = alloc_bitmap(bitmap_size); if (!bitmap) { ret = -ENOMEM; @@ -383,8 +381,8 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, fs_info->sectorsize * BITS_PER_BYTE); bitmap_cursor = ((char *)bitmap) + bitmap_pos; - data_size = free_space_bitmap_size(found_key.offset, - fs_info->sectorsize); + data_size = free_space_bitmap_size(fs_info, + found_key.offset); ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1); read_extent_buffer(leaf, bitmap_cursor, ptr, @@ -416,7 +414,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - nrbits = div_u64(block_group->length, block_group->fs_info->sectorsize); + nrbits = block_group->length >> block_group->fs_info->sectorsize_bits; start_bit = find_next_bit_le(bitmap, nrbits, 0); while (start_bit < nrbits) { @@ -540,8 +538,8 @@ static void free_space_set_bits(struct btrfs_block_group *block_group, end = found_end; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); - first = div_u64(*start - found_start, fs_info->sectorsize); - last = div_u64(end - found_start, fs_info->sectorsize); + first = (*start - found_start) >> fs_info->sectorsize_bits; + last = (end - found_start) >> fs_info->sectorsize_bits; if (bit) extent_buffer_bitmap_set(leaf, ptr, first, last - first); else @@ -1195,8 +1193,6 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; - key.objectid = 0; key.type = 0; key.offset = 0; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 668701832845..37f36ffdaf6b 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -119,8 +119,6 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) ret = -ENOENT; @@ -193,8 +191,6 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { ret = -ENOENT; @@ -270,7 +266,6 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &key, ins_len); if (ret == -EEXIST) { @@ -327,7 +322,6 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; path->skip_release_on_error = 1; ret = btrfs_insert_empty_item(trans, root, path, &key, ins_len); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c deleted file mode 100644 index 76d2e43817ea..000000000000 --- a/fs/btrfs/inode-map.c +++ /dev/null @@ -1,582 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2007 Oracle. All rights reserved. - */ - -#include <linux/kthread.h> -#include <linux/pagemap.h> - -#include "ctree.h" -#include "disk-io.h" -#include "free-space-cache.h" -#include "inode-map.h" -#include "transaction.h" -#include "delalloc-space.h" - -static void fail_caching_thread(struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - - btrfs_warn(fs_info, "failed to start inode caching task"); - btrfs_clear_pending_and_info(fs_info, INODE_MAP_CACHE, - "disabling inode map caching"); - spin_lock(&root->ino_cache_lock); - root->ino_cache_state = BTRFS_CACHE_ERROR; - spin_unlock(&root->ino_cache_lock); - wake_up(&root->ino_cache_wait); -} - -static int caching_kthread(void *data) -{ - struct btrfs_root *root = data; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct btrfs_key key; - struct btrfs_path *path; - struct extent_buffer *leaf; - u64 last = (u64)-1; - int slot; - int ret; - - if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE)) - return 0; - - path = btrfs_alloc_path(); - if (!path) { - fail_caching_thread(root); - return -ENOMEM; - } - - /* Since the commit root is read-only, we can safely skip locking. */ - path->skip_locking = 1; - path->search_commit_root = 1; - path->reada = READA_FORWARD; - - key.objectid = BTRFS_FIRST_FREE_OBJECTID; - key.offset = 0; - key.type = BTRFS_INODE_ITEM_KEY; -again: - /* need to make sure the commit_root doesn't disappear */ - down_read(&fs_info->commit_root_sem); - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - while (1) { - if (btrfs_fs_closing(fs_info)) - goto out; - - leaf = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - else if (ret > 0) - break; - - if (need_resched() || - btrfs_transaction_in_commit(fs_info)) { - leaf = path->nodes[0]; - - if (WARN_ON(btrfs_header_nritems(leaf) == 0)) - break; - - /* - * Save the key so we can advances forward - * in the next search. - */ - btrfs_item_key_to_cpu(leaf, &key, 0); - btrfs_release_path(path); - root->ino_cache_progress = last; - up_read(&fs_info->commit_root_sem); - schedule_timeout(1); - goto again; - } else - continue; - } - - btrfs_item_key_to_cpu(leaf, &key, slot); - - if (key.type != BTRFS_INODE_ITEM_KEY) - goto next; - - if (key.objectid >= root->highest_objectid) - break; - - if (last != (u64)-1 && last + 1 != key.objectid) { - __btrfs_add_free_space(fs_info, ctl, last + 1, - key.objectid - last - 1, 0); - wake_up(&root->ino_cache_wait); - } - - last = key.objectid; -next: - path->slots[0]++; - } - - if (last < root->highest_objectid - 1) { - __btrfs_add_free_space(fs_info, ctl, last + 1, - root->highest_objectid - last - 1, 0); - } - - spin_lock(&root->ino_cache_lock); - root->ino_cache_state = BTRFS_CACHE_FINISHED; - spin_unlock(&root->ino_cache_lock); - - root->ino_cache_progress = (u64)-1; - btrfs_unpin_free_ino(root); -out: - wake_up(&root->ino_cache_wait); - up_read(&fs_info->commit_root_sem); - - btrfs_free_path(path); - - return ret; -} - -static void start_caching(struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct task_struct *tsk; - int ret; - u64 objectid; - - if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE)) - return; - - spin_lock(&root->ino_cache_lock); - if (root->ino_cache_state != BTRFS_CACHE_NO) { - spin_unlock(&root->ino_cache_lock); - return; - } - - root->ino_cache_state = BTRFS_CACHE_STARTED; - spin_unlock(&root->ino_cache_lock); - - ret = load_free_ino_cache(fs_info, root); - if (ret == 1) { - spin_lock(&root->ino_cache_lock); - root->ino_cache_state = BTRFS_CACHE_FINISHED; - spin_unlock(&root->ino_cache_lock); - wake_up(&root->ino_cache_wait); - return; - } - - /* - * It can be quite time-consuming to fill the cache by searching - * through the extent tree, and this can keep ino allocation path - * waiting. Therefore at start we quickly find out the highest - * inode number and we know we can use inode numbers which fall in - * [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID]. - */ - ret = btrfs_find_free_objectid(root, &objectid); - if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) { - __btrfs_add_free_space(fs_info, ctl, objectid, - BTRFS_LAST_FREE_OBJECTID - objectid + 1, - 0); - wake_up(&root->ino_cache_wait); - } - - tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu", - root->root_key.objectid); - if (IS_ERR(tsk)) - fail_caching_thread(root); -} - -int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) -{ - if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) - return btrfs_find_free_objectid(root, objectid); - -again: - *objectid = btrfs_find_ino_for_alloc(root); - - if (*objectid != 0) - return 0; - - start_caching(root); - - wait_event(root->ino_cache_wait, - root->ino_cache_state == BTRFS_CACHE_FINISHED || - root->ino_cache_state == BTRFS_CACHE_ERROR || - root->free_ino_ctl->free_space > 0); - - if (root->ino_cache_state == BTRFS_CACHE_FINISHED && - root->free_ino_ctl->free_space == 0) - return -ENOSPC; - else if (root->ino_cache_state == BTRFS_CACHE_ERROR) - return btrfs_find_free_objectid(root, objectid); - else - goto again; -} - -void btrfs_return_ino(struct btrfs_root *root, u64 objectid) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; - - if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE)) - return; -again: - if (root->ino_cache_state == BTRFS_CACHE_FINISHED) { - __btrfs_add_free_space(fs_info, pinned, objectid, 1, 0); - } else { - down_write(&fs_info->commit_root_sem); - spin_lock(&root->ino_cache_lock); - if (root->ino_cache_state == BTRFS_CACHE_FINISHED) { - spin_unlock(&root->ino_cache_lock); - up_write(&fs_info->commit_root_sem); - goto again; - } - spin_unlock(&root->ino_cache_lock); - - start_caching(root); - - __btrfs_add_free_space(fs_info, pinned, objectid, 1, 0); - - up_write(&fs_info->commit_root_sem); - } -} - -/* - * When a transaction is committed, we'll move those inode numbers which are - * smaller than root->ino_cache_progress from pinned tree to free_ino tree, and - * others will just be dropped, because the commit root we were searching has - * changed. - * - * Must be called with root->fs_info->commit_root_sem held - */ -void btrfs_unpin_free_ino(struct btrfs_root *root) -{ - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset; - spinlock_t *rbroot_lock = &root->free_ino_pinned->tree_lock; - struct btrfs_free_space *info; - struct rb_node *n; - u64 count; - - if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) - return; - - while (1) { - spin_lock(rbroot_lock); - n = rb_first(rbroot); - if (!n) { - spin_unlock(rbroot_lock); - break; - } - - info = rb_entry(n, struct btrfs_free_space, offset_index); - BUG_ON(info->bitmap); /* Logic error */ - - if (info->offset > root->ino_cache_progress) - count = 0; - else - count = min(root->ino_cache_progress - info->offset + 1, - info->bytes); - - rb_erase(&info->offset_index, rbroot); - spin_unlock(rbroot_lock); - if (count) - __btrfs_add_free_space(root->fs_info, ctl, - info->offset, count, 0); - kmem_cache_free(btrfs_free_space_cachep, info); - } -} - -#define INIT_THRESHOLD ((SZ_32K / 2) / sizeof(struct btrfs_free_space)) -#define INODES_PER_BITMAP (PAGE_SIZE * 8) - -/* - * The goal is to keep the memory used by the free_ino tree won't - * exceed the memory if we use bitmaps only. - */ -static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) -{ - struct btrfs_free_space *info; - struct rb_node *n; - int max_ino; - int max_bitmaps; - - n = rb_last(&ctl->free_space_offset); - if (!n) { - ctl->extents_thresh = INIT_THRESHOLD; - return; - } - info = rb_entry(n, struct btrfs_free_space, offset_index); - - /* - * Find the maximum inode number in the filesystem. Note we - * ignore the fact that this can be a bitmap, because we are - * not doing precise calculation. - */ - max_ino = info->bytes - 1; - - max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP; - if (max_bitmaps <= ctl->total_bitmaps) { - ctl->extents_thresh = 0; - return; - } - - ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) * - PAGE_SIZE / sizeof(*info); -} - -/* - * We don't fall back to bitmap, if we are below the extents threshold - * or this chunk of inode numbers is a big one. - */ -static bool use_bitmap(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info) -{ - if (ctl->free_extents < ctl->extents_thresh || - info->bytes > INODES_PER_BITMAP / 10) - return false; - - return true; -} - -static const struct btrfs_free_space_op free_ino_op = { - .recalc_thresholds = recalculate_thresholds, - .use_bitmap = use_bitmap, -}; - -static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl) -{ -} - -static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info) -{ - /* - * We always use extents for two reasons: - * - * - The pinned tree is only used during the process of caching - * work. - * - Make code simpler. See btrfs_unpin_free_ino(). - */ - return false; -} - -static const struct btrfs_free_space_op pinned_free_ino_op = { - .recalc_thresholds = pinned_recalc_thresholds, - .use_bitmap = pinned_use_bitmap, -}; - -void btrfs_init_free_ino_ctl(struct btrfs_root *root) -{ - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; - - spin_lock_init(&ctl->tree_lock); - ctl->unit = 1; - ctl->start = 0; - ctl->private = NULL; - ctl->op = &free_ino_op; - INIT_LIST_HEAD(&ctl->trimming_ranges); - mutex_init(&ctl->cache_writeout_mutex); - - /* - * Initially we allow to use 16K of ram to cache chunks of - * inode numbers before we resort to bitmaps. This is somewhat - * arbitrary, but it will be adjusted in runtime. - */ - ctl->extents_thresh = INIT_THRESHOLD; - - spin_lock_init(&pinned->tree_lock); - pinned->unit = 1; - pinned->start = 0; - pinned->private = NULL; - pinned->extents_thresh = 0; - pinned->op = &pinned_free_ino_op; -} - -int btrfs_save_ino_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct btrfs_path *path; - struct inode *inode; - struct btrfs_block_rsv *rsv; - struct extent_changeset *data_reserved = NULL; - u64 num_bytes; - u64 alloc_hint = 0; - int ret; - int prealloc; - bool retry = false; - - /* only fs tree and subvol/snap needs ino cache */ - if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID && - (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID || - root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID)) - return 0; - - /* Don't save inode cache if we are deleting this root */ - if (btrfs_root_refs(&root->root_item) == 0) - return 0; - - if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE)) - return 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - rsv = trans->block_rsv; - trans->block_rsv = &fs_info->trans_block_rsv; - - num_bytes = trans->bytes_reserved; - /* - * 1 item for inode item insertion if need - * 4 items for inode item update (in the worst case) - * 1 items for slack space if we need do truncation - * 1 item for free space object - * 3 items for pre-allocation - */ - trans->bytes_reserved = btrfs_calc_insert_metadata_size(fs_info, 10); - ret = btrfs_block_rsv_add(root, trans->block_rsv, - trans->bytes_reserved, - BTRFS_RESERVE_NO_FLUSH); - if (ret) - goto out; - trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid, - trans->bytes_reserved, 1); -again: - inode = lookup_free_ino_inode(root, path); - if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) { - ret = PTR_ERR(inode); - goto out_release; - } - - if (IS_ERR(inode)) { - BUG_ON(retry); /* Logic error */ - retry = true; - - ret = create_free_ino_inode(root, trans, path); - if (ret) - goto out_release; - goto again; - } - - BTRFS_I(inode)->generation = 0; - ret = btrfs_update_inode(trans, root, inode); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_put; - } - - if (i_size_read(inode) > 0) { - ret = btrfs_truncate_free_space_cache(trans, NULL, inode); - if (ret) { - if (ret != -ENOSPC) - btrfs_abort_transaction(trans, ret); - goto out_put; - } - } - - spin_lock(&root->ino_cache_lock); - if (root->ino_cache_state != BTRFS_CACHE_FINISHED) { - ret = -1; - spin_unlock(&root->ino_cache_lock); - goto out_put; - } - spin_unlock(&root->ino_cache_lock); - - spin_lock(&ctl->tree_lock); - prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents; - prealloc = ALIGN(prealloc, PAGE_SIZE); - prealloc += ctl->total_bitmaps * PAGE_SIZE; - spin_unlock(&ctl->tree_lock); - - /* Just to make sure we have enough space */ - prealloc += 8 * PAGE_SIZE; - - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, 0, - prealloc); - if (ret) - goto out_put; - - ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, - prealloc, prealloc, &alloc_hint); - if (ret) { - btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc); - btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc, true); - goto out_put; - } - - ret = btrfs_write_out_ino_cache(root, trans, path, inode); - btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc); -out_put: - iput(inode); -out_release: - trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid, - trans->bytes_reserved, 0); - btrfs_block_rsv_release(fs_info, trans->block_rsv, - trans->bytes_reserved, NULL); -out: - trans->block_rsv = rsv; - trans->bytes_reserved = num_bytes; - - btrfs_free_path(path); - extent_changeset_free(data_reserved); - return ret; -} - -int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) -{ - struct btrfs_path *path; - int ret; - struct extent_buffer *l; - struct btrfs_key search_key; - struct btrfs_key found_key; - int slot; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - search_key.objectid = BTRFS_LAST_FREE_OBJECTID; - search_key.type = -1; - search_key.offset = (u64)-1; - ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); - if (ret < 0) - goto error; - BUG_ON(ret == 0); /* Corruption */ - if (path->slots[0] > 0) { - slot = path->slots[0] - 1; - l = path->nodes[0]; - btrfs_item_key_to_cpu(l, &found_key, slot); - *objectid = max_t(u64, found_key.objectid, - BTRFS_FIRST_FREE_OBJECTID - 1); - } else { - *objectid = BTRFS_FIRST_FREE_OBJECTID - 1; - } - ret = 0; -error: - btrfs_free_path(path); - return ret; -} - -int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid) -{ - int ret; - mutex_lock(&root->objectid_mutex); - - if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) { - btrfs_warn(root->fs_info, - "the objectid of root %llu reaches its highest value", - root->root_key.objectid); - ret = -ENOSPC; - goto out; - } - - *objectid = ++root->highest_objectid; - ret = 0; -out: - mutex_unlock(&root->objectid_mutex); - return ret; -} diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h deleted file mode 100644 index 7a962811dffe..000000000000 --- a/fs/btrfs/inode-map.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef BTRFS_INODE_MAP_H -#define BTRFS_INODE_MAP_H - -void btrfs_init_free_ino_ctl(struct btrfs_root *root); -void btrfs_unpin_free_ino(struct btrfs_root *root); -void btrfs_return_ino(struct btrfs_root *root, u64 objectid); -int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid); -int btrfs_save_ino_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans); - -int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid); -int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid); - -#endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7e8d8169779d..8e23780acfae 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -45,7 +45,6 @@ #include "compression.h" #include "locking.h" #include "free-space-cache.h" -#include "inode-map.h" #include "props.h" #include "qgroup.h" #include "delalloc-space.h" @@ -62,7 +61,6 @@ struct btrfs_dio_data { loff_t length; ssize_t submitted; struct extent_changeset *data_reserved; - bool sync; }; static const struct inode_operations btrfs_dir_inode_operations; @@ -96,6 +94,51 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode, const bool uptodate); /* + * btrfs_inode_lock - lock inode i_rwsem based on arguments passed + * + * ilock_flags can have the following bit set: + * + * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode + * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt + * return -EAGAIN + */ +int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags) +{ + if (ilock_flags & BTRFS_ILOCK_SHARED) { + if (ilock_flags & BTRFS_ILOCK_TRY) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + else + return 0; + } + inode_lock_shared(inode); + } else { + if (ilock_flags & BTRFS_ILOCK_TRY) { + if (!inode_trylock(inode)) + return -EAGAIN; + else + return 0; + } + inode_lock(inode); + } + return 0; +} + +/* + * btrfs_inode_unlock - unock inode i_rwsem + * + * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() + * to decide whether the lock acquired is shared or exclusive. + */ +void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags) +{ + if (ilock_flags & BTRFS_ILOCK_SHARED) + inode_unlock_shared(inode); + else + inode_unlock(inode); +} + +/* * Cleanup all submitted ordered extents in specified range to handle errors * from the btrfs_run_delalloc_range() callback. * @@ -158,7 +201,7 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, * no overlapping inline items exist in the btree */ static int insert_inline_extent(struct btrfs_trans_handle *trans, - struct btrfs_path *path, int extent_inserted, + struct btrfs_path *path, bool extent_inserted, struct btrfs_root *root, struct inode *inode, u64 start, size_t size, size_t compressed_size, int compress_type, @@ -179,8 +222,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, if (compressed_size && compressed_pages) cur_size = compressed_size; - inode_add_bytes(inode, size); - if (!extent_inserted) { struct btrfs_key key; size_t datasize; @@ -190,7 +231,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, key.type = BTRFS_EXTENT_DATA_KEY; datasize = btrfs_file_extent_calc_inline_size(cur_size); - path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &key, datasize); if (ret) @@ -256,8 +296,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, * could end up racing with unlink. */ BTRFS_I(inode)->disk_i_size = inode->i_size; - ret = btrfs_update_inode(trans, root, inode); - fail: return ret; } @@ -273,6 +311,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, int compress_type, struct page **compressed_pages) { + struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; @@ -283,8 +322,6 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, u64 data_len = inline_len; int ret; struct btrfs_path *path; - int extent_inserted = 0; - u32 extent_item_size; if (compressed_size) data_len = compressed_size; @@ -310,16 +347,20 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, } trans->block_rsv = &inode->block_rsv; + drop_args.path = path; + drop_args.start = start; + drop_args.end = aligned_end; + drop_args.drop_cache = true; + drop_args.replace_extent = true; + if (compressed_size && compressed_pages) - extent_item_size = btrfs_file_extent_calc_inline_size( + drop_args.extent_item_size = btrfs_file_extent_calc_inline_size( compressed_size); else - extent_item_size = btrfs_file_extent_calc_inline_size( + drop_args.extent_item_size = btrfs_file_extent_calc_inline_size( inline_len); - ret = __btrfs_drop_extents(trans, root, inode, path, start, aligned_end, - NULL, 1, 1, extent_item_size, - &extent_inserted); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -327,7 +368,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, if (isize > actual_end) inline_len = min_t(u64, isize, actual_end); - ret = insert_inline_extent(trans, path, extent_inserted, + ret = insert_inline_extent(trans, path, drop_args.extent_inserted, root, &inode->vfs_inode, start, inline_len, compressed_size, compress_type, compressed_pages); @@ -339,8 +380,17 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, goto out; } + btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found); + ret = btrfs_update_inode(trans, root, inode); + if (ret && ret != -ENOSPC) { + btrfs_abort_transaction(trans, ret); + goto out; + } else if (ret == -ENOSPC) { + ret = 1; + goto out; + } + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); - btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); out: /* * Don't forget to free the reserved space, as for inlined extent @@ -1598,6 +1648,15 @@ next_slot: goto out_check; if (extent_type == BTRFS_FILE_EXTENT_REG && !force) goto out_check; + + /* + * The following checks can be expensive, as they need to + * take other locks and do btree or rbtree searches, so + * release the path to avoid blocking other tasks for too + * long. + */ + btrfs_release_path(path); + /* If extent is RO, we must COW it */ if (btrfs_extent_readonly(fs_info, disk_bytenr)) goto out_check; @@ -1673,12 +1732,12 @@ out_check: cur_offset = extent_end; if (cur_offset > end) break; + if (!path->nodes[0]) + continue; path->slots[0]++; goto next_slot; } - btrfs_release_path(path); - /* * COW range from cow_start to found_key.offset - 1. As the key * will contain the beginning of the first extent that can be @@ -2098,6 +2157,8 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, spin_lock(&inode->lock); ASSERT(inode->new_delalloc_bytes >= len); inode->new_delalloc_bytes -= len; + if (*bits & EXTENT_ADD_INODE_BYTES) + inode_add_bytes(&inode->vfs_inode, len); spin_unlock(&inode->lock); } } @@ -2121,7 +2182,7 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - u64 logical = (u64)bio->bi_iter.bi_sector << 9; + u64 logical = bio->bi_iter.bi_sector << 9; u64 length = 0; u64 map_length; int ret; @@ -2150,11 +2211,9 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio, - u64 bio_offset) +static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, + u64 dio_file_offset) { - struct inode *inode = private_data; - return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); } @@ -2187,7 +2246,8 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, int skip_sum; int async = !atomic_read(&BTRFS_I(inode)->sync_writers); - skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || + !fs_info->csum_root; if (btrfs_is_free_space_inode(BTRFS_I(inode))) metadata = BTRFS_WQ_ENDIO_FREE_SPACE; @@ -2202,8 +2262,13 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, mirror_num, bio_flags); goto out; - } else if (!skip_sum) { - ret = btrfs_lookup_bio_sums(inode, bio, (u64)-1, NULL); + } else { + /* + * Lookup bio sums does extra checks around whether we + * need to csum or not, which is why we ignore skip_sum + * here. + */ + ret = btrfs_lookup_bio_sums(inode, bio, NULL); if (ret) goto out; } @@ -2213,8 +2278,8 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) goto mapit; /* we're doing a write, do the async checksumming */ - ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags, - 0, inode, btrfs_submit_bio_start); + ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags, + 0, btrfs_submit_bio_start); goto out; } else if (!skip_sum) { ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); @@ -2282,8 +2347,8 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, ret = set_extent_bit(&inode->io_tree, search_start, search_start + em_len - 1, - EXTENT_DELALLOC_NEW, - NULL, cached_state, GFP_NOFS); + EXTENT_DELALLOC_NEW, 0, NULL, cached_state, + GFP_NOFS, NULL); next: search_start = extent_map_end(em); free_extent_map(em); @@ -2511,9 +2576,11 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 file_pos, struct btrfs_file_extent_item *stack_fi, + const bool update_inode_bytes, u64 qgroup_reserved) { struct btrfs_root *root = inode->root; + const u64 sectorsize = root->fs_info->sectorsize; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key ins; @@ -2521,7 +2588,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi); u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi); u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi); - int extent_inserted = 0; + struct btrfs_drop_extents_args drop_args = { 0 }; int ret; path = btrfs_alloc_path(); @@ -2537,18 +2604,20 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, * the caller is expected to unpin it and allow it to be merged * with the others. */ - ret = __btrfs_drop_extents(trans, root, inode, path, file_pos, - file_pos + num_bytes, NULL, 0, - 1, sizeof(*stack_fi), &extent_inserted); + drop_args.path = path; + drop_args.start = file_pos; + drop_args.end = file_pos + num_bytes; + drop_args.replace_extent = true; + drop_args.extent_item_size = sizeof(*stack_fi); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) goto out; - if (!extent_inserted) { + if (!drop_args.extent_inserted) { ins.objectid = btrfs_ino(inode); ins.offset = file_pos; ins.type = BTRFS_EXTENT_DATA_KEY; - path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*stack_fi)); if (ret) @@ -2563,7 +2632,24 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - inode_add_bytes(&inode->vfs_inode, num_bytes); + /* + * If we dropped an inline extent here, we know the range where it is + * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the + * number of bytes only for that range contaning the inline extent. + * The remaining of the range will be processed when clearning the + * EXTENT_DELALLOC_BIT bit through the ordered extent completion. + */ + if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) { + u64 inline_size = round_down(drop_args.bytes_found, sectorsize); + + inline_size = drop_args.bytes_found - inline_size; + btrfs_update_inode_bytes(inode, sectorsize, inline_size); + drop_args.bytes_found -= inline_size; + num_bytes -= sectorsize; + } + + if (update_inode_bytes) + btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found); ins.objectid = disk_bytenr; ins.offset = disk_num_bytes; @@ -2601,6 +2687,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, { struct btrfs_file_extent_item stack_fi; u64 logical_len; + bool update_inode_bytes; memset(&stack_fi, 0, sizeof(stack_fi)); btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG); @@ -2616,9 +2703,18 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); /* Encryption and other encoding is reserved and all 0 */ + /* + * For delalloc, when completing an ordered extent we update the inode's + * bytes when clearing the range in the inode's io tree, so pass false + * as the argument 'update_inode_bytes' to insert_reserved_file_extent(), + * except if the ordered extent was truncated. + */ + update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) || + test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); + return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), oe->file_offset, &stack_fi, - oe->qgroup_rsv); + update_inode_bytes, oe->qgroup_rsv); } /* @@ -2628,11 +2724,11 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, */ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) { - struct inode *inode = ordered_extent->inode; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans = NULL; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_io_tree *io_tree = &inode->io_tree; struct extent_state *cached_state = NULL; u64 start, end; int compress_type = 0; @@ -2640,10 +2736,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) u64 logical_len = ordered_extent->num_bytes; bool freespace_inode; bool truncated = false; - bool range_locked = false; - bool clear_new_delalloc_bytes = false; bool clear_reserved_extent = true; - unsigned int clear_bits; + unsigned int clear_bits = EXTENT_DEFRAG; start = ordered_extent->file_offset; end = start + ordered_extent->num_bytes - 1; @@ -2651,16 +2745,16 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags)) - clear_new_delalloc_bytes = true; + clear_bits |= EXTENT_DELALLOC_NEW; - freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode)); + freespace_inode = btrfs_is_free_space_inode(inode); if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { ret = -EIO; goto out; } - btrfs_free_io_failure_record(BTRFS_I(inode), start, end); + btrfs_free_io_failure_record(inode, start, end); if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; @@ -2683,14 +2777,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) trans = NULL; goto out; } - trans->block_rsv = &BTRFS_I(inode)->block_rsv; + trans->block_rsv = &inode->block_rsv; ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); goto out; } - range_locked = true; + clear_bits |= EXTENT_LOCKED; lock_extent_bits(io_tree, start, end, &cached_state); if (freespace_inode) @@ -2703,13 +2797,13 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - trans->block_rsv = &BTRFS_I(inode)->block_rsv; + trans->block_rsv = &inode->block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { BUG_ON(compress_type); - ret = btrfs_mark_extent_written(trans, BTRFS_I(inode), + ret = btrfs_mark_extent_written(trans, inode, ordered_extent->file_offset, ordered_extent->file_offset + logical_len); @@ -2723,8 +2817,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ordered_extent->disk_num_bytes); } } - unpin_extent_cache(&BTRFS_I(inode)->extent_tree, - ordered_extent->file_offset, + unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset, ordered_extent->num_bytes, trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -2737,6 +2830,17 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } + /* + * If this is a new delalloc range, clear its new delalloc flag to + * update the inode's number of bytes. This needs to be done first + * before updating the inode item. + */ + if ((clear_bits & EXTENT_DELALLOC_NEW) && + !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) + clear_extent_bit(&inode->io_tree, start, end, + EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, + 0, 0, &cached_state); + btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) { /* -ENOMEM or corruption */ @@ -2745,12 +2849,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) } ret = 0; out: - clear_bits = EXTENT_DEFRAG; - if (range_locked) - clear_bits |= EXTENT_LOCKED; - if (clear_new_delalloc_bytes) - clear_bits |= EXTENT_DELALLOC_NEW; - clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, + clear_extent_bit(&inode->io_tree, start, end, clear_bits, (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0, &cached_state); @@ -2765,7 +2864,7 @@ out: clear_extent_uptodate(io_tree, unwritten_start, end, NULL); /* Drop the cache for the part of the extent we didn't write. */ - btrfs_drop_extent_cache(BTRFS_I(inode), unwritten_start, end, 0); + btrfs_drop_extent_cache(inode, unwritten_start, end, 0); /* * If the ordered extent had an IOERR or something else went @@ -2800,7 +2899,7 @@ out: * This needs to be done to make sure anybody waiting knows we are done * updating everything for this ordered extent. */ - btrfs_remove_ordered_extent(BTRFS_I(inode), ordered_extent); + btrfs_remove_ordered_extent(inode, ordered_extent); /* once for us */ btrfs_put_ordered_extent(ordered_extent); @@ -2841,18 +2940,32 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, btrfs_queue_work(wq, &ordered_extent->work); } +/* + * check_data_csum - verify checksum of one sector of uncompressed data + * @inode: inode + * @io_bio: btrfs_io_bio which contains the csum + * @bio_offset: offset to the beginning of the bio (in bytes) + * @page: page where is the data to be verified + * @pgoff: offset inside the page + * + * The length of such check is always one sector size. + */ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, - int icsum, struct page *page, int pgoff, u64 start, - size_t len) + u32 bio_offset, struct page *page, u32 pgoff) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); char *kaddr; - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + u32 len = fs_info->sectorsize; + const u32 csum_size = fs_info->csum_size; + unsigned int offset_sectors; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; - csum_expected = ((u8 *)io_bio->csum) + icsum * csum_size; + ASSERT(pgoff + len <= PAGE_SIZE); + + offset_sectors = bio_offset >> fs_info->sectorsize_bits; + csum_expected = ((u8 *)io_bio->csum) + offset_sectors * csum_size; kaddr = kmap_atomic(page); shash->tfm = fs_info->csum_shash; @@ -2865,8 +2978,8 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, kunmap_atomic(kaddr); return 0; zeroit: - btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, - io_bio->mirror_num); + btrfs_print_data_csum_error(BTRFS_I(inode), page_offset(page) + pgoff, + csum, csum_expected, io_bio->mirror_num); if (io_bio->device) btrfs_dev_stat_inc_and_print(io_bio->device, BTRFS_DEV_STAT_CORRUPTION_ERRS); @@ -2877,17 +2990,23 @@ zeroit: } /* - * when reads are done, we need to check csums to verify the data is correct + * When reads are done, we need to check csums to verify the data is correct. * if there's a match, we allow the bio to finish. If not, the code in * extent_io.c will try to find good copies for us. + * + * @bio_offset: offset to the beginning of the bio (in bytes) + * @start: file offset of the range start + * @end: file offset of the range end (inclusive) + * @mirror: mirror number */ -int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, +int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, struct page *page, u64 start, u64 end, int mirror) { - size_t offset = start - page_offset(page); struct inode *inode = page->mapping->host; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_root *root = BTRFS_I(inode)->root; + const u32 sectorsize = root->fs_info->sectorsize; + u32 pg_off; if (PageChecked(page)) { ClearPageChecked(page); @@ -2897,15 +3016,27 @@ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) return 0; + if (!root->fs_info->csum_root) + return 0; + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM); return 0; } - phy_offset >>= inode->i_sb->s_blocksize_bits; - return check_data_csum(inode, io_bio, phy_offset, page, offset, start, - (size_t)(end - start + 1)); + ASSERT(page_offset(page) <= start && + end <= page_offset(page) + PAGE_SIZE - 1); + for (pg_off = offset_in_page(start); + pg_off < offset_in_page(end); + pg_off += sectorsize, bio_offset += sectorsize) { + int ret; + + ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off); + if (ret < 0) + return -EIO; + } + return 0; } /* @@ -3515,7 +3646,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, * copy everything in the in-memory inode into the btree. */ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode) + struct btrfs_root *root, + struct btrfs_inode *inode) { struct btrfs_inode_item *inode_item; struct btrfs_path *path; @@ -3526,9 +3658,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; - ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location, - 1); + ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1); if (ret) { if (ret > 0) ret = -ENOENT; @@ -3539,9 +3669,9 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - fill_inode_item(trans, leaf, inode_item, inode); + fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); btrfs_mark_buffer_dirty(leaf); - btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); + btrfs_set_inode_last_trans(trans, inode); ret = 0; failed: btrfs_free_path(path); @@ -3552,7 +3682,8 @@ failed: * copy everything in the in-memory inode into the btree. */ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode) + struct btrfs_root *root, + struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; @@ -3564,23 +3695,22 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, * The data relocation inode should also be directly updated * without delay */ - if (!btrfs_is_free_space_inode(BTRFS_I(inode)) + if (!btrfs_is_free_space_inode(inode) && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { btrfs_update_root_times(trans, root); ret = btrfs_delayed_update_inode(trans, root, inode); if (!ret) - btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); + btrfs_set_inode_last_trans(trans, inode); return ret; } return btrfs_update_inode_item(trans, root, inode); } -noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode) +int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_inode *inode) { int ret; @@ -3615,7 +3745,6 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, goto out; } - path->leave_spinning = 1; di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, name_len, -1); if (IS_ERR_OR_NULL(di)) { @@ -3695,7 +3824,7 @@ err: inode_inc_iversion(&dir->vfs_inode); inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime = dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode); - ret = btrfs_update_inode(trans, root, &dir->vfs_inode); + ret = btrfs_update_inode(trans, root, dir); out: return ret; } @@ -3709,7 +3838,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); if (!ret) { drop_nlink(&inode->vfs_inode); - ret = btrfs_update_inode(trans, root, &inode->vfs_inode); + ret = btrfs_update_inode(trans, root, inode); } return ret; } @@ -3858,7 +3987,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2); inode_inc_iversion(dir); dir->i_mtime = dir->i_ctime = current_time(dir); - ret = btrfs_update_inode_fallback(trans, root, dir); + ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir)); if (ret) btrfs_abort_transaction(trans, ret); out: @@ -3995,7 +4124,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) struct btrfs_block_rsv block_rsv; u64 root_flags; int ret; - int err; /* * Don't allow to delete a subvolume with send in progress. This is @@ -4017,8 +4145,8 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) down_write(&fs_info->subvol_sem); - err = may_destroy_subvol(dest); - if (err) + ret = may_destroy_subvol(dest); + if (ret) goto out_up_write; btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); @@ -4027,13 +4155,13 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) * two for dir entries, * two for root ref/backref. */ - err = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); - if (err) + ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); + if (ret) goto out_up_write; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_release; } trans->block_rsv = &block_rsv; @@ -4043,7 +4171,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) ret = btrfs_unlink_subvol(trans, dir, dentry); if (ret) { - err = ret; btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -4052,7 +4179,7 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) memset(&dest->root_item.drop_progress, 0, sizeof(dest->root_item.drop_progress)); - dest->root_item.drop_level = 0; + btrfs_set_root_drop_level(&dest->root_item, 0); btrfs_set_root_refs(&dest->root_item, 0); if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { @@ -4061,7 +4188,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) dest->root_key.objectid); if (ret) { btrfs_abort_transaction(trans, ret); - err = ret; goto out_end_trans; } } @@ -4071,7 +4197,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) dest->root_key.objectid); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); - err = ret; goto out_end_trans; } if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) { @@ -4081,7 +4206,6 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) dest->root_key.objectid); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); - err = ret; goto out_end_trans; } } @@ -4092,14 +4216,12 @@ out_end_trans: trans->block_rsv = NULL; trans->bytes_reserved = 0; ret = btrfs_end_transaction(trans); - if (ret && !err) - err = ret; inode->i_flags |= S_DEAD; out_release: btrfs_subvolume_release_metadata(root, &block_rsv); out_up_write: up_write(&fs_info->subvol_sem); - if (err) { + if (ret) { spin_lock(&dest->root_item_lock); root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, @@ -4109,15 +4231,9 @@ out_up_write: d_invalidate(dentry); btrfs_prune_dentries(dest); ASSERT(dest->send_in_progress == 0); - - /* the last ref */ - if (dest->ino_cache_inode) { - iput(dest->ino_cache_inode); - dest->ino_cache_inode = NULL; - } } - return err; + return ret; } static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) @@ -4194,7 +4310,7 @@ out: */ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct inode *inode, + struct btrfs_inode *inode, u64 new_size, u32 min_type) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -4215,7 +4331,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, int pending_del_slot = 0; int extent_type = -1; int ret; - u64 ino = btrfs_ino(BTRFS_I(inode)); + u64 ino = btrfs_ino(inode); u64 bytes_deleted = 0; bool be_nice = false; bool should_throttle = false; @@ -4229,7 +4345,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, * off from time to time. This means all inodes in subvolume roots, * reloc roots, and data reloc roots. */ - if (!btrfs_is_free_space_inode(BTRFS_I(inode)) && + if (!btrfs_is_free_space_inode(inode) && test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) be_nice = true; @@ -4239,7 +4355,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, path->reada = READA_BACK; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, + lock_extent_bits(&inode->io_tree, lock_start, (u64)-1, &cached_state); /* @@ -4247,7 +4363,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, * new size is not block aligned since we will be keeping the * last block of the extent just the way it is. */ - btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size, + btrfs_drop_extent_cache(inode, ALIGN(new_size, fs_info->sectorsize), (u64)-1, 0); } @@ -4258,8 +4374,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, * it is used to drop the logged items. So we shouldn't kill the delayed * items. */ - if (min_type == 0 && root == BTRFS_I(inode)->root) - btrfs_kill_delayed_inode_items(BTRFS_I(inode)); + if (min_type == 0 && root == inode->root) + btrfs_kill_delayed_inode_items(inode); key.objectid = ino; key.offset = (u64)-1; @@ -4315,14 +4431,13 @@ search_again: btrfs_file_extent_num_bytes(leaf, fi); trace_btrfs_truncate_show_fi_regular( - BTRFS_I(inode), leaf, fi, - found_key.offset); + inode, leaf, fi, found_key.offset); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_ram_bytes(leaf, fi); trace_btrfs_truncate_show_fi_inline( - BTRFS_I(inode), leaf, fi, path->slots[0], + inode, leaf, fi, path->slots[0], found_key.offset); } item_end--; @@ -4361,7 +4476,8 @@ search_again: if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && extent_start != 0) - inode_sub_bytes(inode, num_dec); + inode_sub_bytes(&inode->vfs_inode, + num_dec); btrfs_mark_buffer_dirty(leaf); } else { extent_num_bytes = @@ -4376,7 +4492,8 @@ search_again: found_extent = 1; if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - inode_sub_bytes(inode, num_dec); + inode_sub_bytes(&inode->vfs_inode, + num_dec); } } clear_len = num_dec; @@ -4411,7 +4528,8 @@ search_again: } if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - inode_sub_bytes(inode, item_end + 1 - new_size); + inode_sub_bytes(&inode->vfs_inode, + item_end + 1 - new_size); } delete: /* @@ -4419,8 +4537,8 @@ delete: * multiple fsyncs, and in this case we don't want to clear the * file extent range because it's just the log. */ - if (root == BTRFS_I(inode)->root) { - ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), + if (root == inode->root) { + ret = btrfs_inode_clear_file_extent_range(inode, clear_start, clear_len); if (ret) { btrfs_abort_transaction(trans, ret); @@ -4529,8 +4647,8 @@ out: if (!ret && last_size > new_size) last_size = new_size; btrfs_inode_safe_disk_i_size_write(inode, last_size); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, - (u64)-1, &cached_state); + unlock_extent_cached(&inode->io_tree, lock_start, (u64)-1, + &cached_state); } btrfs_free_path(path); @@ -4548,12 +4666,12 @@ out: * This will find the block for the "from" offset and cow the block and zero the * part we want to zero. This is used with truncate and hole punching. */ -int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, - int front) +int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, + int front) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct address_space *mapping = inode->i_mapping; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct address_space *mapping = inode->vfs_inode.i_mapping; + struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; @@ -4576,30 +4694,29 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, block_start = round_down(from, blocksize); block_end = block_start + blocksize - 1; - ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, - block_start, blocksize); + ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, + blocksize); if (ret < 0) { - if (btrfs_check_nocow_lock(BTRFS_I(inode), block_start, - &write_bytes) > 0) { + if (btrfs_check_nocow_lock(inode, block_start, &write_bytes) > 0) { /* For nocow case, no need to reserve data space */ only_release_metadata = true; } else { goto out; } } - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), blocksize); + ret = btrfs_delalloc_reserve_metadata(inode, blocksize); if (ret < 0) { if (!only_release_metadata) - btrfs_free_reserved_data_space(BTRFS_I(inode), - data_reserved, block_start, blocksize); + btrfs_free_reserved_data_space(inode, data_reserved, + block_start, blocksize); goto out; } again: page = find_or_create_page(mapping, index, mask); if (!page) { - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - block_start, blocksize, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); + btrfs_delalloc_release_space(inode, data_reserved, block_start, + blocksize, true); + btrfs_delalloc_release_extents(inode, blocksize); ret = -ENOMEM; goto out; } @@ -4622,7 +4739,7 @@ again: lock_extent_bits(io_tree, block_start, block_end, &cached_state); set_page_extent_mapped(page); - ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), block_start); + ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { unlock_extent_cached(io_tree, block_start, block_end, &cached_state); @@ -4633,11 +4750,11 @@ again: goto again; } - clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end, + clear_extent_bit(&inode->io_tree, block_start, block_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state); - ret = btrfs_set_extent_delalloc(BTRFS_I(inode), block_start, block_end, 0, + ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, &cached_state); if (ret) { unlock_extent_cached(io_tree, block_start, block_end, @@ -4663,34 +4780,33 @@ again: unlock_extent_cached(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) - set_extent_bit(&BTRFS_I(inode)->io_tree, block_start, - block_end, EXTENT_NORESERVE, NULL, NULL, - GFP_NOFS); + set_extent_bit(&inode->io_tree, block_start, block_end, + EXTENT_NORESERVE, 0, NULL, NULL, GFP_NOFS, NULL); out_unlock: if (ret) { if (only_release_metadata) - btrfs_delalloc_release_metadata(BTRFS_I(inode), - blocksize, true); + btrfs_delalloc_release_metadata(inode, blocksize, true); else - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, + btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize, true); } - btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); + btrfs_delalloc_release_extents(inode, blocksize); unlock_page(page); put_page(page); out: if (only_release_metadata) - btrfs_check_nocow_unlock(BTRFS_I(inode)); + btrfs_check_nocow_unlock(inode); extent_changeset_free(data_reserved); return ret; } -static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, +static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode, u64 offset, u64 len) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; + struct btrfs_drop_extents_args drop_args = { 0 }; int ret; /* @@ -4698,9 +4814,9 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, * that any holes get logged if we fsync. */ if (btrfs_fs_incompat(fs_info, NO_HOLES)) { - BTRFS_I(inode)->last_trans = fs_info->generation; - BTRFS_I(inode)->last_sub_trans = root->log_transid; - BTRFS_I(inode)->last_log_commit = root->last_log_commit; + inode->last_trans = fs_info->generation; + inode->last_sub_trans = root->log_transid; + inode->last_log_commit = root->last_log_commit; return 0; } @@ -4713,19 +4829,25 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1); + drop_args.start = offset; + drop_args.end = offset + len; + drop_args.drop_cache = true; + + ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; } - ret = btrfs_insert_file_extent(trans, root, btrfs_ino(BTRFS_I(inode)), + ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, 0, 0, len, 0, len, 0, 0, 0); - if (ret) + if (ret) { btrfs_abort_transaction(trans, ret); - else + } else { + btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found); btrfs_update_inode(trans, root, inode); + } btrfs_end_transaction(trans); return ret; } @@ -4736,14 +4858,14 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for * the range between oldsize and size */ -int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) +int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map_tree *em_tree = &inode->extent_tree; u64 hole_start = ALIGN(oldsize, fs_info->sectorsize); u64 block_end = ALIGN(size, fs_info->sectorsize); u64 last_byte; @@ -4763,11 +4885,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) if (size <= hole_start) return 0; - btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), hole_start, - block_end - 1, &cached_state); + btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1, + &cached_state); cur_offset = hole_start; while (1) { - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, + em = btrfs_get_extent(inode, NULL, 0, cur_offset, block_end - cur_offset); if (IS_ERR(em)) { err = PTR_ERR(em); @@ -4786,17 +4908,17 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) if (err) break; - err = btrfs_inode_set_file_extent_range(BTRFS_I(inode), + err = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); if (err) break; - btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, + btrfs_drop_extent_cache(inode, cur_offset, cur_offset + hole_size - 1, 0); hole_em = alloc_extent_map(); if (!hole_em) { set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + &inode->runtime_flags); goto next; } hole_em->start = cur_offset; @@ -4816,14 +4938,13 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) write_unlock(&em_tree->lock); if (err != -EEXIST) break; - btrfs_drop_extent_cache(BTRFS_I(inode), - cur_offset, + btrfs_drop_extent_cache(inode, cur_offset, cur_offset + hole_size - 1, 0); } free_extent_map(hole_em); } else { - err = btrfs_inode_set_file_extent_range(BTRFS_I(inode), + err = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); if (err) break; @@ -4871,7 +4992,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * this truncation. */ btrfs_drew_write_lock(&root->snapshot_lock); - ret = btrfs_cont_expand(inode, oldsize, newsize); + ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize); if (ret) { btrfs_drew_write_unlock(&root->snapshot_lock); return ret; @@ -4884,9 +5005,9 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) } i_size_write(inode, newsize); - btrfs_inode_safe_disk_i_size_write(inode, 0); + btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); pagecache_isize_extended(inode, oldsize, newsize); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_end_transaction(trans); } else { @@ -5157,7 +5278,8 @@ void btrfs_evict_inode(struct inode *inode) trans->block_rsv = rsv; - ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); + ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), + 0, 0); trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -5184,10 +5306,6 @@ void btrfs_evict_inode(struct inode *inode) btrfs_end_transaction(trans); } - if (!(root == fs_info->tree_root || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) - btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode))); - free_rsv: btrfs_free_block_rsv(fs_info, rsv); no_delete: @@ -5797,7 +5915,7 @@ static int btrfs_dirty_inode(struct inode *inode) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret && ret == -ENOSPC) { /* whoops, lets try again with the full transaction */ btrfs_end_transaction(trans); @@ -5805,7 +5923,7 @@ static int btrfs_dirty_inode(struct inode *inode) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); } btrfs_end_transaction(trans); if (BTRFS_I(inode)->delayed_node) @@ -6068,7 +6186,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, goto fail; } - path->leave_spinning = 1; ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); if (ret != 0) goto fail_unlock; @@ -6194,7 +6311,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, parent_inode->vfs_inode.i_mtime = now; parent_inode->vfs_inode.i_ctime = now; } - ret = btrfs_update_inode(trans, root, &parent_inode->vfs_inode); + ret = btrfs_update_inode(trans, root, parent_inode); if (ret) btrfs_abort_transaction(trans, ret); return ret; @@ -6254,7 +6371,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_ino(root, &objectid); + err = btrfs_find_free_objectid(root, &objectid); if (err) goto out_unlock; @@ -6285,7 +6402,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (err) goto out_unlock; - btrfs_update_inode(trans, root, inode); + btrfs_update_inode(trans, root, BTRFS_I(inode)); d_instantiate_new(dentry, inode); out_unlock: @@ -6318,7 +6435,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_ino(root, &objectid); + err = btrfs_find_free_objectid(root, &objectid); if (err) goto out_unlock; @@ -6344,7 +6461,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (err) goto out_unlock; - err = btrfs_update_inode(trans, root, inode); + err = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (err) goto out_unlock; @@ -6416,7 +6533,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } else { struct dentry *parent = dentry->d_parent; - err = btrfs_update_inode(trans, root, inode); + err = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (err) goto fail; if (inode->i_nlink == 1) { @@ -6462,7 +6579,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_ino(root, &objectid); + err = btrfs_find_free_objectid(root, &objectid); if (err) goto out_fail; @@ -6484,7 +6601,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out_fail; btrfs_i_size_write(BTRFS_I(inode), 0); - err = btrfs_update_inode(trans, root, inode); + err = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (err) goto out_fail; @@ -6621,12 +6738,14 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, path->reada = READA_FORWARD; /* - * Unless we're going to uncompress the inline extent, no sleep would - * happen. + * The same explanation in load_free_space_cache applies here as well, + * we only read when we're loading the free space cache, and at that + * point the commit_root has everything we need. */ - path->leave_spinning = 1; - - path->recurse = btrfs_is_free_space_inode(inode); + if (btrfs_is_free_space_inode(inode)) { + path->search_commit_root = 1; + path->skip_locking = 1; + } ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); if (ret < 0) { @@ -6728,7 +6847,6 @@ next: em->orig_start = em->start; ptr = btrfs_file_extent_inline_start(item) + extent_offset; - btrfs_set_path_blocking(path); if (!PageUptodate(page)) { if (btrfs_file_extent_compression(leaf, item) != BTRFS_COMPRESS_NONE) { @@ -7377,17 +7495,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, int ret = 0; u64 len = length; bool unlock_extents = false; - bool sync = (current->journal_info == BTRFS_DIO_SYNC_STUB); - - /* - * We used current->journal_info here to see if we were sync, but - * there's a lot of tests in the enospc machinery to not do flushing if - * we have a journal_info set, so we need to clear this out and re-set - * it in iomap_end. - */ - ASSERT(current->journal_info == NULL || - current->journal_info == BTRFS_DIO_SYNC_STUB); - current->journal_info = NULL; if (!write) len = min_t(u64, len, fs_info->sectorsize); @@ -7413,7 +7520,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (!dio_data) return -ENOMEM; - dio_data->sync = sync; dio_data->length = length; if (write) { dio_data->reserve = round_up(length, fs_info->sectorsize); @@ -7561,14 +7667,6 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, extent_changeset_free(dio_data->data_reserved); } out: - /* - * We're all done, we can re-set the current->journal_info now safely - * for our endio. - */ - if (dio_data->sync) { - ASSERT(current->journal_info == NULL); - current->journal_info = BTRFS_DIO_SYNC_STUB; - } kfree(dio_data); iomap->private = NULL; @@ -7632,7 +7730,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, struct bio_vec bvec; struct bvec_iter iter; u64 start = io_bio->logical; - int icsum = 0; + u32 bio_offset = 0; blk_status_t err = BLK_STS_OK; __bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) { @@ -7643,9 +7741,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, for (i = 0; i < nr_sectors; i++) { ASSERT(pgoff < PAGE_SIZE); if (uptodate && - (!csum || !check_data_csum(inode, io_bio, icsum, - bvec.bv_page, pgoff, - start, sectorsize))) { + (!csum || !check_data_csum(inode, io_bio, + bio_offset, bvec.bv_page, pgoff))) { clean_io_failure(fs_info, failure_tree, io_tree, start, bvec.bv_page, btrfs_ino(BTRFS_I(inode)), @@ -7653,6 +7750,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, } else { blk_status_t status; + ASSERT((start - io_bio->logical) < UINT_MAX); status = btrfs_submit_read_repair(inode, &io_bio->bio, start - io_bio->logical, @@ -7665,7 +7763,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, err = status; } start += sectorsize; - icsum++; + ASSERT(bio_offset + sectorsize > bio_offset); + bio_offset += sectorsize; pgoff += sectorsize; } } @@ -7715,12 +7814,11 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode, } } -static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data, - struct bio *bio, u64 offset) +static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, + struct bio *bio, + u64 dio_file_offset) { - struct inode *inode = private_data; - - return btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1); + return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, 1); } static void btrfs_end_dio_bio(struct bio *bio) @@ -7732,8 +7830,7 @@ static void btrfs_end_dio_bio(struct bio *bio) btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio), - bio->bi_opf, - (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_opf, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, err); if (bio_op(bio) == REQ_OP_READ) { @@ -7770,8 +7867,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, goto map; if (write && async_submit) { - ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0, - file_offset, inode, + ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset, btrfs_submit_bio_start_direct_io); goto err; } else if (write) { @@ -7786,8 +7882,8 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, u64 csum_offset; csum_offset = file_offset - dip->logical_offset; - csum_offset >>= inode->i_sb->s_blocksize_bits; - csum_offset *= btrfs_super_csum_size(fs_info->super_copy); + csum_offset >>= fs_info->sectorsize_bits; + csum_offset *= fs_info->csum_size; btrfs_io_bio(bio)->csum = dip->csums + csum_offset; } map: @@ -7812,11 +7908,10 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, dip_size = sizeof(*dip); if (!write && csum) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); size_t nblocks; - nblocks = dio_bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; - dip_size += csum_size * nblocks; + nblocks = dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits; + dip_size += fs_info->csum_size * nblocks; } dip = kzalloc(dip_size, GFP_NOFS); @@ -7826,7 +7921,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, dip->inode = inode; dip->logical_offset = file_offset; dip->bytes = dio_bio->bi_iter.bi_size; - dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; + dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9; dip->dio_bio = dio_bio; refcount_set(&dip->refs, 1); return dip; @@ -7836,7 +7931,6 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, struct bio *dio_bio, loff_t file_offset) { const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); - const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const bool raid56 = (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK); @@ -7863,13 +7957,14 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, return BLK_QC_T_NONE; } - if (!write && csum) { + if (!write) { /* * Load the csums up front to reduce csum tree searches and * contention when submitting bios. + * + * If we have csums disabled this will do nothing. */ - status = btrfs_lookup_bio_sums(inode, dio_bio, file_offset, - dip->csums); + status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums); if (status != BLK_STS_OK) goto out_err; } @@ -7944,129 +8039,15 @@ out_err: return BLK_QC_T_NONE; } -static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, - const struct iov_iter *iter, loff_t offset) -{ - int seg; - int i; - unsigned int blocksize_mask = fs_info->sectorsize - 1; - ssize_t retval = -EINVAL; - - if (offset & blocksize_mask) - goto out; - - if (iov_iter_alignment(iter) & blocksize_mask) - goto out; - - /* If this is a write we don't need to check anymore */ - if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter)) - return 0; - /* - * Check to make sure we don't have duplicate iov_base's in this - * iovec, if so return EINVAL, otherwise we'll get csum errors - * when reading back. - */ - for (seg = 0; seg < iter->nr_segs; seg++) { - for (i = seg + 1; i < iter->nr_segs; i++) { - if (iter->iov[seg].iov_base == iter->iov[i].iov_base) - goto out; - } - } - retval = 0; -out: - return retval; -} - -static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size, - int error, unsigned flags) -{ - /* - * Now if we're still in the context of our submitter we know we can't - * safely run generic_write_sync(), so clear our flag here so that the - * caller knows to follow up with a sync. - */ - if (current->journal_info == BTRFS_DIO_SYNC_STUB) { - current->journal_info = NULL; - return error; - } - - if (error) - return error; - - if (size) { - iocb->ki_flags |= IOCB_DSYNC; - return generic_write_sync(iocb, size); - } - - return 0; -} - -static const struct iomap_ops btrfs_dio_iomap_ops = { +const struct iomap_ops btrfs_dio_iomap_ops = { .iomap_begin = btrfs_dio_iomap_begin, .iomap_end = btrfs_dio_iomap_end, }; -static const struct iomap_dio_ops btrfs_dio_ops = { +const struct iomap_dio_ops btrfs_dio_ops = { .submit_io = btrfs_submit_direct, }; -static const struct iomap_dio_ops btrfs_sync_dops = { - .submit_io = btrfs_submit_direct, - .end_io = btrfs_maybe_fsync_end_io, -}; - -ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_changeset *data_reserved = NULL; - loff_t offset = iocb->ki_pos; - size_t count = 0; - bool relock = false; - ssize_t ret; - - if (check_direct_IO(fs_info, iter, offset)) - return 0; - - count = iov_iter_count(iter); - if (iov_iter_rw(iter) == WRITE) { - /* - * If the write DIO is beyond the EOF, we need update - * the isize, but it is protected by i_mutex. So we can - * not unlock the i_mutex at this case. - */ - if (offset + count <= inode->i_size) { - inode_unlock(inode); - relock = true; - } - down_read(&BTRFS_I(inode)->dio_sem); - } - - /* - * We have are actually a sync iocb, so we need our fancy endio to know - * if we need to sync. - */ - if (current->journal_info) - ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, - &btrfs_sync_dops, is_sync_kiocb(iocb)); - else - ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, - &btrfs_dio_ops, is_sync_kiocb(iocb)); - - if (ret == -ENOTBLK) - ret = 0; - - if (iov_iter_rw(iter) == WRITE) - up_read(&BTRFS_I(inode)->dio_sem); - - if (relock) - inode_lock(inode); - - extent_changeset_free(data_reserved); - return ret; -} - static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { @@ -8186,6 +8167,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, u64 start; u64 end; int inode_evicting = inode->vfs_inode.i_state & I_FREEING; + bool found_ordered = false; + bool completed_ordered = false; /* * we have the page locked, so new writeback can't start, @@ -8207,15 +8190,17 @@ again: start = page_start; ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1); if (ordered) { + found_ordered = true; end = min(page_end, ordered->file_offset + ordered->num_bytes - 1); /* - * IO on this page will never be started, so we need - * to account for any ordered extents now + * IO on this page will never be started, so we need to account + * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW + * here, must leave that up for the ordered extent completion. */ if (!inode_evicting) clear_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DELALLOC | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 0, &cached_state); /* @@ -8237,8 +8222,10 @@ again: if (btrfs_dec_test_ordered_pending(inode, &ordered, start, - end - start + 1, 1)) + end - start + 1, 1)) { btrfs_finish_ordered_io(ordered); + completed_ordered = true; + } } btrfs_put_ordered_extent(ordered); if (!inode_evicting) { @@ -8267,10 +8254,23 @@ again: */ btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); if (!inode_evicting) { + bool delete = true; + + /* + * If there's an ordered extent for this range and we have not + * finished it ourselves, we must leave EXTENT_DELALLOC_NEW set + * in the range for the ordered extent completion. We must also + * not delete the range, otherwise we would lose that bit (and + * any other bits set in the range). Make sure EXTENT_UPTODATE + * is cleared if we don't delete, otherwise it can lead to + * corruptions if the i_size is extented later. + */ + if (found_ordered && !completed_ordered) + delete = false; clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, - &cached_state); + EXTENT_DELALLOC | EXTENT_UPTODATE | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, + delete, &cached_state); __btrfs_releasepage(page, GFP_NOFS); } @@ -8519,14 +8519,14 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) trans->block_rsv = rsv; while (1) { - ret = btrfs_truncate_inode_items(trans, root, inode, + ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), inode->i_size, BTRFS_EXTENT_DATA_KEY); trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) break; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) break; @@ -8557,7 +8557,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); - ret = btrfs_truncate_block(inode, inode->i_size, 0, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0); if (ret) goto out; trans = btrfs_start_transaction(root, 1); @@ -8565,14 +8565,14 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) ret = PTR_ERR(trans); goto out; } - btrfs_inode_safe_disk_i_size_write(inode, 0); + btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } if (trans) { int ret2; trans->block_rsv = &fs_info->trans_block_rsv; - ret2 = btrfs_update_inode(trans, root, inode); + ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret2 && !ret) ret = ret2; @@ -8618,7 +8618,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, "error inheriting subvolume %llu properties: %d", new_root->root_key.objectid, err); - err = btrfs_update_inode(trans, new_root, inode); + err = btrfs_update_inode(trans, new_root, BTRFS_I(inode)); iput(inode); return err; @@ -8680,7 +8680,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); - init_rwsem(&ei->dio_sem); return inode; } @@ -8820,6 +8819,7 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { u64 delalloc_bytes; + u64 inode_bytes; struct inode *inode = d_inode(path->dentry); u32 blocksize = inode->i_sb->s_blocksize; u32 bi_flags = BTRFS_I(inode)->flags; @@ -8846,8 +8846,9 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat, spin_lock(&BTRFS_I(inode)->lock); delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes; + inode_bytes = inode_get_bytes(inode); spin_unlock(&BTRFS_I(inode)->lock); - stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + + stat->blocks = (ALIGN(inode_bytes, blocksize) + ALIGN(delalloc_bytes, blocksize)) >> 9; return 0; } @@ -8973,7 +8974,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, old_dentry->d_name.name, old_dentry->d_name.len); if (!ret) - ret = btrfs_update_inode(trans, root, old_inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -8989,7 +8990,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, new_dentry->d_name.name, new_dentry->d_name.len); if (!ret) - ret = btrfs_update_inode(trans, dest, new_inode); + ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -9078,7 +9079,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, u64 objectid; u64 index; - ret = btrfs_find_free_ino(root, &objectid); + ret = btrfs_find_free_objectid(root, &objectid); if (ret) return ret; @@ -9109,7 +9110,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); out: unlock_new_inode(inode); if (ret) @@ -9243,7 +9244,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_dentry->d_name.name, old_dentry->d_name.len); if (!ret) - ret = btrfs_update_inode(trans, root, old_inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -9541,7 +9542,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_ino(root, &objectid); + err = btrfs_find_free_objectid(root, &objectid); if (err) goto out_unlock; @@ -9603,7 +9604,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode_nohighmem(inode); inode_set_bytes(inode, name_len); btrfs_i_size_write(BTRFS_I(inode), name_len); - err = btrfs_update_inode(trans, root, inode); + err = btrfs_update_inode(trans, root, BTRFS_I(inode)); /* * Last step, add directory indexes for our symlink inode. This is the * last step to avoid extra cleanup of these indexes if an error happens @@ -9629,7 +9630,8 @@ out_unlock: static struct btrfs_trans_handle *insert_prealloc_file_extent( struct btrfs_trans_handle *trans_in, - struct inode *inode, struct btrfs_key *ins, + struct btrfs_inode *inode, + struct btrfs_key *ins, u64 file_offset) { struct btrfs_file_extent_item stack_fi; @@ -9650,13 +9652,14 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); /* Encryption and other encoding is reserved and all 0 */ - ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len); + ret = btrfs_qgroup_release_data(inode, file_offset, len); if (ret < 0) return ERR_PTR(ret); if (trans) { - ret = insert_reserved_file_extent(trans, BTRFS_I(inode), - file_offset, &stack_fi, ret); + ret = insert_reserved_file_extent(trans, inode, + file_offset, &stack_fi, + true, ret); if (ret) return ERR_PTR(ret); return trans; @@ -9676,7 +9679,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( if (!path) return ERR_PTR(-ENOMEM); - ret = btrfs_replace_file_extents(inode, path, file_offset, + ret = btrfs_replace_file_extents(&inode->vfs_inode, path, file_offset, file_offset + len - 1, &extent_info, &trans); btrfs_free_path(path); @@ -9732,7 +9735,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, clear_offset += ins.offset; last_alloc = ins.offset; - trans = insert_prealloc_file_extent(trans, inode, &ins, cur_offset); + trans = insert_prealloc_file_extent(trans, BTRFS_I(inode), + &ins, cur_offset); /* * Now that we inserted the prealloc extent we can finally * decrement the number of reservations in the block group. @@ -9794,10 +9798,10 @@ next: else i_size = cur_offset; i_size_write(inode, i_size); - btrfs_inode_safe_disk_i_size_write(inode, 0); + btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) { btrfs_abort_transaction(trans, ret); @@ -9872,7 +9876,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_find_free_ino(root, &objectid); + ret = btrfs_find_free_objectid(root, &objectid); if (ret) goto out; @@ -9893,7 +9897,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) if (ret) goto out; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) goto out; ret = btrfs_orphan_add(trans, BTRFS_I(inode)); @@ -10272,6 +10276,27 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, } #endif +/* + * Update the number of bytes used in the VFS' inode. When we replace extents in + * a range (clone, dedupe, fallocate's zero range), we must update the number of + * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls + * always get a correct value. + */ +void btrfs_update_inode_bytes(struct btrfs_inode *inode, + const u64 add_bytes, + const u64 del_bytes) +{ + if (add_bytes == del_bytes) + return; + + spin_lock(&inode->lock); + if (del_bytes > 0) + inode_sub_bytes(&inode->vfs_inode, del_bytes); + if (add_bytes > 0) + inode_add_bytes(&inode->vfs_inode, add_bytes); + spin_unlock(&inode->lock); +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 69a384145dc6..703212ff50a5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -34,7 +34,6 @@ #include "print-tree.h" #include "volumes.h" #include "locking.h" -#include "inode-map.h" #include "backref.h" #include "rcu-string.h" #include "send.h" @@ -193,6 +192,15 @@ static int check_fsflags(unsigned int old_flags, unsigned int flags) return 0; } +static int check_fsflags_compatible(struct btrfs_fs_info *fs_info, + unsigned int flags) +{ + if (btrfs_is_zoned(fs_info) && (flags & FS_NOCOW_FL)) + return -EPERM; + + return 0; +} + static int btrfs_ioctl_setflags(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); @@ -230,6 +238,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (ret) goto out_unlock; + ret = check_fsflags_compatible(fs_info, fsflags); + if (ret) + goto out_unlock; + binode_flags = binode->flags; if (fsflags & FS_SYNC_FL) binode_flags |= BTRFS_INODE_SYNC; @@ -336,7 +348,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) btrfs_sync_inode_flags_to_i_flags(inode); inode_inc_iversion(inode); inode->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); out_end_trans: btrfs_end_transaction(trans); @@ -479,7 +491,7 @@ static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg) btrfs_sync_inode_flags_to_i_flags(inode); inode_inc_iversion(inode); inode->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); btrfs_end_transaction(trans); @@ -733,7 +745,7 @@ static noinline int create_subvol(struct inode *dir, } btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); - ret = btrfs_update_inode(trans, root, dir); + ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1275,6 +1287,7 @@ static int cluster_pages_for_defrag(struct inode *inode, u64 page_end; u64 page_cnt; u64 start = (u64)start_index << PAGE_SHIFT; + u64 search_start; int ret; int i; int i_done; @@ -1371,6 +1384,40 @@ again: lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, &cached_state); + + /* + * When defragmenting we skip ranges that have holes or inline extents, + * (check should_defrag_range()), to avoid unnecessary IO and wasting + * space. At btrfs_defrag_file(), we check if a range should be defragged + * before locking the inode and then, if it should, we trigger a sync + * page cache readahead - we lock the inode only after that to avoid + * blocking for too long other tasks that possibly want to operate on + * other file ranges. But before we were able to get the inode lock, + * some other task may have punched a hole in the range, or we may have + * now an inline extent, in which case we should not defrag. So check + * for that here, where we have the inode and the range locked, and bail + * out if that happened. + */ + search_start = page_start; + while (search_start < page_end) { + struct extent_map *em; + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, search_start, + page_end - search_start); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_unlock_range; + } + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + free_extent_map(em); + /* Ok, 0 means we did not defrag anything */ + ret = 0; + goto out_unlock_range; + } + search_start = extent_map_end(em); + free_extent_map(em); + } + clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state); @@ -1401,6 +1448,10 @@ again: btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); extent_changeset_free(data_reserved); return i_done; + +out_unlock_range: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + page_start, page_end - 1, &cached_state); out: for (i = 0; i < i_done; i++) { unlock_page(pages[i]); @@ -1678,7 +1729,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, btrfs_info(fs_info, "resizing devid %llu", devid); } - device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); + device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); if (!device) { btrfs_info(fs_info, "resizer unable to find device %llu", devid); @@ -3321,7 +3372,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, rcu_read_lock(); dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid, - NULL, true); + NULL); if (!dev) { ret = -ENODEV; @@ -3393,7 +3444,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) ret = -ENOMEM; goto out_free; } - path->leave_spinning = 1; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 66e02ebdd340..5fafc5e89bb7 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -17,404 +17,89 @@ * Extent buffer locking * ===================== * - * The locks use a custom scheme that allows to do more operations than are - * available fromt current locking primitives. The building blocks are still - * rwlock and wait queues. - * - * Required semantics: + * We use a rw_semaphore for tree locking, and the semantics are exactly the + * same: * * - reader/writer exclusion * - writer/writer exclusion * - reader/reader sharing - * - spinning lock semantics - * - blocking lock semantics * - try-lock semantics for readers and writers - * - one level nesting, allowing read lock to be taken by the same thread that - * already has write lock - * - * The extent buffer locks (also called tree locks) manage access to eb data - * related to the storage in the b-tree (keys, items, but not the individual - * members of eb). - * We want concurrency of many readers and safe updates. The underlying locking - * is done by read-write spinlock and the blocking part is implemented using - * counters and wait queues. - * - * spinning semantics - the low-level rwlock is held so all other threads that - * want to take it are spinning on it. - * - * blocking semantics - the low-level rwlock is not held but the counter - * denotes how many times the blocking lock was held; - * sleeping is possible - * - * Write lock always allows only one thread to access the data. - * - * - * Debugging - * --------- - * - * There are additional state counters that are asserted in various contexts, - * removed from non-debug build to reduce extent_buffer size and for - * performance reasons. - * - * - * Lock recursion - * -------------- - * - * A write operation on a tree might indirectly start a look up on the same - * tree. This can happen when btrfs_cow_block locks the tree and needs to - * lookup free extents. - * - * btrfs_cow_block - * .. - * alloc_tree_block_no_bg_flush - * btrfs_alloc_tree_block - * btrfs_reserve_extent - * .. - * load_free_space_cache - * .. - * btrfs_lookup_file_extent - * btrfs_search_slot - * - * - * Locking pattern - spinning - * -------------------------- - * - * The simple locking scenario, the +--+ denotes the spinning section. - * - * +- btrfs_tree_lock - * | - extent_buffer::rwlock is held - * | - no heavy operations should happen, eg. IO, memory allocations, large - * | structure traversals - * +- btrfs_tree_unock -* -* - * Locking pattern - blocking - * -------------------------- - * - * The blocking write uses the following scheme. The +--+ denotes the spinning - * section. - * - * +- btrfs_tree_lock - * | - * +- btrfs_set_lock_blocking_write - * - * - allowed: IO, memory allocations, etc. - * - * -- btrfs_tree_unlock - note, no explicit unblocking necessary - * - * - * Blocking read is similar. - * - * +- btrfs_tree_read_lock - * | - * +- btrfs_set_lock_blocking_read - * - * - heavy operations allowed - * - * +- btrfs_tree_read_unlock_blocking - * | - * +- btrfs_tree_read_unlock - * - */ - -#ifdef CONFIG_BTRFS_DEBUG -static inline void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) -{ - WARN_ON(eb->spinning_writers); - eb->spinning_writers++; -} - -static inline void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) -{ - WARN_ON(eb->spinning_writers != 1); - eb->spinning_writers--; -} - -static inline void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) -{ - WARN_ON(eb->spinning_writers); -} - -static inline void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) -{ - atomic_inc(&eb->spinning_readers); -} - -static inline void btrfs_assert_spinning_readers_put(struct extent_buffer *eb) -{ - WARN_ON(atomic_read(&eb->spinning_readers) == 0); - atomic_dec(&eb->spinning_readers); -} - -static inline void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) -{ - atomic_inc(&eb->read_locks); -} - -static inline void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) -{ - atomic_dec(&eb->read_locks); -} - -static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb) -{ - BUG_ON(!atomic_read(&eb->read_locks)); -} - -static inline void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) -{ - eb->write_locks++; -} - -static inline void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) -{ - eb->write_locks--; -} - -#else -static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) { } -static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) { } -static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) { } -static void btrfs_assert_spinning_readers_put(struct extent_buffer *eb) { } -static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) { } -static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { } -static void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) { } -static void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) { } -static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) { } -static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) { } -#endif - -/* - * Mark already held read lock as blocking. Can be nested in write lock by the - * same thread. - * - * Use when there are potentially long operations ahead so other thread waiting - * on the lock will not actively spin but sleep instead. - * - * The rwlock is released and blocking reader counter is increased. - */ -void btrfs_set_lock_blocking_read(struct extent_buffer *eb) -{ - trace_btrfs_set_lock_blocking_read(eb); - /* - * No lock is required. The lock owner may change if we have a read - * lock, but it won't change to or away from us. If we have the write - * lock, we are the owner and it'll never change. - */ - if (eb->lock_recursed && current->pid == eb->lock_owner) - return; - btrfs_assert_tree_read_locked(eb); - atomic_inc(&eb->blocking_readers); - btrfs_assert_spinning_readers_put(eb); - read_unlock(&eb->lock); -} - -/* - * Mark already held write lock as blocking. - * - * Use when there are potentially long operations ahead so other threads - * waiting on the lock will not actively spin but sleep instead. * - * The rwlock is released and blocking writers is set. + * The rwsem implementation does opportunistic spinning which reduces number of + * times the locking task needs to sleep. */ -void btrfs_set_lock_blocking_write(struct extent_buffer *eb) -{ - trace_btrfs_set_lock_blocking_write(eb); - /* - * No lock is required. The lock owner may change if we have a read - * lock, but it won't change to or away from us. If we have the write - * lock, we are the owner and it'll never change. - */ - if (eb->lock_recursed && current->pid == eb->lock_owner) - return; - if (eb->blocking_writers == 0) { - btrfs_assert_spinning_writers_put(eb); - btrfs_assert_tree_locked(eb); - WRITE_ONCE(eb->blocking_writers, 1); - write_unlock(&eb->lock); - } -} /* - * Lock the extent buffer for read. Wait for any writers (spinning or blocking). - * Can be nested in write lock by the same thread. + * __btrfs_tree_read_lock - lock extent buffer for read + * @eb: the eb to be locked + * @nest: the nesting level to be used for lockdep * - * Use when the locked section does only lightweight actions and busy waiting - * would be cheaper than making other threads do the wait/wake loop. - * - * The rwlock is held upon exit. + * This takes the read lock on the extent buffer, using the specified nesting + * level for lockdep purposes. */ -void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest, - bool recurse) +void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) { u64 start_ns = 0; if (trace_btrfs_tree_read_lock_enabled()) start_ns = ktime_get_ns(); -again: - read_lock(&eb->lock); - BUG_ON(eb->blocking_writers == 0 && - current->pid == eb->lock_owner); - if (eb->blocking_writers) { - if (current->pid == eb->lock_owner) { - /* - * This extent is already write-locked by our thread. - * We allow an additional read lock to be added because - * it's for the same thread. btrfs_find_all_roots() - * depends on this as it may be called on a partly - * (write-)locked tree. - */ - WARN_ON(!recurse); - BUG_ON(eb->lock_recursed); - eb->lock_recursed = true; - read_unlock(&eb->lock); - trace_btrfs_tree_read_lock(eb, start_ns); - return; - } - read_unlock(&eb->lock); - wait_event(eb->write_lock_wq, - READ_ONCE(eb->blocking_writers) == 0); - goto again; - } - btrfs_assert_tree_read_locks_get(eb); - btrfs_assert_spinning_readers_get(eb); + + down_read_nested(&eb->lock, nest); + eb->lock_owner = current->pid; trace_btrfs_tree_read_lock(eb, start_ns); } void btrfs_tree_read_lock(struct extent_buffer *eb) { - __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, false); -} - -/* - * Lock extent buffer for read, optimistically expecting that there are no - * contending blocking writers. If there are, don't wait. - * - * Return 1 if the rwlock has been taken, 0 otherwise - */ -int btrfs_tree_read_lock_atomic(struct extent_buffer *eb) -{ - if (READ_ONCE(eb->blocking_writers)) - return 0; - - read_lock(&eb->lock); - /* Refetch value after lock */ - if (READ_ONCE(eb->blocking_writers)) { - read_unlock(&eb->lock); - return 0; - } - btrfs_assert_tree_read_locks_get(eb); - btrfs_assert_spinning_readers_get(eb); - trace_btrfs_tree_read_lock_atomic(eb); - return 1; + __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL); } /* - * Try-lock for read. Don't block or wait for contending writers. + * Try-lock for read. * * Retrun 1 if the rwlock has been taken, 0 otherwise */ int btrfs_try_tree_read_lock(struct extent_buffer *eb) { - if (READ_ONCE(eb->blocking_writers)) - return 0; - - if (!read_trylock(&eb->lock)) - return 0; - - /* Refetch value after lock */ - if (READ_ONCE(eb->blocking_writers)) { - read_unlock(&eb->lock); - return 0; + if (down_read_trylock(&eb->lock)) { + eb->lock_owner = current->pid; + trace_btrfs_try_tree_read_lock(eb); + return 1; } - btrfs_assert_tree_read_locks_get(eb); - btrfs_assert_spinning_readers_get(eb); - trace_btrfs_try_tree_read_lock(eb); - return 1; + return 0; } /* - * Try-lock for write. May block until the lock is uncontended, but does not - * wait until it is free. + * Try-lock for write. * * Retrun 1 if the rwlock has been taken, 0 otherwise */ int btrfs_try_tree_write_lock(struct extent_buffer *eb) { - if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers)) - return 0; - - write_lock(&eb->lock); - /* Refetch value after lock */ - if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers)) { - write_unlock(&eb->lock); - return 0; + if (down_write_trylock(&eb->lock)) { + eb->lock_owner = current->pid; + trace_btrfs_try_tree_write_lock(eb); + return 1; } - btrfs_assert_tree_write_locks_get(eb); - btrfs_assert_spinning_writers_get(eb); - eb->lock_owner = current->pid; - trace_btrfs_try_tree_write_lock(eb); - return 1; + return 0; } /* - * Release read lock. Must be used only if the lock is in spinning mode. If - * the read lock is nested, must pair with read lock before the write unlock. - * - * The rwlock is not held upon exit. + * Release read lock. */ void btrfs_tree_read_unlock(struct extent_buffer *eb) { trace_btrfs_tree_read_unlock(eb); - /* - * if we're nested, we have the write lock. No new locking - * is needed as long as we are the lock owner. - * The write unlock will do a barrier for us, and the lock_recursed - * field only matters to the lock owner. - */ - if (eb->lock_recursed && current->pid == eb->lock_owner) { - eb->lock_recursed = false; - return; - } - btrfs_assert_tree_read_locked(eb); - btrfs_assert_spinning_readers_put(eb); - btrfs_assert_tree_read_locks_put(eb); - read_unlock(&eb->lock); -} - -/* - * Release read lock, previously set to blocking by a pairing call to - * btrfs_set_lock_blocking_read(). Can be nested in write lock by the same - * thread. - * - * State of rwlock is unchanged, last reader wakes waiting threads. - */ -void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) -{ - trace_btrfs_tree_read_unlock_blocking(eb); - /* - * if we're nested, we have the write lock. No new locking - * is needed as long as we are the lock owner. - * The write unlock will do a barrier for us, and the lock_recursed - * field only matters to the lock owner. - */ - if (eb->lock_recursed && current->pid == eb->lock_owner) { - eb->lock_recursed = false; - return; - } - btrfs_assert_tree_read_locked(eb); - WARN_ON(atomic_read(&eb->blocking_readers) == 0); - /* atomic_dec_and_test implies a barrier */ - if (atomic_dec_and_test(&eb->blocking_readers)) - cond_wake_up_nomb(&eb->read_lock_wq); - btrfs_assert_tree_read_locks_put(eb); + eb->lock_owner = 0; + up_read(&eb->lock); } /* - * Lock for write. Wait for all blocking and spinning readers and writers. This - * starts context where reader lock could be nested by the same thread. + * __btrfs_tree_lock - lock eb for write + * @eb: the eb to lock + * @nest: the nesting to use for the lock * - * The rwlock is held for write upon exit. + * Returns with the eb->lock write locked. */ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) __acquires(&eb->lock) @@ -424,19 +109,7 @@ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) if (trace_btrfs_tree_lock_enabled()) start_ns = ktime_get_ns(); - WARN_ON(eb->lock_owner == current->pid); -again: - wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); - wait_event(eb->write_lock_wq, READ_ONCE(eb->blocking_writers) == 0); - write_lock(&eb->lock); - /* Refetch value after lock */ - if (atomic_read(&eb->blocking_readers) || - READ_ONCE(eb->blocking_writers)) { - write_unlock(&eb->lock); - goto again; - } - btrfs_assert_spinning_writers_get(eb); - btrfs_assert_tree_write_locks_get(eb); + down_write_nested(&eb->lock, nest); eb->lock_owner = current->pid; trace_btrfs_tree_lock(eb, start_ns); } @@ -447,68 +120,13 @@ void btrfs_tree_lock(struct extent_buffer *eb) } /* - * Release the write lock, either blocking or spinning (ie. there's no need - * for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking). - * This also ends the context for nesting, the read lock must have been - * released already. - * - * Tasks blocked and waiting are woken, rwlock is not held upon exit. + * Release the write lock. */ void btrfs_tree_unlock(struct extent_buffer *eb) { - /* - * This is read both locked and unlocked but always by the same thread - * that already owns the lock so we don't need to use READ_ONCE - */ - int blockers = eb->blocking_writers; - - BUG_ON(blockers > 1); - - btrfs_assert_tree_locked(eb); trace_btrfs_tree_unlock(eb); eb->lock_owner = 0; - btrfs_assert_tree_write_locks_put(eb); - - if (blockers) { - btrfs_assert_no_spinning_writers(eb); - /* Unlocked write */ - WRITE_ONCE(eb->blocking_writers, 0); - /* - * We need to order modifying blocking_writers above with - * actually waking up the sleepers to ensure they see the - * updated value of blocking_writers - */ - cond_wake_up(&eb->write_lock_wq); - } else { - btrfs_assert_spinning_writers_put(eb); - write_unlock(&eb->lock); - } -} - -/* - * Set all locked nodes in the path to blocking locks. This should be done - * before scheduling - */ -void btrfs_set_path_blocking(struct btrfs_path *p) -{ - int i; - - for (i = 0; i < BTRFS_MAX_LEVEL; i++) { - if (!p->nodes[i] || !p->locks[i]) - continue; - /* - * If we currently have a spinning reader or writer lock this - * will bump the count of blocking holders and drop the - * spinlock. - */ - if (p->locks[i] == BTRFS_READ_LOCK) { - btrfs_set_lock_blocking_read(p->nodes[i]); - p->locks[i] = BTRFS_READ_LOCK_BLOCKING; - } else if (p->locks[i] == BTRFS_WRITE_LOCK) { - btrfs_set_lock_blocking_write(p->nodes[i]); - p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; - } - } + up_write(&eb->lock); } /* @@ -564,14 +182,13 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) * * Return: root extent buffer with read lock held */ -struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root, - bool recurse) +struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) { struct extent_buffer *eb; while (1) { eb = btrfs_root_node(root); - __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, recurse); + btrfs_tree_read_lock(eb); if (eb == root->node) break; btrfs_tree_read_unlock(eb); diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 3ea81ed3320b..a2e1f1f5c6e3 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -13,8 +13,6 @@ #define BTRFS_WRITE_LOCK 1 #define BTRFS_READ_LOCK 2 -#define BTRFS_WRITE_LOCK_BLOCKING 3 -#define BTRFS_READ_LOCK_BLOCKING 4 /* * We are limited in number of subclasses by MAX_LOCKDEP_SUBCLASSES, which at @@ -89,42 +87,28 @@ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest); void btrfs_tree_lock(struct extent_buffer *eb); void btrfs_tree_unlock(struct extent_buffer *eb); -void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest, - bool recurse); +void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest); void btrfs_tree_read_lock(struct extent_buffer *eb); void btrfs_tree_read_unlock(struct extent_buffer *eb); -void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); -void btrfs_set_lock_blocking_read(struct extent_buffer *eb); -void btrfs_set_lock_blocking_write(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_write_lock(struct extent_buffer *eb); -int btrfs_tree_read_lock_atomic(struct extent_buffer *eb); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); -struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root, - bool recurse); - -static inline struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) -{ - return __btrfs_read_lock_root_node(root, false); -} +struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); #ifdef CONFIG_BTRFS_DEBUG static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { - BUG_ON(!eb->write_locks); + lockdep_assert_held(&eb->lock); } #else static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { } #endif -void btrfs_set_path_blocking(struct btrfs_path *p); void btrfs_unlock_up_safe(struct btrfs_path *path, int level); static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) { - if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING) + if (rw == BTRFS_WRITE_LOCK) btrfs_tree_unlock(eb); - else if (rw == BTRFS_READ_LOCK_BLOCKING) - btrfs_tree_read_unlock_blocking(eb); else if (rw == BTRFS_READ_LOCK) btrfs_tree_read_unlock(eb); else diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 87bac9ecdf4c..79d366a36223 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -855,51 +855,6 @@ out: } /* - * search the ordered extents for one corresponding to 'offset' and - * try to find a checksum. This is used because we allow pages to - * be reclaimed before their checksum is actually put into the btree - */ -int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset, - u64 disk_bytenr, u8 *sum, int len) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_ordered_sum *ordered_sum; - struct btrfs_ordered_extent *ordered; - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; - unsigned long num_sectors; - unsigned long i; - u32 sectorsize = btrfs_inode_sectorsize(inode); - const u8 blocksize_bits = inode->vfs_inode.i_sb->s_blocksize_bits; - const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); - int index = 0; - - ordered = btrfs_lookup_ordered_extent(inode, offset); - if (!ordered) - return 0; - - spin_lock_irq(&tree->lock); - list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { - if (disk_bytenr >= ordered_sum->bytenr && - disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { - i = (disk_bytenr - ordered_sum->bytenr) >> blocksize_bits; - num_sectors = ordered_sum->len >> blocksize_bits; - num_sectors = min_t(int, len - index, num_sectors - i); - memcpy(sum + index, ordered_sum->sums + i * csum_size, - num_sectors * csum_size); - - index += (int)num_sectors * csum_size; - if (index == len) - goto out; - disk_bytenr += num_sectors * sectorsize; - } - } -out: - spin_unlock_irq(&tree->lock); - btrfs_put_ordered_extent(ordered); - return index; -} - -/* * btrfs_flush_ordered_range - Lock the passed range and ensures all pending * ordered extents in it are run to completion. * diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index c3a2325e64a4..0bfa82b58e23 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -137,9 +137,8 @@ static inline int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes) { int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize); - int csum_size = btrfs_super_csum_size(fs_info->super_copy); - return sizeof(struct btrfs_ordered_sum) + num_sectors * csum_size; + return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size; } static inline void @@ -184,8 +183,6 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( u64 len); void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, struct list_head *list); -int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset, - u64 disk_bytenr, u8 *sum, int len); u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len); void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 7695c4783d33..fe5e0026129d 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -177,8 +177,7 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset, __le64 subvol_id; read_extent_buffer(l, &subvol_id, offset, sizeof(subvol_id)); - pr_info("\t\tsubvol_id %llu\n", - (unsigned long long)le64_to_cpu(subvol_id)); + pr_info("\t\tsubvol_id %llu\n", le64_to_cpu(subvol_id)); item_size -= sizeof(u64); offset += sizeof(u64); } @@ -191,15 +190,8 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset, static void print_eb_refs_lock(struct extent_buffer *eb) { #ifdef CONFIG_BTRFS_DEBUG - btrfs_info(eb->fs_info, -"refs %u lock (w:%d r:%d bw:%d br:%d sw:%d sr:%d) lock_owner %u current %u", - atomic_read(&eb->refs), eb->write_locks, - atomic_read(&eb->read_locks), - eb->blocking_writers, - atomic_read(&eb->blocking_readers), - eb->spinning_writers, - atomic_read(&eb->spinning_readers), - eb->lock_owner, current->pid); + btrfs_info(eb->fs_info, "refs %u lock_owner %u current %u", + atomic_read(&eb->refs), eb->lock_owner, current->pid); #endif } @@ -398,6 +390,7 @@ void btrfs_print_tree(struct extent_buffer *c, bool follow) btrfs_node_key_to_cpu(c, &first_key, i); next = read_tree_block(fs_info, btrfs_node_blockptr(c, i), + btrfs_header_owner(c), btrfs_node_ptr_generation(c, i), level - 1, &first_key); if (IS_ERR(next)) { diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 87bd37b70738..fe3046007f52 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -894,8 +894,6 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->leave_spinning = 1; - key.objectid = 0; key.offset = 0; key.type = 0; @@ -1944,34 +1942,22 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, struct btrfs_key dst_key; if (src_path->nodes[cur_level] == NULL) { - struct btrfs_key first_key; struct extent_buffer *eb; int parent_slot; - u64 child_gen; - u64 child_bytenr; eb = src_path->nodes[cur_level + 1]; parent_slot = src_path->slots[cur_level + 1]; - child_bytenr = btrfs_node_blockptr(eb, parent_slot); - child_gen = btrfs_node_ptr_generation(eb, parent_slot); - btrfs_node_key_to_cpu(eb, &first_key, parent_slot); - eb = read_tree_block(fs_info, child_bytenr, child_gen, - cur_level, &first_key); + eb = btrfs_read_node_slot(eb, parent_slot); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; - } else if (!extent_buffer_uptodate(eb)) { - free_extent_buffer(eb); - ret = -EIO; - goto out; } src_path->nodes[cur_level] = eb; btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_read(eb); - src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING; + src_path->locks[cur_level] = BTRFS_READ_LOCK; } src_path->slots[cur_level] = dst_path->slots[cur_level]; @@ -2066,10 +2052,8 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, /* Read the tree block if needed */ if (dst_path->nodes[cur_level] == NULL) { - struct btrfs_key first_key; int parent_slot; u64 child_gen; - u64 child_bytenr; /* * dst_path->nodes[root_level] must be initialized before @@ -2088,31 +2072,23 @@ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, */ eb = dst_path->nodes[cur_level + 1]; parent_slot = dst_path->slots[cur_level + 1]; - child_bytenr = btrfs_node_blockptr(eb, parent_slot); child_gen = btrfs_node_ptr_generation(eb, parent_slot); - btrfs_node_key_to_cpu(eb, &first_key, parent_slot); /* This node is old, no need to trace */ if (child_gen < last_snapshot) goto out; - eb = read_tree_block(fs_info, child_bytenr, child_gen, - cur_level, &first_key); + eb = btrfs_read_node_slot(eb, parent_slot); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; - } else if (!extent_buffer_uptodate(eb)) { - free_extent_buffer(eb); - ret = -EIO; - goto out; } dst_path->nodes[cur_level] = eb; dst_path->slots[cur_level] = 0; btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_read(eb); - dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING; + dst_path->locks[cur_level] = BTRFS_READ_LOCK; need_cleanup = true; } @@ -2256,38 +2232,28 @@ walk_down: level = root_level; while (level >= 0) { if (path->nodes[level] == NULL) { - struct btrfs_key first_key; int parent_slot; - u64 child_gen; u64 child_bytenr; /* - * We need to get child blockptr/gen from parent before - * we can read it. + * We need to get child blockptr from parent before we + * can read it. */ eb = path->nodes[level + 1]; parent_slot = path->slots[level + 1]; child_bytenr = btrfs_node_blockptr(eb, parent_slot); - child_gen = btrfs_node_ptr_generation(eb, parent_slot); - btrfs_node_key_to_cpu(eb, &first_key, parent_slot); - eb = read_tree_block(fs_info, child_bytenr, child_gen, - level, &first_key); + eb = btrfs_read_node_slot(eb, parent_slot); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; - } else if (!extent_buffer_uptodate(eb)) { - free_extent_buffer(eb); - ret = -EIO; - goto out; } path->nodes[level] = eb; path->slots[level] = 0; btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_read(eb); - path->locks[level] = BTRFS_READ_LOCK_BLOCKING; + path->locks[level] = BTRFS_READ_LOCK; ret = btrfs_qgroup_trace_extent(trans, child_bytenr, fs_info->nodesize, @@ -4242,7 +4208,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, spin_unlock(&blocks->lock); /* Read out reloc subtree root */ - reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, + reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, 0, block->reloc_generation, block->level, &block->first_key); if (IS_ERR(reloc_eb)) { diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 255490f42b5d..93fbf87bdc8d 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1097,7 +1097,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, /* see if we can add this page onto our existing bio */ if (last) { - u64 last_end = (u64)last->bi_iter.bi_sector << 9; + u64 last_end = last->bi_iter.bi_sector << 9; last_end += last->bi_iter.bi_size; /* @@ -1163,7 +1163,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) struct bvec_iter iter; int i = 0; - start = (u64)bio->bi_iter.bi_sector << 9; + start = bio->bi_iter.bi_sector << 9; stripe_offset = start - rbio->bbio->raid_map[0]; page_index = stripe_offset >> PAGE_SHIFT; @@ -1374,7 +1374,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio) { - u64 logical = (u64)bio->bi_iter.bi_sector << 9; + u64 logical = bio->bi_iter.bi_sector << 9; int i; for (i = 0; i < rbio->nr_data; i++) { @@ -2150,7 +2150,7 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, if (rbio->faila == -1) { btrfs_warn(fs_info, "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)", - __func__, (u64)bio->bi_iter.bi_sector << 9, + __func__, bio->bi_iter.bi_sector << 9, (u64)bio->bi_iter.bi_size, bbio->map_type); if (generic_io) btrfs_put_bbio(bbio); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index d9a166eb344e..20fd4aa48a8c 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -52,6 +52,7 @@ struct reada_extctl { struct reada_extent { u64 logical; + u64 owner_root; struct btrfs_key top; struct list_head extctl; int refcnt; @@ -59,6 +60,7 @@ struct reada_extent { struct reada_zone *zones[BTRFS_MAX_MIRRORS]; int nzones; int scheduled; + int level; }; struct reada_zone { @@ -87,7 +89,8 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info); static void __reada_start_machine(struct btrfs_fs_info *fs_info); static int reada_add_block(struct reada_control *rc, u64 logical, - struct btrfs_key *top, u64 generation); + struct btrfs_key *top, u64 owner_root, + u64 generation, int level); /* recurses */ /* in case of err, eb might be NULL */ @@ -165,7 +168,9 @@ static void __readahead_hook(struct btrfs_fs_info *fs_info, if (rec->generation == generation && btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 && btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0) - reada_add_block(rc, bytenr, &next_key, n_gen); + reada_add_block(rc, bytenr, &next_key, + btrfs_header_owner(eb), n_gen, + btrfs_header_level(eb) - 1); } } @@ -298,7 +303,8 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, u64 logical, - struct btrfs_key *top) + struct btrfs_key *top, + u64 owner_root, int level) { int ret; struct reada_extent *re = NULL; @@ -331,6 +337,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&re->extctl); spin_lock_init(&re->lock); re->refcnt = 1; + re->owner_root = owner_root; + re->level = level; /* * map block @@ -531,6 +539,8 @@ static void reada_zone_release(struct kref *kref) { struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt); + lockdep_assert_held(&zone->device->fs_info->reada_lock); + radix_tree_delete(&zone->device->reada_zones, zone->end >> PAGE_SHIFT); @@ -546,14 +556,15 @@ static void reada_control_release(struct kref *kref) } static int reada_add_block(struct reada_control *rc, u64 logical, - struct btrfs_key *top, u64 generation) + struct btrfs_key *top, u64 owner_root, + u64 generation, int level) { struct btrfs_fs_info *fs_info = rc->fs_info; struct reada_extent *re; struct reada_extctl *rec; /* takes one ref */ - re = reada_find_extent(fs_info, logical, top); + re = reada_find_extent(fs_info, logical, top, owner_root, level); if (!re) return -1; @@ -645,12 +656,13 @@ static int reada_pick_zone(struct btrfs_device *dev) } static int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, - int mirror_num, struct extent_buffer **eb) + u64 owner_root, int level, int mirror_num, + struct extent_buffer **eb) { struct extent_buffer *buf = NULL; int ret; - buf = btrfs_find_create_tree_block(fs_info, bytenr); + buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); if (IS_ERR(buf)) return 0; @@ -738,7 +750,8 @@ static int reada_start_machine_dev(struct btrfs_device *dev) logical = re->logical; atomic_inc(&dev->reada_in_flight); - ret = reada_tree_block_flagged(fs_info, logical, mirror_num, &eb); + ret = reada_tree_block_flagged(fs_info, logical, re->owner_root, + re->level, mirror_num, &eb); if (ret) __readahead_hook(fs_info, re, NULL, ret); else if (eb) @@ -945,6 +958,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, u64 start; u64 generation; int ret; + int level; struct extent_buffer *node; static struct btrfs_key max_key = { .objectid = (u64)-1, @@ -967,9 +981,11 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, node = btrfs_root_node(root); start = node->start; generation = btrfs_header_generation(node); + level = btrfs_header_level(node); free_extent_buffer(node); - ret = reada_add_block(rc, start, &max_key, generation); + ret = reada_add_block(rc, start, &max_key, root->root_key.objectid, + generation, level); if (ret) { kfree(rc); return ERR_PTR(ret); diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 78693d3dd15b..4b9b6c52a83b 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -551,34 +551,19 @@ static int process_leaf(struct btrfs_root *root, static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, int level, u64 *bytenr, u64 *num_bytes) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *eb; - u64 block_bytenr, gen; int ret = 0; while (level >= 0) { if (level) { - struct btrfs_key first_key; - - block_bytenr = btrfs_node_blockptr(path->nodes[level], - path->slots[level]); - gen = btrfs_node_ptr_generation(path->nodes[level], - path->slots[level]); - btrfs_node_key_to_cpu(path->nodes[level], &first_key, - path->slots[level]); - eb = read_tree_block(fs_info, block_bytenr, gen, - level - 1, &first_key); + eb = btrfs_read_node_slot(path->nodes[level], + path->slots[level]); if (IS_ERR(eb)) return PTR_ERR(eb); - if (!extent_buffer_uptodate(eb)) { - free_extent_buffer(eb); - return -EIO; - } btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_read(eb); path->nodes[level-1] = eb; path->slots[level-1] = 0; - path->locks[level-1] = BTRFS_READ_LOCK_BLOCKING; + path->locks[level-1] = BTRFS_READ_LOCK; } else { ret = process_leaf(root, path, bytenr, num_bytes); if (ret) @@ -799,8 +784,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, if (!be) { btrfs_err(fs_info, "trying to do action %d to bytenr %llu num_bytes %llu but there is no existing entry!", - action, (unsigned long long)bytenr, - (unsigned long long)num_bytes); + action, bytenr, num_bytes); dump_ref_action(fs_info, ra); kfree(ref); kfree(ra); @@ -1001,11 +985,10 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) return -ENOMEM; eb = btrfs_read_lock_root_node(fs_info->extent_root); - btrfs_set_lock_blocking_read(eb); level = btrfs_header_level(eb); path->nodes[level] = eb; path->slots[level] = 0; - path->locks[level] = BTRFS_READ_LOCK_BLOCKING; + path->locks[level] = BTRFS_READ_LOCK; while (1) { /* diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 99aa87c08912..ab80896315be 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -31,10 +31,10 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, endoff = destoff + olen; if (endoff > inode->i_size) { i_size_write(inode, endoff); - btrfs_inode_safe_disk_i_size_write(inode, 0); + btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); @@ -163,6 +163,7 @@ static int clone_copy_inline_extent(struct inode *dst, const u64 aligned_end = ALIGN(new_key->offset + datal, fs_info->sectorsize); struct btrfs_trans_handle *trans = NULL; + struct btrfs_drop_extents_args drop_args = { 0 }; int ret; struct btrfs_key key; @@ -252,7 +253,11 @@ copy_inline_extent: trans = NULL; goto out; } - ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1); + drop_args.path = path; + drop_args.start = drop_start; + drop_args.end = aligned_end; + drop_args.drop_cache = true; + ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args); if (ret) goto out; ret = btrfs_insert_empty_item(trans, root, path, new_key, size); @@ -263,7 +268,7 @@ copy_inline_extent: btrfs_item_ptr_offset(path->nodes[0], path->slots[0]), size); - inode_add_bytes(dst, datal); + btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found); set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags); ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end); out: @@ -347,7 +352,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u64 drop_start; /* Note the key will change type as we walk through the tree */ - path->leave_spinning = 1; ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, 0, 0); if (ret < 0) @@ -417,7 +421,6 @@ process_slot: size); btrfs_release_path(path); - path->leave_spinning = 0; memcpy(&new_key, &key, sizeof(new_key)); new_key.objectid = btrfs_ino(BTRFS_I(inode)); @@ -533,7 +536,6 @@ process_slot: * mixing buffered and direct IO writes against this file. */ btrfs_release_path(path); - path->leave_spinning = 0; ret = btrfs_replace_file_extents(inode, path, last_dest_end, destoff + len - 1, NULL, &trans); @@ -652,7 +654,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, if (destoff > inode->i_size) { const u64 wb_start = ALIGN_DOWN(inode->i_size, bs); - ret = btrfs_cont_expand(inode, inode->i_size, destoff); + ret = btrfs_cont_expand(BTRFS_I(inode), inode->i_size, destoff); if (ret) return ret; /* diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 9ba92d86da0b..19b7db8b2117 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -18,7 +18,6 @@ #include "btrfs_inode.h" #include "async-thread.h" #include "free-space-cache.h" -#include "inode-map.h" #include "qgroup.h" #include "print-tree.h" #include "delalloc-space.h" @@ -783,7 +782,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, btrfs_set_root_refs(root_item, 0); memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); - root_item->drop_level = 0; + btrfs_set_root_drop_level(root_item, 0); } btrfs_tree_unlock(eb); @@ -1196,7 +1195,6 @@ again: btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); eb = btrfs_lock_root_node(dest); - btrfs_set_lock_blocking_write(eb); level = btrfs_header_level(eb); if (level < lowest_level) { @@ -1210,7 +1208,6 @@ again: BTRFS_NESTING_COW); BUG_ON(ret); } - btrfs_set_lock_blocking_write(eb); if (next_key) { next_key->objectid = (u64)-1; @@ -1220,8 +1217,6 @@ again: parent = eb; while (1) { - struct btrfs_key first_key; - level = btrfs_header_level(parent); BUG_ON(level < lowest_level); @@ -1237,7 +1232,6 @@ again: old_bytenr = btrfs_node_blockptr(parent, slot); blocksize = fs_info->nodesize; old_ptr_gen = btrfs_node_ptr_generation(parent, slot); - btrfs_node_key_to_cpu(parent, &first_key, slot); if (level <= max_level) { eb = path->nodes[level]; @@ -1262,15 +1256,10 @@ again: break; } - eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen, - level - 1, &first_key); + eb = btrfs_read_node_slot(parent, slot); if (IS_ERR(eb)) { ret = PTR_ERR(eb); break; - } else if (!extent_buffer_uptodate(eb)) { - ret = -EIO; - free_extent_buffer(eb); - break; } btrfs_tree_lock(eb); if (cow) { @@ -1279,7 +1268,6 @@ again: BTRFS_NESTING_COW); BUG_ON(ret); } - btrfs_set_lock_blocking_write(eb); btrfs_tree_unlock(parent); free_extent_buffer(parent); @@ -1418,10 +1406,8 @@ static noinline_for_stack int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, int *level) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *eb = NULL; int i; - u64 bytenr; u64 ptr_gen = 0; u64 last_snapshot; u32 nritems; @@ -1429,8 +1415,6 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, last_snapshot = btrfs_root_last_snapshot(&root->root_item); for (i = *level; i > 0; i--) { - struct btrfs_key first_key; - eb = path->nodes[i]; nritems = btrfs_header_nritems(eb); while (path->slots[i] < nritems) { @@ -1450,16 +1434,9 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, return 0; } - bytenr = btrfs_node_blockptr(eb, path->slots[i]); - btrfs_node_key_to_cpu(eb, &first_key, path->slots[i]); - eb = read_tree_block(fs_info, bytenr, ptr_gen, i - 1, - &first_key); - if (IS_ERR(eb)) { + eb = btrfs_read_node_slot(eb, path->slots[i]); + if (IS_ERR(eb)) return PTR_ERR(eb); - } else if (!extent_buffer_uptodate(eb)) { - free_extent_buffer(eb); - return -EIO; - } BUG_ON(btrfs_header_level(eb) != i - 1); path->nodes[i - 1] = eb; path->slots[i - 1] = 0; @@ -1575,7 +1552,7 @@ static void insert_dirty_subvol(struct btrfs_trans_handle *trans, reloc_root_item = &reloc_root->root_item; memset(&reloc_root_item->drop_progress, 0, sizeof(reloc_root_item->drop_progress)); - reloc_root_item->drop_level = 0; + btrfs_set_root_drop_level(reloc_root_item, 0); btrfs_set_root_refs(reloc_root_item, 0); btrfs_update_reloc_root(trans, root); @@ -1652,8 +1629,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, int level; int max_level; int replaced = 0; - int ret; - int err = 0; + int ret = 0; u32 min_reserved; path = btrfs_alloc_path(); @@ -1672,7 +1648,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, } else { btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); - level = root_item->drop_level; + level = btrfs_root_drop_level(root_item); BUG_ON(level == 0); path->lowest_level = level; ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); @@ -1704,13 +1680,11 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, while (1) { ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, BTRFS_RESERVE_FLUSH_LIMIT); - if (ret) { - err = ret; + if (ret) goto out; - } trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); trans = NULL; goto out; } @@ -1732,10 +1706,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, max_level = level; ret = walk_down_reloc_tree(reloc_root, path, &level); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } if (ret > 0) break; @@ -1746,11 +1718,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, ret = replace_path(trans, rc, root, reloc_root, path, &next_key, level, max_level); } - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } - if (ret > 0) { level = ret; btrfs_node_key_to_cpu(path->nodes[level], &key, @@ -1769,7 +1738,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, */ btrfs_node_key(path->nodes[level], &root_item->drop_progress, path->slots[level]); - root_item->drop_level = level; + btrfs_set_root_drop_level(root_item, level); btrfs_end_transaction_throttle(trans); trans = NULL; @@ -1789,12 +1758,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, BTRFS_NESTING_COW); btrfs_tree_unlock(leaf); free_extent_buffer(leaf); - if (ret < 0) - err = ret; out: btrfs_free_path(path); - if (err == 0) + if (ret == 0) insert_dirty_subvol(trans, rc, root); if (trans) @@ -1805,7 +1772,7 @@ out: if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); - return err; + return ret; } static noinline_for_stack @@ -2205,7 +2172,6 @@ static int do_relocation(struct btrfs_trans_handle *trans, struct btrfs_key *key, struct btrfs_path *path, int lowest) { - struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct btrfs_backref_node *upper; struct btrfs_backref_edge *edge; struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; @@ -2213,17 +2179,14 @@ static int do_relocation(struct btrfs_trans_handle *trans, struct extent_buffer *eb; u32 blocksize; u64 bytenr; - u64 generation; int slot; - int ret; - int err = 0; + int ret = 0; BUG_ON(lowest && node->eb); path->lowest_level = node->level + 1; rc->backref_cache.path[node->level] = node; list_for_each_entry(edge, &node->upper, list[LOWER]) { - struct btrfs_key first_key; struct btrfs_ref ref = { 0 }; cond_resched(); @@ -2235,10 +2198,8 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (upper->eb && !upper->locked) { if (!lowest) { ret = btrfs_bin_search(upper->eb, key, &slot); - if (ret < 0) { - err = ret; + if (ret < 0) goto next; - } BUG_ON(ret); bytenr = btrfs_node_blockptr(upper->eb, slot); if (node->eb->start == bytenr) @@ -2250,10 +2211,8 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (!upper->eb) { ret = btrfs_search_slot(trans, root, key, path, 0, 1); if (ret) { - if (ret < 0) - err = ret; - else - err = -ENOENT; + if (ret > 0) + ret = -ENOENT; btrfs_release_path(path); break; @@ -2273,10 +2232,8 @@ static int do_relocation(struct btrfs_trans_handle *trans, btrfs_release_path(path); } else { ret = btrfs_bin_search(upper->eb, key, &slot); - if (ret < 0) { - err = ret; + if (ret < 0) goto next; - } BUG_ON(ret); } @@ -2287,7 +2244,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", bytenr, node->bytenr, slot, upper->eb->start); - err = -EIO; + ret = -EIO; goto next; } } else { @@ -2296,30 +2253,20 @@ static int do_relocation(struct btrfs_trans_handle *trans, } blocksize = root->fs_info->nodesize; - generation = btrfs_node_ptr_generation(upper->eb, slot); - btrfs_node_key_to_cpu(upper->eb, &first_key, slot); - eb = read_tree_block(fs_info, bytenr, generation, - upper->level - 1, &first_key); + eb = btrfs_read_node_slot(upper->eb, slot); if (IS_ERR(eb)) { - err = PTR_ERR(eb); - goto next; - } else if (!extent_buffer_uptodate(eb)) { - free_extent_buffer(eb); - err = -EIO; + ret = PTR_ERR(eb); goto next; } btrfs_tree_lock(eb); - btrfs_set_lock_blocking_write(eb); if (!node->eb) { ret = btrfs_cow_block(trans, root, eb, upper->eb, slot, &eb, BTRFS_NESTING_COW); btrfs_tree_unlock(eb); free_extent_buffer(eb); - if (ret < 0) { - err = ret; + if (ret < 0) goto next; - } BUG_ON(node->eb != eb); } else { btrfs_set_node_blockptr(upper->eb, slot, @@ -2345,19 +2292,19 @@ next: btrfs_backref_drop_node_buffer(upper); else btrfs_backref_unlock_node_buffer(upper); - if (err) + if (ret) break; } - if (!err && node->pending) { + if (!ret && node->pending) { btrfs_backref_drop_node_buffer(node); list_move_tail(&node->list, &rc->backref_cache.changed); node->pending = 0; } path->lowest_level = 0; - BUG_ON(err == -ENOSPC); - return err; + BUG_ON(ret == -ENOSPC); + return ret; } static int link_to_upper(struct btrfs_trans_handle *trans, @@ -2446,7 +2393,7 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, { struct extent_buffer *eb; - eb = read_tree_block(fs_info, block->bytenr, block->key.offset, + eb = read_tree_block(fs_info, block->bytenr, 0, block->key.offset, block->level, NULL); if (IS_ERR(eb)) { return PTR_ERR(eb); @@ -2546,7 +2493,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, /* Kick in readahead for tree blocks with missing keys */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { if (!block->key_ready) - readahead_tree_block(fs_info, block->bytenr); + btrfs_readahead_tree_block(fs_info, block->bytenr, 0, 0, + block->level); } /* Get first keys */ @@ -3071,7 +3019,7 @@ int add_data_references(struct reloc_control *rc, while ((ref_node = ulist_next(leaves, &leaf_uiter))) { struct extent_buffer *eb; - eb = read_tree_block(fs_info, ref_node->val, 0, 0, NULL); + eb = read_tree_block(fs_info, ref_node->val, 0, 0, 0, NULL); if (IS_ERR(eb)) { ret = PTR_ERR(eb); break; @@ -3694,7 +3642,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) memset(&root->root_item.drop_progress, 0, sizeof(root->root_item.drop_progress)); - root->root_item.drop_level = 0; + btrfs_set_root_drop_level(&root->root_item, 0); btrfs_set_root_refs(&root->root_item, 0); ret = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e71e7586e9eb..5f4f88a4d2c8 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -20,6 +20,7 @@ #include "rcu-string.h" #include "raid56.h" #include "block-group.h" +#include "zoned.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -71,11 +72,9 @@ struct scrub_page { u64 physical; u64 physical_for_dev_replace; atomic_t refs; - struct { - unsigned int mirror_num:8; - unsigned int have_csum:1; - unsigned int io_error:1; - }; + u8 mirror_num; + int have_csum:1; + int io_error:1; u8 csum[BTRFS_CSUM_SIZE]; struct scrub_recover *recover; @@ -131,7 +130,7 @@ struct scrub_parity { int nsectors; - u64 stripe_len; + u32 stripe_len; refcount_t refs; @@ -161,7 +160,6 @@ struct scrub_ctx { atomic_t workers_pending; spinlock_t list_lock; wait_queue_head_t list_wait; - u16 csum_size; struct list_head csum_list; atomic_t cancel_req; int readonly; @@ -235,15 +233,15 @@ static void scrub_parity_get(struct scrub_parity *sparity); static void scrub_parity_put(struct scrub_parity *sparity); static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, struct scrub_page *spage); -static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, +static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num, u8 *csum, int force, + u64 gen, int mirror_num, u8 *csum, u64 physical_for_dev_replace); static void scrub_bio_end_io(struct bio *bio); static void scrub_bio_end_io_worker(struct btrfs_work *work); static void scrub_block_complete(struct scrub_block *sblock); static void scrub_remap_extent(struct btrfs_fs_info *fs_info, - u64 extent_logical, u64 extent_len, + u64 extent_logical, u32 extent_len, u64 *extent_physical, struct btrfs_device **extent_dev, int *extent_mirror_num); @@ -256,10 +254,10 @@ static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); static void scrub_put_ctx(struct scrub_ctx *sctx); -static inline int scrub_is_page_on_raid56(struct scrub_page *page) +static inline int scrub_is_page_on_raid56(struct scrub_page *spage) { - return page->recover && - (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); + return spage->recover && + (spage->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); } static void scrub_pending_bio_inc(struct scrub_ctx *sctx) @@ -610,7 +608,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( atomic_set(&sctx->bios_in_flight, 0); atomic_set(&sctx->workers_pending, 0); atomic_set(&sctx->cancel_req, 0); - sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy); spin_lock_init(&sctx->list_lock); spin_lock_init(&sctx->stat_lock); @@ -1092,11 +1089,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) success = 1; for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { - struct scrub_page *page_bad = sblock_bad->pagev[page_num]; + struct scrub_page *spage_bad = sblock_bad->pagev[page_num]; struct scrub_block *sblock_other = NULL; /* skip no-io-error page in scrub */ - if (!page_bad->io_error && !sctx->is_dev_replace) + if (!spage_bad->io_error && !sctx->is_dev_replace) continue; if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) { @@ -1108,7 +1105,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * sblock_for_recheck array to target device. */ sblock_other = NULL; - } else if (page_bad->io_error) { + } else if (spage_bad->io_error) { /* try to find no-io-error page in mirrors */ for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS && @@ -1147,7 +1144,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sblock_other, page_num, 0); if (0 == ret) - page_bad->io_error = 0; + spage_bad->io_error = 0; else success = 0; } @@ -1325,13 +1322,13 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, for (mirror_index = 0; mirror_index < nmirrors; mirror_index++) { struct scrub_block *sblock; - struct scrub_page *page; + struct scrub_page *spage; sblock = sblocks_for_recheck + mirror_index; sblock->sctx = sctx; - page = kzalloc(sizeof(*page), GFP_NOFS); - if (!page) { + spage = kzalloc(sizeof(*spage), GFP_NOFS); + if (!spage) { leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -1339,17 +1336,17 @@ leave_nomem: scrub_put_recover(fs_info, recover); return -ENOMEM; } - scrub_page_get(page); - sblock->pagev[page_index] = page; - page->sblock = sblock; - page->flags = flags; - page->generation = generation; - page->logical = logical; - page->have_csum = have_csum; + scrub_page_get(spage); + sblock->pagev[page_index] = spage; + spage->sblock = sblock; + spage->flags = flags; + spage->generation = generation; + spage->logical = logical; + spage->have_csum = have_csum; if (have_csum) - memcpy(page->csum, + memcpy(spage->csum, original_sblock->pagev[0]->csum, - sctx->csum_size); + sctx->fs_info->csum_size); scrub_stripe_index_and_offset(logical, bbio->map_type, @@ -1360,23 +1357,23 @@ leave_nomem: mirror_index, &stripe_index, &stripe_offset); - page->physical = bbio->stripes[stripe_index].physical + + spage->physical = bbio->stripes[stripe_index].physical + stripe_offset; - page->dev = bbio->stripes[stripe_index].dev; + spage->dev = bbio->stripes[stripe_index].dev; BUG_ON(page_index >= original_sblock->page_count); - page->physical_for_dev_replace = + spage->physical_for_dev_replace = original_sblock->pagev[page_index]-> physical_for_dev_replace; /* for missing devices, dev->bdev is NULL */ - page->mirror_num = mirror_index + 1; + spage->mirror_num = mirror_index + 1; sblock->page_count++; - page->page = alloc_page(GFP_NOFS); - if (!page->page) + spage->page = alloc_page(GFP_NOFS); + if (!spage->page) goto leave_nomem; scrub_get_recover(recover); - page->recover = recover; + spage->recover = recover; } scrub_put_recover(fs_info, recover); length -= sublen; @@ -1394,19 +1391,19 @@ static void scrub_bio_wait_endio(struct bio *bio) static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, struct bio *bio, - struct scrub_page *page) + struct scrub_page *spage) { DECLARE_COMPLETION_ONSTACK(done); int ret; int mirror_num; - bio->bi_iter.bi_sector = page->logical >> 9; + bio->bi_iter.bi_sector = spage->logical >> 9; bio->bi_private = &done; bio->bi_end_io = scrub_bio_wait_endio; - mirror_num = page->sblock->pagev[0]->mirror_num; - ret = raid56_parity_recover(fs_info, bio, page->recover->bbio, - page->recover->map_length, + mirror_num = spage->sblock->pagev[0]->mirror_num; + ret = raid56_parity_recover(fs_info, bio, spage->recover->bbio, + spage->recover->map_length, mirror_num, 0); if (ret) return ret; @@ -1431,10 +1428,10 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, bio_set_dev(bio, first_page->dev->bdev); for (page_num = 0; page_num < sblock->page_count; page_num++) { - struct scrub_page *page = sblock->pagev[page_num]; + struct scrub_page *spage = sblock->pagev[page_num]; - WARN_ON(!page->page); - bio_add_page(bio, page->page, PAGE_SIZE, 0); + WARN_ON(!spage->page); + bio_add_page(bio, spage->page, PAGE_SIZE, 0); } if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) { @@ -1475,24 +1472,24 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, for (page_num = 0; page_num < sblock->page_count; page_num++) { struct bio *bio; - struct scrub_page *page = sblock->pagev[page_num]; + struct scrub_page *spage = sblock->pagev[page_num]; - if (page->dev->bdev == NULL) { - page->io_error = 1; + if (spage->dev->bdev == NULL) { + spage->io_error = 1; sblock->no_io_error_seen = 0; continue; } - WARN_ON(!page->page); + WARN_ON(!spage->page); bio = btrfs_io_bio_alloc(1); - bio_set_dev(bio, page->dev->bdev); + bio_set_dev(bio, spage->dev->bdev); - bio_add_page(bio, page->page, PAGE_SIZE, 0); - bio->bi_iter.bi_sector = page->physical >> 9; + bio_add_page(bio, spage->page, PAGE_SIZE, 0); + bio->bi_iter.bi_sector = spage->physical >> 9; bio->bi_opf = REQ_OP_READ; if (btrfsic_submit_bio_wait(bio)) { - page->io_error = 1; + spage->io_error = 1; sblock->no_io_error_seen = 0; } @@ -1548,36 +1545,36 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, int page_num, int force_write) { - struct scrub_page *page_bad = sblock_bad->pagev[page_num]; - struct scrub_page *page_good = sblock_good->pagev[page_num]; + struct scrub_page *spage_bad = sblock_bad->pagev[page_num]; + struct scrub_page *spage_good = sblock_good->pagev[page_num]; struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info; - BUG_ON(page_bad->page == NULL); - BUG_ON(page_good->page == NULL); + BUG_ON(spage_bad->page == NULL); + BUG_ON(spage_good->page == NULL); if (force_write || sblock_bad->header_error || - sblock_bad->checksum_error || page_bad->io_error) { + sblock_bad->checksum_error || spage_bad->io_error) { struct bio *bio; int ret; - if (!page_bad->dev->bdev) { + if (!spage_bad->dev->bdev) { btrfs_warn_rl(fs_info, "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected"); return -EIO; } bio = btrfs_io_bio_alloc(1); - bio_set_dev(bio, page_bad->dev->bdev); - bio->bi_iter.bi_sector = page_bad->physical >> 9; + bio_set_dev(bio, spage_bad->dev->bdev); + bio->bi_iter.bi_sector = spage_bad->physical >> 9; bio->bi_opf = REQ_OP_WRITE; - ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); + ret = bio_add_page(bio, spage_good->page, PAGE_SIZE, 0); if (PAGE_SIZE != ret) { bio_put(bio); return -EIO; } if (btrfsic_submit_bio_wait(bio)) { - btrfs_dev_stat_inc_and_print(page_bad->dev, + btrfs_dev_stat_inc_and_print(spage_bad->dev, BTRFS_DEV_STAT_WRITE_ERRS); atomic64_inc(&fs_info->dev_replace.num_write_errors); bio_put(bio); @@ -1798,11 +1795,15 @@ static int scrub_checksum_data(struct scrub_block *sblock) shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); - crypto_shash_digest(shash, kaddr, PAGE_SIZE, csum); - if (memcmp(csum, spage->csum, sctx->csum_size)) - sblock->checksum_error = 1; + /* + * In scrub_pages() and scrub_pages_for_parity() we ensure each spage + * only contains one sector of data. + */ + crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); + if (memcmp(csum, spage->csum, fs_info->csum_size)) + sblock->checksum_error = 1; return sblock->checksum_error; } @@ -1814,16 +1815,26 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; - const int num_pages = sctx->fs_info->nodesize >> PAGE_SHIFT; + /* + * This is done in sectorsize steps even for metadata as there's a + * constraint for nodesize to be aligned to sectorsize. This will need + * to change so we don't misuse data and metadata units like that. + */ + const u32 sectorsize = sctx->fs_info->sectorsize; + const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits; int i; struct scrub_page *spage; char *kaddr; BUG_ON(sblock->page_count < 1); + + /* Each member in pagev is just one block, not a full page */ + ASSERT(sblock->page_count == num_sectors); + spage = sblock->pagev[0]; kaddr = page_address(spage->page); h = (struct btrfs_header *)kaddr; - memcpy(on_disk_csum, h->csum, sctx->csum_size); + memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size); /* * we don't use the getter functions here, as we @@ -1848,15 +1859,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, - PAGE_SIZE - BTRFS_CSUM_SIZE); + sectorsize - BTRFS_CSUM_SIZE); - for (i = 1; i < num_pages; i++) { + for (i = 1; i < num_sectors; i++) { kaddr = page_address(sblock->pagev[i]->page); - crypto_shash_update(shash, kaddr, PAGE_SIZE); + crypto_shash_update(shash, kaddr, sectorsize); } crypto_shash_final(shash, calculated_csum); - if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) + if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) sblock->checksum_error = 1; return sblock->header_error || sblock->checksum_error; @@ -1893,7 +1904,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum); - if (memcmp(calculated_csum, s->csum, sctx->csum_size)) + if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size)) ++fail_cor; if (fail_cor + fail_gen) { @@ -2150,12 +2161,13 @@ bbio_out: spin_unlock(&sctx->stat_lock); } -static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, +static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num, u8 *csum, int force, + u64 gen, int mirror_num, u8 *csum, u64 physical_for_dev_replace) { struct scrub_block *sblock; + const u32 sectorsize = sctx->fs_info->sectorsize; int index; sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); @@ -2174,7 +2186,12 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, for (index = 0; len > 0; index++) { struct scrub_page *spage; - u64 l = min_t(u64, len, PAGE_SIZE); + /* + * Here we will allocate one page for one sector to scrub. + * This is fine if PAGE_SIZE == sectorsize, but will cost + * more memory for PAGE_SIZE > sectorsize case. + */ + u32 l = min(sectorsize, len); spage = kzalloc(sizeof(*spage), GFP_KERNEL); if (!spage) { @@ -2198,7 +2215,7 @@ leave_nomem: spage->mirror_num = mirror_num; if (csum) { spage->have_csum = 1; - memcpy(spage->csum, csum, sctx->csum_size); + memcpy(spage->csum, csum, sctx->fs_info->csum_size); } else { spage->have_csum = 0; } @@ -2231,7 +2248,7 @@ leave_nomem: } } - if (force) + if (flags & BTRFS_EXTENT_FLAG_SUPER) scrub_submit(sctx); } @@ -2295,12 +2312,11 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, unsigned long *bitmap, - u64 start, u64 len) + u64 start, u32 len) { u64 offset; - u64 nsectors64; u32 nsectors; - int sectorsize = sparity->sctx->fs_info->sectorsize; + u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits; if (len >= sparity->stripe_len) { bitmap_set(bitmap, 0, sparity->nsectors); @@ -2309,11 +2325,8 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, start -= sparity->logic_start; start = div64_u64_rem(start, sparity->stripe_len, &offset); - offset = div_u64(offset, sectorsize); - nsectors64 = div_u64(len, sectorsize); - - ASSERT(nsectors64 < UINT_MAX); - nsectors = (u32)nsectors64; + offset = offset >> sectorsize_bits; + nsectors = len >> sectorsize_bits; if (offset + nsectors <= sparity->nsectors) { bitmap_set(bitmap, offset, nsectors); @@ -2325,13 +2338,13 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, } static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity, - u64 start, u64 len) + u64 start, u32 len) { __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len); } static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity, - u64 start, u64 len) + u64 start, u32 len) { __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len); } @@ -2359,48 +2372,77 @@ static void scrub_block_complete(struct scrub_block *sblock) u64 end = sblock->pagev[sblock->page_count - 1]->logical + PAGE_SIZE; + ASSERT(end - start <= U32_MAX); scrub_parity_mark_sectors_error(sblock->sparity, start, end - start); } } +static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum) +{ + sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits; + list_del(&sum->list); + kfree(sum); +} + +/* + * Find the desired csum for range [logical, logical + sectorsize), and store + * the csum into @csum. + * + * The search source is sctx->csum_list, which is a pre-populated list + * storing bytenr ordered csum ranges. We're reponsible to cleanup any range + * that is before @logical. + * + * Return 0 if there is no csum for the range. + * Return 1 if there is csum for the range and copied to @csum. + */ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) { - struct btrfs_ordered_sum *sum = NULL; - unsigned long index; - unsigned long num_sectors; + bool found = false; while (!list_empty(&sctx->csum_list)) { + struct btrfs_ordered_sum *sum = NULL; + unsigned long index; + unsigned long num_sectors; + sum = list_first_entry(&sctx->csum_list, struct btrfs_ordered_sum, list); + /* The current csum range is beyond our range, no csum found */ if (sum->bytenr > logical) - return 0; - if (sum->bytenr + sum->len > logical) break; - ++sctx->stat.csum_discards; - list_del(&sum->list); - kfree(sum); - sum = NULL; - } - if (!sum) - return 0; + /* + * The current sum is before our bytenr, since scrub is always + * done in bytenr order, the csum will never be used anymore, + * clean it up so that later calls won't bother with the range, + * and continue search the next range. + */ + if (sum->bytenr + sum->len <= logical) { + drop_csum_range(sctx, sum); + continue; + } - index = div_u64(logical - sum->bytenr, sctx->fs_info->sectorsize); - ASSERT(index < UINT_MAX); + /* Now the csum range covers our bytenr, copy the csum */ + found = true; + index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits; + num_sectors = sum->len >> sctx->fs_info->sectorsize_bits; - num_sectors = sum->len / sctx->fs_info->sectorsize; - memcpy(csum, sum->sums + index * sctx->csum_size, sctx->csum_size); - if (index == num_sectors - 1) { - list_del(&sum->list); - kfree(sum); + memcpy(csum, sum->sums + index * sctx->fs_info->csum_size, + sctx->fs_info->csum_size); + + /* Cleanup the range if we're at the end of the csum range */ + if (index == num_sectors - 1) + drop_csum_range(sctx, sum); + break; } + if (!found) + return 0; return 1; } /* scrub extent tries to collect up to 64 kB for each bio */ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, - u64 logical, u64 len, + u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u64 physical_for_dev_replace) { @@ -2432,7 +2474,7 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, } while (len) { - u64 l = min_t(u64, len, blocksize); + u32 l = min(len, blocksize); int have_csum = 0; if (flags & BTRFS_EXTENT_FLAG_DATA) { @@ -2442,7 +2484,7 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, ++sctx->stat.no_csum; } ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, - mirror_num, have_csum ? csum : NULL, 0, + mirror_num, have_csum ? csum : NULL, physical_for_dev_replace); if (ret) return ret; @@ -2455,14 +2497,17 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, } static int scrub_pages_for_parity(struct scrub_parity *sparity, - u64 logical, u64 len, + u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u8 *csum) { struct scrub_ctx *sctx = sparity->sctx; struct scrub_block *sblock; + const u32 sectorsize = sctx->fs_info->sectorsize; int index; + ASSERT(IS_ALIGNED(len, sectorsize)); + sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); if (!sblock) { spin_lock(&sctx->stat_lock); @@ -2481,7 +2526,6 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity, for (index = 0; len > 0; index++) { struct scrub_page *spage; - u64 l = min_t(u64, len, PAGE_SIZE); spage = kzalloc(sizeof(*spage), GFP_KERNEL); if (!spage) { @@ -2508,7 +2552,7 @@ leave_nomem: spage->mirror_num = mirror_num; if (csum) { spage->have_csum = 1; - memcpy(spage->csum, csum, sctx->csum_size); + memcpy(spage->csum, csum, sctx->fs_info->csum_size); } else { spage->have_csum = 0; } @@ -2516,9 +2560,12 @@ leave_nomem: spage->page = alloc_page(GFP_KERNEL); if (!spage->page) goto leave_nomem; - len -= l; - logical += l; - physical += l; + + + /* Iterate over the stripe range in sectorsize steps */ + len -= sectorsize; + logical += sectorsize; + physical += sectorsize; } WARN_ON(sblock->page_count == 0); @@ -2539,7 +2586,7 @@ leave_nomem: } static int scrub_extent_for_parity(struct scrub_parity *sparity, - u64 logical, u64 len, + u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num) { @@ -2563,7 +2610,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity, } while (len) { - u64 l = min_t(u64, len, blocksize); + u32 l = min(len, blocksize); int have_csum = 0; if (flags & BTRFS_EXTENT_FLAG_DATA) { @@ -2767,7 +2814,8 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, u64 generation; u64 extent_logical; u64 extent_physical; - u64 extent_len; + /* Check the comment in scrub_stripe() for why u32 is enough here */ + u32 extent_len; u64 mapped_length; struct btrfs_device *extent_dev; struct scrub_parity *sparity; @@ -2776,7 +2824,8 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, int extent_mirror_num; int stop_loop = 0; - nsectors = div_u64(map->stripe_len, fs_info->sectorsize); + ASSERT(map->stripe_len <= U32_MAX); + nsectors = map->stripe_len >> fs_info->sectorsize_bits; bitmap_len = scrub_calc_parity_bitmap_len(nsectors); sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len, GFP_NOFS); @@ -2787,6 +2836,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, return -ENOMEM; } + ASSERT(map->stripe_len <= U32_MAX); sparity->stripe_len = map->stripe_len; sparity->nsectors = nsectors; sparity->sctx = sctx; @@ -2881,6 +2931,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, } again: extent_logical = key.objectid; + ASSERT(bytes <= U32_MAX); extent_len = bytes; if (extent_logical < logic_start) { @@ -2959,9 +3010,11 @@ next: logic_start += map->stripe_len; } out: - if (ret < 0) + if (ret < 0) { + ASSERT(logic_end - logic_start <= U32_MAX); scrub_parity_mark_sectors_error(sparity, logic_start, logic_end - logic_start); + } scrub_parity_put(sparity); scrub_submit(sctx); mutex_lock(&sctx->wr_lock); @@ -3003,7 +3056,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 offset; u64 extent_logical; u64 extent_physical; - u64 extent_len; + /* + * Unlike chunk length, extent length should never go beyond + * BTRFS_MAX_EXTENT_SIZE, thus u32 is enough here. + */ + u32 extent_len; u64 stripe_logical; u64 stripe_end; struct btrfs_device *extent_dev; @@ -3084,17 +3141,21 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, key_end.offset = (u64)-1; reada1 = btrfs_reada_add(root, &key, &key_end); - key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key.type = BTRFS_EXTENT_CSUM_KEY; - key.offset = logical; - key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key_end.type = BTRFS_EXTENT_CSUM_KEY; - key_end.offset = logic_end; - reada2 = btrfs_reada_add(csum_root, &key, &key_end); + if (cache->flags & BTRFS_BLOCK_GROUP_DATA) { + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = logical; + key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key_end.type = BTRFS_EXTENT_CSUM_KEY; + key_end.offset = logic_end; + reada2 = btrfs_reada_add(csum_root, &key, &key_end); + } else { + reada2 = NULL; + } if (!IS_ERR(reada1)) btrfs_reada_wait(reada1); - if (!IS_ERR(reada2)) + if (!IS_ERR_OR_NULL(reada2)) btrfs_reada_wait(reada2); @@ -3248,6 +3309,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, again: extent_logical = key.objectid; + ASSERT(bytes <= U32_MAX); extent_len = bytes; /* @@ -3704,10 +3766,12 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->commit_total_bytes) break; + if (!btrfs_check_super_location(scrub_dev, bytenr)) + continue; ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, - NULL, 1, bytenr); + NULL, bytenr); if (ret) return ret; } @@ -3821,14 +3885,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -EINVAL; } - if (fs_info->sectorsize != PAGE_SIZE) { - /* not supported for data w/o checksums */ - btrfs_err_rl(fs_info, - "scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails", - fs_info->sectorsize, PAGE_SIZE); - return -EINVAL; - } - if (fs_info->nodesize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK || fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) { @@ -3855,7 +3911,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, goto out_free_ctx; mutex_lock(&fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); + dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -4032,7 +4088,7 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, struct scrub_ctx *sctx = NULL; mutex_lock(&fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); + dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); if (dev) sctx = dev->scrub_ctx; if (sctx) @@ -4043,7 +4099,7 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, } static void scrub_remap_extent(struct btrfs_fs_info *fs_info, - u64 extent_logical, u64 extent_len, + u64 extent_logical, u32 extent_len, u64 *extent_physical, struct btrfs_device **extent_dev, int *extent_mirror_num) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 340c76a12ce1..d719a2755a40 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2410,7 +2410,7 @@ static int send_subvol_begin(struct send_ctx *sctx) sctx->send_root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, - le64_to_cpu(sctx->send_root->root_item.ctransid)); + btrfs_root_ctransid(&sctx->send_root->root_item)); if (parent_root) { if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid)) TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, @@ -2419,7 +2419,7 @@ static int send_subvol_begin(struct send_ctx *sctx) TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, parent_root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, - le64_to_cpu(sctx->parent_root->root_item.ctransid)); + btrfs_root_ctransid(&sctx->parent_root->root_item)); } ret = send_cmd(sctx); @@ -5101,7 +5101,7 @@ static int send_clone(struct send_ctx *sctx, TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, clone_root->root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, - le64_to_cpu(clone_root->root->root_item.ctransid)); + btrfs_root_ctransid(&clone_root->root->root_item)); TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET, clone_root->offset); diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index c46be27be700..8260f8bb3ff0 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -57,8 +57,9 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ const void *ptr, unsigned long off) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long idx = member_offset >> PAGE_SHIFT; \ - const unsigned long oip = offset_in_page(member_offset); \ + const unsigned long idx = get_eb_page_index(member_offset); \ + const unsigned long oip = get_eb_offset_in_page(token->eb, \ + member_offset); \ const int size = sizeof(u##bits); \ u8 lebytes[sizeof(u##bits)]; \ const int part = PAGE_SIZE - oip; \ @@ -85,8 +86,8 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ const void *ptr, unsigned long off) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long oip = offset_in_page(member_offset); \ - const unsigned long idx = member_offset >> PAGE_SHIFT; \ + const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \ + const unsigned long idx = get_eb_page_index(member_offset); \ char *kaddr = page_address(eb->pages[idx]); \ const int size = sizeof(u##bits); \ const int part = PAGE_SIZE - oip; \ @@ -106,8 +107,9 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \ u##bits val) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long idx = member_offset >> PAGE_SHIFT; \ - const unsigned long oip = offset_in_page(member_offset); \ + const unsigned long idx = get_eb_page_index(member_offset); \ + const unsigned long oip = get_eb_offset_in_page(token->eb, \ + member_offset); \ const int size = sizeof(u##bits); \ u8 lebytes[sizeof(u##bits)]; \ const int part = PAGE_SIZE - oip; \ @@ -136,8 +138,8 @@ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ unsigned long off, u##bits val) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long oip = offset_in_page(member_offset); \ - const unsigned long idx = member_offset >> PAGE_SHIFT; \ + const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \ + const unsigned long idx = get_eb_page_index(member_offset); \ char *kaddr = page_address(eb->pages[idx]); \ const int size = sizeof(u##bits); \ const int part = PAGE_SIZE - oip; \ diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8840a4fa81eb..022f20810089 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -44,6 +44,7 @@ #include "backref.h" #include "space-info.h" #include "sysfs.h" +#include "zoned.h" #include "tests/btrfs-tests.h" #include "block-group.h" #include "discard.h" @@ -240,9 +241,13 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, . vaf.fmt = fmt; vaf.va = &args; - if (__ratelimit(ratelimit)) - printk("%sBTRFS %s (device %s): %pV\n", lvl, type, - fs_info ? fs_info->sb->s_id : "<unknown>", &vaf); + if (__ratelimit(ratelimit)) { + if (fs_info) + printk("%sBTRFS %s (device %s): %pV\n", lvl, type, + fs_info->sb->s_id, &vaf); + else + printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); + } va_end(args); } @@ -333,7 +338,6 @@ enum { Opt_device, Opt_fatal_errors, Opt_flushoncommit, Opt_noflushoncommit, - Opt_inode_cache, Opt_noinode_cache, Opt_max_inline, Opt_barrier, Opt_nobarrier, Opt_datacow, Opt_nodatacow, @@ -360,9 +364,13 @@ enum { Opt_rescue, Opt_usebackuproot, Opt_nologreplay, + Opt_ignorebadroots, + Opt_ignoredatacsums, + Opt_rescue_all, /* Deprecated options */ Opt_recovery, + Opt_inode_cache, Opt_noinode_cache, /* Debugging options */ Opt_check_integrity, @@ -455,9 +463,25 @@ static const match_table_t tokens = { static const match_table_t rescue_tokens = { {Opt_usebackuproot, "usebackuproot"}, {Opt_nologreplay, "nologreplay"}, + {Opt_ignorebadroots, "ignorebadroots"}, + {Opt_ignorebadroots, "ibadroots"}, + {Opt_ignoredatacsums, "ignoredatacsums"}, + {Opt_ignoredatacsums, "idatacsums"}, + {Opt_rescue_all, "all"}, {Opt_err, NULL}, }; +static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt, + const char *opt_name) +{ + if (fs_info->mount_opt & opt) { + btrfs_err(fs_info, "%s must be used with ro mount option", + opt_name); + return true; + } + return false; +} + static int parse_rescue_options(struct btrfs_fs_info *info, const char *options) { char *opts; @@ -487,6 +511,23 @@ static int parse_rescue_options(struct btrfs_fs_info *info, const char *options) btrfs_set_and_info(info, NOLOGREPLAY, "disabling log replay at mount time"); break; + case Opt_ignorebadroots: + btrfs_set_and_info(info, IGNOREBADROOTS, + "ignoring bad roots"); + break; + case Opt_ignoredatacsums: + btrfs_set_and_info(info, IGNOREDATACSUMS, + "ignoring data csums"); + break; + case Opt_rescue_all: + btrfs_info(info, "enabling all of the rescue options"); + btrfs_set_and_info(info, IGNOREDATACSUMS, + "ignoring data csums"); + btrfs_set_and_info(info, IGNOREBADROOTS, + "ignoring bad roots"); + btrfs_set_and_info(info, NOLOGREPLAY, + "disabling log replay at mount time"); + break; case Opt_err: btrfs_info(info, "unrecognized rescue option '%s'", p); ret = -EINVAL; @@ -511,7 +552,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, { substring_t args[MAX_OPT_ARGS]; char *p, *num; - u64 cache_gen; int intarg; int ret = 0; char *compress_type; @@ -521,11 +561,17 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, bool saved_compress_force; int no_compress = 0; - cache_gen = btrfs_super_cache_generation(info->super_copy); if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE)) btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE); - else if (cache_gen) - btrfs_set_opt(info->mount_opt, SPACE_CACHE); + else if (btrfs_free_space_cache_v1_active(info)) { + if (btrfs_is_zoned(info)) { + btrfs_info(info, + "zoned: clearing existing space cache"); + btrfs_set_super_cache_generation(info->super_copy, 0); + } else { + btrfs_set_opt(info->mount_opt, SPACE_CACHE); + } + } /* * Even the options are empty, we still need to do extra check @@ -832,14 +878,9 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, } break; case Opt_inode_cache: - btrfs_warn(info, - "the 'inode_cache' option is deprecated and will have no effect from 5.11"); - btrfs_set_pending_and_info(info, INODE_MAP_CACHE, - "enabling inode map caching"); - break; case Opt_noinode_cache: - btrfs_clear_pending_and_info(info, INODE_MAP_CACHE, - "disabling inode map caching"); + btrfs_warn(info, + "the 'inode_cache' option is deprecated and has no effect since 5.11"); break; case Opt_clear_cache: btrfs_set_and_info(info, CLEAR_CACHE, @@ -968,14 +1009,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, } } check: - /* - * Extra check for current option against current flag - */ - if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & SB_RDONLY)) { - btrfs_err(info, - "nologreplay must be used with ro mount option"); + /* We're read-only, don't have to check. */ + if (new_flags & SB_RDONLY) + goto out; + + if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") || + check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") || + check_ro_option(info, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums")) ret = -EINVAL; - } out: if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) && !btrfs_test_opt(info, FREE_SPACE_TREE) && @@ -984,6 +1025,8 @@ out: ret = -EINVAL; } + if (!ret) + ret = btrfs_check_mountopts_zoned(info); if (!ret && btrfs_test_opt(info, SPACE_CACHE)) btrfs_info(info, "disk space caching is enabled"); if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE)) @@ -1127,7 +1170,6 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, ret = -ENOMEM; goto err; } - path->leave_spinning = 1; name = kmalloc(PATH_MAX, GFP_KERNEL); if (!name) { @@ -1256,7 +1298,6 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->leave_spinning = 1; /* * Find the "default" dir item which points to the root item that we @@ -1383,11 +1424,18 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return btrfs_commit_transaction(trans); } +static void print_rescue_option(struct seq_file *seq, const char *s, bool *printed) +{ + seq_printf(seq, "%s%s", (*printed) ? ":" : ",rescue=", s); + *printed = true; +} + static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) { struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb); const char *compress_type; const char *subvol_name; + bool printed = false; if (btrfs_test_opt(info, DEGRADED)) seq_puts(seq, ",degraded"); @@ -1420,7 +1468,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (btrfs_test_opt(info, NOTREELOG)) seq_puts(seq, ",notreelog"); if (btrfs_test_opt(info, NOLOGREPLAY)) - seq_puts(seq, ",rescue=nologreplay"); + print_rescue_option(seq, "nologreplay", &printed); + if (btrfs_test_opt(info, USEBACKUPROOT)) + print_rescue_option(seq, "usebackuproot", &printed); + if (btrfs_test_opt(info, IGNOREBADROOTS)) + print_rescue_option(seq, "ignorebadroots", &printed); + if (btrfs_test_opt(info, IGNOREDATACSUMS)) + print_rescue_option(seq, "ignoredatacsums", &printed); if (btrfs_test_opt(info, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); if (btrfs_test_opt(info, DISCARD_SYNC)) @@ -1429,9 +1483,9 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",discard=async"); if (!(info->sb->s_flags & SB_POSIXACL)) seq_puts(seq, ",noacl"); - if (btrfs_test_opt(info, SPACE_CACHE)) + if (btrfs_free_space_cache_v1_active(info)) seq_puts(seq, ",space_cache"); - else if (btrfs_test_opt(info, FREE_SPACE_TREE)) + else if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE)) seq_puts(seq, ",space_cache=v2"); else seq_puts(seq, ",nospace_cache"); @@ -1445,8 +1499,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",enospc_debug"); if (btrfs_test_opt(info, AUTO_DEFRAG)) seq_puts(seq, ",autodefrag"); - if (btrfs_test_opt(info, INODE_MAP_CACHE)) - seq_puts(seq, ",inode_cache"); if (btrfs_test_opt(info, SKIP_BALANCE)) seq_puts(seq, ",skip_balance"); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY @@ -1810,6 +1862,8 @@ static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info, static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, unsigned long old_opts) { + const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE); + /* * We need to cleanup all defragable inodes if the autodefragment is * close or the filesystem is read only. @@ -1826,12 +1880,15 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, else if (btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) && !btrfs_test_opt(fs_info, DISCARD_ASYNC)) btrfs_discard_cleanup(fs_info); + + /* If we toggled space cache */ + if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) + btrfs_set_free_space_cache_v1_active(fs_info, cache_opt); } static int btrfs_remount(struct super_block *sb, int *flags, char *data) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); - struct btrfs_root *root = fs_info->tree_root; unsigned old_flags = sb->s_flags; unsigned long old_opts = fs_info->mount_opt; unsigned long old_compress_type = fs_info->compress_type; @@ -1862,6 +1919,22 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size, old_thread_pool_size); + if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) != + btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + (!sb_rdonly(sb) || (*flags & SB_RDONLY))) { + btrfs_warn(fs_info, + "remount supports changing free space tree only from ro to rw"); + /* Make sure free space cache options match the state on disk */ + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); + btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); + } + if (btrfs_free_space_cache_v1_active(fs_info)) { + btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE); + btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE); + } + } + if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) goto out; @@ -1924,39 +1997,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; } - ret = btrfs_cleanup_fs_roots(fs_info); - if (ret) - goto restore; - - /* recover relocation */ - mutex_lock(&fs_info->cleaner_mutex); - ret = btrfs_recover_relocation(root); - mutex_unlock(&fs_info->cleaner_mutex); - if (ret) - goto restore; - - ret = btrfs_resume_balance_async(fs_info); + /* + * NOTE: when remounting with a change that does writes, don't + * put it anywhere above this point, as we are not sure to be + * safe to write until we pass the above checks. + */ + ret = btrfs_start_pre_rw_mount(fs_info); if (ret) goto restore; - ret = btrfs_resume_dev_replace_async(fs_info); - if (ret) { - btrfs_warn(fs_info, "failed to resume dev_replace"); - goto restore; - } - - btrfs_qgroup_rescan_resume(fs_info); - - if (!fs_info->uuid_root) { - btrfs_info(fs_info, "creating UUID tree"); - ret = btrfs_create_uuid_tree(fs_info); - if (ret) { - btrfs_warn(fs_info, - "failed to create the UUID tree %d", - ret); - goto restore; - } - } sb->s_flags &= ~SB_RDONLY; set_bit(BTRFS_FS_OPEN, &fs_info->flags); @@ -1970,6 +2019,7 @@ out: wake_up_process(fs_info->transaction_kthread); btrfs_remount_cleanup(fs_info, old_opts); + btrfs_clear_oneshot_options(fs_info); clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); return 0; @@ -2156,7 +2206,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 total_used = 0; u64 total_free_data = 0; u64 total_free_meta = 0; - int bits = dentry->d_sb->s_blocksize_bits; + u32 bits = fs_info->sectorsize_bits; __be32 *fsid = (__be32 *)fs_info->fs_devices->fsid; unsigned factor = 1; struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; @@ -2463,6 +2513,11 @@ static void __init btrfs_print_mod_info(void) #ifdef CONFIG_BTRFS_FS_REF_VERIFY ", ref-verify=on" #endif +#ifdef CONFIG_BLK_DEV_ZONED + ", zoned=yes" +#else + ", zoned=no" +#endif ; pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options); } @@ -2523,8 +2578,6 @@ static int __init init_btrfs_fs(void) if (err) goto free_end_io_wq; - btrfs_init_lockdep(); - btrfs_print_mod_info(); err = btrfs_run_sanity_tests(); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 279d9262b676..19b9fffa2c9c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -263,6 +263,10 @@ BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); +/* Remove once support for zoned allocation is feature complete */ +#ifdef CONFIG_BTRFS_DEBUG +BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); +#endif static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(mixed_backref), @@ -278,6 +282,9 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(metadata_uuid), BTRFS_FEAT_ATTR_PTR(free_space_tree), BTRFS_FEAT_ATTR_PTR(raid1c34), +#ifdef CONFIG_BTRFS_DEBUG + BTRFS_FEAT_ATTR_PTR(zoned), +#endif NULL }; @@ -329,10 +336,35 @@ static ssize_t send_stream_version_show(struct kobject *kobj, } BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show); +static const char *rescue_opts[] = { + "usebackuproot", + "nologreplay", + "ignorebadroots", + "ignoredatacsums", + "all", +}; + +static ssize_t supported_rescue_options_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + ssize_t ret = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(rescue_opts); i++) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", + (i ? " " : ""), rescue_opts[i]); + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + return ret; +} +BTRFS_ATTR(static_feature, supported_rescue_options, + supported_rescue_options_show); + static struct attribute *btrfs_supported_static_feature_attrs[] = { BTRFS_ATTR_PTR(static_feature, rmdir_subvol), BTRFS_ATTR_PTR(static_feature, supported_checksums), BTRFS_ATTR_PTR(static_feature, send_stream_version), + BTRFS_ATTR_PTR(static_feature, supported_rescue_options), NULL }; @@ -433,7 +465,8 @@ static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj, return -EINVAL; WRITE_ONCE(discard_ctl->iops_limit, iops_limit); - + btrfs_discard_calc_delay(discard_ctl); + btrfs_discard_schedule_work(discard_ctl, true); return len; } BTRFS_ATTR_RW(discard, iops_limit, btrfs_discard_iops_limit_show, @@ -463,7 +496,7 @@ static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj, return -EINVAL; WRITE_ONCE(discard_ctl->kbps_limit, kbps_limit); - + btrfs_discard_schedule_work(discard_ctl, true); return len; } BTRFS_ATTR_RW(discard, kbps_limit, btrfs_discard_kbps_limit_show, @@ -854,6 +887,82 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj, } BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show); +static ssize_t btrfs_generation_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", fs_info->generation); +} +BTRFS_ATTR(, generation, btrfs_generation_show); + +/* + * Look for an exact string @string in @buffer with possible leading or + * trailing whitespace + */ +static bool strmatch(const char *buffer, const char *string) +{ + const size_t len = strlen(string); + + /* Skip leading whitespace */ + buffer = skip_spaces(buffer); + + /* Match entire string, check if the rest is whitespace or empty */ + if (strncmp(string, buffer, len) == 0 && + strlen(skip_spaces(buffer + len)) == 0) + return true; + + return false; +} + +static const char * const btrfs_read_policy_name[] = { "pid" }; + +static ssize_t btrfs_read_policy_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); + ssize_t ret = 0; + int i; + + for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { + if (fs_devices->read_policy == i) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s[%s]", + (ret == 0 ? "" : " "), + btrfs_read_policy_name[i]); + else + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", + (ret == 0 ? "" : " "), + btrfs_read_policy_name[i]); + } + + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + + return ret; +} + +static ssize_t btrfs_read_policy_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); + int i; + + for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { + if (strmatch(buf, btrfs_read_policy_name[i])) { + if (i != fs_devices->read_policy) { + fs_devices->read_policy = i; + btrfs_info(fs_devices->fs_info, + "read policy set to '%s'", + btrfs_read_policy_name[i]); + } + return len; + } + } + + return -EINVAL; +} +BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), @@ -863,6 +972,8 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, metadata_uuid), BTRFS_ATTR_PTR(, checksum), BTRFS_ATTR_PTR(, exclusive_operation), + BTRFS_ATTR_PTR(, generation), + BTRFS_ATTR_PTR(, read_policy), NULL, }; @@ -1207,7 +1318,7 @@ static const char *alloc_name(u64 flags) default: WARN_ON(1); return "invalid-combination"; - }; + } } /* @@ -1232,8 +1343,6 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, void btrfs_sysfs_remove_device(struct btrfs_device *device) { - struct hd_struct *disk; - struct kobject *disk_kobj; struct kobject *devices_kobj; /* @@ -1243,11 +1352,8 @@ void btrfs_sysfs_remove_device(struct btrfs_device *device) devices_kobj = device->fs_info->fs_devices->devices_kobj; ASSERT(devices_kobj); - if (device->bdev) { - disk = device->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; - sysfs_remove_link(devices_kobj, disk_kobj->name); - } + if (device->bdev) + sysfs_remove_link(devices_kobj, bdev_kobj(device->bdev)->name); if (device->devid_kobj.state_initialized) { kobject_del(&device->devid_kobj); @@ -1353,11 +1459,7 @@ int btrfs_sysfs_add_device(struct btrfs_device *device) nofs_flag = memalloc_nofs_save(); if (device->bdev) { - struct hd_struct *disk; - struct kobject *disk_kobj; - - disk = device->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; + struct kobject *disk_kobj = bdev_kobj(device->bdev); ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name); if (ret) { diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 999c14e5d0bd..8ca334d554af 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -134,6 +134,7 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; + fs_info->sectorsize_bits = ilog2(sectorsize); set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); test_mnt->mnt_sb->s_fs_info = fs_info; @@ -224,7 +225,7 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); INIT_LIST_HEAD(&cache->bg_list); - btrfs_init_free_space_ctl(cache); + btrfs_init_free_space_ctl(cache, cache->free_space_ctl); mutex_init(&cache->free_space_lock); return cache; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index df7ce874a74b..73e96d505f4f 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -379,54 +379,50 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) { struct btrfs_fs_info *fs_info; - unsigned long len; unsigned long *bitmap = NULL; struct extent_buffer *eb = NULL; int ret; test_msg("running extent buffer bitmap tests"); - /* - * In ppc64, sectorsize can be 64K, thus 4 * 64K will be larger than - * BTRFS_MAX_METADATA_BLOCKSIZE. - */ - len = (sectorsize < BTRFS_MAX_METADATA_BLOCKSIZE) - ? sectorsize * 4 : sectorsize; - - fs_info = btrfs_alloc_dummy_fs_info(len, len); + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { test_std_err(TEST_ALLOC_FS_INFO); return -ENOMEM; } - bitmap = kmalloc(len, GFP_KERNEL); + bitmap = kmalloc(nodesize, GFP_KERNEL); if (!bitmap) { test_err("couldn't allocate test bitmap"); ret = -ENOMEM; goto out; } - eb = __alloc_dummy_extent_buffer(fs_info, 0, len); + eb = __alloc_dummy_extent_buffer(fs_info, 0, nodesize); if (!eb) { test_std_err(TEST_ALLOC_ROOT); ret = -ENOMEM; goto out; } - ret = __test_eb_bitmaps(bitmap, eb, len); + ret = __test_eb_bitmaps(bitmap, eb, nodesize); if (ret) goto out; - /* Do it over again with an extent buffer which isn't page-aligned. */ free_extent_buffer(eb); - eb = __alloc_dummy_extent_buffer(fs_info, nodesize / 2, len); + + /* + * Test again for case where the tree block is sectorsize aligned but + * not nodesize aligned. + */ + eb = __alloc_dummy_extent_buffer(fs_info, sectorsize, nodesize); if (!eb) { test_std_err(TEST_ALLOC_ROOT); ret = -ENOMEM; goto out; } - ret = __test_eb_bitmaps(bitmap, eb, len); + ret = __test_eb_bitmaps(bitmap, eb, nodesize); out: free_extent_buffer(eb); kfree(bitmap); diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index aebdf23f0cdd..8f05c1eb833f 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -399,7 +399,6 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache, u64 offset; u64 max_extent_size; const struct btrfs_free_space_op test_free_space_ops = { - .recalc_thresholds = cache->free_space_ctl->op->recalc_thresholds, .use_bitmap = test_use_bitmap, }; const struct btrfs_free_space_op *orig_free_space_ops; diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index ce1ca8e73c2d..f3137285a9e2 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -36,7 +36,6 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, return -ENOMEM; } - path->leave_spinning = 1; ret = btrfs_insert_empty_item(&trans, root, path, &ins, size); if (ret) { test_err("couldn't insert ref %d", ret); @@ -86,7 +85,6 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, return -ENOMEM; } - path->leave_spinning = 1; ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); if (ret) { test_err("couldn't find extent ref"); @@ -135,7 +133,6 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr, test_std_err(TEST_ALLOC_ROOT); return -ENOMEM; } - path->leave_spinning = 1; ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); if (ret) { @@ -170,7 +167,6 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, return -ENOMEM; } - path->leave_spinning = 1; ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); if (ret) { test_err("couldn't find extent ref"); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 52ada47aff50..8e0f7a1029c6 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -16,7 +16,6 @@ #include "transaction.h" #include "locking.h" #include "tree-log.h" -#include "inode-map.h" #include "volumes.h" #include "dev-replace.h" #include "qgroup.h" @@ -155,6 +154,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root, *tmp; + struct btrfs_caching_control *caching_ctl, *next; down_write(&fs_info->commit_root_sem); list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits, @@ -162,8 +162,6 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) list_del_init(&root->dirty_list); free_extent_buffer(root->commit_root); root->commit_root = btrfs_root_node(root); - if (is_fstree(root->root_key.objectid)) - btrfs_unpin_free_ino(root); extent_io_tree_release(&root->dirty_log_pages); btrfs_qgroup_clean_swapped_blocks(root); } @@ -180,6 +178,47 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) spin_lock(&cur_trans->dropped_roots_lock); } spin_unlock(&cur_trans->dropped_roots_lock); + + /* + * We have to update the last_byte_to_unpin under the commit_root_sem, + * at the same time we swap out the commit roots. + * + * This is because we must have a real view of the last spot the caching + * kthreads were while caching. Consider the following views of the + * extent tree for a block group + * + * commit root + * +----+----+----+----+----+----+----+ + * |\\\\| |\\\\|\\\\| |\\\\|\\\\| + * +----+----+----+----+----+----+----+ + * 0 1 2 3 4 5 6 7 + * + * new commit root + * +----+----+----+----+----+----+----+ + * | | | |\\\\| | |\\\\| + * +----+----+----+----+----+----+----+ + * 0 1 2 3 4 5 6 7 + * + * If the cache_ctl->progress was at 3, then we are only allowed to + * unpin [0,1) and [2,3], because the caching thread has already + * processed those extents. We are not allowed to unpin [5,6), because + * the caching thread will re-start it's search from 3, and thus find + * the hole from [4,6) to add to the free space cache. + */ + spin_lock(&fs_info->block_group_cache_lock); + list_for_each_entry_safe(caching_ctl, next, + &fs_info->caching_block_groups, list) { + struct btrfs_block_group *cache = caching_ctl->block_group; + + if (btrfs_block_group_done(cache)) { + cache->last_byte_to_unpin = (u64)-1; + list_del_init(&caching_ctl->list); + btrfs_put_caching_control(caching_ctl); + } else { + cache->last_byte_to_unpin = caching_ctl->progress; + } + } + spin_unlock(&fs_info->block_group_cache_lock); up_write(&fs_info->commit_root_sem); } @@ -856,24 +895,24 @@ void btrfs_throttle(struct btrfs_fs_info *fs_info) wait_current_trans(fs_info); } -static int should_end_transaction(struct btrfs_trans_handle *trans) +static bool should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; if (btrfs_check_space_for_delayed_refs(fs_info)) - return 1; + return true; return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); } -int btrfs_should_end_transaction(struct btrfs_trans_handle *trans) +bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_transaction *cur_trans = trans->transaction; smp_mb(); if (cur_trans->state >= TRANS_STATE_COMMIT_START || cur_trans->delayed_refs.flushing) - return 1; + return true; return should_end_transaction(trans); } @@ -1300,8 +1339,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) btrfs_free_log(trans, root); btrfs_update_reloc_root(trans, root); - btrfs_save_ino_cache(root, trans); - /* see comments in should_cow_block() */ clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); smp_mb__after_atomic(); @@ -1598,8 +1635,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - btrfs_set_lock_blocking_write(old); - ret = btrfs_copy_root(trans, root, old, &tmp, objectid); /* clean up in any case */ btrfs_tree_unlock(old); @@ -1681,7 +1716,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dentry->d_name.len * 2); parent_inode->i_mtime = parent_inode->i_ctime = current_time(parent_inode); - ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode); + ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode)); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1761,6 +1796,8 @@ static void update_super_roots(struct btrfs_fs_info *fs_info) super->root_level = root_item->level; if (btrfs_test_opt(fs_info, SPACE_CACHE)) super->cache_generation = root_item->generation; + else if (test_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags)) + super->cache_generation = 0; if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags)) super->uuid_tree_generation = root_item->generation; } @@ -1956,10 +1993,8 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) } } -static inline int btrfs_start_delalloc_flush(struct btrfs_trans_handle *trans) +static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = trans->fs_info; - /* * We use writeback_inodes_sb here because if we used * btrfs_start_delalloc_roots we would deadlock with fs freeze. @@ -1969,50 +2004,15 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_trans_handle *trans) * from already being in a transaction and our join_transaction doesn't * have to re-take the fs freeze lock. */ - if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) { + if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); - } else { - struct btrfs_pending_snapshot *pending; - struct list_head *head = &trans->transaction->pending_snapshots; - - /* - * Flush dellaloc for any root that is going to be snapshotted. - * This is done to avoid a corrupted version of files, in the - * snapshots, that had both buffered and direct IO writes (even - * if they were done sequentially) due to an unordered update of - * the inode's size on disk. - */ - list_for_each_entry(pending, head, list) { - int ret; - - ret = btrfs_start_delalloc_snapshot(pending->root); - if (ret) - return ret; - } - } return 0; } -static inline void btrfs_wait_delalloc_flush(struct btrfs_trans_handle *trans) +static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = trans->fs_info; - - if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) { + if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); - } else { - struct btrfs_pending_snapshot *pending; - struct list_head *head = &trans->transaction->pending_snapshots; - - /* - * Wait for any dellaloc that we started previously for the roots - * that are going to be snapshotted. This is to avoid a corrupted - * version of files in the snapshots that had both buffered and - * direct IO writes (even if they were done sequentially). - */ - list_for_each_entry(pending, head, list) - btrfs_wait_ordered_extents(pending->root, - U64_MAX, 0, U64_MAX); - } } int btrfs_commit_transaction(struct btrfs_trans_handle *trans) @@ -2150,7 +2150,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) extwriter_counter_dec(cur_trans, trans->type); - ret = btrfs_start_delalloc_flush(trans); + ret = btrfs_start_delalloc_flush(fs_info); if (ret) goto cleanup_transaction; @@ -2166,7 +2166,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) goto cleanup_transaction; - btrfs_wait_delalloc_flush(trans); + btrfs_wait_delalloc_flush(fs_info); /* * Wait for all ordered extents started by a fast fsync that joined this @@ -2293,8 +2293,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) goto unlock_tree_log; } - btrfs_prepare_extent_commit(fs_info); - cur_trans = fs_info->running_transaction; btrfs_set_root_node(&fs_info->tree_root->root_item, @@ -2435,10 +2433,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid); btrfs_kill_all_delayed_nodes(root); - if (root->ino_cache_inode) { - iput(root->ino_cache_inode); - root->ino_cache_inode = NULL; - } if (btrfs_header_backref_rev(root->node) < BTRFS_MIXED_BACKREF_REV) @@ -2459,16 +2453,6 @@ void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info) if (!prev) return; - bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE; - if (prev & bit) - btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE); - prev &= ~bit; - - bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE; - if (prev & bit) - btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE); - prev &= ~bit; - bit = 1 << BTRFS_PENDING_COMMIT; if (prev & bit) btrfs_debug(fs_info, "pending commit done"); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 858d9153a1cd..31ca81bad822 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -112,7 +112,6 @@ struct btrfs_transaction { #define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH) #define BTRFS_SEND_TRANS_STUB ((void *)1) -#define BTRFS_DIO_SYNC_STUB ((void *)2) struct btrfs_trans_handle { u64 transid; @@ -219,7 +218,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans); int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, int wait_for_unblock); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); -int btrfs_should_end_transaction(struct btrfs_trans_handle *trans); +bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans); void btrfs_throttle(struct btrfs_fs_info *fs_info); int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index ea2bb4cb5890..028e733e42f3 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -100,7 +100,8 @@ static void file_extent_err(const struct extent_buffer *eb, int slot, */ #define CHECK_FE_ALIGNED(leaf, slot, fi, name, alignment) \ ({ \ - if (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))) \ + if (unlikely(!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), \ + (alignment)))) \ file_extent_err((leaf), (slot), \ "invalid %s for file extent, have %llu, should be aligned to %u", \ (#name), btrfs_file_extent_##name((leaf), (fi)), \ @@ -203,7 +204,7 @@ static int check_extent_data_item(struct extent_buffer *leaf, u32 item_size = btrfs_item_size_nr(leaf, slot); u64 extent_end; - if (!IS_ALIGNED(key->offset, sectorsize)) { + if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) { file_extent_err(leaf, slot, "unaligned file_offset for file extent, have %llu should be aligned to %u", key->offset, sectorsize); @@ -216,7 +217,7 @@ static int check_extent_data_item(struct extent_buffer *leaf, * But if objectids mismatch, it means we have a missing * INODE_ITEM. */ - if (!check_prev_ino(leaf, key, slot, prev_key)) + if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) return -EUCLEAN; fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); @@ -225,14 +226,15 @@ static int check_extent_data_item(struct extent_buffer *leaf, * Make sure the item contains at least inline header, so the file * extent type is not some garbage. */ - if (item_size < BTRFS_FILE_EXTENT_INLINE_DATA_START) { + if (unlikely(item_size < BTRFS_FILE_EXTENT_INLINE_DATA_START)) { file_extent_err(leaf, slot, "invalid item size, have %u expect [%zu, %u)", item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START, SZ_4K); return -EUCLEAN; } - if (btrfs_file_extent_type(leaf, fi) >= BTRFS_NR_FILE_EXTENT_TYPES) { + if (unlikely(btrfs_file_extent_type(leaf, fi) >= + BTRFS_NR_FILE_EXTENT_TYPES)) { file_extent_err(leaf, slot, "invalid type for file extent, have %u expect range [0, %u]", btrfs_file_extent_type(leaf, fi), @@ -244,14 +246,15 @@ static int check_extent_data_item(struct extent_buffer *leaf, * Support for new compression/encryption must introduce incompat flag, * and must be caught in open_ctree(). */ - if (btrfs_file_extent_compression(leaf, fi) >= BTRFS_NR_COMPRESS_TYPES) { + if (unlikely(btrfs_file_extent_compression(leaf, fi) >= + BTRFS_NR_COMPRESS_TYPES)) { file_extent_err(leaf, slot, "invalid compression for file extent, have %u expect range [0, %u]", btrfs_file_extent_compression(leaf, fi), BTRFS_NR_COMPRESS_TYPES - 1); return -EUCLEAN; } - if (btrfs_file_extent_encryption(leaf, fi)) { + if (unlikely(btrfs_file_extent_encryption(leaf, fi))) { file_extent_err(leaf, slot, "invalid encryption for file extent, have %u expect 0", btrfs_file_extent_encryption(leaf, fi)); @@ -259,7 +262,7 @@ static int check_extent_data_item(struct extent_buffer *leaf, } if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { /* Inline extent must have 0 as key offset */ - if (key->offset) { + if (unlikely(key->offset)) { file_extent_err(leaf, slot, "invalid file_offset for inline file extent, have %llu expect 0", key->offset); @@ -272,8 +275,8 @@ static int check_extent_data_item(struct extent_buffer *leaf, return 0; /* Uncompressed inline extent size must match item size */ - if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START + - btrfs_file_extent_ram_bytes(leaf, fi)) { + if (unlikely(item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START + + btrfs_file_extent_ram_bytes(leaf, fi))) { file_extent_err(leaf, slot, "invalid ram_bytes for uncompressed inline extent, have %u expect %llu", item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START + @@ -284,22 +287,22 @@ static int check_extent_data_item(struct extent_buffer *leaf, } /* Regular or preallocated extent has fixed item size */ - if (item_size != sizeof(*fi)) { + if (unlikely(item_size != sizeof(*fi))) { file_extent_err(leaf, slot, "invalid item size for reg/prealloc file extent, have %u expect %zu", item_size, sizeof(*fi)); return -EUCLEAN; } - if (CHECK_FE_ALIGNED(leaf, slot, fi, ram_bytes, sectorsize) || - CHECK_FE_ALIGNED(leaf, slot, fi, disk_bytenr, sectorsize) || - CHECK_FE_ALIGNED(leaf, slot, fi, disk_num_bytes, sectorsize) || - CHECK_FE_ALIGNED(leaf, slot, fi, offset, sectorsize) || - CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize)) + if (unlikely(CHECK_FE_ALIGNED(leaf, slot, fi, ram_bytes, sectorsize) || + CHECK_FE_ALIGNED(leaf, slot, fi, disk_bytenr, sectorsize) || + CHECK_FE_ALIGNED(leaf, slot, fi, disk_num_bytes, sectorsize) || + CHECK_FE_ALIGNED(leaf, slot, fi, offset, sectorsize) || + CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize))) return -EUCLEAN; /* Catch extent end overflow */ - if (check_add_overflow(btrfs_file_extent_num_bytes(leaf, fi), - key->offset, &extent_end)) { + if (unlikely(check_add_overflow(btrfs_file_extent_num_bytes(leaf, fi), + key->offset, &extent_end))) { file_extent_err(leaf, slot, "extent end overflow, have file offset %llu extent num bytes %llu", key->offset, @@ -320,7 +323,7 @@ static int check_extent_data_item(struct extent_buffer *leaf, prev_fi = btrfs_item_ptr(leaf, slot - 1, struct btrfs_file_extent_item); prev_end = file_extent_end(leaf, prev_key, prev_fi); - if (prev_end > key->offset) { + if (unlikely(prev_end > key->offset)) { file_extent_err(leaf, slot - 1, "file extent end range (%llu) goes beyond start offset (%llu) of the next file extent", prev_end, key->offset); @@ -336,21 +339,21 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, { struct btrfs_fs_info *fs_info = leaf->fs_info; u32 sectorsize = fs_info->sectorsize; - u32 csumsize = btrfs_super_csum_size(fs_info->super_copy); + const u32 csumsize = fs_info->csum_size; - if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) { + if (unlikely(key->objectid != BTRFS_EXTENT_CSUM_OBJECTID)) { generic_err(leaf, slot, "invalid key objectid for csum item, have %llu expect %llu", key->objectid, BTRFS_EXTENT_CSUM_OBJECTID); return -EUCLEAN; } - if (!IS_ALIGNED(key->offset, sectorsize)) { + if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) { generic_err(leaf, slot, "unaligned key offset for csum item, have %llu should be aligned to %u", key->offset, sectorsize); return -EUCLEAN; } - if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) { + if (unlikely(!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize))) { generic_err(leaf, slot, "unaligned item size for csum item, have %u should be aligned to %u", btrfs_item_size_nr(leaf, slot), csumsize); @@ -363,7 +366,7 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, prev_item_size = btrfs_item_size_nr(leaf, slot - 1); prev_csum_end = (prev_item_size / csumsize) * sectorsize; prev_csum_end += prev_key->offset; - if (prev_csum_end > key->offset) { + if (unlikely(prev_csum_end > key->offset)) { generic_err(leaf, slot - 1, "csum end range (%llu) goes beyond the start range (%llu) of the next csum item", prev_csum_end, key->offset); @@ -388,15 +391,16 @@ static int check_inode_key(struct extent_buffer *leaf, struct btrfs_key *key, /* For XATTR_ITEM, location key should be all 0 */ if (item_key.type == BTRFS_XATTR_ITEM_KEY) { - if (key->type != 0 || key->objectid != 0 || key->offset != 0) + if (unlikely(key->objectid != 0 || key->type != 0 || + key->offset != 0)) return -EUCLEAN; return 0; } - if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID || - key->objectid > BTRFS_LAST_FREE_OBJECTID) && - key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID && - key->objectid != BTRFS_FREE_INO_OBJECTID) { + if (unlikely((key->objectid < BTRFS_FIRST_FREE_OBJECTID || + key->objectid > BTRFS_LAST_FREE_OBJECTID) && + key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID && + key->objectid != BTRFS_FREE_INO_OBJECTID)) { if (is_inode_item) { generic_err(leaf, slot, "invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu", @@ -414,7 +418,7 @@ static int check_inode_key(struct extent_buffer *leaf, struct btrfs_key *key, } return -EUCLEAN; } - if (key->offset != 0) { + if (unlikely(key->offset != 0)) { if (is_inode_item) inode_item_err(leaf, slot, "invalid key offset: has %llu expect 0", @@ -438,7 +442,7 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key, is_root_item = (item_key.type == BTRFS_ROOT_ITEM_KEY); /* No such tree id */ - if (key->objectid == 0) { + if (unlikely(key->objectid == 0)) { if (is_root_item) generic_err(leaf, slot, "invalid root id 0"); else @@ -448,7 +452,7 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key, } /* DIR_ITEM/INDEX/INODE_REF is not allowed to point to non-fs trees */ - if (!is_fstree(key->objectid) && !is_root_item) { + if (unlikely(!is_fstree(key->objectid) && !is_root_item)) { dir_item_err(leaf, slot, "invalid location key objectid, have %llu expect [%llu, %llu]", key->objectid, BTRFS_FIRST_FREE_OBJECTID, @@ -464,7 +468,8 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key, * So here we only check offset for reloc tree whose key->offset must * be a valid tree. */ - if (key->objectid == BTRFS_TREE_RELOC_OBJECTID && key->offset == 0) { + if (unlikely(key->objectid == BTRFS_TREE_RELOC_OBJECTID && + key->offset == 0)) { generic_err(leaf, slot, "invalid root id 0 for reloc tree"); return -EUCLEAN; } @@ -480,8 +485,9 @@ static int check_dir_item(struct extent_buffer *leaf, u32 item_size = btrfs_item_size_nr(leaf, slot); u32 cur = 0; - if (!check_prev_ino(leaf, key, slot, prev_key)) + if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) return -EUCLEAN; + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); while (cur < item_size) { struct btrfs_key location_key; @@ -494,7 +500,7 @@ static int check_dir_item(struct extent_buffer *leaf, int ret; /* header itself should not cross item boundary */ - if (cur + sizeof(*di) > item_size) { + if (unlikely(cur + sizeof(*di) > item_size)) { dir_item_err(leaf, slot, "dir item header crosses item boundary, have %zu boundary %u", cur + sizeof(*di), item_size); @@ -505,12 +511,12 @@ static int check_dir_item(struct extent_buffer *leaf, btrfs_dir_item_key_to_cpu(leaf, di, &location_key); if (location_key.type == BTRFS_ROOT_ITEM_KEY) { ret = check_root_key(leaf, &location_key, slot); - if (ret < 0) + if (unlikely(ret < 0)) return ret; } else if (location_key.type == BTRFS_INODE_ITEM_KEY || location_key.type == 0) { ret = check_inode_key(leaf, &location_key, slot); - if (ret < 0) + if (unlikely(ret < 0)) return ret; } else { dir_item_err(leaf, slot, @@ -522,22 +528,22 @@ static int check_dir_item(struct extent_buffer *leaf, /* dir type check */ dir_type = btrfs_dir_type(leaf, di); - if (dir_type >= BTRFS_FT_MAX) { + if (unlikely(dir_type >= BTRFS_FT_MAX)) { dir_item_err(leaf, slot, "invalid dir item type, have %u expect [0, %u)", dir_type, BTRFS_FT_MAX); return -EUCLEAN; } - if (key->type == BTRFS_XATTR_ITEM_KEY && - dir_type != BTRFS_FT_XATTR) { + if (unlikely(key->type == BTRFS_XATTR_ITEM_KEY && + dir_type != BTRFS_FT_XATTR)) { dir_item_err(leaf, slot, "invalid dir item type for XATTR key, have %u expect %u", dir_type, BTRFS_FT_XATTR); return -EUCLEAN; } - if (dir_type == BTRFS_FT_XATTR && - key->type != BTRFS_XATTR_ITEM_KEY) { + if (unlikely(dir_type == BTRFS_FT_XATTR && + key->type != BTRFS_XATTR_ITEM_KEY)) { dir_item_err(leaf, slot, "xattr dir type found for non-XATTR key"); return -EUCLEAN; @@ -550,13 +556,13 @@ static int check_dir_item(struct extent_buffer *leaf, /* Name/data length check */ name_len = btrfs_dir_name_len(leaf, di); data_len = btrfs_dir_data_len(leaf, di); - if (name_len > max_name_len) { + if (unlikely(name_len > max_name_len)) { dir_item_err(leaf, slot, "dir item name len too long, have %u max %u", name_len, max_name_len); return -EUCLEAN; } - if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(fs_info)) { + if (unlikely(name_len + data_len > BTRFS_MAX_XATTR_SIZE(fs_info))) { dir_item_err(leaf, slot, "dir item name and data len too long, have %u max %u", name_len + data_len, @@ -564,7 +570,7 @@ static int check_dir_item(struct extent_buffer *leaf, return -EUCLEAN; } - if (data_len && dir_type != BTRFS_FT_XATTR) { + if (unlikely(data_len && dir_type != BTRFS_FT_XATTR)) { dir_item_err(leaf, slot, "dir item with invalid data len, have %u expect 0", data_len); @@ -574,7 +580,7 @@ static int check_dir_item(struct extent_buffer *leaf, total_size = sizeof(*di) + name_len + data_len; /* header and name/data should not cross item boundary */ - if (cur + total_size > item_size) { + if (unlikely(cur + total_size > item_size)) { dir_item_err(leaf, slot, "dir item data crosses item boundary, have %u boundary %u", cur + total_size, item_size); @@ -592,7 +598,7 @@ static int check_dir_item(struct extent_buffer *leaf, read_extent_buffer(leaf, namebuf, (unsigned long)(di + 1), name_len); name_hash = btrfs_name_hash(namebuf, name_len); - if (key->offset != name_hash) { + if (unlikely(key->offset != name_hash)) { dir_item_err(leaf, slot, "name hash mismatch with key, have 0x%016x expect 0x%016llx", name_hash, key->offset); @@ -641,13 +647,13 @@ static int check_block_group_item(struct extent_buffer *leaf, * Here we don't really care about alignment since extent allocator can * handle it. We care more about the size. */ - if (key->offset == 0) { + if (unlikely(key->offset == 0)) { block_group_err(leaf, slot, "invalid block group size 0"); return -EUCLEAN; } - if (item_size != sizeof(bgi)) { + if (unlikely(item_size != sizeof(bgi))) { block_group_err(leaf, slot, "invalid item size, have %u expect %zu", item_size, sizeof(bgi)); @@ -656,8 +662,8 @@ static int check_block_group_item(struct extent_buffer *leaf, read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), sizeof(bgi)); - if (btrfs_stack_block_group_chunk_objectid(&bgi) != - BTRFS_FIRST_CHUNK_TREE_OBJECTID) { + if (unlikely(btrfs_stack_block_group_chunk_objectid(&bgi) != + BTRFS_FIRST_CHUNK_TREE_OBJECTID)) { block_group_err(leaf, slot, "invalid block group chunk objectid, have %llu expect %llu", btrfs_stack_block_group_chunk_objectid(&bgi), @@ -665,7 +671,7 @@ static int check_block_group_item(struct extent_buffer *leaf, return -EUCLEAN; } - if (btrfs_stack_block_group_used(&bgi) > key->offset) { + if (unlikely(btrfs_stack_block_group_used(&bgi) > key->offset)) { block_group_err(leaf, slot, "invalid block group used, have %llu expect [0, %llu)", btrfs_stack_block_group_used(&bgi), key->offset); @@ -673,7 +679,7 @@ static int check_block_group_item(struct extent_buffer *leaf, } flags = btrfs_stack_block_group_flags(&bgi); - if (hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1) { + if (unlikely(hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1)) { block_group_err(leaf, slot, "invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set", flags & BTRFS_BLOCK_GROUP_PROFILE_MASK, @@ -682,11 +688,11 @@ static int check_block_group_item(struct extent_buffer *leaf, } type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; - if (type != BTRFS_BLOCK_GROUP_DATA && - type != BTRFS_BLOCK_GROUP_METADATA && - type != BTRFS_BLOCK_GROUP_SYSTEM && - type != (BTRFS_BLOCK_GROUP_METADATA | - BTRFS_BLOCK_GROUP_DATA)) { + if (unlikely(type != BTRFS_BLOCK_GROUP_DATA && + type != BTRFS_BLOCK_GROUP_METADATA && + type != BTRFS_BLOCK_GROUP_SYSTEM && + type != (BTRFS_BLOCK_GROUP_METADATA | + BTRFS_BLOCK_GROUP_DATA))) { block_group_err(leaf, slot, "invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx", type, hweight64(type), @@ -773,49 +779,49 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, ncopies = btrfs_raid_array[raid_index].ncopies; nparity = btrfs_raid_array[raid_index].nparity; - if (!num_stripes) { + if (unlikely(!num_stripes)) { chunk_err(leaf, chunk, logical, "invalid chunk num_stripes, have %u", num_stripes); return -EUCLEAN; } - if (num_stripes < ncopies) { + if (unlikely(num_stripes < ncopies)) { chunk_err(leaf, chunk, logical, "invalid chunk num_stripes < ncopies, have %u < %d", num_stripes, ncopies); return -EUCLEAN; } - if (nparity && num_stripes == nparity) { + if (unlikely(nparity && num_stripes == nparity)) { chunk_err(leaf, chunk, logical, "invalid chunk num_stripes == nparity, have %u == %d", num_stripes, nparity); return -EUCLEAN; } - if (!IS_ALIGNED(logical, fs_info->sectorsize)) { + if (unlikely(!IS_ALIGNED(logical, fs_info->sectorsize))) { chunk_err(leaf, chunk, logical, "invalid chunk logical, have %llu should aligned to %u", logical, fs_info->sectorsize); return -EUCLEAN; } - if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) { + if (unlikely(btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize)) { chunk_err(leaf, chunk, logical, "invalid chunk sectorsize, have %u expect %u", btrfs_chunk_sector_size(leaf, chunk), fs_info->sectorsize); return -EUCLEAN; } - if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) { + if (unlikely(!length || !IS_ALIGNED(length, fs_info->sectorsize))) { chunk_err(leaf, chunk, logical, "invalid chunk length, have %llu", length); return -EUCLEAN; } - if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { + if (unlikely(!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN)) { chunk_err(leaf, chunk, logical, "invalid chunk stripe length: %llu", stripe_len); return -EUCLEAN; } - if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & - type) { + if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK | + BTRFS_BLOCK_GROUP_PROFILE_MASK))) { chunk_err(leaf, chunk, logical, "unrecognized chunk type: 0x%llx", ~(BTRFS_BLOCK_GROUP_TYPE_MASK | @@ -824,22 +830,23 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, return -EUCLEAN; } - if (!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && - (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) { + if (unlikely(!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && + (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0)) { chunk_err(leaf, chunk, logical, "invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set", type & BTRFS_BLOCK_GROUP_PROFILE_MASK); return -EUCLEAN; } - if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) { + if (unlikely((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0)) { chunk_err(leaf, chunk, logical, "missing chunk type flag, have 0x%llx one bit must be set in 0x%llx", type, BTRFS_BLOCK_GROUP_TYPE_MASK); return -EUCLEAN; } - if ((type & BTRFS_BLOCK_GROUP_SYSTEM) && - (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) { + if (unlikely((type & BTRFS_BLOCK_GROUP_SYSTEM) && + (type & (BTRFS_BLOCK_GROUP_METADATA | + BTRFS_BLOCK_GROUP_DATA)))) { chunk_err(leaf, chunk, logical, "system chunk with data or metadata type: 0x%llx", type); @@ -851,20 +858,21 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, mixed = true; if (!mixed) { - if ((type & BTRFS_BLOCK_GROUP_METADATA) && - (type & BTRFS_BLOCK_GROUP_DATA)) { + if (unlikely((type & BTRFS_BLOCK_GROUP_METADATA) && + (type & BTRFS_BLOCK_GROUP_DATA))) { chunk_err(leaf, chunk, logical, "mixed chunk type in non-mixed mode: 0x%llx", type); return -EUCLEAN; } } - if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || - (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) || - (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || - (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || - (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) || - ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != 1)) { + if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || + (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) || + (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || + (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || + (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) || + ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && + num_stripes != 1))) { chunk_err(leaf, chunk, logical, "invalid num_stripes:sub_stripes %u:%u for profile %llu", num_stripes, sub_stripes, @@ -887,7 +895,7 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf, { int num_stripes; - if (btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk)) { + if (unlikely(btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk))) { chunk_err(leaf, chunk, key->offset, "invalid chunk item size: have %u expect [%zu, %u)", btrfs_item_size_nr(leaf, slot), @@ -901,8 +909,8 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf, if (num_stripes == 0) goto out; - if (btrfs_chunk_item_size(num_stripes) != - btrfs_item_size_nr(leaf, slot)) { + if (unlikely(btrfs_chunk_item_size(num_stripes) != + btrfs_item_size_nr(leaf, slot))) { chunk_err(leaf, chunk, key->offset, "invalid chunk item size: have %u expect %lu", btrfs_item_size_nr(leaf, slot), @@ -941,14 +949,14 @@ static int check_dev_item(struct extent_buffer *leaf, { struct btrfs_dev_item *ditem; - if (key->objectid != BTRFS_DEV_ITEMS_OBJECTID) { + if (unlikely(key->objectid != BTRFS_DEV_ITEMS_OBJECTID)) { dev_item_err(leaf, slot, "invalid objectid: has=%llu expect=%llu", key->objectid, BTRFS_DEV_ITEMS_OBJECTID); return -EUCLEAN; } ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); - if (btrfs_device_id(leaf, ditem) != key->offset) { + if (unlikely(btrfs_device_id(leaf, ditem) != key->offset)) { dev_item_err(leaf, slot, "devid mismatch: key has=%llu item has=%llu", key->offset, btrfs_device_id(leaf, ditem)); @@ -960,8 +968,8 @@ static int check_dev_item(struct extent_buffer *leaf, * it can be 0 for device removal. Device size check can only be done * by dev extents check. */ - if (btrfs_device_bytes_used(leaf, ditem) > - btrfs_device_total_bytes(leaf, ditem)) { + if (unlikely(btrfs_device_bytes_used(leaf, ditem) > + btrfs_device_total_bytes(leaf, ditem))) { dev_item_err(leaf, slot, "invalid bytes used: have %llu expect [0, %llu]", btrfs_device_bytes_used(leaf, ditem), @@ -986,13 +994,13 @@ static int check_inode_item(struct extent_buffer *leaf, int ret; ret = check_inode_key(leaf, key, slot); - if (ret < 0) + if (unlikely(ret < 0)) return ret; iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item); /* Here we use super block generation + 1 to handle log tree */ - if (btrfs_inode_generation(leaf, iitem) > super_gen + 1) { + if (unlikely(btrfs_inode_generation(leaf, iitem) > super_gen + 1)) { inode_item_err(leaf, slot, "invalid inode generation: has %llu expect (0, %llu]", btrfs_inode_generation(leaf, iitem), @@ -1000,7 +1008,7 @@ static int check_inode_item(struct extent_buffer *leaf, return -EUCLEAN; } /* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */ - if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) { + if (unlikely(btrfs_inode_transid(leaf, iitem) > super_gen + 1)) { inode_item_err(leaf, slot, "invalid inode transid: has %llu expect [0, %llu]", btrfs_inode_transid(leaf, iitem), super_gen + 1); @@ -1013,7 +1021,7 @@ static int check_inode_item(struct extent_buffer *leaf, * anything in the fs. So here we skip the check. */ mode = btrfs_inode_mode(leaf, iitem); - if (mode & ~valid_mask) { + if (unlikely(mode & ~valid_mask)) { inode_item_err(leaf, slot, "unknown mode bit detected: 0x%x", mode & ~valid_mask); @@ -1026,20 +1034,20 @@ static int check_inode_item(struct extent_buffer *leaf, * FIFO/CHR/DIR/REG. Only needs to check BLK, LNK and SOCKS */ if (!has_single_bit_set(mode & S_IFMT)) { - if (!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode)) { + if (unlikely(!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode))) { inode_item_err(leaf, slot, "invalid mode: has 0%o expect valid S_IF* bit(s)", mode & S_IFMT); return -EUCLEAN; } } - if (S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1) { + if (unlikely(S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1)) { inode_item_err(leaf, slot, "invalid nlink: has %u expect no more than 1 for dir", btrfs_inode_nlink(leaf, iitem)); return -EUCLEAN; } - if (btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK) { + if (unlikely(btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK)) { inode_item_err(leaf, slot, "unknown flags detected: 0x%llx", btrfs_inode_flags(leaf, iitem) & @@ -1059,11 +1067,12 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, int ret; ret = check_root_key(leaf, key, slot); - if (ret < 0) + if (unlikely(ret < 0)) return ret; - if (btrfs_item_size_nr(leaf, slot) != sizeof(ri) && - btrfs_item_size_nr(leaf, slot) != btrfs_legacy_root_item_size()) { + if (unlikely(btrfs_item_size_nr(leaf, slot) != sizeof(ri) && + btrfs_item_size_nr(leaf, slot) != + btrfs_legacy_root_item_size())) { generic_err(leaf, slot, "invalid root item size, have %u expect %zu or %u", btrfs_item_size_nr(leaf, slot), sizeof(ri), @@ -1080,24 +1089,24 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, btrfs_item_size_nr(leaf, slot)); /* Generation related */ - if (btrfs_root_generation(&ri) > - btrfs_super_generation(fs_info->super_copy) + 1) { + if (unlikely(btrfs_root_generation(&ri) > + btrfs_super_generation(fs_info->super_copy) + 1)) { generic_err(leaf, slot, "invalid root generation, have %llu expect (0, %llu]", btrfs_root_generation(&ri), btrfs_super_generation(fs_info->super_copy) + 1); return -EUCLEAN; } - if (btrfs_root_generation_v2(&ri) > - btrfs_super_generation(fs_info->super_copy) + 1) { + if (unlikely(btrfs_root_generation_v2(&ri) > + btrfs_super_generation(fs_info->super_copy) + 1)) { generic_err(leaf, slot, "invalid root v2 generation, have %llu expect (0, %llu]", btrfs_root_generation_v2(&ri), btrfs_super_generation(fs_info->super_copy) + 1); return -EUCLEAN; } - if (btrfs_root_last_snapshot(&ri) > - btrfs_super_generation(fs_info->super_copy) + 1) { + if (unlikely(btrfs_root_last_snapshot(&ri) > + btrfs_super_generation(fs_info->super_copy) + 1)) { generic_err(leaf, slot, "invalid root last_snapshot, have %llu expect (0, %llu]", btrfs_root_last_snapshot(&ri), @@ -1106,27 +1115,27 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, } /* Alignment and level check */ - if (!IS_ALIGNED(btrfs_root_bytenr(&ri), fs_info->sectorsize)) { + if (unlikely(!IS_ALIGNED(btrfs_root_bytenr(&ri), fs_info->sectorsize))) { generic_err(leaf, slot, "invalid root bytenr, have %llu expect to be aligned to %u", btrfs_root_bytenr(&ri), fs_info->sectorsize); return -EUCLEAN; } - if (btrfs_root_level(&ri) >= BTRFS_MAX_LEVEL) { + if (unlikely(btrfs_root_level(&ri) >= BTRFS_MAX_LEVEL)) { generic_err(leaf, slot, "invalid root level, have %u expect [0, %u]", btrfs_root_level(&ri), BTRFS_MAX_LEVEL - 1); return -EUCLEAN; } - if (ri.drop_level >= BTRFS_MAX_LEVEL) { + if (unlikely(btrfs_root_drop_level(&ri) >= BTRFS_MAX_LEVEL)) { generic_err(leaf, slot, "invalid root level, have %u expect [0, %u]", - ri.drop_level, BTRFS_MAX_LEVEL - 1); + btrfs_root_drop_level(&ri), BTRFS_MAX_LEVEL - 1); return -EUCLEAN; } /* Flags check */ - if (btrfs_root_flags(&ri) & ~valid_root_flags) { + if (unlikely(btrfs_root_flags(&ri) & ~valid_root_flags)) { generic_err(leaf, slot, "invalid root flags, have 0x%llx expect mask 0x%llx", btrfs_root_flags(&ri), valid_root_flags); @@ -1180,14 +1189,14 @@ static int check_extent_item(struct extent_buffer *leaf, u64 total_refs; /* Total refs in btrfs_extent_item */ u64 inline_refs = 0; /* found total inline refs */ - if (key->type == BTRFS_METADATA_ITEM_KEY && - !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) { + if (unlikely(key->type == BTRFS_METADATA_ITEM_KEY && + !btrfs_fs_incompat(fs_info, SKINNY_METADATA))) { generic_err(leaf, slot, "invalid key type, METADATA_ITEM type invalid when SKINNY_METADATA feature disabled"); return -EUCLEAN; } /* key->objectid is the bytenr for both key types */ - if (!IS_ALIGNED(key->objectid, fs_info->sectorsize)) { + if (unlikely(!IS_ALIGNED(key->objectid, fs_info->sectorsize))) { generic_err(leaf, slot, "invalid key objectid, have %llu expect to be aligned to %u", key->objectid, fs_info->sectorsize); @@ -1195,8 +1204,8 @@ static int check_extent_item(struct extent_buffer *leaf, } /* key->offset is tree level for METADATA_ITEM_KEY */ - if (key->type == BTRFS_METADATA_ITEM_KEY && - key->offset >= BTRFS_MAX_LEVEL) { + if (unlikely(key->type == BTRFS_METADATA_ITEM_KEY && + key->offset >= BTRFS_MAX_LEVEL)) { extent_err(leaf, slot, "invalid tree level, have %llu expect [0, %u]", key->offset, BTRFS_MAX_LEVEL - 1); @@ -1222,7 +1231,7 @@ static int check_extent_item(struct extent_buffer *leaf, * Either using btrfs_extent_inline_ref::offset, or specific * data structure. */ - if (item_size < sizeof(*ei)) { + if (unlikely(item_size < sizeof(*ei))) { extent_err(leaf, slot, "invalid item size, have %u expect [%zu, %u)", item_size, sizeof(*ei), @@ -1236,15 +1245,16 @@ static int check_extent_item(struct extent_buffer *leaf, flags = btrfs_extent_flags(leaf, ei); total_refs = btrfs_extent_refs(leaf, ei); generation = btrfs_extent_generation(leaf, ei); - if (generation > btrfs_super_generation(fs_info->super_copy) + 1) { + if (unlikely(generation > + btrfs_super_generation(fs_info->super_copy) + 1)) { extent_err(leaf, slot, "invalid generation, have %llu expect (0, %llu]", generation, btrfs_super_generation(fs_info->super_copy) + 1); return -EUCLEAN; } - if (!has_single_bit_set(flags & (BTRFS_EXTENT_FLAG_DATA | - BTRFS_EXTENT_FLAG_TREE_BLOCK))) { + if (unlikely(!has_single_bit_set(flags & (BTRFS_EXTENT_FLAG_DATA | + BTRFS_EXTENT_FLAG_TREE_BLOCK)))) { extent_err(leaf, slot, "invalid extent flag, have 0x%llx expect 1 bit set in 0x%llx", flags, BTRFS_EXTENT_FLAG_DATA | @@ -1253,21 +1263,21 @@ static int check_extent_item(struct extent_buffer *leaf, } is_tree_block = !!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK); if (is_tree_block) { - if (key->type == BTRFS_EXTENT_ITEM_KEY && - key->offset != fs_info->nodesize) { + if (unlikely(key->type == BTRFS_EXTENT_ITEM_KEY && + key->offset != fs_info->nodesize)) { extent_err(leaf, slot, "invalid extent length, have %llu expect %u", key->offset, fs_info->nodesize); return -EUCLEAN; } } else { - if (key->type != BTRFS_EXTENT_ITEM_KEY) { + if (unlikely(key->type != BTRFS_EXTENT_ITEM_KEY)) { extent_err(leaf, slot, "invalid key type, have %u expect %u for data backref", key->type, BTRFS_EXTENT_ITEM_KEY); return -EUCLEAN; } - if (!IS_ALIGNED(key->offset, fs_info->sectorsize)) { + if (unlikely(!IS_ALIGNED(key->offset, fs_info->sectorsize))) { extent_err(leaf, slot, "invalid extent length, have %llu expect aligned to %u", key->offset, fs_info->sectorsize); @@ -1281,7 +1291,7 @@ static int check_extent_item(struct extent_buffer *leaf, struct btrfs_tree_block_info *info; info = (struct btrfs_tree_block_info *)ptr; - if (btrfs_tree_block_level(leaf, info) >= BTRFS_MAX_LEVEL) { + if (unlikely(btrfs_tree_block_level(leaf, info) >= BTRFS_MAX_LEVEL)) { extent_err(leaf, slot, "invalid tree block info level, have %u expect [0, %u]", btrfs_tree_block_level(leaf, info), @@ -1300,7 +1310,7 @@ static int check_extent_item(struct extent_buffer *leaf, u64 inline_offset; u8 inline_type; - if (ptr + sizeof(*iref) > end) { + if (unlikely(ptr + sizeof(*iref) > end)) { extent_err(leaf, slot, "inline ref item overflows extent item, ptr %lu iref size %zu end %lu", ptr, sizeof(*iref), end); @@ -1309,7 +1319,7 @@ static int check_extent_item(struct extent_buffer *leaf, iref = (struct btrfs_extent_inline_ref *)ptr; inline_type = btrfs_extent_inline_ref_type(leaf, iref); inline_offset = btrfs_extent_inline_ref_offset(leaf, iref); - if (ptr + btrfs_extent_inline_ref_size(inline_type) > end) { + if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) { extent_err(leaf, slot, "inline ref item overflows extent item, ptr %lu iref size %u end %lu", ptr, inline_type, end); @@ -1323,7 +1333,8 @@ static int check_extent_item(struct extent_buffer *leaf, break; /* Contains parent bytenr */ case BTRFS_SHARED_BLOCK_REF_KEY: - if (!IS_ALIGNED(inline_offset, fs_info->sectorsize)) { + if (unlikely(!IS_ALIGNED(inline_offset, + fs_info->sectorsize))) { extent_err(leaf, slot, "invalid tree parent bytenr, have %llu expect aligned to %u", inline_offset, fs_info->sectorsize); @@ -1338,7 +1349,8 @@ static int check_extent_item(struct extent_buffer *leaf, case BTRFS_EXTENT_DATA_REF_KEY: dref = (struct btrfs_extent_data_ref *)(&iref->offset); dref_offset = btrfs_extent_data_ref_offset(leaf, dref); - if (!IS_ALIGNED(dref_offset, fs_info->sectorsize)) { + if (unlikely(!IS_ALIGNED(dref_offset, + fs_info->sectorsize))) { extent_err(leaf, slot, "invalid data ref offset, have %llu expect aligned to %u", dref_offset, fs_info->sectorsize); @@ -1349,7 +1361,8 @@ static int check_extent_item(struct extent_buffer *leaf, /* Contains parent bytenr and ref count */ case BTRFS_SHARED_DATA_REF_KEY: sref = (struct btrfs_shared_data_ref *)(iref + 1); - if (!IS_ALIGNED(inline_offset, fs_info->sectorsize)) { + if (unlikely(!IS_ALIGNED(inline_offset, + fs_info->sectorsize))) { extent_err(leaf, slot, "invalid data parent bytenr, have %llu expect aligned to %u", inline_offset, fs_info->sectorsize); @@ -1365,14 +1378,14 @@ static int check_extent_item(struct extent_buffer *leaf, ptr += btrfs_extent_inline_ref_size(inline_type); } /* No padding is allowed */ - if (ptr != end) { + if (unlikely(ptr != end)) { extent_err(leaf, slot, "invalid extent item size, padding bytes found"); return -EUCLEAN; } /* Finally, check the inline refs against total refs */ - if (inline_refs > total_refs) { + if (unlikely(inline_refs > total_refs)) { extent_err(leaf, slot, "invalid extent refs, have %llu expect >= inline %llu", total_refs, inline_refs); @@ -1389,21 +1402,21 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf, if (key->type == BTRFS_SHARED_DATA_REF_KEY) expect_item_size = sizeof(struct btrfs_shared_data_ref); - if (btrfs_item_size_nr(leaf, slot) != expect_item_size) { + if (unlikely(btrfs_item_size_nr(leaf, slot) != expect_item_size)) { generic_err(leaf, slot, "invalid item size, have %u expect %u for key type %u", btrfs_item_size_nr(leaf, slot), expect_item_size, key->type); return -EUCLEAN; } - if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) { + if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) { generic_err(leaf, slot, "invalid key objectid for shared block ref, have %llu expect aligned to %u", key->objectid, leaf->fs_info->sectorsize); return -EUCLEAN; } - if (key->type != BTRFS_TREE_BLOCK_REF_KEY && - !IS_ALIGNED(key->offset, leaf->fs_info->sectorsize)) { + if (unlikely(key->type != BTRFS_TREE_BLOCK_REF_KEY && + !IS_ALIGNED(key->offset, leaf->fs_info->sectorsize))) { extent_err(leaf, slot, "invalid tree parent bytenr, have %llu expect aligned to %u", key->offset, leaf->fs_info->sectorsize); @@ -1419,14 +1432,14 @@ static int check_extent_data_ref(struct extent_buffer *leaf, unsigned long ptr = btrfs_item_ptr_offset(leaf, slot); const unsigned long end = ptr + btrfs_item_size_nr(leaf, slot); - if (btrfs_item_size_nr(leaf, slot) % sizeof(*dref) != 0) { + if (unlikely(btrfs_item_size_nr(leaf, slot) % sizeof(*dref) != 0)) { generic_err(leaf, slot, "invalid item size, have %u expect aligned to %zu for key type %u", btrfs_item_size_nr(leaf, slot), sizeof(*dref), key->type); return -EUCLEAN; } - if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) { + if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) { generic_err(leaf, slot, "invalid key objectid for shared block ref, have %llu expect aligned to %u", key->objectid, leaf->fs_info->sectorsize); @@ -1443,13 +1456,13 @@ static int check_extent_data_ref(struct extent_buffer *leaf, owner = btrfs_extent_data_ref_objectid(leaf, dref); offset = btrfs_extent_data_ref_offset(leaf, dref); hash = hash_extent_data_ref(root_objectid, owner, offset); - if (hash != key->offset) { + if (unlikely(hash != key->offset)) { extent_err(leaf, slot, "invalid extent data ref hash, item has 0x%016llx key has 0x%016llx", hash, key->offset); return -EUCLEAN; } - if (!IS_ALIGNED(offset, leaf->fs_info->sectorsize)) { + if (unlikely(!IS_ALIGNED(offset, leaf->fs_info->sectorsize))) { extent_err(leaf, slot, "invalid extent data backref offset, have %llu expect aligned to %u", offset, leaf->fs_info->sectorsize); @@ -1469,10 +1482,10 @@ static int check_inode_ref(struct extent_buffer *leaf, unsigned long ptr; unsigned long end; - if (!check_prev_ino(leaf, key, slot, prev_key)) + if (unlikely(!check_prev_ino(leaf, key, slot, prev_key))) return -EUCLEAN; /* namelen can't be 0, so item_size == sizeof() is also invalid */ - if (btrfs_item_size_nr(leaf, slot) <= sizeof(*iref)) { + if (unlikely(btrfs_item_size_nr(leaf, slot) <= sizeof(*iref))) { inode_ref_err(leaf, slot, "invalid item size, have %u expect (%zu, %u)", btrfs_item_size_nr(leaf, slot), @@ -1485,7 +1498,7 @@ static int check_inode_ref(struct extent_buffer *leaf, while (ptr < end) { u16 namelen; - if (ptr + sizeof(iref) > end) { + if (unlikely(ptr + sizeof(iref) > end)) { inode_ref_err(leaf, slot, "inode ref overflow, ptr %lu end %lu inode_ref_size %zu", ptr, end, sizeof(iref)); @@ -1494,7 +1507,7 @@ static int check_inode_ref(struct extent_buffer *leaf, iref = (struct btrfs_inode_ref *)ptr; namelen = btrfs_inode_ref_name_len(leaf, iref); - if (ptr + sizeof(*iref) + namelen > end) { + if (unlikely(ptr + sizeof(*iref) + namelen > end)) { inode_ref_err(leaf, slot, "inode ref overflow, ptr %lu end %lu namelen %u", ptr, end, namelen); @@ -1577,7 +1590,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) u32 nritems = btrfs_header_nritems(leaf); int slot; - if (btrfs_header_level(leaf) != 0) { + if (unlikely(btrfs_header_level(leaf) != 0)) { generic_err(leaf, 0, "invalid level for leaf, have %d expect 0", btrfs_header_level(leaf)); @@ -1596,19 +1609,19 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) u64 owner = btrfs_header_owner(leaf); /* These trees must never be empty */ - if (owner == BTRFS_ROOT_TREE_OBJECTID || - owner == BTRFS_CHUNK_TREE_OBJECTID || - owner == BTRFS_EXTENT_TREE_OBJECTID || - owner == BTRFS_DEV_TREE_OBJECTID || - owner == BTRFS_FS_TREE_OBJECTID || - owner == BTRFS_DATA_RELOC_TREE_OBJECTID) { + if (unlikely(owner == BTRFS_ROOT_TREE_OBJECTID || + owner == BTRFS_CHUNK_TREE_OBJECTID || + owner == BTRFS_EXTENT_TREE_OBJECTID || + owner == BTRFS_DEV_TREE_OBJECTID || + owner == BTRFS_FS_TREE_OBJECTID || + owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) { generic_err(leaf, 0, "invalid root, root %llu must never be empty", owner); return -EUCLEAN; } /* Unknown tree */ - if (owner == 0) { + if (unlikely(owner == 0)) { generic_err(leaf, 0, "invalid owner, root 0 is not defined"); return -EUCLEAN; @@ -1616,7 +1629,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) return 0; } - if (nritems == 0) + if (unlikely(nritems == 0)) return 0; /* @@ -1637,7 +1650,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) btrfs_item_key_to_cpu(leaf, &key, slot); /* Make sure the keys are in the right order */ - if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) { + if (unlikely(btrfs_comp_cpu_keys(&prev_key, &key) >= 0)) { generic_err(leaf, slot, "bad key order, prev (%llu %u %llu) current (%llu %u %llu)", prev_key.objectid, prev_key.type, @@ -1656,7 +1669,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) else item_end_expected = btrfs_item_offset_nr(leaf, slot - 1); - if (btrfs_item_end_nr(leaf, slot) != item_end_expected) { + if (unlikely(btrfs_item_end_nr(leaf, slot) != item_end_expected)) { generic_err(leaf, slot, "unexpected item end, have %u expect %u", btrfs_item_end_nr(leaf, slot), @@ -1669,8 +1682,8 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) * just in case all the items are consistent to each other, but * all point outside of the leaf. */ - if (btrfs_item_end_nr(leaf, slot) > - BTRFS_LEAF_DATA_SIZE(fs_info)) { + if (unlikely(btrfs_item_end_nr(leaf, slot) > + BTRFS_LEAF_DATA_SIZE(fs_info))) { generic_err(leaf, slot, "slot end outside of leaf, have %u expect range [0, %u]", btrfs_item_end_nr(leaf, slot), @@ -1679,8 +1692,8 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) } /* Also check if the item pointer overlaps with btrfs item. */ - if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) > - btrfs_item_ptr_offset(leaf, slot)) { + if (unlikely(btrfs_item_ptr_offset(leaf, slot) < + btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item))) { generic_err(leaf, slot, "slot overlaps with its data, item end %lu data start %lu", btrfs_item_nr_offset(slot) + @@ -1695,7 +1708,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) * criteria */ ret = check_leaf_item(leaf, &key, slot, &prev_key); - if (ret < 0) + if (unlikely(ret < 0)) return ret; } @@ -1728,13 +1741,13 @@ int btrfs_check_node(struct extent_buffer *node) u64 bytenr; int ret = 0; - if (level <= 0 || level >= BTRFS_MAX_LEVEL) { + if (unlikely(level <= 0 || level >= BTRFS_MAX_LEVEL)) { generic_err(node, 0, "invalid level for node, have %d expect [1, %d]", level, BTRFS_MAX_LEVEL - 1); return -EUCLEAN; } - if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info)) { + if (unlikely(nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info))) { btrfs_crit(fs_info, "corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]", btrfs_header_owner(node), node->start, @@ -1748,13 +1761,13 @@ int btrfs_check_node(struct extent_buffer *node) btrfs_node_key_to_cpu(node, &key, slot); btrfs_node_key_to_cpu(node, &next_key, slot + 1); - if (!bytenr) { + if (unlikely(!bytenr)) { generic_err(node, slot, "invalid NULL node pointer"); ret = -EUCLEAN; goto out; } - if (!IS_ALIGNED(bytenr, fs_info->sectorsize)) { + if (unlikely(!IS_ALIGNED(bytenr, fs_info->sectorsize))) { generic_err(node, slot, "unaligned pointer, have %llu should be aligned to %u", bytenr, fs_info->sectorsize); @@ -1762,7 +1775,7 @@ int btrfs_check_node(struct extent_buffer *node) goto out; } - if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) { + if (unlikely(btrfs_comp_cpu_keys(&key, &next_key) >= 0)) { generic_err(node, slot, "bad key order, current (%llu %u %llu) next (%llu %u %llu)", key.objectid, key.type, key.offset, diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index d3f28b8f4ff9..7c45d960b53c 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -52,7 +52,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, u32 nritems; root_node = btrfs_lock_root_node(root); - btrfs_set_lock_blocking_write(root_node); nritems = btrfs_header_nritems(root_node); root->defrag_max.objectid = 0; /* from above we know this is not a leaf */ diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 56cbc1706b6f..254c2ee43aae 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -17,7 +17,6 @@ #include "backref.h" #include "compression.h" #include "qgroup.h" -#include "inode-map.h" #include "block-group.h" #include "space-info.h" @@ -139,8 +138,25 @@ static int start_log_trans(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *tree_root = fs_info->tree_root; int ret = 0; + /* + * First check if the log root tree was already created. If not, create + * it before locking the root's log_mutex, just to keep lockdep happy. + */ + if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) { + mutex_lock(&tree_root->log_mutex); + if (!fs_info->log_root_tree) { + ret = btrfs_init_log_root_tree(trans, fs_info); + if (!ret) + set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state); + } + mutex_unlock(&tree_root->log_mutex); + if (ret) + return ret; + } + mutex_lock(&root->log_mutex); if (root->log_root) { @@ -156,13 +172,6 @@ static int start_log_trans(struct btrfs_trans_handle *trans, set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); } } else { - mutex_lock(&fs_info->tree_log_mutex); - if (!fs_info->log_root_tree) - ret = btrfs_init_log_root_tree(trans, fs_info); - mutex_unlock(&fs_info->tree_log_mutex); - if (ret) - goto out; - ret = btrfs_add_log_tree(trans, root); if (ret) goto out; @@ -172,7 +181,6 @@ static int start_log_trans(struct btrfs_trans_handle *trans, root->log_start_pid = current->pid; } - atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); if (ctx && !ctx->logging_new_name) { int index = root->log_transid % 2; @@ -576,6 +584,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { + struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_fs_info *fs_info = root->fs_info; int found_type; u64 extent_end; @@ -653,7 +662,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); /* drop any overlapping extents */ - ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); + drop_args.start = start; + drop_args.end = extent_end; + drop_args.drop_cache = true; + ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); if (ret) goto out; @@ -828,9 +840,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, if (ret) goto out; - inode_add_bytes(inode, nbytes); update_inode: - ret = btrfs_update_inode(trans, root, inode); + btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); out: if (inode) iput(inode); @@ -1529,7 +1541,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; - btrfs_update_inode(trans, root, inode); + btrfs_update_inode(trans, root, BTRFS_I(inode)); } ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; @@ -1564,18 +1576,6 @@ out: return ret; } -static int insert_orphan_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 ino) -{ - int ret; - - ret = btrfs_insert_orphan_item(trans, root, ino); - if (ret == -EEXIST) - ret = 0; - - return ret; -} - static int count_inode_extrefs(struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path) { @@ -1716,7 +1716,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (nlink != inode->i_nlink) { set_nlink(inode, nlink); - btrfs_update_inode(trans, root, inode); + btrfs_update_inode(trans, root, BTRFS_I(inode)); } BTRFS_I(inode)->index_cnt = (u64)-1; @@ -1727,7 +1727,9 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (ret) goto out; } - ret = insert_orphan_item(trans, root, ino); + ret = btrfs_insert_orphan_item(trans, root, ino); + if (ret == -EEXIST) + ret = 0; } out: @@ -1820,7 +1822,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, set_nlink(inode, 1); else inc_nlink(inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); } else if (ret == -EEXIST) { ret = 0; } else { @@ -1973,7 +1975,7 @@ out: btrfs_release_path(path); if (!ret && update_size) { btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); - ret = btrfs_update_inode(trans, root, dir); + ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); } kfree(name); iput(dir); @@ -2586,6 +2588,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, * those prealloc extents just after replaying them. */ if (S_ISREG(mode)) { + struct btrfs_drop_extents_args drop_args = { 0 }; struct inode *inode; u64 from; @@ -2596,12 +2599,18 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, } from = ALIGN(i_size_read(inode), root->fs_info->sectorsize); - ret = btrfs_drop_extents(wc->trans, root, inode, - from, (u64)-1, 1); + drop_args.start = from; + drop_args.end = (u64)-1; + drop_args.drop_cache = true; + ret = btrfs_drop_extents(wc->trans, root, + BTRFS_I(inode), + &drop_args); if (!ret) { + inode_sub_bytes(inode, + drop_args.bytes_found); /* Update the inode's nbytes. */ ret = btrfs_update_inode(wc->trans, - root, inode); + root, BTRFS_I(inode)); } iput(inode); if (ret) @@ -2709,7 +2718,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); blocksize = fs_info->nodesize; - next = btrfs_find_create_tree_block(fs_info, bytenr); + next = btrfs_find_create_tree_block(fs_info, bytenr, + btrfs_header_owner(cur), + *level - 1); if (IS_ERR(next)) return PTR_ERR(next); @@ -2732,7 +2743,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); - btrfs_set_lock_blocking_write(next); btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -2801,7 +2811,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); - btrfs_set_lock_blocking_write(next); btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -2883,7 +2892,6 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); - btrfs_set_lock_blocking_write(next); btrfs_clean_tree_block(next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); @@ -3023,6 +3031,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, int log_transid = 0; struct btrfs_log_ctx root_log_ctx; struct blk_plug plug; + u64 log_root_start; + u64 log_root_level; mutex_lock(&root->log_mutex); log_transid = ctx->log_transid; @@ -3200,22 +3210,31 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out_wake_log_root; } - btrfs_set_super_log_root(fs_info->super_for_commit, - log_root_tree->node->start); - btrfs_set_super_log_root_level(fs_info->super_for_commit, - btrfs_header_level(log_root_tree->node)); - + log_root_start = log_root_tree->node->start; + log_root_level = btrfs_header_level(log_root_tree->node); log_root_tree->log_transid++; mutex_unlock(&log_root_tree->log_mutex); /* - * Nobody else is going to jump in and write the ctree - * super here because the log_commit atomic below is protecting - * us. We must be called with a transaction handle pinning - * the running transaction open, so a full commit can't hop - * in and cause problems either. + * Here we are guaranteed that nobody is going to write the superblock + * for the current transaction before us and that neither we do write + * our superblock before the previous transaction finishes its commit + * and writes its superblock, because: + * + * 1) We are holding a handle on the current transaction, so no body + * can commit it until we release the handle; + * + * 2) Before writing our superblock we acquire the tree_log_mutex, so + * if the previous transaction is still committing, and hasn't yet + * written its superblock, we wait for it to do it, because a + * transaction commit acquires the tree_log_mutex when the commit + * begins and releases it only after writing its superblock. */ + mutex_lock(&fs_info->tree_log_mutex); + btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start); + btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); ret = write_all_supers(fs_info, 1); + mutex_unlock(&fs_info->tree_log_mutex); if (ret) { btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret); @@ -3300,6 +3319,7 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, if (fs_info->log_root_tree) { free_log_tree(trans, fs_info->log_root_tree); fs_info->log_root_tree = NULL; + clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state); } return 0; } @@ -4196,6 +4216,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_log_ctx *ctx) { + struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *log = root->log_root; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; @@ -4204,19 +4225,21 @@ static int log_one_extent(struct btrfs_trans_handle *trans, u64 extent_offset = em->start - em->orig_start; u64 block_len; int ret; - int extent_inserted = 0; ret = log_extent_csums(trans, inode, log, em, ctx); if (ret) return ret; - ret = __btrfs_drop_extents(trans, log, inode, path, em->start, - em->start + em->len, NULL, 0, 1, - sizeof(*fi), &extent_inserted); + drop_args.path = path; + drop_args.start = em->start; + drop_args.end = em->start + em->len; + drop_args.replace_extent = true; + drop_args.extent_item_size = sizeof(*fi); + ret = btrfs_drop_extents(trans, log, inode, &drop_args); if (ret) return ret; - if (!extent_inserted) { + if (!drop_args.extent_inserted) { key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = em->start; @@ -4375,8 +4398,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, do { ret = btrfs_truncate_inode_items(trans, root->log_root, - &inode->vfs_inode, - truncate_offset, + inode, truncate_offset, BTRFS_EXTENT_DATA_KEY); } while (ret == -EAGAIN); if (ret) @@ -4415,14 +4437,12 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &inode->extent_tree; - u64 test_gen; int ret = 0; int num = 0; INIT_LIST_HEAD(&extents); write_lock(&tree->lock); - test_gen = root->fs_info->last_trans_committed; list_for_each_entry_safe(em, n, &tree->modified_extents, list) { list_del_init(&em->list); @@ -4438,7 +4458,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, goto process; } - if (em->generation <= test_gen) + if (em->generation < trans->transid) continue; /* We log prealloc extents beyond eof later. */ @@ -4571,6 +4591,10 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, const u64 ino = btrfs_ino(inode); int ins_nr = 0; int start_slot = 0; + bool found_xattrs = false; + + if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags)) + return 0; key.objectid = ino; key.type = BTRFS_XATTR_ITEM_KEY; @@ -4609,6 +4633,7 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, start_slot = slot; ins_nr++; path->slots[0]++; + found_xattrs = true; cond_resched(); } if (ins_nr > 0) { @@ -4618,6 +4643,9 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, return ret; } + if (!found_xattrs) + set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags); + return 0; } @@ -5303,7 +5331,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, &inode->runtime_flags); while(1) { ret = btrfs_truncate_inode_items(trans, - log, &inode->vfs_inode, 0, 0); + log, inode, 0, 0); if (ret != -EAGAIN) break; } @@ -5442,11 +5470,10 @@ out_unlock: static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; bool ret = false; mutex_lock(&inode->log_mutex); - if (inode->last_unlink_trans > fs_info->last_trans_committed) { + if (inode->last_unlink_trans >= trans->transid) { /* * Make sure any commits to the log are forced to be full * commits. @@ -5468,8 +5495,7 @@ static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct dentry *parent, - struct super_block *sb, - u64 last_committed) + struct super_block *sb) { int ret = 0; struct dentry *old_parent = NULL; @@ -5481,8 +5507,8 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, * and other fun in this file. */ if (S_ISREG(inode->vfs_inode.i_mode) && - inode->generation <= last_committed && - inode->last_unlink_trans <= last_committed) + inode->generation < trans->transid && + inode->last_unlink_trans < trans->transid) goto out; if (!S_ISDIR(inode->vfs_inode.i_mode)) { @@ -5828,7 +5854,6 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, while (true) { struct btrfs_fs_info *fs_info = root->fs_info; - const u64 last_committed = fs_info->last_trans_committed; struct extent_buffer *leaf = path->nodes[0]; int slot = path->slots[0]; struct btrfs_key search_key; @@ -5847,7 +5872,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) return PTR_ERR(inode); - if (BTRFS_I(inode)->generation > last_committed) + if (BTRFS_I(inode)->generation >= trans->transid) ret = btrfs_log_inode(trans, root, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); btrfs_add_delayed_iput(inode); @@ -5888,7 +5913,6 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; struct dentry *old_parent = NULL; struct super_block *sb = inode->vfs_inode.i_sb; int ret = 0; @@ -5902,7 +5926,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, if (root != inode->root) break; - if (inode->generation > fs_info->last_trans_committed) { + if (inode->generation >= trans->transid) { ret = btrfs_log_inode(trans, root, inode, LOG_INODE_EXISTS, ctx); if (ret) @@ -6019,7 +6043,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct super_block *sb; int ret = 0; - u64 last_committed = fs_info->last_trans_committed; bool log_dentries = false; sb = inode->vfs_inode.i_sb; @@ -6029,23 +6052,12 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } - /* - * The prev transaction commit doesn't complete, we need do - * full commit by ourselves. - */ - if (fs_info->last_trans_log_full_commit > - fs_info->last_trans_committed) { - ret = 1; - goto end_no_trans; - } - if (btrfs_root_refs(&root->root_item) == 0) { ret = 1; goto end_no_trans; } - ret = check_parent_dirs_for_sync(trans, inode, parent, sb, - last_committed); + ret = check_parent_dirs_for_sync(trans, inode, parent, sb); if (ret) goto end_no_trans; @@ -6075,8 +6087,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, * and other fun in this file. */ if (S_ISREG(inode->vfs_inode.i_mode) && - inode->generation <= last_committed && - inode->last_unlink_trans <= last_committed) { + inode->generation < trans->transid && + inode->last_unlink_trans < trans->transid) { ret = 0; goto end_trans; } @@ -6125,7 +6137,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, * but the file inode does not have a matching BTRFS_INODE_REF_KEY item * and has a link count of 2. */ - if (inode->last_unlink_trans > last_committed) { + if (inode->last_unlink_trans >= trans->transid) { ret = btrfs_log_all_parents(trans, inode, ctx); if (ret) goto end_trans; @@ -6434,7 +6446,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, struct dentry *parent) { - struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_log_ctx ctx; /* @@ -6448,8 +6459,8 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * if this inode hasn't been logged and directory we're renaming it * from hasn't been logged, we don't need to log it */ - if (inode->logged_trans <= fs_info->last_trans_committed && - (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) + if (inode->logged_trans < trans->transid && + (!old_dir || old_dir->logged_trans < trans->transid)) return; btrfs_init_log_ctx(&ctx, &inode->vfs_inode); diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 28525ad7ff8c..74023c8a783f 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -129,8 +129,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, } else { btrfs_warn(fs_info, "insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!", - ret, (unsigned long long)key.objectid, - (unsigned long long)key.offset, type); + ret, key.objectid, key.offset, type); goto out; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 78637665166e..ee086fc56c30 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -31,6 +31,7 @@ #include "space-info.h" #include "block-group.h" #include "discard.h" +#include "zoned.h" const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { @@ -374,6 +375,7 @@ void btrfs_free_device(struct btrfs_device *device) rcu_string_free(device->name); extent_io_tree_release(&device->alloc_state); bio_put(device->flush_bio); + btrfs_destroy_dev_zone_info(device); kfree(device); } @@ -667,6 +669,10 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); device->mode = flags; + ret = btrfs_get_dev_zone_info(device); + if (ret != 0) + goto error_free_page; + fs_devices->open_devices++; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { @@ -822,7 +828,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, } else { mutex_lock(&fs_devices->device_list_mutex); device = btrfs_find_device(fs_devices, devid, - disk_super->dev_item.uuid, NULL, false); + disk_super->dev_item.uuid, NULL); /* * If this disk has been pulled into an fs devices created by @@ -929,16 +935,16 @@ static noinline struct btrfs_device *device_list_add(const char *path, * make sure it's the same device if the device is mounted */ if (device->bdev) { - struct block_device *path_bdev; + int error; + dev_t path_dev; - path_bdev = lookup_bdev(path); - if (IS_ERR(path_bdev)) { + error = lookup_bdev(path, &path_dev); + if (error) { mutex_unlock(&fs_devices->device_list_mutex); - return ERR_CAST(path_bdev); + return ERR_PTR(error); } - if (device->bdev != path_bdev) { - bdput(path_bdev); + if (device->bdev->bd_dev != path_dev) { mutex_unlock(&fs_devices->device_list_mutex); /* * device->fs_info may not be reliable here, so @@ -953,7 +959,6 @@ static noinline struct btrfs_device *device_list_add(const char *path, task_pid_nr(current)); return ERR_PTR(-EEXIST); } - bdput(path_bdev); btrfs_info_in_rcu(device->fs_info, "devid %llu device path %s changed to %s scanned by %s (%d)", devid, rcu_str_deref(device->name), @@ -1044,7 +1049,7 @@ error: } static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, - int step, struct btrfs_device **latest_dev) + struct btrfs_device **latest_dev) { struct btrfs_device *device, *next; @@ -1089,16 +1094,16 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, * After we have read the system tree and know devids belonging to this * filesystem, remove the device which does not belong there. */ -void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) +void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *latest_dev = NULL; struct btrfs_fs_devices *seed_dev; mutex_lock(&uuid_mutex); - __btrfs_free_extra_devids(fs_devices, step, &latest_dev); + __btrfs_free_extra_devids(fs_devices, &latest_dev); list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) - __btrfs_free_extra_devids(seed_dev, step, &latest_dev); + __btrfs_free_extra_devids(seed_dev, &latest_dev); fs_devices->latest_bdev = latest_dev->bdev; @@ -1137,6 +1142,7 @@ static void btrfs_close_one_device(struct btrfs_device *device) device->bdev = NULL; } clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + btrfs_destroy_dev_zone_info(device); device->fs_info = NULL; atomic_set(&device->dev_stats_ccnt, 0); @@ -1217,6 +1223,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, fs_devices->latest_bdev = latest_dev->bdev; fs_devices->total_rw_bytes = 0; fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; + fs_devices->read_policy = BTRFS_READ_POLICY_PID; return 0; } @@ -1268,7 +1275,7 @@ void btrfs_release_disk_super(struct btrfs_super_block *super) } static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, - u64 bytenr) + u64 bytenr, u64 bytenr_orig) { struct btrfs_super_block *disk_super; struct page *page; @@ -1299,7 +1306,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev /* align our pointer to the offset of the super block */ disk_super = p + offset_in_page(bytenr); - if (btrfs_super_bytenr(disk_super) != bytenr || + if (btrfs_super_bytenr(disk_super) != bytenr_orig || btrfs_super_magic(disk_super) != BTRFS_MAGIC) { btrfs_release_disk_super(p); return ERR_PTR(-EINVAL); @@ -1334,7 +1341,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, bool new_device_added = false; struct btrfs_device *device = NULL; struct block_device *bdev; - u64 bytenr; + u64 bytenr, bytenr_orig; + int ret; lockdep_assert_held(&uuid_mutex); @@ -1344,14 +1352,18 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, * So, we need to add a special mount option to scan for * later supers, using BTRFS_SUPER_MIRROR_MAX instead */ - bytenr = btrfs_sb_offset(0); flags |= FMODE_EXCL; bdev = blkdev_get_by_path(path, flags, holder); if (IS_ERR(bdev)) return ERR_CAST(bdev); - disk_super = btrfs_read_disk_super(bdev, bytenr); + bytenr_orig = btrfs_sb_offset(0); + ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr); + if (ret) + return ERR_PTR(ret); + + disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig); if (IS_ERR(disk_super)) { device = ERR_CAST(disk_super); goto error_bdev_put; @@ -2015,6 +2027,11 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, if (IS_ERR(disk_super)) continue; + if (bdev_is_zoned(bdev)) { + btrfs_reset_sb_log_zones(bdev, copy_num); + continue; + } + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); page = virt_to_page(disk_super); @@ -2293,10 +2310,10 @@ static struct btrfs_device *btrfs_find_device_by_path( dev_uuid = disk_super->dev_item.uuid; if (btrfs_fs_incompat(fs_info, METADATA_UUID)) device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - disk_super->metadata_uuid, true); + disk_super->metadata_uuid); else device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - disk_super->fsid, true); + disk_super->fsid); btrfs_release_disk_super(disk_super); if (!device) @@ -2316,7 +2333,7 @@ struct btrfs_device *btrfs_find_device_by_devspec( if (devid) { device = btrfs_find_device(fs_info->fs_devices, devid, NULL, - NULL, true); + NULL); if (!device) return ERR_PTR(-ENOENT); return device; @@ -2465,7 +2482,7 @@ next_slot: read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_FSID_SIZE); device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - fs_uuid, true); + fs_uuid); BUG_ON(!device); /* Logic error */ if (device->fs_devices->seeding) { @@ -2507,6 +2524,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (IS_ERR(bdev)) return PTR_ERR(bdev); + if (!btrfs_check_device_zone_type(fs_info, bdev)) { + ret = -EINVAL; + goto error; + } + if (fs_devices->seeding) { seeding_dev = 1; down_write(&sb->s_umount); @@ -2540,10 +2562,17 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } rcu_assign_pointer(device->name, name); + device->fs_info = fs_info; + device->bdev = bdev; + + ret = btrfs_get_dev_zone_info(device); + if (ret) + goto error_free_device; + trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - goto error_free_device; + goto error_free_zone; } q = bdev_get_queue(bdev); @@ -2556,8 +2585,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path fs_info->sectorsize); device->disk_total_bytes = device->total_bytes; device->commit_total_bytes = device->total_bytes; - device->fs_info = fs_info; - device->bdev = bdev; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); device->mode = FMODE_EXCL; @@ -2704,6 +2731,8 @@ error_trans: sb->s_flags |= SB_RDONLY; if (trans) btrfs_end_transaction(trans); +error_free_zone: + btrfs_destroy_dev_zone_info(device); error_free_device: btrfs_free_device(device); error: @@ -5479,7 +5508,18 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, else num_stripes = map->num_stripes; - preferred_mirror = first + current->pid % num_stripes; + switch (fs_info->fs_devices->read_policy) { + default: + /* Shouldn't happen, just warn and use pid instead of failing */ + btrfs_warn_rl(fs_info, + "unknown read_policy type %u, reset to pid", + fs_info->fs_devices->read_policy); + fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID; + fallthrough; + case BTRFS_READ_POLICY_PID: + preferred_mirror = first + (current->pid % num_stripes); + break; + } if (dev_replace_is_ongoing && fs_info->dev_replace.cont_reading_from_srcdev_mode == @@ -6335,7 +6375,7 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, bio->bi_iter.bi_sector = physical >> 9; btrfs_debug_in_rcu(fs_info, "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", - bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector, + bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid, bio->bi_iter.bi_size); bio_set_dev(bio, dev->bdev); @@ -6367,7 +6407,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, { struct btrfs_device *dev; struct bio *first_bio = bio; - u64 logical = (u64)bio->bi_iter.bi_sector << 9; + u64 logical = bio->bi_iter.bi_sector << 9; u64 length = 0; u64 map_length; int ret; @@ -6447,8 +6487,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, * If @seed is true, traverse through the seed devices. */ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, - u64 devid, u8 *uuid, u8 *fsid, - bool seed) + u64 devid, u8 *uuid, u8 *fsid) { struct btrfs_device *device; struct btrfs_fs_devices *seed_devs; @@ -6655,7 +6694,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, btrfs_stripe_dev_uuid_nr(chunk, i), BTRFS_UUID_SIZE); map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, - devid, uuid, NULL, true); + devid, uuid, NULL); if (!map->stripes[i].dev && !btrfs_test_opt(fs_info, DEGRADED)) { free_extent_map(em); @@ -6794,7 +6833,7 @@ static int read_one_dev(struct extent_buffer *leaf, } device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - fs_uuid, true); + fs_uuid); if (!device) { if (!btrfs_test_opt(fs_info, DEGRADED)) { btrfs_report_missing_device(fs_info, devid, @@ -6857,6 +6896,16 @@ static int read_one_dev(struct extent_buffer *leaf, } fill_device_from_item(leaf, dev_item, device); + if (device->bdev) { + u64 max_total_bytes = i_size_read(device->bdev->bd_inode); + + if (device->total_bytes > max_total_bytes) { + btrfs_err(fs_info, + "device total_bytes should be at most %llu but found %llu", + max_total_bytes, device->total_bytes); + return -EINVAL; + } + } set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { @@ -6891,11 +6940,11 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will * overallocate but we can keep it as-is, only the first page is used. */ - sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET); + sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET, + root->root_key.objectid, 0); if (IS_ERR(sb)) return PTR_ERR(sb); set_extent_buffer_uptodate(sb); - btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); /* * The sb extent buffer is artificial and just used to read the system array. * set_extent_buffer_uptodate() call does not properly mark all it's @@ -7059,12 +7108,8 @@ static void readahead_tree_node_children(struct extent_buffer *node) int i; const int nr_items = btrfs_header_nritems(node); - for (i = 0; i < nr_items; i++) { - u64 start; - - start = btrfs_node_blockptr(node, i); - readahead_tree_block(node->fs_info, start); - } + for (i = 0; i < nr_items; i++) + btrfs_readahead_node_child(node, i); } int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) @@ -7451,8 +7496,7 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, int i; mutex_lock(&fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL, - true); + dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL); mutex_unlock(&fs_devices->device_list_mutex); if (!dev) { @@ -7583,28 +7627,13 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } /* Make sure no dev extent is beyond device bondary */ - dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); + dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); if (!dev) { btrfs_err(fs_info, "failed to find devid %llu", devid); ret = -EUCLEAN; goto out; } - /* It's possible this device is a dummy for seed device */ - if (dev->disk_total_bytes == 0) { - struct btrfs_fs_devices *devs; - - devs = list_first_entry(&fs_info->fs_devices->seed_list, - struct btrfs_fs_devices, seed_list); - dev = btrfs_find_device(devs, devid, NULL, NULL, false); - if (!dev) { - btrfs_err(fs_info, "failed to find seed devid %llu", - devid); - ret = -EUCLEAN; - goto out; - } - } - if (physical_offset + physical_len > dev->disk_total_bytes) { btrfs_err(fs_info, "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", @@ -7659,6 +7688,19 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) u64 prev_dev_ext_end = 0; int ret = 0; + /* + * We don't have a dev_root because we mounted with ignorebadroots and + * failed to load the root, so we want to skip the verification in this + * case for sure. + * + * However if the dev root is fine, but the tree itself is corrupted + * we'd still fail to mount. This verification is only to make sure + * writes can happen safely, so instead just bypass this check + * completely in the case of IGNOREBADROOTS. + */ + if (btrfs_test_opt(fs_info, IGNOREBADROOTS)) + return 0; + key.objectid = 1; key.type = BTRFS_DEV_EXTENT_KEY; key.offset = 0; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 232f02bd214f..1997a4649a66 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -52,6 +52,8 @@ struct btrfs_io_geometry { #define BTRFS_DEV_STATE_FLUSH_SENT (4) #define BTRFS_DEV_STATE_NO_READA (5) +struct btrfs_zoned_device_info; + struct btrfs_device { struct list_head dev_list; /* device_list_mutex */ struct list_head dev_alloc_list; /* chunk mutex */ @@ -65,6 +67,8 @@ struct btrfs_device { struct block_device *bdev; + struct btrfs_zoned_device_info *zone_info; + /* the mode sent to blkdev_get */ fmode_t mode; @@ -211,6 +215,16 @@ enum btrfs_chunk_allocation_policy { BTRFS_CHUNK_ALLOC_REGULAR, }; +/* + * Read policies for mirrored block group profiles, read picks the stripe based + * on these policies. + */ +enum btrfs_read_policy { + /* Use process PID to choose the stripe */ + BTRFS_READ_POLICY_PID, + BTRFS_NR_READ_POLICY, +}; + struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ u8 metadata_uuid[BTRFS_FSID_SIZE]; @@ -264,6 +278,9 @@ struct btrfs_fs_devices { struct completion kobj_unregister; enum btrfs_chunk_allocation_policy chunk_alloc_policy; + + /* Policy used to read the mirrored stripes */ + enum btrfs_read_policy read_policy; }; #define BTRFS_BIO_INLINE_CSUM_SIZE 64 @@ -436,7 +453,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, void *holder); int btrfs_forget_devices(const char *path); void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); -void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); +void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices); void btrfs_assign_next_active_device(struct btrfs_device *device, struct btrfs_device *this_dev); struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, @@ -453,7 +470,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size); struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, - u64 devid, u8 *uuid, u8 *fsid, bool seed); + u64 devid, u8 *uuid, u8 *fsid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path); int btrfs_balance(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 95d9aebff2c4..af6246f36a9e 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -213,9 +213,11 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, } out: btrfs_free_path(path); - if (!ret) + if (!ret) { set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); + clear_bit(BTRFS_INODE_NO_XATTRS, &BTRFS_I(inode)->runtime_flags); + } return ret; } @@ -239,7 +241,7 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name, inode_inc_iversion(inode); inode->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); BUG_ON(ret); out: btrfs_end_transaction(trans); @@ -390,7 +392,7 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, if (!ret) { inode_inc_iversion(inode); inode->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); BUG_ON(ret); } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c new file mode 100644 index 000000000000..c38846659019 --- /dev/null +++ b/fs/btrfs/zoned.c @@ -0,0 +1,616 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/slab.h> +#include <linux/blkdev.h> +#include "ctree.h" +#include "volumes.h" +#include "zoned.h" +#include "rcu-string.h" + +/* Maximum number of zones to report per blkdev_report_zones() call */ +#define BTRFS_REPORT_NR_ZONES 4096 + +/* Number of superblock log zones */ +#define BTRFS_NR_SB_LOG_ZONES 2 + +static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) +{ + struct blk_zone *zones = data; + + memcpy(&zones[idx], zone, sizeof(*zone)); + + return 0; +} + +static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, + u64 *wp_ret) +{ + bool empty[BTRFS_NR_SB_LOG_ZONES]; + bool full[BTRFS_NR_SB_LOG_ZONES]; + sector_t sector; + + ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL && + zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL); + + empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY); + empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY); + full[0] = (zones[0].cond == BLK_ZONE_COND_FULL); + full[1] = (zones[1].cond == BLK_ZONE_COND_FULL); + + /* + * Possible states of log buffer zones + * + * Empty[0] In use[0] Full[0] + * Empty[1] * x 0 + * In use[1] 0 x 0 + * Full[1] 1 1 C + * + * Log position: + * *: Special case, no superblock is written + * 0: Use write pointer of zones[0] + * 1: Use write pointer of zones[1] + * C: Compare super blcoks from zones[0] and zones[1], use the latest + * one determined by generation + * x: Invalid state + */ + + if (empty[0] && empty[1]) { + /* Special case to distinguish no superblock to read */ + *wp_ret = zones[0].start << SECTOR_SHIFT; + return -ENOENT; + } else if (full[0] && full[1]) { + /* Compare two super blocks */ + struct address_space *mapping = bdev->bd_inode->i_mapping; + struct page *page[BTRFS_NR_SB_LOG_ZONES]; + struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES]; + int i; + + for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { + u64 bytenr; + + bytenr = ((zones[i].start + zones[i].len) + << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE; + + page[i] = read_cache_page_gfp(mapping, + bytenr >> PAGE_SHIFT, GFP_NOFS); + if (IS_ERR(page[i])) { + if (i == 1) + btrfs_release_disk_super(super[0]); + return PTR_ERR(page[i]); + } + super[i] = page_address(page[i]); + } + + if (super[0]->generation > super[1]->generation) + sector = zones[1].start; + else + sector = zones[0].start; + + for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) + btrfs_release_disk_super(super[i]); + } else if (!full[0] && (empty[1] || full[1])) { + sector = zones[0].wp; + } else if (full[0]) { + sector = zones[1].wp; + } else { + return -EUCLEAN; + } + *wp_ret = sector << SECTOR_SHIFT; + return 0; +} + +/* + * The following zones are reserved as the circular buffer on ZONED btrfs. + * - The primary superblock: zones 0 and 1 + * - The first copy: zones 16 and 17 + * - The second copy: zones 1024 or zone at 256GB which is minimum, and + * the following one + */ +static inline u32 sb_zone_number(int shift, int mirror) +{ + ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); + + switch (mirror) { + case 0: return 0; + case 1: return 16; + case 2: return min_t(u64, btrfs_sb_offset(mirror) >> shift, 1024); + } + + return 0; +} + +static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, + struct blk_zone *zones, unsigned int *nr_zones) +{ + int ret; + + if (!*nr_zones) + return 0; + + ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, + copy_zone_info_cb, zones); + if (ret < 0) { + btrfs_err_in_rcu(device->fs_info, + "zoned: failed to read zone %llu on %s (devid %llu)", + pos, rcu_str_deref(device->name), + device->devid); + return ret; + } + *nr_zones = ret; + if (!ret) + return -EIO; + + return 0; +} + +int btrfs_get_dev_zone_info(struct btrfs_device *device) +{ + struct btrfs_zoned_device_info *zone_info = NULL; + struct block_device *bdev = device->bdev; + struct request_queue *queue = bdev_get_queue(bdev); + sector_t nr_sectors; + sector_t sector = 0; + struct blk_zone *zones = NULL; + unsigned int i, nreported = 0, nr_zones; + unsigned int zone_sectors; + int ret; + + if (!bdev_is_zoned(bdev)) + return 0; + + if (device->zone_info) + return 0; + + zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL); + if (!zone_info) + return -ENOMEM; + + nr_sectors = bdev_nr_sectors(bdev); + zone_sectors = bdev_zone_sectors(bdev); + /* Check if it's power of 2 (see is_power_of_2) */ + ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0); + zone_info->zone_size = zone_sectors << SECTOR_SHIFT; + zone_info->zone_size_shift = ilog2(zone_info->zone_size); + zone_info->max_zone_append_size = + (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT; + zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); + if (!IS_ALIGNED(nr_sectors, zone_sectors)) + zone_info->nr_zones++; + + zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->seq_zones) { + ret = -ENOMEM; + goto out; + } + + zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->empty_zones) { + ret = -ENOMEM; + goto out; + } + + zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); + if (!zones) { + ret = -ENOMEM; + goto out; + } + + /* Get zones type */ + while (sector < nr_sectors) { + nr_zones = BTRFS_REPORT_NR_ZONES; + ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones, + &nr_zones); + if (ret) + goto out; + + for (i = 0; i < nr_zones; i++) { + if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ) + __set_bit(nreported, zone_info->seq_zones); + if (zones[i].cond == BLK_ZONE_COND_EMPTY) + __set_bit(nreported, zone_info->empty_zones); + nreported++; + } + sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len; + } + + if (nreported != zone_info->nr_zones) { + btrfs_err_in_rcu(device->fs_info, + "inconsistent number of zones on %s (%u/%u)", + rcu_str_deref(device->name), nreported, + zone_info->nr_zones); + ret = -EIO; + goto out; + } + + /* Validate superblock log */ + nr_zones = BTRFS_NR_SB_LOG_ZONES; + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + u32 sb_zone; + u64 sb_wp; + int sb_pos = BTRFS_NR_SB_LOG_ZONES * i; + + sb_zone = sb_zone_number(zone_info->zone_size_shift, i); + if (sb_zone + 1 >= zone_info->nr_zones) + continue; + + sector = sb_zone << (zone_info->zone_size_shift - SECTOR_SHIFT); + ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, + &zone_info->sb_zones[sb_pos], + &nr_zones); + if (ret) + goto out; + + if (nr_zones != BTRFS_NR_SB_LOG_ZONES) { + btrfs_err_in_rcu(device->fs_info, + "zoned: failed to read super block log zone info at devid %llu zone %u", + device->devid, sb_zone); + ret = -EUCLEAN; + goto out; + } + + /* + * If zones[0] is conventional, always use the beggining of the + * zone to record superblock. No need to validate in that case. + */ + if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type == + BLK_ZONE_TYPE_CONVENTIONAL) + continue; + + ret = sb_write_pointer(device->bdev, + &zone_info->sb_zones[sb_pos], &sb_wp); + if (ret != -ENOENT && ret) { + btrfs_err_in_rcu(device->fs_info, + "zoned: super block log zone corrupted devid %llu zone %u", + device->devid, sb_zone); + ret = -EUCLEAN; + goto out; + } + } + + + kfree(zones); + + device->zone_info = zone_info; + + /* device->fs_info is not safe to use for printing messages */ + btrfs_info_in_rcu(NULL, + "host-%s zoned block device %s, %u zones of %llu bytes", + bdev_zoned_model(bdev) == BLK_ZONED_HM ? "managed" : "aware", + rcu_str_deref(device->name), zone_info->nr_zones, + zone_info->zone_size); + + return 0; + +out: + kfree(zones); + bitmap_free(zone_info->empty_zones); + bitmap_free(zone_info->seq_zones); + kfree(zone_info); + + return ret; +} + +void btrfs_destroy_dev_zone_info(struct btrfs_device *device) +{ + struct btrfs_zoned_device_info *zone_info = device->zone_info; + + if (!zone_info) + return; + + bitmap_free(zone_info->seq_zones); + bitmap_free(zone_info->empty_zones); + kfree(zone_info); + device->zone_info = NULL; +} + +int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, + struct blk_zone *zone) +{ + unsigned int nr_zones = 1; + int ret; + + ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones); + if (ret != 0 || !nr_zones) + return ret ? ret : -EIO; + + return 0; +} + +int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + u64 zoned_devices = 0; + u64 nr_devices = 0; + u64 zone_size = 0; + u64 max_zone_append_size = 0; + const bool incompat_zoned = btrfs_is_zoned(fs_info); + int ret = 0; + + /* Count zoned devices */ + list_for_each_entry(device, &fs_devices->devices, dev_list) { + enum blk_zoned_model model; + + if (!device->bdev) + continue; + + model = bdev_zoned_model(device->bdev); + if (model == BLK_ZONED_HM || + (model == BLK_ZONED_HA && incompat_zoned)) { + struct btrfs_zoned_device_info *zone_info; + + zone_info = device->zone_info; + zoned_devices++; + if (!zone_size) { + zone_size = zone_info->zone_size; + } else if (zone_info->zone_size != zone_size) { + btrfs_err(fs_info, + "zoned: unequal block device zone sizes: have %llu found %llu", + device->zone_info->zone_size, + zone_size); + ret = -EINVAL; + goto out; + } + if (!max_zone_append_size || + (zone_info->max_zone_append_size && + zone_info->max_zone_append_size < max_zone_append_size)) + max_zone_append_size = + zone_info->max_zone_append_size; + } + nr_devices++; + } + + if (!zoned_devices && !incompat_zoned) + goto out; + + if (!zoned_devices && incompat_zoned) { + /* No zoned block device found on ZONED filesystem */ + btrfs_err(fs_info, + "zoned: no zoned devices found on a zoned filesystem"); + ret = -EINVAL; + goto out; + } + + if (zoned_devices && !incompat_zoned) { + btrfs_err(fs_info, + "zoned: mode not enabled but zoned device found"); + ret = -EINVAL; + goto out; + } + + if (zoned_devices != nr_devices) { + btrfs_err(fs_info, + "zoned: cannot mix zoned and regular devices"); + ret = -EINVAL; + goto out; + } + + /* + * stripe_size is always aligned to BTRFS_STRIPE_LEN in + * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size, + * check the alignment here. + */ + if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) { + btrfs_err(fs_info, + "zoned: zone size %llu not aligned to stripe %u", + zone_size, BTRFS_STRIPE_LEN); + ret = -EINVAL; + goto out; + } + + if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { + btrfs_err(fs_info, "zoned: mixed block groups not supported"); + ret = -EINVAL; + goto out; + } + + fs_info->zone_size = zone_size; + fs_info->max_zone_append_size = max_zone_append_size; + + btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size); +out: + return ret; +} + +int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) +{ + if (!btrfs_is_zoned(info)) + return 0; + + /* + * Space cache writing is not COWed. Disable that to avoid write errors + * in sequential zones. + */ + if (btrfs_test_opt(info, SPACE_CACHE)) { + btrfs_err(info, "zoned: space cache v1 is not supported"); + return -EINVAL; + } + + if (btrfs_test_opt(info, NODATACOW)) { + btrfs_err(info, "zoned: NODATACOW not supported"); + return -EINVAL; + } + + return 0; +} + +static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, + int rw, u64 *bytenr_ret) +{ + u64 wp; + int ret; + + if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) { + *bytenr_ret = zones[0].start << SECTOR_SHIFT; + return 0; + } + + ret = sb_write_pointer(bdev, zones, &wp); + if (ret != -ENOENT && ret < 0) + return ret; + + if (rw == WRITE) { + struct blk_zone *reset = NULL; + + if (wp == zones[0].start << SECTOR_SHIFT) + reset = &zones[0]; + else if (wp == zones[1].start << SECTOR_SHIFT) + reset = &zones[1]; + + if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { + ASSERT(reset->cond == BLK_ZONE_COND_FULL); + + ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + reset->start, reset->len, + GFP_NOFS); + if (ret) + return ret; + + reset->cond = BLK_ZONE_COND_EMPTY; + reset->wp = reset->start; + } + } else if (ret != -ENOENT) { + /* For READ, we want the precious one */ + if (wp == zones[0].start << SECTOR_SHIFT) + wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT; + wp -= BTRFS_SUPER_INFO_SIZE; + } + + *bytenr_ret = wp; + return 0; + +} + +int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, + u64 *bytenr_ret) +{ + struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES]; + unsigned int zone_sectors; + u32 sb_zone; + int ret; + u64 zone_size; + u8 zone_sectors_shift; + sector_t nr_sectors; + u32 nr_zones; + + if (!bdev_is_zoned(bdev)) { + *bytenr_ret = btrfs_sb_offset(mirror); + return 0; + } + + ASSERT(rw == READ || rw == WRITE); + + zone_sectors = bdev_zone_sectors(bdev); + if (!is_power_of_2(zone_sectors)) + return -EINVAL; + zone_size = zone_sectors << SECTOR_SHIFT; + zone_sectors_shift = ilog2(zone_sectors); + nr_sectors = bdev_nr_sectors(bdev); + nr_zones = nr_sectors >> zone_sectors_shift; + + sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); + if (sb_zone + 1 >= nr_zones) + return -ENOENT; + + ret = blkdev_report_zones(bdev, sb_zone << zone_sectors_shift, + BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb, + zones); + if (ret < 0) + return ret; + if (ret != BTRFS_NR_SB_LOG_ZONES) + return -EIO; + + return sb_log_location(bdev, zones, rw, bytenr_ret); +} + +int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, + u64 *bytenr_ret) +{ + struct btrfs_zoned_device_info *zinfo = device->zone_info; + u32 zone_num; + + if (!zinfo) { + *bytenr_ret = btrfs_sb_offset(mirror); + return 0; + } + + zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); + if (zone_num + 1 >= zinfo->nr_zones) + return -ENOENT; + + return sb_log_location(device->bdev, + &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror], + rw, bytenr_ret); +} + +static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo, + int mirror) +{ + u32 zone_num; + + if (!zinfo) + return false; + + zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); + if (zone_num + 1 >= zinfo->nr_zones) + return false; + + if (!test_bit(zone_num, zinfo->seq_zones)) + return false; + + return true; +} + +void btrfs_advance_sb_log(struct btrfs_device *device, int mirror) +{ + struct btrfs_zoned_device_info *zinfo = device->zone_info; + struct blk_zone *zone; + + if (!is_sb_log_zone(zinfo, mirror)) + return; + + zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror]; + if (zone->cond != BLK_ZONE_COND_FULL) { + if (zone->cond == BLK_ZONE_COND_EMPTY) + zone->cond = BLK_ZONE_COND_IMP_OPEN; + + zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT); + + if (zone->wp == zone->start + zone->len) + zone->cond = BLK_ZONE_COND_FULL; + + return; + } + + zone++; + ASSERT(zone->cond != BLK_ZONE_COND_FULL); + if (zone->cond == BLK_ZONE_COND_EMPTY) + zone->cond = BLK_ZONE_COND_IMP_OPEN; + + zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT); + + if (zone->wp == zone->start + zone->len) + zone->cond = BLK_ZONE_COND_FULL; +} + +int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) +{ + sector_t zone_sectors; + sector_t nr_sectors; + u8 zone_sectors_shift; + u32 sb_zone; + u32 nr_zones; + + zone_sectors = bdev_zone_sectors(bdev); + zone_sectors_shift = ilog2(zone_sectors); + nr_sectors = bdev_nr_sectors(bdev); + nr_zones = nr_sectors >> zone_sectors_shift; + + sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); + if (sb_zone + 1 >= nr_zones) + return -ENOENT; + + return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + sb_zone << zone_sectors_shift, + zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h new file mode 100644 index 000000000000..8abe2f83272b --- /dev/null +++ b/fs/btrfs/zoned.h @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_ZONED_H +#define BTRFS_ZONED_H + +#include <linux/types.h> +#include <linux/blkdev.h> +#include "volumes.h" +#include "disk-io.h" + +struct btrfs_zoned_device_info { + /* + * Number of zones, zone size and types of zones if bdev is a + * zoned block device. + */ + u64 zone_size; + u8 zone_size_shift; + u64 max_zone_append_size; + u32 nr_zones; + unsigned long *seq_zones; + unsigned long *empty_zones; + struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX]; +}; + +#ifdef CONFIG_BLK_DEV_ZONED +int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, + struct blk_zone *zone); +int btrfs_get_dev_zone_info(struct btrfs_device *device); +void btrfs_destroy_dev_zone_info(struct btrfs_device *device); +int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info); +int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info); +int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, + u64 *bytenr_ret); +int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, + u64 *bytenr_ret); +void btrfs_advance_sb_log(struct btrfs_device *device, int mirror); +int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror); +#else /* CONFIG_BLK_DEV_ZONED */ +static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, + struct blk_zone *zone) +{ + return 0; +} + +static inline int btrfs_get_dev_zone_info(struct btrfs_device *device) +{ + return 0; +} + +static inline void btrfs_destroy_dev_zone_info(struct btrfs_device *device) { } + +static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info) +{ + if (!btrfs_is_zoned(fs_info)) + return 0; + + btrfs_err(fs_info, "zoned block devices support is not enabled"); + return -EOPNOTSUPP; +} + +static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) +{ + return 0; +} + +static inline int btrfs_sb_log_location_bdev(struct block_device *bdev, + int mirror, int rw, u64 *bytenr_ret) +{ + *bytenr_ret = btrfs_sb_offset(mirror); + return 0; +} + +static inline int btrfs_sb_log_location(struct btrfs_device *device, int mirror, + int rw, u64 *bytenr_ret) +{ + *bytenr_ret = btrfs_sb_offset(mirror); + return 0; +} + +static inline void btrfs_advance_sb_log(struct btrfs_device *device, int mirror) +{ } + +static inline int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) +{ + return 0; +} + +#endif + +static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) +{ + struct btrfs_zoned_device_info *zone_info = device->zone_info; + + if (!zone_info) + return false; + + return test_bit(pos >> zone_info->zone_size_shift, zone_info->seq_zones); +} + +static inline bool btrfs_dev_is_empty_zone(struct btrfs_device *device, u64 pos) +{ + struct btrfs_zoned_device_info *zone_info = device->zone_info; + + if (!zone_info) + return true; + + return test_bit(pos >> zone_info->zone_size_shift, zone_info->empty_zones); +} + +static inline void btrfs_dev_set_empty_zone_bit(struct btrfs_device *device, + u64 pos, bool set) +{ + struct btrfs_zoned_device_info *zone_info = device->zone_info; + unsigned int zno; + + if (!zone_info) + return; + + zno = pos >> zone_info->zone_size_shift; + if (set) + set_bit(zno, zone_info->empty_zones); + else + clear_bit(zno, zone_info->empty_zones); +} + +static inline void btrfs_dev_set_zone_empty(struct btrfs_device *device, u64 pos) +{ + btrfs_dev_set_empty_zone_bit(device, pos, true); +} + +static inline void btrfs_dev_clear_zone_empty(struct btrfs_device *device, u64 pos) +{ + btrfs_dev_set_empty_zone_bit(device, pos, false); +} + +static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_info, + struct block_device *bdev) +{ + u64 zone_size; + + if (btrfs_is_zoned(fs_info)) { + zone_size = bdev_zone_sectors(bdev) << SECTOR_SHIFT; + /* Do not allow non-zoned device */ + return bdev_is_zoned(bdev) && fs_info->zone_size == zone_size; + } + + /* Do not allow Host Manged zoned device */ + return bdev_zoned_model(bdev) != BLK_ZONED_HM; +} + +static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 pos) +{ + /* + * On a non-zoned device, any address is OK. On a zoned device, + * non-SEQUENTIAL WRITE REQUIRED zones are capable. + */ + return device->zone_info == NULL || !btrfs_dev_is_sequential(device, pos); +} + +#endif diff --git a/fs/buffer.c b/fs/buffer.c index 23f645657488..32647d2011df 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -523,7 +523,7 @@ repeat: void emergency_thaw_bdev(struct super_block *sb) { - while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) + while (sb->s_bdev && !thaw_bdev(sb->s_bdev)) printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev); } @@ -657,7 +657,7 @@ int __set_page_dirty_buffers(struct page *page) } while (bh != head); } /* - * Lock out page->mem_cgroup migration to keep PageDirty + * Lock out page's memcg migration to keep PageDirty * synchronized with per-memcg dirty page counters. */ lock_page_memcg(page); diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index 2d24c765cbd7..2c557229696a 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -106,15 +106,25 @@ #endif #ifdef compat_start_thread -#undef start_thread -#define start_thread compat_start_thread +#define COMPAT_START_THREAD(ex, regs, new_ip, new_sp) \ + compat_start_thread(regs, new_ip, new_sp) #endif -#ifdef compat_arch_setup_additional_pages +#ifdef COMPAT_START_THREAD +#undef START_THREAD +#define START_THREAD COMPAT_START_THREAD +#endif + +#ifdef compat_arch_setup_additional_pages +#define COMPAT_ARCH_SETUP_ADDITIONAL_PAGES(bprm, ex, interpreter) \ + compat_arch_setup_additional_pages(bprm, interpreter) +#endif + +#ifdef COMPAT_ARCH_SETUP_ADDITIONAL_PAGES #undef ARCH_HAS_SETUP_ADDITIONAL_PAGES #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 -#undef arch_setup_additional_pages -#define arch_setup_additional_pages compat_arch_setup_additional_pages +#undef ARCH_SETUP_ADDITIONAL_PAGES +#define ARCH_SETUP_ADDITIONAL_PAGES COMPAT_ARCH_SETUP_ADDITIONAL_PAGES #endif #ifdef compat_elf_read_implies_exec diff --git a/fs/coredump.c b/fs/coredump.c index c6acfc694f65..a2f6ecc8e345 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -586,7 +586,6 @@ void do_coredump(const kernel_siginfo_t *siginfo) int ispipe; size_t *argv = NULL; int argc = 0; - struct files_struct *displaced; /* require nonrelative corefile path and be extra careful */ bool need_suid_safe = false; bool core_dumped = false; @@ -792,11 +791,10 @@ void do_coredump(const kernel_siginfo_t *siginfo) } /* get us an unshared descriptor table; almost always a no-op */ - retval = unshare_files(&displaced); + /* The cell spufs coredump code reads the file descriptor tables */ + retval = unshare_files(); if (retval) goto close_fail; - if (displaced) - put_files_struct(displaced); if (!dump_interrupted()) { /* * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 1fbe6c24d705..6ca7d16593ff 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -14,7 +14,7 @@ #include <linux/namei.h> #include <linux/scatterlist.h> #include <crypto/hash.h> -#include <crypto/sha.h> +#include <crypto/sha2.h> #include <crypto/skcipher.h> #include "fscrypt_private.h" @@ -404,7 +404,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, fname->disk_name.len = iname->len; return 0; } - ret = fscrypt_get_encryption_info(dir); + ret = fscrypt_get_encryption_info(dir, lookup); if (ret) return ret; @@ -560,7 +560,11 @@ int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; dir = dget_parent(dentry); - err = fscrypt_get_encryption_info(d_inode(dir)); + /* + * Pass allow_unsupported=true, so that files with an unsupported + * encryption policy can be deleted. + */ + err = fscrypt_get_encryption_info(d_inode(dir), true); valid = !fscrypt_has_encryption_key(d_inode(dir)); dput(dir); @@ -570,7 +574,3 @@ int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) return valid; } EXPORT_SYMBOL_GPL(fscrypt_d_revalidate); - -const struct dentry_operations fscrypt_d_ops = { - .d_revalidate = fscrypt_d_revalidate, -}; diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 4f5806a3b73d..3fa965eb3336 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -25,6 +25,9 @@ #define FSCRYPT_CONTEXT_V1 1 #define FSCRYPT_CONTEXT_V2 2 +/* Keep this in sync with include/uapi/linux/fscrypt.h */ +#define FSCRYPT_MODE_MAX FSCRYPT_MODE_ADIANTUM + struct fscrypt_context_v1 { u8 version; /* FSCRYPT_CONTEXT_V1 */ u8 contents_encryption_mode; @@ -294,7 +297,6 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, u32 orig_len, u32 max_len, u32 *encrypted_len_ret); -extern const struct dentry_operations fscrypt_d_ops; /* hkdf.c */ @@ -436,16 +438,9 @@ struct fscrypt_master_key { * FS_IOC_REMOVE_ENCRYPTION_KEY can be retried, or * FS_IOC_ADD_ENCRYPTION_KEY can add the secret again. * - * Locking: protected by key->sem (outer) and mk_secret_sem (inner). - * The reason for two locks is that key->sem also protects modifying - * mk_users, which ranks it above the semaphore for the keyring key - * type, which is in turn above page faults (via keyring_read). But - * sometimes filesystems call fscrypt_get_encryption_info() from within - * a transaction, which ranks it below page faults. So we need a - * separate lock which protects mk_secret but not also mk_users. + * Locking: protected by this master key's key->sem. */ struct fscrypt_master_key_secret mk_secret; - struct rw_semaphore mk_secret_sem; /* * For v1 policy keys: an arbitrary key descriptor which was assigned by @@ -464,8 +459,8 @@ struct fscrypt_master_key { * * This is NULL for v1 policy keys; those can only be added by root. * - * Locking: in addition to this keyrings own semaphore, this is - * protected by the master key's key->sem, so we can do atomic + * Locking: in addition to this keyring's own semaphore, this is + * protected by this master key's key->sem, so we can do atomic * search+insert. It can also be searched without taking any locks, but * in that case the returned key may have already been removed. */ @@ -491,9 +486,9 @@ struct fscrypt_master_key { * Per-mode encryption keys for the various types of encryption policies * that use them. Allocated and derived on-demand. */ - struct fscrypt_prepared_key mk_direct_keys[__FSCRYPT_MODE_MAX + 1]; - struct fscrypt_prepared_key mk_iv_ino_lblk_64_keys[__FSCRYPT_MODE_MAX + 1]; - struct fscrypt_prepared_key mk_iv_ino_lblk_32_keys[__FSCRYPT_MODE_MAX + 1]; + struct fscrypt_prepared_key mk_direct_keys[FSCRYPT_MODE_MAX + 1]; + struct fscrypt_prepared_key mk_iv_ino_lblk_64_keys[FSCRYPT_MODE_MAX + 1]; + struct fscrypt_prepared_key mk_iv_ino_lblk_32_keys[FSCRYPT_MODE_MAX + 1]; /* Hash key for inode numbers. Initialized only when needed. */ siphash_key_t mk_ino_hash_key; @@ -507,9 +502,9 @@ is_master_key_secret_present(const struct fscrypt_master_key_secret *secret) /* * The READ_ONCE() is only necessary for fscrypt_drop_inode() and * fscrypt_key_describe(). These run in atomic context, so they can't - * take ->mk_secret_sem and thus 'secret' can change concurrently which - * would be a data race. But they only need to know whether the secret - * *was* present at the time of check, so READ_ONCE() suffices. + * take the key semaphore and thus 'secret' can change concurrently + * which would be a data race. But they only need to know whether the + * secret *was* present at the time of check, so READ_ONCE() suffices. */ return READ_ONCE(secret->size) != 0; } @@ -575,6 +570,34 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, void fscrypt_hash_inode_number(struct fscrypt_info *ci, const struct fscrypt_master_key *mk); +int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported); + +/** + * fscrypt_require_key() - require an inode's encryption key + * @inode: the inode we need the key for + * + * If the inode is encrypted, set up its encryption key if not already done. + * Then require that the key be present and return -ENOKEY otherwise. + * + * No locks are needed, and the key will live as long as the struct inode --- so + * it won't go away from under you. + * + * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code + * if a problem occurred while setting up the encryption key. + */ +static inline int fscrypt_require_key(struct inode *inode) +{ + if (IS_ENCRYPTED(inode)) { + int err = fscrypt_get_encryption_info(inode, false); + + if (err) + return err; + if (!fscrypt_has_encryption_key(inode)) + return -ENOKEY; + } + return 0; +} + /* keysetup_v1.c */ void fscrypt_put_direct_key(struct fscrypt_direct_key *dk); diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c index 0cba7928446d..e0ec21055505 100644 --- a/fs/crypto/hkdf.c +++ b/fs/crypto/hkdf.c @@ -10,7 +10,7 @@ */ #include <crypto/hash.h> -#include <crypto/sha.h> +#include <crypto/sha2.h> #include "fscrypt_private.h" diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 20b0df47fe6a..a73b0376e6f3 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -54,15 +54,12 @@ EXPORT_SYMBOL_GPL(fscrypt_file_open); int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, struct dentry *dentry) { - int err; - - err = fscrypt_require_key(dir); - if (err) - return err; - - /* ... in case we looked up no-key name before key was added */ - if (dentry->d_flags & DCACHE_NOKEY_NAME) + if (fscrypt_is_nokey_name(dentry)) return -ENOKEY; + /* + * We don't need to separately check that the directory inode's key is + * available, as it's implied by the dentry not being a no-key name. + */ if (!fscrypt_has_permitted_context(dir, inode)) return -EXDEV; @@ -75,19 +72,13 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - int err; - - err = fscrypt_require_key(old_dir); - if (err) - return err; - - err = fscrypt_require_key(new_dir); - if (err) - return err; - - /* ... in case we looked up no-key name(s) before key was added */ - if ((old_dentry->d_flags | new_dentry->d_flags) & DCACHE_NOKEY_NAME) + if (fscrypt_is_nokey_name(old_dentry) || + fscrypt_is_nokey_name(new_dentry)) return -ENOKEY; + /* + * We don't need to separately check that the directory inodes' keys are + * available, as it's implied by the dentries not being no-key names. + */ if (old_dir != new_dir) { if (IS_ENCRYPTED(new_dir) && @@ -117,12 +108,25 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_NOKEY_NAME; spin_unlock(&dentry->d_lock); - d_set_d_op(dentry, &fscrypt_d_ops); } return err; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup); +int __fscrypt_prepare_readdir(struct inode *dir) +{ + return fscrypt_get_encryption_info(dir, true); +} +EXPORT_SYMBOL_GPL(__fscrypt_prepare_readdir); + +int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr) +{ + if (attr->ia_valid & ATTR_SIZE) + return fscrypt_require_key(d_inode(dentry)); + return 0; +} +EXPORT_SYMBOL_GPL(__fscrypt_prepare_setattr); + /** * fscrypt_prepare_setflags() - prepare to change flags with FS_IOC_SETFLAGS * @inode: the inode on which flags are being changed @@ -138,6 +142,7 @@ int fscrypt_prepare_setflags(struct inode *inode, unsigned int oldflags, unsigned int flags) { struct fscrypt_info *ci; + struct key *key; struct fscrypt_master_key *mk; int err; @@ -153,13 +158,14 @@ int fscrypt_prepare_setflags(struct inode *inode, ci = inode->i_crypt_info; if (ci->ci_policy.version != FSCRYPT_POLICY_V2) return -EINVAL; - mk = ci->ci_master_key->payload.data[0]; - down_read(&mk->mk_secret_sem); + key = ci->ci_master_key; + mk = key->payload.data[0]; + down_read(&key->sem); if (is_master_key_secret_present(&mk->mk_secret)) err = fscrypt_derive_dirhash_key(ci, mk); else err = -ENOKEY; - up_read(&mk->mk_secret_sem); + up_read(&key->sem); return err; } return 0; @@ -325,7 +331,7 @@ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, * Try to set up the symlink's encryption key, but we can continue * regardless of whether the key is available or not. */ - err = fscrypt_get_encryption_info(inode); + err = fscrypt_get_encryption_info(inode, false); if (err) return ERR_PTR(err); has_key = fscrypt_has_encryption_key(inode); diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 53cc552a7b8f..0b3ffbb4faf4 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -44,7 +44,7 @@ static void free_master_key(struct fscrypt_master_key *mk) wipe_master_key_secret(&mk->mk_secret); - for (i = 0; i <= __FSCRYPT_MODE_MAX; i++) { + for (i = 0; i <= FSCRYPT_MODE_MAX; i++) { fscrypt_destroy_prepared_key(&mk->mk_direct_keys[i]); fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_64_keys[i]); fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_32_keys[i]); @@ -347,7 +347,6 @@ static int add_new_master_key(struct fscrypt_master_key_secret *secret, mk->mk_spec = *mk_spec; move_master_key_secret(&mk->mk_secret, secret); - init_rwsem(&mk->mk_secret_sem); refcount_set(&mk->mk_refcount, 1); /* secret is present */ INIT_LIST_HEAD(&mk->mk_decrypted_inodes); @@ -427,11 +426,8 @@ static int add_existing_master_key(struct fscrypt_master_key *mk, } /* Re-add the secret if needed. */ - if (rekey) { - down_write(&mk->mk_secret_sem); + if (rekey) move_master_key_secret(&mk->mk_secret, secret); - up_write(&mk->mk_secret_sem); - } return 0; } @@ -975,10 +971,8 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) /* No user claims remaining. Go ahead and wipe the secret. */ dead = false; if (is_master_key_secret_present(&mk->mk_secret)) { - down_write(&mk->mk_secret_sem); wipe_master_key_secret(&mk->mk_secret); dead = refcount_dec_and_test(&mk->mk_refcount); - up_write(&mk->mk_secret_sem); } up_write(&key->sem); if (dead) { diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index d595abb8ef90..261293fb7097 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -56,6 +56,8 @@ static struct fscrypt_mode * select_encryption_mode(const union fscrypt_policy *policy, const struct inode *inode) { + BUILD_BUG_ON(ARRAY_SIZE(fscrypt_modes) != FSCRYPT_MODE_MAX + 1); + if (S_ISREG(inode->i_mode)) return &fscrypt_modes[fscrypt_policy_contents_mode(policy)]; @@ -168,7 +170,7 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci, unsigned int hkdf_infolen = 0; int err; - if (WARN_ON(mode_num > __FSCRYPT_MODE_MAX)) + if (WARN_ON(mode_num > FSCRYPT_MODE_MAX)) return -EINVAL; prep_key = &keys[mode_num]; @@ -335,11 +337,11 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, * Find the master key, then set up the inode's actual encryption key. * * If the master key is found in the filesystem-level keyring, then the - * corresponding 'struct key' is returned in *master_key_ret with - * ->mk_secret_sem read-locked. This is needed to ensure that only one task - * links the fscrypt_info into ->mk_decrypted_inodes (as multiple tasks may race - * to create an fscrypt_info for the same inode), and to synchronize the master - * key being removed with a new inode starting to use it. + * corresponding 'struct key' is returned in *master_key_ret with its semaphore + * read-locked. This is needed to ensure that only one task links the + * fscrypt_info into ->mk_decrypted_inodes (as multiple tasks may race to create + * an fscrypt_info for the same inode), and to synchronize the master key being + * removed with a new inode starting to use it. */ static int setup_file_encryption_key(struct fscrypt_info *ci, bool need_dirhash_key, @@ -388,7 +390,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, } mk = key->payload.data[0]; - down_read(&mk->mk_secret_sem); + down_read(&key->sem); /* Has the secret been removed (via FS_IOC_REMOVE_ENCRYPTION_KEY)? */ if (!is_master_key_secret_present(&mk->mk_secret)) { @@ -431,7 +433,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, return 0; out_release_key: - up_read(&mk->mk_secret_sem); + up_read(&key->sem); key_put(key); return err; } @@ -534,9 +536,7 @@ fscrypt_setup_encryption_info(struct inode *inode, res = 0; out: if (master_key) { - struct fscrypt_master_key *mk = master_key->payload.data[0]; - - up_read(&mk->mk_secret_sem); + up_read(&master_key->sem); key_put(master_key); } put_crypt_info(crypt_info); @@ -546,6 +546,11 @@ out: /** * fscrypt_get_encryption_info() - set up an inode's encryption key * @inode: the inode to set up the key for. Must be encrypted. + * @allow_unsupported: if %true, treat an unsupported encryption policy (or + * unrecognized encryption context) the same way as the key + * being unavailable, instead of returning an error. Use + * %false unless the operation being performed is needed in + * order for files (or directories) to be deleted. * * Set up ->i_crypt_info, if it hasn't already been done. * @@ -556,7 +561,7 @@ out: * encryption key is unavailable. (Use fscrypt_has_encryption_key() to * distinguish these cases.) Also can return another -errno code. */ -int fscrypt_get_encryption_info(struct inode *inode) +int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) { int res; union fscrypt_context ctx; @@ -567,29 +572,38 @@ int fscrypt_get_encryption_info(struct inode *inode) res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { + if (res == -ERANGE && allow_unsupported) + return 0; fscrypt_warn(inode, "Error %d getting encryption context", res); return res; } res = fscrypt_policy_from_context(&policy, &ctx, res); if (res) { + if (allow_unsupported) + return 0; fscrypt_warn(inode, "Unrecognized or corrupt encryption context"); return res; } - if (!fscrypt_supported_policy(&policy, inode)) + if (!fscrypt_supported_policy(&policy, inode)) { + if (allow_unsupported) + return 0; return -EINVAL; + } res = fscrypt_setup_encryption_info(inode, &policy, fscrypt_context_nonce(&ctx), IS_CASEFOLDED(inode) && S_ISDIR(inode->i_mode)); + + if (res == -ENOPKG && allow_unsupported) /* Algorithm unavailable? */ + res = 0; if (res == -ENOKEY) res = 0; return res; } -EXPORT_SYMBOL(fscrypt_get_encryption_info); /** * fscrypt_prepare_new_inode() - prepare to create a new inode in a directory @@ -710,7 +724,7 @@ int fscrypt_drop_inode(struct inode *inode) return 0; /* - * Note: since we aren't holding ->mk_secret_sem, the result here can + * Note: since we aren't holding the key semaphore, the result here can * immediately become outdated. But there's no correctness problem with * unnecessarily evicting. Nor is there a correctness problem with not * evicting while iput() is racing with the key being removed, since diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 4441d9944b9e..a51cef6bd27f 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -175,7 +175,10 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, return false; } - if (policy->flags & ~FSCRYPT_POLICY_FLAGS_VALID) { + if (policy->flags & ~(FSCRYPT_POLICY_FLAGS_PAD_MASK | + FSCRYPT_POLICY_FLAG_DIRECT_KEY | + FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 | + FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) { fscrypt_warn(inode, "Unsupported encryption flags (0x%02x)", policy->flags); return false; @@ -587,7 +590,7 @@ EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_nonce); int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { union fscrypt_policy parent_policy, child_policy; - int err; + int err, err1, err2; /* No restrictions on file types which are never encrypted */ if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && @@ -617,19 +620,25 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) * In any case, if an unexpected error occurs, fall back to "forbidden". */ - err = fscrypt_get_encryption_info(parent); + err = fscrypt_get_encryption_info(parent, true); if (err) return 0; - err = fscrypt_get_encryption_info(child); + err = fscrypt_get_encryption_info(child, true); if (err) return 0; - err = fscrypt_get_policy(parent, &parent_policy); - if (err) - return 0; + err1 = fscrypt_get_policy(parent, &parent_policy); + err2 = fscrypt_get_policy(child, &child_policy); - err = fscrypt_get_policy(child, &child_policy); - if (err) + /* + * Allow the case where the parent and child both have an unrecognized + * encryption policy, so that files with an unrecognized encryption + * policy can be deleted. + */ + if (err1 == -EINVAL && err2 == -EINVAL) + return 1; + + if (err1 || err2) return 0; return fscrypt_policies_equal(&parent_policy, &child_policy); @@ -810,12 +810,11 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, address = pgoff_address(index, vma); /* - * Note because we provide range to follow_pte_pmd it will - * call mmu_notifier_invalidate_range_start() on our behalf - * before taking any lock. + * Note because we provide range to follow_pte it will call + * mmu_notifier_invalidate_range_start() on our behalf before + * taking any lock. */ - if (follow_pte_pmd(vma->vm_mm, address, &range, - &ptep, &pmdp, &ptl)) + if (follow_pte(vma->vm_mm, address, &range, &ptep, &pmdp, &ptl)) continue; /* diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 624617c12250..561dcad08ad6 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -572,7 +572,7 @@ static int new_lockspace(const char *name, const char *cluster, mutex_init(&ls->ls_requestqueue_mutex); mutex_init(&ls->ls_clear_proc_locks); - ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS); + ls->ls_recover_buf = kmalloc(LOWCOMMS_MAX_TX_BUFFER_LEN, GFP_NOFS); if (!ls->ls_recover_buf) goto out_lkbidr; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 79f56f16bc2c..372c34ff8594 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -78,9 +78,9 @@ struct connection { #define CF_APP_LIMITED 7 #define CF_CLOSING 8 #define CF_SHUTDOWN 9 +#define CF_CONNECTED 10 struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; - int (*rx_action) (struct connection *); /* What to do when active */ void (*connect_action) (struct connection *); /* What to do to connect */ void (*shutdown_action)(struct connection *con); /* What to do to shutdown */ int retries; @@ -97,6 +97,11 @@ struct connection { }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) +struct listen_connection { + struct socket *sock; + struct work_struct rwork; +}; + /* An entry waiting to be sent */ struct writequeue_entry { struct list_head list; @@ -126,6 +131,7 @@ static struct listen_sock_callbacks { static LIST_HEAD(dlm_node_addrs); static DEFINE_SPINLOCK(dlm_node_addrs_spin); +static struct listen_connection listen_con; static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT]; static int dlm_local_count; static int dlm_allow_conn; @@ -141,6 +147,9 @@ DEFINE_STATIC_SRCU(connections_srcu); static void process_recv_sockets(struct work_struct *work); static void process_send_sockets(struct work_struct *work); +static void sctp_connect_to_sock(struct connection *con); +static void tcp_connect_to_sock(struct connection *con); +static void dlm_tcp_shutdown(struct connection *con); /* This is deliberately very simple because most clusters have simple sequential nodeids, so we should be able to go straight to a connection @@ -169,6 +178,31 @@ static struct connection *__find_con(int nodeid) return NULL; } +static int dlm_con_init(struct connection *con, int nodeid) +{ + con->rx_buflen = dlm_config.ci_buffer_size; + con->rx_buf = kmalloc(con->rx_buflen, GFP_NOFS); + if (!con->rx_buf) + return -ENOMEM; + + con->nodeid = nodeid; + mutex_init(&con->sock_mutex); + INIT_LIST_HEAD(&con->writequeue); + spin_lock_init(&con->writequeue_lock); + INIT_WORK(&con->swork, process_send_sockets); + INIT_WORK(&con->rwork, process_recv_sockets); + init_waitqueue_head(&con->shutdown_wait); + + if (dlm_config.ci_protocol == 0) { + con->connect_action = tcp_connect_to_sock; + con->shutdown_action = dlm_tcp_shutdown; + } else { + con->connect_action = sctp_connect_to_sock; + } + + return 0; +} + /* * If 'allocation' is zero then we don't attempt to create a new * connection structure for this node. @@ -176,7 +210,7 @@ static struct connection *__find_con(int nodeid) static struct connection *nodeid2con(int nodeid, gfp_t alloc) { struct connection *con, *tmp; - int r; + int r, ret; con = __find_con(nodeid); if (con || !alloc) @@ -186,30 +220,12 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) if (!con) return NULL; - con->rx_buflen = dlm_config.ci_buffer_size; - con->rx_buf = kmalloc(con->rx_buflen, GFP_NOFS); - if (!con->rx_buf) { + ret = dlm_con_init(con, nodeid); + if (ret) { kfree(con); return NULL; } - con->nodeid = nodeid; - mutex_init(&con->sock_mutex); - INIT_LIST_HEAD(&con->writequeue); - spin_lock_init(&con->writequeue_lock); - INIT_WORK(&con->swork, process_send_sockets); - INIT_WORK(&con->rwork, process_recv_sockets); - init_waitqueue_head(&con->shutdown_wait); - - /* Setup action pointers for child sockets */ - if (con->nodeid) { - struct connection *zerocon = __find_con(0); - - con->connect_action = zerocon->connect_action; - if (!con->rx_action) - con->rx_action = zerocon->rx_action; - } - r = nodeid_hash(nodeid); spin_lock(&connections_lock); @@ -258,7 +274,8 @@ static struct dlm_node_addr *find_node_addr(int nodeid) return NULL; } -static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y) +static int addr_compare(const struct sockaddr_storage *x, + const struct sockaddr_storage *y) { switch (x->ss_family) { case AF_INET: { @@ -357,10 +374,25 @@ unlock: return rv; } +/* caller need to held dlm_node_addrs_spin lock */ +static bool dlm_lowcomms_na_has_addr(const struct dlm_node_addr *na, + const struct sockaddr_storage *addr) +{ + int i; + + for (i = 0; i < na->addr_count; i++) { + if (addr_compare(na->addr[i], addr)) + return true; + } + + return false; +} + int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) { struct sockaddr_storage *new_addr; struct dlm_node_addr *new_node, *na; + bool ret; new_node = kzalloc(sizeof(struct dlm_node_addr), GFP_NOFS); if (!new_node) @@ -385,6 +417,14 @@ int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) return 0; } + ret = dlm_lowcomms_na_has_addr(na, addr); + if (ret) { + spin_unlock(&dlm_node_addrs_spin); + kfree(new_addr); + kfree(new_node); + return -EEXIST; + } + if (na->addr_count >= DLM_MAX_ADDR_COUNT) { spin_unlock(&dlm_node_addrs_spin); kfree(new_addr); @@ -410,6 +450,11 @@ static void lowcomms_data_ready(struct sock *sk) read_unlock_bh(&sk->sk_callback_lock); } +static void lowcomms_listen_data_ready(struct sock *sk) +{ + queue_work(recv_workqueue, &listen_con.rwork); +} + static void lowcomms_write_space(struct sock *sk) { struct connection *con; @@ -419,6 +464,12 @@ static void lowcomms_write_space(struct sock *sk) if (!con) goto out; + if (!test_and_set_bit(CF_CONNECTED, &con->flags)) { + log_print("successful connected to node %d", con->nodeid); + queue_work(send_workqueue, &con->swork); + goto out; + } + clear_bit(SOCK_NOSPACE, &con->sock->flags); if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) { @@ -539,6 +590,21 @@ static void restore_callbacks(struct socket *sock) write_unlock_bh(&sk->sk_callback_lock); } +static void add_listen_sock(struct socket *sock, struct listen_connection *con) +{ + struct sock *sk = sock->sk; + + write_lock_bh(&sk->sk_callback_lock); + save_listen_callbacks(sock); + con->sock = sock; + + sk->sk_user_data = con; + sk->sk_allocation = GFP_NOFS; + /* Install a data_ready callback */ + sk->sk_data_ready = lowcomms_listen_data_ready; + write_unlock_bh(&sk->sk_callback_lock); +} + /* Make a socket active */ static void add_sock(struct socket *sock, struct connection *con) { @@ -576,6 +642,15 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, memset((char *)saddr + *addr_len, 0, sizeof(struct sockaddr_storage) - *addr_len); } +static void dlm_close_sock(struct socket **sock) +{ + if (*sock) { + restore_callbacks(*sock); + sock_release(*sock); + *sock = NULL; + } +} + /* Close a remote connection and tidy up */ static void close_connection(struct connection *con, bool and_other, bool tx, bool rx) @@ -592,11 +667,8 @@ static void close_connection(struct connection *con, bool and_other, } mutex_lock(&con->sock_mutex); - if (con->sock) { - restore_callbacks(con->sock); - sock_release(con->sock); - con->sock = NULL; - } + dlm_close_sock(&con->sock); + if (con->othercon && and_other) { /* Will only re-enter once. */ close_connection(con->othercon, false, true, true); @@ -604,6 +676,7 @@ static void close_connection(struct connection *con, bool and_other, con->rx_leftover = 0; con->retries = 0; + clear_bit(CF_CONNECTED, &con->flags); mutex_unlock(&con->sock_mutex); clear_bit(CF_CLOSING, &con->flags); } @@ -691,11 +764,6 @@ static int receive_from_sock(struct connection *con) goto out_close; } - if (con->nodeid == 0) { - ret = -EINVAL; - goto out_close; - } - /* realloc if we get new buffer size to read out */ buflen = dlm_config.ci_buffer_size; if (con->rx_buflen != buflen && con->rx_leftover <= buflen) { @@ -767,7 +835,7 @@ out_close: } /* Listening socket is busy, accept a connection */ -static int accept_from_sock(struct connection *con) +static int accept_from_sock(struct listen_connection *con) { int result; struct sockaddr_storage peeraddr; @@ -782,12 +850,8 @@ static int accept_from_sock(struct connection *con) return -1; } - mutex_lock_nested(&con->sock_mutex, 0); - - if (!con->sock) { - mutex_unlock(&con->sock_mutex); + if (!con->sock) return -ENOTCONN; - } result = kernel_accept(con->sock, &newsock, O_NONBLOCK); if (result < 0) @@ -809,7 +873,6 @@ static int accept_from_sock(struct connection *con) print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, b, sizeof(struct sockaddr_storage)); sock_release(newsock); - mutex_unlock(&con->sock_mutex); return -1; } @@ -828,7 +891,8 @@ static int accept_from_sock(struct connection *con) result = -ENOMEM; goto accept_err; } - mutex_lock_nested(&newcon->sock_mutex, 1); + + mutex_lock(&newcon->sock_mutex); if (newcon->sock) { struct connection *othercon = newcon->othercon; @@ -841,38 +905,24 @@ static int accept_from_sock(struct connection *con) goto accept_err; } - othercon->rx_buflen = dlm_config.ci_buffer_size; - othercon->rx_buf = kmalloc(othercon->rx_buflen, GFP_NOFS); - if (!othercon->rx_buf) { - mutex_unlock(&newcon->sock_mutex); + result = dlm_con_init(othercon, nodeid); + if (result < 0) { kfree(othercon); - log_print("failed to allocate incoming socket receive buffer"); - result = -ENOMEM; goto accept_err; } - othercon->nodeid = nodeid; - othercon->rx_action = receive_from_sock; - mutex_init(&othercon->sock_mutex); - INIT_LIST_HEAD(&othercon->writequeue); - spin_lock_init(&othercon->writequeue_lock); - INIT_WORK(&othercon->swork, process_send_sockets); - INIT_WORK(&othercon->rwork, process_recv_sockets); - init_waitqueue_head(&othercon->shutdown_wait); - set_bit(CF_IS_OTHERCON, &othercon->flags); + newcon->othercon = othercon; } else { /* close other sock con if we have something new */ close_connection(othercon, false, true, false); } - mutex_lock_nested(&othercon->sock_mutex, 2); - newcon->othercon = othercon; + mutex_lock_nested(&othercon->sock_mutex, 1); add_sock(newsock, othercon); addcon = othercon; mutex_unlock(&othercon->sock_mutex); } else { - newcon->rx_action = receive_from_sock; /* accept copies the sk after we've saved the callbacks, so we don't want to save them a second time or comm errors will result in calling sk_error_report recursively. */ @@ -889,12 +939,10 @@ static int accept_from_sock(struct connection *con) */ if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags)) queue_work(recv_workqueue, &addcon->rwork); - mutex_unlock(&con->sock_mutex); return 0; accept_err: - mutex_unlock(&con->sock_mutex); if (newsock) sock_release(newsock); @@ -930,7 +978,7 @@ static void writequeue_entry_complete(struct writequeue_entry *e, int completed) /* * sctp_bind_addrs - bind a SCTP socket to all our addresses */ -static int sctp_bind_addrs(struct connection *con, uint16_t port) +static int sctp_bind_addrs(struct socket *sock, uint16_t port) { struct sockaddr_storage localaddr; struct sockaddr *addr = (struct sockaddr *)&localaddr; @@ -941,9 +989,9 @@ static int sctp_bind_addrs(struct connection *con, uint16_t port) make_sockaddr(&localaddr, port, &addr_len); if (!i) - result = kernel_bind(con->sock, addr, addr_len); + result = kernel_bind(sock, addr, addr_len); else - result = sock_bind_add(con->sock->sk, addr, addr_len); + result = sock_bind_add(sock->sk, addr, addr_len); if (result < 0) { log_print("Can't bind to %d addr number %d, %d.\n", @@ -967,11 +1015,6 @@ static void sctp_connect_to_sock(struct connection *con) struct socket *sock; unsigned int mark; - if (con->nodeid == 0) { - log_print("attempt to connect sock 0 foiled"); - return; - } - dlm_comm_mark(con->nodeid, &mark); mutex_lock(&con->sock_mutex); @@ -1000,12 +1043,10 @@ static void sctp_connect_to_sock(struct connection *con) sock_set_mark(sock->sk, mark); - con->rx_action = receive_from_sock; - con->connect_action = sctp_connect_to_sock; add_sock(sock, con); /* Bind to all addresses. */ - if (sctp_bind_addrs(con, 0)) + if (sctp_bind_addrs(con->sock, 0)) goto bind_err; make_sockaddr(&daddr, dlm_config.ci_tcp_port, &addr_len); @@ -1027,8 +1068,11 @@ static void sctp_connect_to_sock(struct connection *con) if (result == -EINPROGRESS) result = 0; - if (result == 0) + if (result == 0) { + if (!test_and_set_bit(CF_CONNECTED, &con->flags)) + log_print("successful connected to node %d", con->nodeid); goto out; + } bind_err: con->sock = NULL; @@ -1065,11 +1109,6 @@ static void tcp_connect_to_sock(struct connection *con) unsigned int mark; int result; - if (con->nodeid == 0) { - log_print("attempt to connect sock 0 foiled"); - return; - } - dlm_comm_mark(con->nodeid, &mark); mutex_lock(&con->sock_mutex); @@ -1095,9 +1134,6 @@ static void tcp_connect_to_sock(struct connection *con) goto out_err; } - con->rx_action = receive_from_sock; - con->connect_action = tcp_connect_to_sock; - con->shutdown_action = dlm_tcp_shutdown; add_sock(sock, con); /* Bind to our cluster-known address connecting to avoid @@ -1153,8 +1189,11 @@ out: return; } -static struct socket *tcp_create_listen_sock(struct connection *con, - struct sockaddr_storage *saddr) +/* On error caller must run dlm_close_sock() for the + * listen connection socket. + */ +static int tcp_create_listen_sock(struct listen_connection *con, + struct sockaddr_storage *saddr) { struct socket *sock = NULL; int result = 0; @@ -1180,21 +1219,13 @@ static struct socket *tcp_create_listen_sock(struct connection *con, sock_set_reuseaddr(sock->sk); - write_lock_bh(&sock->sk->sk_callback_lock); - sock->sk->sk_user_data = con; - save_listen_callbacks(sock); - con->rx_action = accept_from_sock; - con->connect_action = tcp_connect_to_sock; - write_unlock_bh(&sock->sk->sk_callback_lock); + add_listen_sock(sock, con); /* Bind to our port */ make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len); result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len); if (result < 0) { log_print("Can't bind to port %d", dlm_config.ci_tcp_port); - sock_release(sock); - sock = NULL; - con->sock = NULL; goto create_out; } sock_set_keepalive(sock->sk); @@ -1202,13 +1233,13 @@ static struct socket *tcp_create_listen_sock(struct connection *con, result = sock->ops->listen(sock, 5); if (result < 0) { log_print("Can't listen on port %d", dlm_config.ci_tcp_port); - sock_release(sock); - sock = NULL; goto create_out; } + return 0; + create_out: - return sock; + return result; } /* Get local addresses */ @@ -1237,15 +1268,14 @@ static void deinit_local(void) kfree(dlm_local_addr[i]); } -/* Initialise SCTP socket and bind to all interfaces */ -static int sctp_listen_for_all(void) +/* Initialise SCTP socket and bind to all interfaces + * On error caller must run dlm_close_sock() for the + * listen connection socket. + */ +static int sctp_listen_for_all(struct listen_connection *con) { struct socket *sock = NULL; int result = -EINVAL; - struct connection *con = nodeid2con(0, GFP_NOFS); - - if (!con) - return -ENOMEM; log_print("Using SCTP for communications"); @@ -1260,47 +1290,29 @@ static int sctp_listen_for_all(void) sock_set_mark(sock->sk, dlm_config.ci_mark); sctp_sock_set_nodelay(sock->sk); - write_lock_bh(&sock->sk->sk_callback_lock); - /* Init con struct */ - sock->sk->sk_user_data = con; - save_listen_callbacks(sock); - con->sock = sock; - con->sock->sk->sk_data_ready = lowcomms_data_ready; - con->rx_action = accept_from_sock; - con->connect_action = sctp_connect_to_sock; - - write_unlock_bh(&sock->sk->sk_callback_lock); + add_listen_sock(sock, con); /* Bind to all addresses. */ - if (sctp_bind_addrs(con, dlm_config.ci_tcp_port)) - goto create_delsock; + result = sctp_bind_addrs(con->sock, dlm_config.ci_tcp_port); + if (result < 0) + goto out; result = sock->ops->listen(sock, 5); if (result < 0) { log_print("Can't set socket listening"); - goto create_delsock; + goto out; } return 0; -create_delsock: - sock_release(sock); - con->sock = NULL; out: return result; } static int tcp_listen_for_all(void) { - struct socket *sock = NULL; - struct connection *con = nodeid2con(0, GFP_NOFS); - int result = -EINVAL; - - if (!con) - return -ENOMEM; - /* We don't support multi-homed hosts */ - if (dlm_local_addr[1] != NULL) { + if (dlm_local_count > 1) { log_print("TCP protocol can't handle multi-homed hosts, " "try SCTP"); return -EINVAL; @@ -1308,16 +1320,7 @@ static int tcp_listen_for_all(void) log_print("Using TCP for communications"); - sock = tcp_create_listen_sock(con, dlm_local_addr[0]); - if (sock) { - add_sock(sock, con); - result = 0; - } - else { - result = -EADDRINUSE; - } - - return result; + return tcp_create_listen_sock(&listen_con, dlm_local_addr[0]); } @@ -1352,6 +1355,12 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) struct writequeue_entry *e; int offset = 0; + if (len > LOWCOMMS_MAX_TX_BUFFER_LEN) { + BUILD_BUG_ON(PAGE_SIZE < LOWCOMMS_MAX_TX_BUFFER_LEN); + log_print("failed to allocate a buffer of size %d", len); + return NULL; + } + con = nodeid2con(nodeid, allocation); if (!con) return NULL; @@ -1506,6 +1515,8 @@ int dlm_lowcomms_close(int nodeid) set_bit(CF_CLOSE, &con->flags); close_connection(con, true, true, true); clean_one_writequeue(con); + if (con->othercon) + clean_one_writequeue(con->othercon); } spin_lock(&dlm_node_addrs_spin); @@ -1529,10 +1540,15 @@ static void process_recv_sockets(struct work_struct *work) clear_bit(CF_READ_PENDING, &con->flags); do { - err = con->rx_action(con); + err = receive_from_sock(con); } while (!err); } +static void process_listen_recv_socket(struct work_struct *work) +{ + accept_from_sock(&listen_con); +} + /* Send workqueue function */ static void process_send_sockets(struct work_struct *work) { @@ -1616,10 +1632,11 @@ static void free_conn(struct connection *con) spin_unlock(&connections_lock); if (con->othercon) { clean_one_writequeue(con->othercon); - call_rcu(&con->othercon->rcu, connection_release); + call_srcu(&connections_srcu, &con->othercon->rcu, + connection_release); } clean_one_writequeue(con); - call_rcu(&con->rcu, connection_release); + call_srcu(&connections_srcu, &con->rcu, connection_release); } static void work_flush(void) @@ -1665,6 +1682,8 @@ void dlm_lowcomms_stop(void) if (send_workqueue) flush_workqueue(send_workqueue); + dlm_close_sock(&listen_con.sock); + foreach_conn(shutdown_conn); work_flush(); foreach_conn(free_conn); @@ -1675,7 +1694,6 @@ void dlm_lowcomms_stop(void) int dlm_lowcomms_start(void) { int error = -EINVAL; - struct connection *con; int i; for (i = 0; i < CONN_HASH_SIZE; i++) @@ -1688,6 +1706,8 @@ int dlm_lowcomms_start(void) goto fail; } + INIT_WORK(&listen_con.rwork, process_listen_recv_socket); + error = work_start(); if (error) goto fail; @@ -1698,7 +1718,7 @@ int dlm_lowcomms_start(void) if (dlm_config.ci_protocol == 0) error = tcp_listen_for_all(); else - error = sctp_listen_for_all(); + error = sctp_listen_for_all(&listen_con); if (error) goto fail_unlisten; @@ -1706,9 +1726,7 @@ int dlm_lowcomms_start(void) fail_unlisten: dlm_allow_conn = 0; - con = nodeid2con(0,0); - if (con) - free_conn(con); + dlm_close_sock(&listen_con.sock); fail: return error; } diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h index 687b2894e469..0918f9376489 100644 --- a/fs/dlm/lowcomms.h +++ b/fs/dlm/lowcomms.h @@ -12,6 +12,8 @@ #ifndef __LOWCOMMS_DOT_H__ #define __LOWCOMMS_DOT_H__ +#define LOWCOMMS_MAX_TX_BUFFER_LEN 4096 + int dlm_lowcomms_start(void); void dlm_lowcomms_stop(void); void dlm_lowcomms_exit(void); diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 7ad83deb4505..ceef3f2074ff 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -270,7 +270,7 @@ int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size, log_slots(ls, gen, num, NULL, array, array_size); - max_slots = (dlm_config.ci_buffer_size - sizeof(struct dlm_rcom) - + max_slots = (LOWCOMMS_MAX_TX_BUFFER_LEN - sizeof(struct dlm_rcom) - sizeof(struct rcom_config)) / sizeof(struct rcom_slot); if (num > max_slots) { diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 4daf5dc2b51c..73ddee5159d7 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -162,7 +162,7 @@ retry: set_rcom_status(ls, (struct rcom_status *)rc->rc_buf, status_flags); allow_sync_reply(ls, &rc->rc_id); - memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size); + memset(ls->ls_recover_buf, 0, LOWCOMMS_MAX_TX_BUFFER_LEN); send_rcom(ls, mh, rc); @@ -284,7 +284,7 @@ retry: memcpy(rc->rc_buf, last_name, last_len); allow_sync_reply(ls, &rc->rc_id); - memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size); + memset(ls->ls_recover_buf, 0, LOWCOMMS_MAX_TX_BUFFER_LEN); send_rcom(ls, mh, rc); @@ -304,7 +304,7 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) nodeid = rc_in->rc_header.h_nodeid; inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom); - outlen = dlm_config.ci_buffer_size - sizeof(struct dlm_rcom); + outlen = LOWCOMMS_MAX_TX_BUFFER_LEN - sizeof(struct dlm_rcom); error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh); if (error) diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index 46f2aa4ba46c..af159539fc1b 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -1,11 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only -EROFS_VERSION = "1.0" - -ccflags-y += -DEROFS_VERSION=\"$(EROFS_VERSION)\" - obj-$(CONFIG_EROFS_FS) += erofs.o erofs-objs := super.o inode.o data.o namei.o dir.o utils.o erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o - diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 3d452443c545..aea129ddda74 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -26,30 +26,58 @@ struct z_erofs_decompress_req { bool inplace_io, partial_decoding; }; +/* some special page->private (unsigned long, see below) */ +#define Z_EROFS_SHORTLIVED_PAGE (-1UL << 2) +#define Z_EROFS_PREALLOCATED_PAGE (-2UL << 2) + /* - * - 0x5A110C8D ('sallocated', Z_EROFS_MAPPING_STAGING) - - * used to mark temporary allocated pages from other - * file/cached pages and NULL mapping pages. + * For all pages in a pcluster, page->private should be one of + * Type Last 2bits page->private + * short-lived page 00 Z_EROFS_SHORTLIVED_PAGE + * preallocated page (tryalloc) 00 Z_EROFS_PREALLOCATED_PAGE + * cached/managed page 00 pointer to z_erofs_pcluster + * online page (file-backed, 01/10/11 sub-index << 2 | count + * some pages can be used for inplace I/O) + * + * page->mapping should be one of + * Type page->mapping + * short-lived page NULL + * preallocated page NULL + * cached/managed page non-NULL or NULL (invalidated/truncated page) + * online page non-NULL + * + * For all managed pages, PG_private should be set with 1 extra refcount, + * which is used for page reclaim / migration. */ -#define Z_EROFS_MAPPING_STAGING ((void *)0x5A110C8D) -/* check if a page is marked as staging */ -static inline bool z_erofs_page_is_staging(struct page *page) +/* + * short-lived pages are pages directly from buddy system with specific + * page->private (no need to set PagePrivate since these are non-LRU / + * non-movable pages and bypass reclaim / migration code). + */ +static inline bool z_erofs_is_shortlived_page(struct page *page) { - return page->mapping == Z_EROFS_MAPPING_STAGING; + if (page->private != Z_EROFS_SHORTLIVED_PAGE) + return false; + + DBG_BUGON(page->mapping); + return true; } -static inline bool z_erofs_put_stagingpage(struct list_head *pagepool, - struct page *page) +static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool, + struct page *page) { - if (!z_erofs_page_is_staging(page)) + if (!z_erofs_is_shortlived_page(page)) return false; - /* staging pages should not be used by others at the same time */ - if (page_ref_count(page) > 1) + /* short-lived pages should not be used by others at the same time */ + if (page_ref_count(page) > 1) { put_page(page); - else + } else { + /* follow the pcluster rule above. */ + set_page_private(page, 0); list_add(&page->lru, pagepool); + } return true; } diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 347be146884c..ea4f693bee22 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -312,27 +312,12 @@ static void erofs_raw_access_readahead(struct readahead_control *rac) submit_bio(bio); } -static int erofs_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - struct erofs_map_blocks map = { - .m_la = iblock << 9, - }; - int err; - - err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); - if (err) - return err; - - if (map.m_flags & EROFS_MAP_MAPPED) - bh->b_blocknr = erofs_blknr(map.m_pa); - - return err; -} - static sector_t erofs_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; + struct erofs_map_blocks map = { + .m_la = blknr_to_addr(block), + }; if (EROFS_I(inode)->datalayout == EROFS_INODE_FLAT_INLINE) { erofs_blk_t blks = i_size_read(inode) >> LOG_BLOCK_SIZE; @@ -341,7 +326,10 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block) return 0; } - return generic_block_bmap(mapping, block, erofs_get_block); + if (!erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW)) + return erofs_blknr(map.m_pa); + + return 0; } /* for uncompressed (aligned) files and raw access for other files */ diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index cbadbf55c6c2..1cb1ffd10569 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -76,7 +76,7 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, victim = erofs_allocpage(pagepool, GFP_KERNEL); if (!victim) return -ENOMEM; - victim->mapping = Z_EROFS_MAPPING_STAGING; + set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE); } rq->out[i] = victim; } diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 86fd3bf62af6..6cb356c4217b 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -20,6 +20,11 @@ enum z_erofs_cache_alloctype { DONTALLOC, /* don't allocate any cached pages */ DELAYEDALLOC, /* delayed allocation (at the time of submitting io) */ + /* + * try to use cached I/O if page allocation succeeds or fallback + * to in-place I/O instead to avoid any direct reclaim. + */ + TRYALLOC, }; /* @@ -154,13 +159,16 @@ static DEFINE_MUTEX(z_pagemap_global_lock); static void preload_compressed_pages(struct z_erofs_collector *clt, struct address_space *mc, - enum z_erofs_cache_alloctype type) + enum z_erofs_cache_alloctype type, + struct list_head *pagepool) { const struct z_erofs_pcluster *pcl = clt->pcl; const unsigned int clusterpages = BIT(pcl->clusterbits); struct page **pages = clt->compressedpages; pgoff_t index = pcl->obj.index + (pages - pcl->compressed_pages); bool standalone = true; + gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; if (clt->mode < COLLECT_PRIMARY_FOLLOWED) return; @@ -168,6 +176,7 @@ static void preload_compressed_pages(struct z_erofs_collector *clt, for (; pages < pcl->compressed_pages + clusterpages; ++pages) { struct page *page; compressed_page_t t; + struct page *newpage = NULL; /* the compressed page was loaded before */ if (READ_ONCE(*pages)) @@ -179,7 +188,15 @@ static void preload_compressed_pages(struct z_erofs_collector *clt, t = tag_compressed_page_justfound(page); } else if (type == DELAYEDALLOC) { t = tagptr_init(compressed_page_t, PAGE_UNALLOCATED); + } else if (type == TRYALLOC) { + newpage = erofs_allocpage(pagepool, gfp); + if (!newpage) + goto dontalloc; + + set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); + t = tag_compressed_page_justfound(newpage); } else { /* DONTALLOC */ +dontalloc: if (standalone) clt->compressedpages = pages; standalone = false; @@ -189,8 +206,12 @@ static void preload_compressed_pages(struct z_erofs_collector *clt, if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t))) continue; - if (page) + if (page) { put_page(page); + } else if (newpage) { + set_page_private(newpage, 0); + list_add(&newpage->lru, pagepool); + } } if (standalone) /* downgrade to PRIMARY_FOLLOWED_NOINPLACE */ @@ -226,11 +247,8 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, /* barrier is implied in the following 'unlock_page' */ WRITE_ONCE(pcl->compressed_pages[i], NULL); - set_page_private(page, 0); - ClearPagePrivate(page); - + detach_page_private(page); unlock_page(page); - put_page(page); } return 0; } @@ -254,10 +272,8 @@ int erofs_try_to_free_cached_page(struct address_space *mapping, } erofs_workgroup_unfreeze(&pcl->obj, 1); - if (ret) { - ClearPagePrivate(page); - put_page(page); - } + if (ret) + detach_page_private(page); } return ret; } @@ -297,34 +313,33 @@ static int z_erofs_attach_page(struct z_erofs_collector *clt, return ret ? 0 : -EAGAIN; } -static enum z_erofs_collectmode -try_to_claim_pcluster(struct z_erofs_pcluster *pcl, - z_erofs_next_pcluster_t *owned_head) +static void z_erofs_try_to_claim_pcluster(struct z_erofs_collector *clt) { - /* let's claim these following types of pclusters */ -retry: - if (pcl->next == Z_EROFS_PCLUSTER_NIL) { - /* type 1, nil pcluster */ - if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL, - *owned_head) != Z_EROFS_PCLUSTER_NIL) - goto retry; + struct z_erofs_pcluster *pcl = clt->pcl; + z_erofs_next_pcluster_t *owned_head = &clt->owned_head; + /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */ + if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL, + *owned_head) == Z_EROFS_PCLUSTER_NIL) { *owned_head = &pcl->next; - /* lucky, I am the followee :) */ - return COLLECT_PRIMARY_FOLLOWED; - } else if (pcl->next == Z_EROFS_PCLUSTER_TAIL) { - /* - * type 2, link to the end of a existing open chain, - * be careful that its submission itself is governed - * by the original owned chain. - */ - if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, - *owned_head) != Z_EROFS_PCLUSTER_TAIL) - goto retry; + /* so we can attach this pcluster to our submission chain. */ + clt->mode = COLLECT_PRIMARY_FOLLOWED; + return; + } + + /* + * type 2, link to the end of an existing open chain, be careful + * that its submission is controlled by the original attached chain. + */ + if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, + *owned_head) == Z_EROFS_PCLUSTER_TAIL) { *owned_head = Z_EROFS_PCLUSTER_TAIL; - return COLLECT_PRIMARY_HOOKED; + clt->mode = COLLECT_PRIMARY_HOOKED; + clt->tailpcl = NULL; + return; } - return COLLECT_PRIMARY; /* :( better luck next time */ + /* type 3, it belongs to a chain, but it isn't the end of the chain */ + clt->mode = COLLECT_PRIMARY; } static int z_erofs_lookup_collection(struct z_erofs_collector *clt, @@ -369,10 +384,8 @@ static int z_erofs_lookup_collection(struct z_erofs_collector *clt, /* used to check tail merging loop due to corrupted images */ if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) clt->tailpcl = pcl; - clt->mode = try_to_claim_pcluster(pcl, &clt->owned_head); - /* clean tailpcl if the current owned_head is Z_EROFS_PCLUSTER_TAIL */ - if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) - clt->tailpcl = NULL; + + z_erofs_try_to_claim_pcluster(clt); clt->cl = cl; return 0; } @@ -562,7 +575,7 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, } static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page) + struct page *page, struct list_head *pagepool) { struct inode *const inode = fe->inode; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); @@ -615,11 +628,12 @@ restart_now: /* preload all compressed pages (maybe downgrade role if necessary) */ if (should_alloc_managed_pages(fe, sbi->ctx.cache_strategy, map->m_la)) - cache_strategy = DELAYEDALLOC; + cache_strategy = TRYALLOC; else cache_strategy = DONTALLOC; - preload_compressed_pages(clt, MNGD_MAPPING(sbi), cache_strategy); + preload_compressed_pages(clt, MNGD_MAPPING(sbi), + cache_strategy, pagepool); hitted: /* @@ -648,12 +662,12 @@ hitted: retry: err = z_erofs_attach_page(clt, page, page_type); - /* should allocate an additional staging page for pagevec */ + /* should allocate an additional short-lived page for pagevec */ if (err == -EAGAIN) { struct page *const newpage = alloc_page(GFP_NOFS | __GFP_NOFAIL); - newpage->mapping = Z_EROFS_MAPPING_STAGING; + set_page_private(newpage, Z_EROFS_SHORTLIVED_PAGE); err = z_erofs_attach_page(clt, newpage, Z_EROFS_PAGE_TYPE_EXCLUSIVE); if (!err) @@ -710,6 +724,11 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, queue_work(z_erofs_workqueue, &io->u.work); } +static bool z_erofs_page_is_invalidated(struct page *page) +{ + return !page->mapping && !z_erofs_is_shortlived_page(page); +} + static void z_erofs_decompressqueue_endio(struct bio *bio) { tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private); @@ -722,7 +741,7 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) struct page *page = bvec->bv_page; DBG_BUGON(PageUptodate(page)); - DBG_BUGON(!page->mapping); + DBG_BUGON(z_erofs_page_is_invalidated(page)); if (err) SetPageError(page); @@ -795,9 +814,9 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, /* all pages in pagevec ought to be valid */ DBG_BUGON(!page); - DBG_BUGON(!page->mapping); + DBG_BUGON(z_erofs_page_is_invalidated(page)); - if (z_erofs_put_stagingpage(pagepool, page)) + if (z_erofs_put_shortlivedpage(pagepool, page)) continue; if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD) @@ -831,9 +850,9 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, /* all compressed pages ought to be valid */ DBG_BUGON(!page); - DBG_BUGON(!page->mapping); + DBG_BUGON(z_erofs_page_is_invalidated(page)); - if (!z_erofs_page_is_staging(page)) { + if (!z_erofs_is_shortlived_page(page)) { if (erofs_page_is_managed(sbi, page)) { if (!PageUptodate(page)) err = -EIO; @@ -858,7 +877,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, overlapped = true; } - /* PG_error needs checking for inplaced and staging pages */ + /* PG_error needs checking for all non-managed pages */ if (PageError(page)) { DBG_BUGON(PageUptodate(page)); err = -EIO; @@ -897,8 +916,8 @@ out: if (erofs_page_is_managed(sbi, page)) continue; - /* recycle all individual staging pages */ - (void)z_erofs_put_stagingpage(pagepool, page); + /* recycle all individual short-lived pages */ + (void)z_erofs_put_shortlivedpage(pagepool, page); WRITE_ONCE(compressed_pages[i], NULL); } @@ -908,10 +927,10 @@ out: if (!page) continue; - DBG_BUGON(!page->mapping); + DBG_BUGON(z_erofs_page_is_invalidated(page)); - /* recycle all individual staging pages */ - if (z_erofs_put_stagingpage(pagepool, page)) + /* recycle all individual short-lived pages */ + if (z_erofs_put_shortlivedpage(pagepool, page)) continue; if (err < 0) @@ -1008,16 +1027,30 @@ repeat: justfound = tagptr_unfold_tags(t); page = tagptr_unfold_ptr(t); + /* + * preallocated cached pages, which is used to avoid direct reclaim + * otherwise, it will go inplace I/O path instead. + */ + if (page->private == Z_EROFS_PREALLOCATED_PAGE) { + WRITE_ONCE(pcl->compressed_pages[nr], page); + set_page_private(page, 0); + tocache = true; + goto out_tocache; + } mapping = READ_ONCE(page->mapping); /* - * unmanaged (file) pages are all locked solidly, + * file-backed online pages in plcuster are all locked steady, * therefore it is impossible for `mapping' to be NULL. */ if (mapping && mapping != mc) /* ought to be unmanaged pages */ goto out; + /* directly return for shortlived page as well */ + if (z_erofs_is_shortlived_page(page)) + goto out; + lock_page(page); /* only true if page reclaim goes wrong, should never happen */ @@ -1061,28 +1094,21 @@ repeat: put_page(page); out_allocpage: page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); - if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) { - /* non-LRU / non-movable temporary page is needed */ - page->mapping = Z_EROFS_MAPPING_STAGING; - tocache = false; - } - if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { - if (tocache) { - /* since it added to managed cache successfully */ - unlock_page(page); - put_page(page); - } else { - list_add(&page->lru, pagepool); - } + list_add(&page->lru, pagepool); cond_resched(); goto repeat; } - - if (tocache) { - set_page_private(page, (unsigned long)pcl); - SetPagePrivate(page); +out_tocache: + if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) { + /* turn into temporary page if fails (1 ref) */ + set_page_private(page, Z_EROFS_SHORTLIVED_PAGE); + goto out; } + attach_page_private(page, pcl); + /* drop a refcount added by allocpage (then we have 2 refs here) */ + put_page(page); + out: /* the only exit (for tracing and debugging) */ return page; } @@ -1284,7 +1310,7 @@ static int z_erofs_readpage(struct file *file, struct page *page) f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; - err = z_erofs_do_read_page(&f, page); + err = z_erofs_do_read_page(&f, page, &pagepool); (void)z_erofs_collector_end(&f.clt); /* if some compressed cluster ready, need submit them anyway */ @@ -1338,7 +1364,7 @@ static void z_erofs_readahead(struct readahead_control *rac) /* traversal in reverse order */ head = (void *)page_private(page); - err = z_erofs_do_read_page(&f, page); + err = z_erofs_do_read_page(&f, page, &pagepool); if (err) erofs_err(inode->i_sb, "readahead error at page %lu @ nid %llu", diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 68c9b29fc0ca..b503b353d4ab 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -173,6 +173,7 @@ static inline void z_erofs_onlinepage_endio(struct page *page) v = atomic_dec_return(u.o); if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) { + set_page_private(page, 0); ClearPagePrivate(page); if (!PageError(page)) SetPageUptodate(page); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4df61129566d..10b81e69db74 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -109,23 +109,22 @@ struct epoll_filefd { int fd; } __packed; -/* - * Structure used to track possible nested calls, for too deep recursions - * and loop cycles. - */ -struct nested_call_node { - struct list_head llink; - void *cookie; - void *ctx; -}; +/* Wait structure used by the poll hooks */ +struct eppoll_entry { + /* List header used to link this structure to the "struct epitem" */ + struct eppoll_entry *next; -/* - * This structure is used as collector for nested calls, to check for - * maximum recursion dept and loop cycles. - */ -struct nested_calls { - struct list_head tasks_call_list; - spinlock_t lock; + /* The "base" pointer is set to the container "struct epitem" */ + struct epitem *base; + + /* + * Wait queue item that will be linked to the target file wait + * queue head. + */ + wait_queue_entry_t wait; + + /* The wait queue head that linked the "wait" wait queue item */ + wait_queue_head_t *whead; }; /* @@ -154,17 +153,14 @@ struct epitem { /* The file descriptor information this item refers to */ struct epoll_filefd ffd; - /* Number of active wait queue attached to poll operations */ - int nwait; - /* List containing poll wait queues */ - struct list_head pwqlist; + struct eppoll_entry *pwqlist; /* The "container" of this item */ struct eventpoll *ep; /* List header used to link this item to the "struct file" items list */ - struct list_head fllink; + struct hlist_node fllink; /* wakeup_source used when EPOLLWAKEUP is set */ struct wakeup_source __rcu *ws; @@ -219,6 +215,7 @@ struct eventpoll { /* used to optimize loop detection check */ u64 gen; + struct hlist_head refs; #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ @@ -231,37 +228,12 @@ struct eventpoll { #endif }; -/* Wait structure used by the poll hooks */ -struct eppoll_entry { - /* List header used to link this structure to the "struct epitem" */ - struct list_head llink; - - /* The "base" pointer is set to the container "struct epitem" */ - struct epitem *base; - - /* - * Wait queue item that will be linked to the target file wait - * queue head. - */ - wait_queue_entry_t wait; - - /* The wait queue head that linked the "wait" wait queue item */ - wait_queue_head_t *whead; -}; - /* Wrapper struct used by poll queueing */ struct ep_pqueue { poll_table pt; struct epitem *epi; }; -/* Used by the ep_send_events() function as callback private data */ -struct ep_send_events_data { - int maxevents; - struct epoll_event __user *events; - int res; -}; - /* * Configuration options available inside /proc/sys/fs/epoll/ */ @@ -276,7 +248,7 @@ static DEFINE_MUTEX(epmutex); static u64 loop_check_gen = 0; /* Used to check for epoll file descriptor inclusion loops */ -static struct nested_calls poll_loop_ncalls; +static struct eventpoll *inserting_into; /* Slab cache used to allocate "struct epitem" */ static struct kmem_cache *epi_cache __read_mostly; @@ -288,7 +260,45 @@ static struct kmem_cache *pwq_cache __read_mostly; * List of files with newly added links, where we may need to limit the number * of emanating paths. Protected by the epmutex. */ -static LIST_HEAD(tfile_check_list); +struct epitems_head { + struct hlist_head epitems; + struct epitems_head *next; +}; +static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR; + +static struct kmem_cache *ephead_cache __read_mostly; + +static inline void free_ephead(struct epitems_head *head) +{ + if (head) + kmem_cache_free(ephead_cache, head); +} + +static void list_file(struct file *file) +{ + struct epitems_head *head; + + head = container_of(file->f_ep, struct epitems_head, epitems); + if (!head->next) { + head->next = tfile_check_list; + tfile_check_list = head; + } +} + +static void unlist_file(struct epitems_head *head) +{ + struct epitems_head *to_free = head; + struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems)); + if (p) { + struct epitem *epi= container_of(p, struct epitem, fllink); + spin_lock(&epi->ffd.file->f_lock); + if (!hlist_empty(&head->epitems)) + to_free = NULL; + head->next = NULL; + spin_unlock(&epi->ffd.file->f_lock); + } + free_ephead(to_free); +} #ifdef CONFIG_SYSCTL @@ -351,19 +361,6 @@ static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p) return container_of(p, struct eppoll_entry, wait)->base; } -/* Get the "struct epitem" from an epoll queue wrapper */ -static inline struct epitem *ep_item_from_epqueue(poll_table *p) -{ - return container_of(p, struct ep_pqueue, pt)->epi; -} - -/* Initialize the poll safe wake up structure */ -static void ep_nested_calls_init(struct nested_calls *ncalls) -{ - INIT_LIST_HEAD(&ncalls->tasks_call_list); - spin_lock_init(&ncalls->lock); -} - /** * ep_events_available - Checks if ready events might be available. * @@ -397,7 +394,8 @@ static void ep_busy_loop(struct eventpoll *ep, int nonblock) unsigned int napi_id = READ_ONCE(ep->napi_id); if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) - napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep); + napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false, + BUSY_POLL_BUDGET); } static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) @@ -415,12 +413,11 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) unsigned int napi_id; struct socket *sock; struct sock *sk; - int err; if (!net_busy_loop_on()) return; - sock = sock_from_file(epi->ffd.file, &err); + sock = sock_from_file(epi->ffd.file); if (!sock) return; @@ -458,69 +455,6 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) #endif /* CONFIG_NET_RX_BUSY_POLL */ -/** - * ep_call_nested - Perform a bound (possibly) nested call, by checking - * that the recursion limit is not exceeded, and that - * the same nested call (by the meaning of same cookie) is - * no re-entered. - * - * @ncalls: Pointer to the nested_calls structure to be used for this call. - * @nproc: Nested call core function pointer. - * @priv: Opaque data to be passed to the @nproc callback. - * @cookie: Cookie to be used to identify this nested call. - * @ctx: This instance context. - * - * Returns: Returns the code returned by the @nproc callback, or -1 if - * the maximum recursion limit has been exceeded. - */ -static int ep_call_nested(struct nested_calls *ncalls, - int (*nproc)(void *, void *, int), void *priv, - void *cookie, void *ctx) -{ - int error, call_nests = 0; - unsigned long flags; - struct list_head *lsthead = &ncalls->tasks_call_list; - struct nested_call_node *tncur; - struct nested_call_node tnode; - - spin_lock_irqsave(&ncalls->lock, flags); - - /* - * Try to see if the current task is already inside this wakeup call. - * We use a list here, since the population inside this set is always - * very much limited. - */ - list_for_each_entry(tncur, lsthead, llink) { - if (tncur->ctx == ctx && - (tncur->cookie == cookie || ++call_nests > EP_MAX_NESTS)) { - /* - * Ops ... loop detected or maximum nest level reached. - * We abort this wake by breaking the cycle itself. - */ - error = -1; - goto out_unlock; - } - } - - /* Add the current task and cookie to the list */ - tnode.ctx = ctx; - tnode.cookie = cookie; - list_add(&tnode.llink, lsthead); - - spin_unlock_irqrestore(&ncalls->lock, flags); - - /* Call the nested function */ - error = (*nproc)(priv, cookie, call_nests); - - /* Remove the current task from the list */ - spin_lock_irqsave(&ncalls->lock, flags); - list_del(&tnode.llink); -out_unlock: - spin_unlock_irqrestore(&ncalls->lock, flags); - - return error; -} - /* * As described in commit 0ccf831cb lockdep: annotate epoll * the use of wait queues used by epoll is done in a very controlled @@ -617,13 +551,11 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq) */ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) { - struct list_head *lsthead = &epi->pwqlist; + struct eppoll_entry **p = &epi->pwqlist; struct eppoll_entry *pwq; - while (!list_empty(lsthead)) { - pwq = list_first_entry(lsthead, struct eppoll_entry, llink); - - list_del(&pwq->llink); + while ((pwq = *p) != NULL) { + *p = pwq->next; ep_remove_wait_queue(pwq); kmem_cache_free(pwq_cache, pwq); } @@ -661,38 +593,13 @@ static inline void ep_pm_stay_awake_rcu(struct epitem *epi) rcu_read_unlock(); } -/** - * ep_scan_ready_list - Scans the ready list in a way that makes possible for - * the scan code, to call f_op->poll(). Also allows for - * O(NumReady) performance. - * - * @ep: Pointer to the epoll private data structure. - * @sproc: Pointer to the scan callback. - * @priv: Private opaque data passed to the @sproc callback. - * @depth: The current depth of recursive f_op->poll calls. - * @ep_locked: caller already holds ep->mtx - * - * Returns: The same integer error code returned by the @sproc callback. + +/* + * ep->mutex needs to be held because we could be hit by + * eventpoll_release_file() and epoll_ctl(). */ -static __poll_t ep_scan_ready_list(struct eventpoll *ep, - __poll_t (*sproc)(struct eventpoll *, - struct list_head *, void *), - void *priv, int depth, bool ep_locked) +static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist) { - __poll_t res; - struct epitem *epi, *nepi; - LIST_HEAD(txlist); - - lockdep_assert_irqs_enabled(); - - /* - * We need to lock this because we could be hit by - * eventpoll_release_file() and epoll_ctl(). - */ - - if (!ep_locked) - mutex_lock_nested(&ep->mtx, depth); - /* * Steal the ready list, and re-init the original one to the * empty list. Also, set ep->ovflist to NULL so that events @@ -701,15 +608,17 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, * because we want the "sproc" callback to be able to do it * in a lockless way. */ + lockdep_assert_irqs_enabled(); write_lock_irq(&ep->lock); - list_splice_init(&ep->rdllist, &txlist); + list_splice_init(&ep->rdllist, txlist); WRITE_ONCE(ep->ovflist, NULL); write_unlock_irq(&ep->lock); +} - /* - * Now call the callback function. - */ - res = (*sproc)(ep, &txlist, priv); +static void ep_done_scan(struct eventpoll *ep, + struct list_head *txlist) +{ + struct epitem *epi, *nepi; write_lock_irq(&ep->lock); /* @@ -744,14 +653,9 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, /* * Quickly re-inject items left on "txlist". */ - list_splice(&txlist, &ep->rdllist); + list_splice(txlist, &ep->rdllist); __pm_relax(ep->ws); write_unlock_irq(&ep->lock); - - if (!ep_locked) - mutex_unlock(&ep->mtx); - - return res; } static void epi_rcu_free(struct rcu_head *head) @@ -767,6 +671,8 @@ static void epi_rcu_free(struct rcu_head *head) static int ep_remove(struct eventpoll *ep, struct epitem *epi) { struct file *file = epi->ffd.file; + struct epitems_head *to_free; + struct hlist_head *head; lockdep_assert_irqs_enabled(); @@ -777,8 +683,20 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) /* Remove the current item from the list of epoll hooks */ spin_lock(&file->f_lock); - list_del_rcu(&epi->fllink); + to_free = NULL; + head = file->f_ep; + if (head->first == &epi->fllink && !epi->fllink.next) { + file->f_ep = NULL; + if (!is_file_epoll(file)) { + struct epitems_head *v; + v = container_of(head, struct epitems_head, epitems); + if (!smp_load_acquire(&v->next)) + to_free = v; + } + } + hlist_del_rcu(&epi->fllink); spin_unlock(&file->f_lock); + free_ephead(to_free); rb_erase_cached(&epi->rbn, &ep->rbr); @@ -864,48 +782,31 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) return 0; } -static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head, - void *priv); -static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, - poll_table *pt); +static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth); -/* - * Differs from ep_eventpoll_poll() in that internal callers already have - * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() - * is correctly annotated. - */ -static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, - int depth) -{ - struct eventpoll *ep; - bool locked; - - pt->_key = epi->event.events; - if (!is_file_epoll(epi->ffd.file)) - return vfs_poll(epi->ffd.file, pt) & epi->event.events; - - ep = epi->ffd.file->private_data; - poll_wait(epi->ffd.file, &ep->poll_wait, pt); - locked = pt && (pt->_qproc == ep_ptable_queue_proc); - - return ep_scan_ready_list(epi->ffd.file->private_data, - ep_read_events_proc, &depth, depth, - locked) & epi->event.events; -} - -static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head, - void *priv) +static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth) { + struct eventpoll *ep = file->private_data; + LIST_HEAD(txlist); struct epitem *epi, *tmp; poll_table pt; - int depth = *(int *)priv; + __poll_t res = 0; init_poll_funcptr(&pt, NULL); - depth++; - list_for_each_entry_safe(epi, tmp, head, rdllink) { - if (ep_item_poll(epi, &pt, depth)) { - return EPOLLIN | EPOLLRDNORM; + /* Insert inside our poll wait queue */ + poll_wait(file, &ep->poll_wait, wait); + + /* + * Proceed to find out if wanted events are really available inside + * the ready list. + */ + mutex_lock_nested(&ep->mtx, depth); + ep_start_scan(ep, &txlist); + list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { + if (ep_item_poll(epi, &pt, depth + 1)) { + res = EPOLLIN | EPOLLRDNORM; + break; } else { /* * Item has been dropped into the ready list by the poll @@ -916,24 +817,33 @@ static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head list_del_init(&epi->rdllink); } } - - return 0; + ep_done_scan(ep, &txlist); + mutex_unlock(&ep->mtx); + return res; } -static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait) +/* + * Differs from ep_eventpoll_poll() in that internal callers already have + * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() + * is correctly annotated. + */ +static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, + int depth) { - struct eventpoll *ep = file->private_data; - int depth = 0; + struct file *file = epi->ffd.file; + __poll_t res; - /* Insert inside our poll wait queue */ - poll_wait(file, &ep->poll_wait, wait); + pt->_key = epi->event.events; + if (!is_file_epoll(file)) + res = vfs_poll(file, pt); + else + res = __ep_eventpoll_poll(file, pt, depth); + return res & epi->event.events; +} - /* - * Proceed to find out if wanted events are really available inside - * the ready list. - */ - return ep_scan_ready_list(ep, ep_read_events_proc, - &depth, depth, false); +static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait) +{ + return __ep_eventpoll_poll(file, wait, 0); } #ifdef CONFIG_PROC_FS @@ -978,7 +888,8 @@ static const struct file_operations eventpoll_fops = { void eventpoll_release_file(struct file *file) { struct eventpoll *ep; - struct epitem *epi, *next; + struct epitem *epi; + struct hlist_node *next; /* * We don't want to get "file->f_lock" because it is not @@ -994,7 +905,11 @@ void eventpoll_release_file(struct file *file) * Besides, ep_remove() acquires the lock, so we can't hold it here. */ mutex_lock(&epmutex); - list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) { + if (unlikely(!file->f_ep)) { + mutex_unlock(&epmutex); + return; + } + hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) { ep = epi->ep; mutex_lock_nested(&ep->mtx, 0); ep_remove(ep, epi); @@ -1309,23 +1224,28 @@ out_unlock: static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt) { - struct epitem *epi = ep_item_from_epqueue(pt); + struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt); + struct epitem *epi = epq->epi; struct eppoll_entry *pwq; - if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) { - init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); - pwq->whead = whead; - pwq->base = epi; - if (epi->event.events & EPOLLEXCLUSIVE) - add_wait_queue_exclusive(whead, &pwq->wait); - else - add_wait_queue(whead, &pwq->wait); - list_add_tail(&pwq->llink, &epi->pwqlist); - epi->nwait++; - } else { - /* We have to signal that an error occurred */ - epi->nwait = -1; + if (unlikely(!epi)) // an earlier allocation has failed + return; + + pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL); + if (unlikely(!pwq)) { + epq->epi = NULL; + return; } + + init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); + pwq->whead = whead; + pwq->base = epi; + if (epi->event.events & EPOLLEXCLUSIVE) + add_wait_queue_exclusive(whead, &pwq->wait); + else + add_wait_queue(whead, &pwq->wait); + pwq->next = epi->pwqlist; + epi->pwqlist = pwq; } static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) @@ -1385,42 +1305,29 @@ static void path_count_init(void) path_count[i] = 0; } -static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) +static int reverse_path_check_proc(struct hlist_head *refs, int depth) { int error = 0; - struct file *file = priv; - struct file *child_file; struct epitem *epi; + if (depth > EP_MAX_NESTS) /* too deep nesting */ + return -1; + /* CTL_DEL can remove links here, but that can't increase our count */ - rcu_read_lock(); - list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) { - child_file = epi->ep->file; - if (is_file_epoll(child_file)) { - if (list_empty(&child_file->f_ep_links)) { - if (path_count_inc(call_nests)) { - error = -1; - break; - } - } else { - error = ep_call_nested(&poll_loop_ncalls, - reverse_path_check_proc, - child_file, child_file, - current); - } - if (error != 0) - break; - } else { - printk(KERN_ERR "reverse_path_check_proc: " - "file is not an ep!\n"); - } + hlist_for_each_entry_rcu(epi, refs, fllink) { + struct hlist_head *refs = &epi->ep->refs; + if (hlist_empty(refs)) + error = path_count_inc(depth); + else + error = reverse_path_check_proc(refs, depth + 1); + if (error != 0) + break; } - rcu_read_unlock(); return error; } /** - * reverse_path_check - The tfile_check_list is list of file *, which have + * reverse_path_check - The tfile_check_list is list of epitem_head, which have * links that are proposed to be newly added. We need to * make sure that those added links don't add too many * paths such that we will spend all our time waking up @@ -1431,19 +1338,18 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) */ static int reverse_path_check(void) { - int error = 0; - struct file *current_file; + struct epitems_head *p; - /* let's call this for all tfiles */ - list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { + for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) { + int error; path_count_init(); - error = ep_call_nested(&poll_loop_ncalls, - reverse_path_check_proc, current_file, - current_file, current); + rcu_read_lock(); + error = reverse_path_check_proc(&p->epitems, 0); + rcu_read_unlock(); if (error) - break; + return error; } - return error; + return 0; } static int ep_create_wakeup_source(struct epitem *epi) @@ -1484,6 +1390,39 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi) wakeup_source_unregister(ws); } +static int attach_epitem(struct file *file, struct epitem *epi) +{ + struct epitems_head *to_free = NULL; + struct hlist_head *head = NULL; + struct eventpoll *ep = NULL; + + if (is_file_epoll(file)) + ep = file->private_data; + + if (ep) { + head = &ep->refs; + } else if (!READ_ONCE(file->f_ep)) { +allocate: + to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL); + if (!to_free) + return -ENOMEM; + head = &to_free->epitems; + } + spin_lock(&file->f_lock); + if (!file->f_ep) { + if (unlikely(!head)) { + spin_unlock(&file->f_lock); + goto allocate; + } + file->f_ep = head; + to_free = NULL; + } + hlist_add_head_rcu(&epi->fllink, file->f_ep); + spin_unlock(&file->f_lock); + free_ephead(to_free); + return 0; +} + /* * Must be called with "mtx" held. */ @@ -1495,47 +1434,62 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, long user_watches; struct epitem *epi; struct ep_pqueue epq; + struct eventpoll *tep = NULL; + + if (is_file_epoll(tfile)) + tep = tfile->private_data; lockdep_assert_irqs_enabled(); user_watches = atomic_long_read(&ep->user->epoll_watches); if (unlikely(user_watches >= max_user_watches)) return -ENOSPC; - if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) + if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) return -ENOMEM; /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); - INIT_LIST_HEAD(&epi->fllink); - INIT_LIST_HEAD(&epi->pwqlist); epi->ep = ep; ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; - epi->nwait = 0; epi->next = EP_UNACTIVE_PTR; - if (epi->event.events & EPOLLWAKEUP) { - error = ep_create_wakeup_source(epi); - if (error) - goto error_create_wakeup_source; - } else { - RCU_INIT_POINTER(epi->ws, NULL); - } + if (tep) + mutex_lock_nested(&tep->mtx, 1); /* Add the current item to the list of active epoll hook for this file */ - spin_lock(&tfile->f_lock); - list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links); - spin_unlock(&tfile->f_lock); + if (unlikely(attach_epitem(tfile, epi) < 0)) { + kmem_cache_free(epi_cache, epi); + if (tep) + mutex_unlock(&tep->mtx); + return -ENOMEM; + } + + if (full_check && !tep) + list_file(tfile); + + atomic_long_inc(&ep->user->epoll_watches); /* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. */ ep_rbtree_insert(ep, epi); + if (tep) + mutex_unlock(&tep->mtx); /* now check if we've created too many backpaths */ - error = -EINVAL; - if (full_check && reverse_path_check()) - goto error_remove_epi; + if (unlikely(full_check && reverse_path_check())) { + ep_remove(ep, epi); + return -EINVAL; + } + + if (epi->event.events & EPOLLWAKEUP) { + error = ep_create_wakeup_source(epi); + if (error) { + ep_remove(ep, epi); + return error; + } + } /* Initialize the poll table using the queue callback */ epq.epi = epi; @@ -1555,9 +1509,10 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, * install process. Namely an allocation for a wait queue failed due * high memory pressure. */ - error = -ENOMEM; - if (epi->nwait < 0) - goto error_unregister; + if (unlikely(!epq.epi)) { + ep_remove(ep, epi); + return -ENOMEM; + } /* We have to drop the new item inside our item list to keep track of it */ write_lock_irq(&ep->lock); @@ -1579,40 +1534,11 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, write_unlock_irq(&ep->lock); - atomic_long_inc(&ep->user->epoll_watches); - /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, NULL); return 0; - -error_unregister: - ep_unregister_pollwait(ep, epi); -error_remove_epi: - spin_lock(&tfile->f_lock); - list_del_rcu(&epi->fllink); - spin_unlock(&tfile->f_lock); - - rb_erase_cached(&epi->rbn, &ep->rbr); - - /* - * We need to do this because an event could have been arrived on some - * allocated wait queue. Note that we don't care about the ep->ovflist - * list, since that is used/cleaned only inside a section bound by "mtx". - * And ep_insert() is called with "mtx" held. - */ - write_lock_irq(&ep->lock); - if (ep_is_linked(epi)) - list_del_init(&epi->rdllink); - write_unlock_irq(&ep->lock); - - wakeup_source_unregister(ep_wakeup_source(epi)); - -error_create_wakeup_source: - kmem_cache_free(epi_cache, epi); - - return error; } /* @@ -1691,28 +1617,28 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, return 0; } -static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head, - void *priv) +static int ep_send_events(struct eventpoll *ep, + struct epoll_event __user *events, int maxevents) { - struct ep_send_events_data *esed = priv; - __poll_t revents; struct epitem *epi, *tmp; - struct epoll_event __user *uevent = esed->events; - struct wakeup_source *ws; + LIST_HEAD(txlist); poll_table pt; + int res = 0; init_poll_funcptr(&pt, NULL); - esed->res = 0; + + mutex_lock(&ep->mtx); + ep_start_scan(ep, &txlist); /* * We can loop without lock because we are passed a task private list. - * Items cannot vanish during the loop because ep_scan_ready_list() is - * holding "mtx" during this call. + * Items cannot vanish during the loop we are holding ep->mtx. */ - lockdep_assert_held(&ep->mtx); + list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { + struct wakeup_source *ws; + __poll_t revents; - list_for_each_entry_safe(epi, tmp, head, rdllink) { - if (esed->res >= esed->maxevents) + if (res >= maxevents) break; /* @@ -1735,24 +1661,23 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head /* * If the event mask intersect the caller-requested one, - * deliver the event to userspace. Again, ep_scan_ready_list() - * is holding ep->mtx, so no operations coming from userspace - * can change the item. + * deliver the event to userspace. Again, we are holding ep->mtx, + * so no operations coming from userspace can change the item. */ revents = ep_item_poll(epi, &pt, 1); if (!revents) continue; - if (__put_user(revents, &uevent->events) || - __put_user(epi->event.data, &uevent->data)) { - list_add(&epi->rdllink, head); + if (__put_user(revents, &events->events) || + __put_user(epi->event.data, &events->data)) { + list_add(&epi->rdllink, &txlist); ep_pm_stay_awake(epi); - if (!esed->res) - esed->res = -EFAULT; - return 0; + if (!res) + res = -EFAULT; + break; } - esed->res++; - uevent++; + res++; + events++; if (epi->event.events & EPOLLONESHOT) epi->event.events &= EP_PRIVATE_BITS; else if (!(epi->event.events & EPOLLET)) { @@ -1771,20 +1696,10 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head ep_pm_stay_awake(epi); } } + ep_done_scan(ep, &txlist); + mutex_unlock(&ep->mtx); - return 0; -} - -static int ep_send_events(struct eventpoll *ep, - struct epoll_event __user *events, int maxevents) -{ - struct ep_send_events_data esed; - - esed.maxevents = maxevents; - esed.events = events; - - ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false); - return esed.res; + return res; } static inline struct timespec64 ep_set_mstimeout(long ms) @@ -1946,40 +1861,36 @@ send_events: } /** - * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested() - * API, to verify that adding an epoll file inside another + * ep_loop_check_proc - verify that adding an epoll file inside another * epoll structure, does not violate the constraints, in * terms of closed loops, or too deep chains (which can * result in excessive stack usage). * * @priv: Pointer to the epoll file to be currently checked. - * @cookie: Original cookie for this call. This is the top-of-the-chain epoll - * data structure pointer. - * @call_nests: Current dept of the @ep_call_nested() call stack. + * @depth: Current depth of the path being checked. * * Returns: Returns zero if adding the epoll @file inside current epoll * structure @ep does not violate the constraints, or -1 otherwise. */ -static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) +static int ep_loop_check_proc(struct eventpoll *ep, int depth) { int error = 0; - struct file *file = priv; - struct eventpoll *ep = file->private_data; - struct eventpoll *ep_tovisit; struct rb_node *rbp; struct epitem *epi; - mutex_lock_nested(&ep->mtx, call_nests + 1); + mutex_lock_nested(&ep->mtx, depth + 1); ep->gen = loop_check_gen; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { + struct eventpoll *ep_tovisit; ep_tovisit = epi->ffd.file->private_data; if (ep_tovisit->gen == loop_check_gen) continue; - error = ep_call_nested(&poll_loop_ncalls, - ep_loop_check_proc, epi->ffd.file, - ep_tovisit, current); + if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS) + error = -1; + else + error = ep_loop_check_proc(ep_tovisit, depth + 1); if (error != 0) break; } else { @@ -1991,11 +1902,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) * not already there, and calling reverse_path_check() * during ep_insert(). */ - if (list_empty(&epi->ffd.file->f_tfile_llink)) { - if (get_file_rcu(epi->ffd.file)) - list_add(&epi->ffd.file->f_tfile_llink, - &tfile_check_list); - } + list_file(epi->ffd.file); } } mutex_unlock(&ep->mtx); @@ -2004,34 +1911,31 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) } /** - * ep_loop_check - Performs a check to verify that adding an epoll file (@file) - * another epoll file (represented by @ep) does not create + * ep_loop_check - Performs a check to verify that adding an epoll file (@to) + * into another epoll file (represented by @from) does not create * closed loops or too deep chains. * - * @ep: Pointer to the epoll private data structure. - * @file: Pointer to the epoll file to be checked. + * @from: Pointer to the epoll we are inserting into. + * @to: Pointer to the epoll to be inserted. * - * Returns: Returns zero if adding the epoll @file inside current epoll - * structure @ep does not violate the constraints, or -1 otherwise. + * Returns: Returns zero if adding the epoll @to inside the epoll @from + * does not violate the constraints, or -1 otherwise. */ -static int ep_loop_check(struct eventpoll *ep, struct file *file) +static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to) { - return ep_call_nested(&poll_loop_ncalls, - ep_loop_check_proc, file, ep, current); + inserting_into = ep; + return ep_loop_check_proc(to, 0); } static void clear_tfile_check_list(void) { - struct file *file; - - /* first clear the tfile_check_list */ - while (!list_empty(&tfile_check_list)) { - file = list_first_entry(&tfile_check_list, struct file, - f_tfile_llink); - list_del_init(&file->f_tfile_llink); - fput(file); + rcu_read_lock(); + while (tfile_check_list != EP_UNACTIVE_PTR) { + struct epitems_head *head = tfile_check_list; + tfile_check_list = head->next; + unlist_file(head); } - INIT_LIST_HEAD(&tfile_check_list); + rcu_read_unlock(); } /* @@ -2181,9 +2085,8 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, if (error) goto error_tgt_fput; if (op == EPOLL_CTL_ADD) { - if (!list_empty(&f.file->f_ep_links) || - ep->gen == loop_check_gen || - is_file_epoll(tf.file)) { + if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen || + is_file_epoll(tf.file)) { mutex_unlock(&ep->mtx); error = epoll_mutex_lock(&epmutex, 0, nonblock); if (error) @@ -2191,25 +2094,14 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, loop_check_gen++; full_check = 1; if (is_file_epoll(tf.file)) { + tep = tf.file->private_data; error = -ELOOP; - if (ep_loop_check(ep, tf.file) != 0) + if (ep_loop_check(ep, tep) != 0) goto error_tgt_fput; - } else { - get_file(tf.file); - list_add(&tf.file->f_tfile_llink, - &tfile_check_list); } error = epoll_mutex_lock(&ep->mtx, 0, nonblock); if (error) goto error_tgt_fput; - if (is_file_epoll(tf.file)) { - tep = tf.file->private_data; - error = epoll_mutex_lock(&tep->mtx, 1, nonblock); - if (error) { - mutex_unlock(&ep->mtx); - goto error_tgt_fput; - } - } } } @@ -2245,8 +2137,6 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, error = -ENOENT; break; } - if (tep != NULL) - mutex_unlock(&tep->mtx); mutex_unlock(&ep->mtx); error_tgt_fput: @@ -2394,12 +2284,6 @@ static int __init eventpoll_init(void) BUG_ON(max_user_watches < 0); /* - * Initialize the structure used to perform epoll file descriptor - * inclusion loops checks. - */ - ep_nested_calls_init(&poll_loop_ncalls); - - /* * We can have many thousands of epitems, so prevent this from * using an extra cache line on 64-bit (and smaller) CPUs */ @@ -2413,6 +2297,9 @@ static int __init eventpoll_init(void) pwq_cache = kmem_cache_create("eventpoll_pwq", sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); + ephead_cache = kmem_cache_create("ep_head", + sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); + return 0; } fs_initcall(eventpoll_init); diff --git a/fs/exec.c b/fs/exec.c index 547a2390baf5..5d4d52039105 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -64,6 +64,7 @@ #include <linux/compat.h> #include <linux/vmalloc.h> #include <linux/io_uring.h> +#include <linux/syscall_user_dispatch.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -756,8 +757,8 @@ int setup_arg_pages(struct linux_binprm *bprm, #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ stack_base = bprm->rlim_stack.rlim_max; - if (stack_base > STACK_SIZE_MAX) - stack_base = STACK_SIZE_MAX; + + stack_base = calc_max_stack_size(stack_base); /* Add space for stack randomization. */ stack_base += (STACK_RND_MASK << PAGE_SHIFT); @@ -965,8 +966,8 @@ EXPORT_SYMBOL(read_code); /* * Maps the mm_struct mm into the current task struct. - * On success, this function returns with the mutex - * exec_update_mutex locked. + * On success, this function returns with exec_update_lock + * held for writing. */ static int exec_mmap(struct mm_struct *mm) { @@ -981,7 +982,7 @@ static int exec_mmap(struct mm_struct *mm) if (old_mm) sync_mm_rss(old_mm); - ret = mutex_lock_killable(&tsk->signal->exec_update_mutex); + ret = down_write_killable(&tsk->signal->exec_update_lock); if (ret) return ret; @@ -995,7 +996,7 @@ static int exec_mmap(struct mm_struct *mm) mmap_read_lock(old_mm); if (unlikely(old_mm->core_state)) { mmap_read_unlock(old_mm); - mutex_unlock(&tsk->signal->exec_update_mutex); + up_write(&tsk->signal->exec_update_lock); return -EINTR; } } @@ -1258,6 +1259,16 @@ int begin_new_exec(struct linux_binprm * bprm) goto out; /* + * Cancel any io_uring activity across execve + */ + io_uring_task_cancel(); + + /* Ensure the files table is not shared. */ + retval = unshare_files(); + if (retval) + goto out; + + /* * Must be called _before_ exec_mmap() as bprm->mm is * not visibile until then. This also enables the update * to be lockless. @@ -1302,6 +1313,8 @@ int begin_new_exec(struct linux_binprm * bprm) flush_thread(); me->personality &= ~bprm->per_clear; + clear_syscall_work_syscall_user_dispatch(me); + /* * We have to apply CLOEXEC before we change whether the process is * dumpable (in setup_new_exec) to avoid a race with a process in userspace @@ -1382,7 +1395,7 @@ int begin_new_exec(struct linux_binprm * bprm) return 0; out_unlock: - mutex_unlock(&me->signal->exec_update_mutex); + up_write(&me->signal->exec_update_lock); out: return retval; } @@ -1423,7 +1436,7 @@ void setup_new_exec(struct linux_binprm * bprm) * some architectures like powerpc */ me->mm->task_size = TASK_SIZE; - mutex_unlock(&me->signal->exec_update_mutex); + up_write(&me->signal->exec_update_lock); mutex_unlock(&me->signal->cred_guard_mutex); } EXPORT_SYMBOL(setup_new_exec); @@ -1776,21 +1789,11 @@ static int bprm_execve(struct linux_binprm *bprm, int fd, struct filename *filename, int flags) { struct file *file; - struct files_struct *displaced; int retval; - /* - * Cancel any io_uring activity across execve - */ - io_uring_task_cancel(); - - retval = unshare_files(&displaced); - if (retval) - return retval; - retval = prepare_bprm_creds(bprm); if (retval) - goto out_files; + return retval; check_unsafe_exec(bprm); current->in_execve = 1; @@ -1805,11 +1808,14 @@ static int bprm_execve(struct linux_binprm *bprm, bprm->file = file; /* * Record that a name derived from an O_CLOEXEC fd will be - * inaccessible after exec. Relies on having exclusive access to - * current->files (due to unshare_files above). + * inaccessible after exec. This allows the code in exec to + * choose to fail when the executable is not mmaped into the + * interpreter and an open file descriptor is not passed to + * the interpreter. This makes for a better user experience + * than having the interpreter start and then immediately fail + * when it finds the executable is inaccessible. */ - if (bprm->fdpath && - close_on_exec(fd, rcu_dereference_raw(current->files->fdt))) + if (bprm->fdpath && get_close_on_exec(fd)) bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; /* Set the unchanging part of bprm->cred */ @@ -1827,8 +1833,6 @@ static int bprm_execve(struct linux_binprm *bprm, rseq_execve(current); acct_update_integrals(current); task_numa_free(current, false); - if (displaced) - put_files_struct(displaced); return retval; out: @@ -1845,10 +1849,6 @@ out_unmark: current->fs->in_exec = 0; current->in_execve = 0; -out_files: - if (displaced) - reset_files_struct(displaced); - return retval; } diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 2dd55b172d57..0106eba46d5a 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -417,9 +417,11 @@ int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len, } EXPORT_SYMBOL_GPL(exportfs_encode_fh); -struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, - int fh_len, int fileid_type, - int (*acceptable)(void *, struct dentry *), void *context) +struct dentry * +exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, + int fileid_type, + int (*acceptable)(void *, struct dentry *), + void *context) { const struct export_operations *nop = mnt->mnt_sb->s_export_op; struct dentry *result, *alias; @@ -432,10 +434,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, if (!nop || !nop->fh_to_dentry) return ERR_PTR(-ESTALE); result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type); - if (PTR_ERR(result) == -ENOMEM) - return ERR_CAST(result); if (IS_ERR_OR_NULL(result)) - return ERR_PTR(-ESTALE); + return result; /* * If no acceptance criteria was specified by caller, a disconnected @@ -561,10 +561,26 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, err_result: dput(result); - if (err != -ENOMEM) - err = -ESTALE; return ERR_PTR(err); } +EXPORT_SYMBOL_GPL(exportfs_decode_fh_raw); + +struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, + int fh_len, int fileid_type, + int (*acceptable)(void *, struct dentry *), + void *context) +{ + struct dentry *ret; + + ret = exportfs_decode_fh_raw(mnt, fid, fh_len, fileid_type, + acceptable, context); + if (IS_ERR_OR_NULL(ret)) { + if (ret == ERR_PTR(-ENOMEM)) + return ret; + return ERR_PTR(-ESTALE); + } + return ret; +} EXPORT_SYMBOL_GPL(exportfs_decode_fh); MODULE_LICENSE("GPL"); diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 70355ab6740e..14aa45316ad2 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -66,12 +66,6 @@ static inline unsigned ext2_chunk_size(struct inode *inode) return inode->i_sb->s_blocksize; } -static inline void ext2_put_page(struct page *page) -{ - kunmap(page); - put_page(page); -} - /* * Return the offset into page `page_nr' of the last valid * byte in that page, plus one. @@ -336,6 +330,8 @@ ext2_readdir(struct file *file, struct dir_context *ctx) * returns the page in which the entry was found (as a parameter - res_page), * and the entry itself. Page is returned mapped and unlocked. * Entry is guaranteed to be valid. + * + * On Success ext2_put_page() should be called on *res_page. */ struct ext2_dir_entry_2 *ext2_find_entry (struct inode *dir, const struct qstr *child, struct page **res_page) @@ -401,6 +397,12 @@ found: return de; } +/** + * Return the '..' directory entry and the page in which the entry was found + * (as a parameter - p). + * + * On Success ext2_put_page() should be called on *p. + */ struct ext2_dir_entry_2 * ext2_dotdot (struct inode *dir, struct page **p) { struct page *page = ext2_get_page(dir, 0, 0); diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 5136b7289e8d..2a4175fbaf5e 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -16,6 +16,8 @@ #include <linux/blockgroup_lock.h> #include <linux/percpu_counter.h> #include <linux/rbtree.h> +#include <linux/mm.h> +#include <linux/highmem.h> /* XXX Here for now... not interested in restructing headers JUST now */ @@ -745,6 +747,11 @@ extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *); extern int ext2_empty_dir (struct inode *); extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **); extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int); +static inline void ext2_put_page(struct page *page) +{ + kunmap(page); + put_page(page); +} /* ialloc.c */ extern struct inode * ext2_new_inode (struct inode *, umode_t, const struct qstr *); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 11c5c6fe75bb..78c417d3c898 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1256,6 +1256,7 @@ do_indirects: mark_inode_dirty(inode); ext2_free_branches(inode, &nr, &nr+1, 3); } + break; case EXT2_TIND_BLOCK: ; } diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 5bf2c145643b..ea980f1e2e99 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -389,23 +389,18 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, if (dir_de) { if (old_dir != new_dir) ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0); - else { - kunmap(dir_page); - put_page(dir_page); - } + else + ext2_put_page(dir_page); inode_dec_link_count(old_dir); } return 0; out_dir: - if (dir_de) { - kunmap(dir_page); - put_page(dir_page); - } + if (dir_de) + ext2_put_page(dir_page); out_old: - kunmap(old_page); - put_page(old_page); + ext2_put_page(old_page); out: return err; } diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 09f1fe676972..6c4753277916 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1070,7 +1070,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) / EXT2_BLOCKS_PER_GROUP(sb)) + 1; db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) / EXT2_DESC_PER_BLOCK(sb); - sbi->s_group_desc = kmalloc_array (db_count, + sbi->s_group_desc = kmalloc_array(db_count, sizeof(struct buffer_head *), GFP_KERNEL); if (sbi->s_group_desc == NULL) { diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index ca50c90adc4c..5ed870614c8d 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -118,11 +118,9 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) struct buffer_head *bh = NULL; struct fscrypt_str fstr = FSTR_INIT(NULL, 0); - if (IS_ENCRYPTED(inode)) { - err = fscrypt_get_encryption_info(inode); - if (err) - return err; - } + err = fscrypt_prepare_readdir(inode); + if (err) + return err; if (is_dx_dir(inode)) { err = ext4_dx_readdir(file, ctx); @@ -616,13 +614,6 @@ finished: return 0; } -static int ext4_dir_open(struct inode * inode, struct file * filp) -{ - if (IS_ENCRYPTED(inode)) - return fscrypt_get_encryption_info(inode) ? -EACCES : 0; - return 0; -} - static int ext4_release_dir(struct inode *inode, struct file *filp) { if (filp->private_data) @@ -664,13 +655,5 @@ const struct file_operations ext4_dir_operations = { .compat_ioctl = ext4_compat_ioctl, #endif .fsync = ext4_sync_file, - .open = ext4_dir_open, .release = ext4_release_dir, }; - -#ifdef CONFIG_UNICODE -const struct dentry_operations ext4_dentry_ops = { - .d_hash = generic_ci_d_hash, - .d_compare = generic_ci_d_compare, -}; -#endif diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 65ecaf96d0a4..c64ea8f59ea7 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3381,10 +3381,6 @@ static inline void ext4_unlock_group(struct super_block *sb, /* dir.c */ extern const struct file_operations ext4_dir_operations; -#ifdef CONFIG_UNICODE -extern const struct dentry_operations ext4_dentry_ops; -#endif - /* file.c */ extern const struct inode_operations ext4_file_inode_operations; extern const struct file_operations ext4_file_operations; diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c index d62d802c9c12..7935ea6cf92c 100644 --- a/fs/ext4/inode-test.c +++ b/fs/ext4/inode-test.c @@ -80,6 +80,145 @@ struct timestamp_expectation { bool lower_bound; }; +static const struct timestamp_expectation test_data[] = { + { + .test_case_name = LOWER_BOUND_NEG_NO_EXTRA_BITS_CASE, + .msb_set = true, + .lower_bound = true, + .extra_bits = 0, + .expected = {.tv_sec = -0x80000000LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NEG_NO_EXTRA_BITS_CASE, + .msb_set = true, + .lower_bound = false, + .extra_bits = 0, + .expected = {.tv_sec = -1LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = LOWER_BOUND_NONNEG_NO_EXTRA_BITS_CASE, + .msb_set = false, + .lower_bound = true, + .extra_bits = 0, + .expected = {0LL, 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NONNEG_NO_EXTRA_BITS_CASE, + .msb_set = false, + .lower_bound = false, + .extra_bits = 0, + .expected = {.tv_sec = 0x7fffffffLL, .tv_nsec = 0L}, + }, + + { + .test_case_name = LOWER_BOUND_NEG_LO_1_CASE, + .msb_set = true, + .lower_bound = true, + .extra_bits = 1, + .expected = {.tv_sec = 0x80000000LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NEG_LO_1_CASE, + .msb_set = true, + .lower_bound = false, + .extra_bits = 1, + .expected = {.tv_sec = 0xffffffffLL, .tv_nsec = 0L}, + }, + + { + .test_case_name = LOWER_BOUND_NONNEG_LO_1_CASE, + .msb_set = false, + .lower_bound = true, + .extra_bits = 1, + .expected = {.tv_sec = 0x100000000LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NONNEG_LO_1_CASE, + .msb_set = false, + .lower_bound = false, + .extra_bits = 1, + .expected = {.tv_sec = 0x17fffffffLL, .tv_nsec = 0L}, + }, + + { + .test_case_name = LOWER_BOUND_NEG_HI_1_CASE, + .msb_set = true, + .lower_bound = true, + .extra_bits = 2, + .expected = {.tv_sec = 0x180000000LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NEG_HI_1_CASE, + .msb_set = true, + .lower_bound = false, + .extra_bits = 2, + .expected = {.tv_sec = 0x1ffffffffLL, .tv_nsec = 0L}, + }, + + { + .test_case_name = LOWER_BOUND_NONNEG_HI_1_CASE, + .msb_set = false, + .lower_bound = true, + .extra_bits = 2, + .expected = {.tv_sec = 0x200000000LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NONNEG_HI_1_CASE, + .msb_set = false, + .lower_bound = false, + .extra_bits = 2, + .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NONNEG_HI_1_NS_1_CASE, + .msb_set = false, + .lower_bound = false, + .extra_bits = 6, + .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 1L}, + }, + + { + .test_case_name = LOWER_BOUND_NONNEG_HI_1_NS_MAX_CASE, + .msb_set = false, + .lower_bound = true, + .extra_bits = 0xFFFFFFFF, + .expected = {.tv_sec = 0x300000000LL, + .tv_nsec = MAX_NANOSECONDS}, + }, + + { + .test_case_name = LOWER_BOUND_NONNEG_EXTRA_BITS_1_CASE, + .msb_set = false, + .lower_bound = true, + .extra_bits = 3, + .expected = {.tv_sec = 0x300000000LL, .tv_nsec = 0L}, + }, + + { + .test_case_name = UPPER_BOUND_NONNEG_EXTRA_BITS_1_CASE, + .msb_set = false, + .lower_bound = false, + .extra_bits = 3, + .expected = {.tv_sec = 0x37fffffffLL, .tv_nsec = 0L}, + } +}; + +static void timestamp_expectation_to_desc(const struct timestamp_expectation *t, + char *desc) +{ + strscpy(desc, t->test_case_name, KUNIT_PARAM_DESC_SIZE); +} + +KUNIT_ARRAY_PARAM(ext4_inode, test_data, timestamp_expectation_to_desc); + static time64_t get_32bit_time(const struct timestamp_expectation * const test) { if (test->msb_set) { @@ -101,166 +240,35 @@ static time64_t get_32bit_time(const struct timestamp_expectation * const test) */ static void inode_test_xtimestamp_decoding(struct kunit *test) { - const struct timestamp_expectation test_data[] = { - { - .test_case_name = LOWER_BOUND_NEG_NO_EXTRA_BITS_CASE, - .msb_set = true, - .lower_bound = true, - .extra_bits = 0, - .expected = {.tv_sec = -0x80000000LL, .tv_nsec = 0L}, - }, - - { - .test_case_name = UPPER_BOUND_NEG_NO_EXTRA_BITS_CASE, - .msb_set = true, - .lower_bound = false, - .extra_bits = 0, - .expected = {.tv_sec = -1LL, .tv_nsec = 0L}, - }, - - { - .test_case_name = LOWER_BOUND_NONNEG_NO_EXTRA_BITS_CASE, - .msb_set = false, - .lower_bound = true, - .extra_bits = 0, - .expected = {0LL, 0L}, - }, - - { - .test_case_name = UPPER_BOUND_NONNEG_NO_EXTRA_BITS_CASE, - .msb_set = false, - .lower_bound = false, - .extra_bits = 0, - .expected = {.tv_sec = 0x7fffffffLL, .tv_nsec = 0L}, - }, - - { - .test_case_name = LOWER_BOUND_NEG_LO_1_CASE, - .msb_set = true, - .lower_bound = true, - .extra_bits = 1, - .expected = {.tv_sec = 0x80000000LL, .tv_nsec = 0L}, - }, - - { - .test_case_name = UPPER_BOUND_NEG_LO_1_CASE, - .msb_set = true, - .lower_bound = false, - .extra_bits = 1, - .expected = {.tv_sec = 0xffffffffLL, .tv_nsec = 0L}, - }, - - { - .test_case_name = LOWER_BOUND_NONNEG_LO_1_CASE, - .msb_set = false, - .lower_bound = true, - .extra_bits = 1, - .expected = {.tv_sec = 0x100000000LL, .tv_nsec = 0L}, - }, - - { - .test_case_name = UPPER_BOUND_NONNEG_LO_1_CASE, - .msb_set = false, - .lower_bound = false, - .extra_bits = 1, - .expected = {.tv_sec = 0x17fffffffLL, .tv_nsec = 0L}, - }, - - { - .test_case_name = LOWER_BOUND_NEG_HI_1_CASE, - .msb_set = true, - .lower_bound = true, - .extra_bits = 2, - .expected = {.tv_sec = 0x180000000LL, .tv_nsec = 0L}, - }, - - { - .test_case_name = UPPER_BOUND_NEG_HI_1_CASE, - .msb_set = true, - .lower_bound = false, - .extra_bits = 2, - .expected = {.tv_sec = 0x1ffffffffLL, .tv_nsec = 0L}, - }, - - { - .test_case_name = LOWER_BOUND_NONNEG_HI_1_CASE, - .msb_set = false, - .lower_bound = true, - .extra_bits = 2, - .expected = {.tv_sec = 0x200000000LL, .tv_nsec = 0L}, - }, - - { - .test_case_name = UPPER_BOUND_NONNEG_HI_1_CASE, - .msb_set = false, - .lower_bound = false, - .extra_bits = 2, - .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 0L}, - }, - - { - .test_case_name = UPPER_BOUND_NONNEG_HI_1_NS_1_CASE, - .msb_set = false, - .lower_bound = false, - .extra_bits = 6, - .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 1L}, - }, - - { - .test_case_name = LOWER_BOUND_NONNEG_HI_1_NS_MAX_CASE, - .msb_set = false, - .lower_bound = true, - .extra_bits = 0xFFFFFFFF, - .expected = {.tv_sec = 0x300000000LL, - .tv_nsec = MAX_NANOSECONDS}, - }, - - { - .test_case_name = LOWER_BOUND_NONNEG_EXTRA_BITS_1_CASE, - .msb_set = false, - .lower_bound = true, - .extra_bits = 3, - .expected = {.tv_sec = 0x300000000LL, .tv_nsec = 0L}, - }, - - { - .test_case_name = UPPER_BOUND_NONNEG_EXTRA_BITS_1_CASE, - .msb_set = false, - .lower_bound = false, - .extra_bits = 3, - .expected = {.tv_sec = 0x37fffffffLL, .tv_nsec = 0L}, - } - }; - struct timespec64 timestamp; - int i; - - for (i = 0; i < ARRAY_SIZE(test_data); ++i) { - timestamp.tv_sec = get_32bit_time(&test_data[i]); - ext4_decode_extra_time(×tamp, - cpu_to_le32(test_data[i].extra_bits)); - - KUNIT_EXPECT_EQ_MSG(test, - test_data[i].expected.tv_sec, - timestamp.tv_sec, - CASE_NAME_FORMAT, - test_data[i].test_case_name, - test_data[i].msb_set, - test_data[i].lower_bound, - test_data[i].extra_bits); - KUNIT_EXPECT_EQ_MSG(test, - test_data[i].expected.tv_nsec, - timestamp.tv_nsec, - CASE_NAME_FORMAT, - test_data[i].test_case_name, - test_data[i].msb_set, - test_data[i].lower_bound, - test_data[i].extra_bits); - } + + struct timestamp_expectation *test_param = + (struct timestamp_expectation *)(test->param_value); + + timestamp.tv_sec = get_32bit_time(test_param); + ext4_decode_extra_time(×tamp, + cpu_to_le32(test_param->extra_bits)); + + KUNIT_EXPECT_EQ_MSG(test, + test_param->expected.tv_sec, + timestamp.tv_sec, + CASE_NAME_FORMAT, + test_param->test_case_name, + test_param->msb_set, + test_param->lower_bound, + test_param->extra_bits); + KUNIT_EXPECT_EQ_MSG(test, + test_param->expected.tv_nsec, + timestamp.tv_nsec, + CASE_NAME_FORMAT, + test_param->test_case_name, + test_param->msb_set, + test_param->lower_bound, + test_param->extra_bits); } static struct kunit_case ext4_inode_test_cases[] = { - KUNIT_CASE(inode_test_xtimestamp_decoding), + KUNIT_CASE_PARAM(inode_test_xtimestamp_decoding, ext4_inode_gen_params), {} }; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index f0381876a7e5..524e13432447 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -624,7 +624,7 @@ static int ext4_shutdown(struct super_block *sb, unsigned long arg) case EXT4_GOING_FLAGS_DEFAULT: freeze_bdev(sb->s_bdev); set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); - thaw_bdev(sb->s_bdev, sb); + thaw_bdev(sb->s_bdev); break; case EXT4_GOING_FLAGS_LOGFLUSH: set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 33509266f5a0..326fe402e495 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -643,13 +643,7 @@ static struct stats dx_show_leaf(struct inode *dir, name = de->name; len = de->name_len; - if (IS_ENCRYPTED(dir)) - res = fscrypt_get_encryption_info(dir); - if (res) { - printk(KERN_WARNING "Error setting up" - " fname crypto: %d\n", res); - } - if (!fscrypt_has_encryption_key(dir)) { + if (!IS_ENCRYPTED(dir)) { /* Directory is not encrypted */ ext4fs_dirhash(dir, de->name, de->name_len, &h); @@ -1010,7 +1004,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, EXT4_DIR_REC_LEN(0)); /* Check if the directory is encrypted */ if (IS_ENCRYPTED(dir)) { - err = fscrypt_get_encryption_info(dir); + err = fscrypt_prepare_readdir(dir); if (err < 0) { brelse(bh); return err; @@ -1614,6 +1608,7 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir, struct buffer_head *bh; err = ext4_fname_prepare_lookup(dir, dentry, &fname); + generic_set_encrypted_ci_d_ops(dentry); if (err == -ENOENT) return NULL; if (err) @@ -2195,6 +2190,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if (!dentry->d_name.len) return -EINVAL; + if (fscrypt_is_nokey_name(dentry)) + return -ENOKEY; + #ifdef CONFIG_UNICODE if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) && sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name)) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 94472044f4c1..830c196ec069 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4044,9 +4044,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_sb = sb; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sb_block = sb_block; - if (sb->s_bdev->bd_part) - sbi->s_sectors_written_start = - part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]); + sbi->s_sectors_written_start = + part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); /* Cleanup superblock name */ strreplace(sb->s_id, '/', '!'); @@ -4964,11 +4963,6 @@ no_journal: goto failed_mount4; } -#ifdef CONFIG_UNICODE - if (sb->s_encoding) - sb->s_d_op = &ext4_dentry_ops; -#endif - sb->s_root = d_make_root(root); if (!sb->s_root) { ext4_msg(sb, KERN_ERR, "get root dentry failed"); @@ -5505,15 +5499,10 @@ static int ext4_commit_super(struct super_block *sb, int sync) */ if (!(sb->s_flags & SB_RDONLY)) ext4_update_tstamp(es, s_wtime); - if (sb->s_bdev->bd_part) - es->s_kbytes_written = - cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + - ((part_stat_read(sb->s_bdev->bd_part, - sectors[STAT_WRITE]) - - EXT4_SB(sb)->s_sectors_written_start) >> 1)); - else - es->s_kbytes_written = - cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); + es->s_kbytes_written = + cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - + EXT4_SB(sb)->s_sectors_written_start) >> 1)); if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter)) ext4_free_blocks_count_set(es, EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive( diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 4e27fe6ed3ae..075aa3a19ff5 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -62,11 +62,8 @@ static ssize_t session_write_kbytes_show(struct ext4_sb_info *sbi, char *buf) { struct super_block *sb = sbi->s_buddy_cache->i_sb; - if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); return snprintf(buf, PAGE_SIZE, "%lu\n", - (part_stat_read(sb->s_bdev->bd_part, - sectors[STAT_WRITE]) - + (part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - sbi->s_sectors_written_start) >> 1); } @@ -74,12 +71,9 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_sb_info *sbi, char *buf) { struct super_block *sb = sbi->s_buddy_cache->i_sb; - if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)(sbi->s_kbytes_written + - ((part_stat_read(sb->s_bdev->bd_part, - sectors[STAT_WRITE]) - + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - EXT4_SB(sb)->s_sectors_written_start) >> 1))); } diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 306413589827..1e5e9b1136ee 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -384,7 +384,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, struct page *dpage) { struct posix_acl *default_acl = NULL, *acl = NULL; - int error = 0; + int error; error = f2fs_acl_create(dir, &inode->i_mode, &default_acl, &acl, dpage); if (error) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 023462e80e58..897edb7c951a 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -37,7 +37,7 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) { struct address_space *mapping = META_MAPPING(sbi); - struct page *page = NULL; + struct page *page; repeat: page = f2fs_grab_cache_page(mapping, index, false); if (!page) { @@ -348,13 +348,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping, goto skip_write; /* if locked failed, cp will flush dirty pages instead */ - if (!mutex_trylock(&sbi->cp_mutex)) + if (!down_write_trylock(&sbi->cp_global_sem)) goto skip_write; trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); - mutex_unlock(&sbi->cp_mutex); + up_write(&sbi->cp_global_sem); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -1385,6 +1385,26 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi, f2fs_submit_merged_write(sbi, META_FLUSH); } +static inline u64 get_sectors_written(struct block_device *bdev) +{ + return (u64)part_stat_read(bdev, sectors[STAT_WRITE]); +} + +u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi) +{ + if (f2fs_is_multi_device(sbi)) { + u64 sectors = 0; + int i; + + for (i = 0; i < sbi->s_ndevs; i++) + sectors += get_sectors_written(FDEV(i).bdev); + + return sectors; + } + + return get_sectors_written(sbi->sb->s_bdev); +} + static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1395,7 +1415,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) __u32 crc32 = 0; int i; int cp_payload_blks = __cp_payload(sbi); - struct super_block *sb = sbi->sb; struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); u64 kbytes_written; int err; @@ -1490,9 +1509,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Record write statistics in the hot node summary */ kbytes_written = sbi->kbytes_written; - if (sb->s_bdev->bd_part) - kbytes_written += BD_PART_WRITTEN(sbi); - + kbytes_written += (f2fs_get_sectors_written(sbi) - + sbi->sectors_written_start) >> 1; seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written); if (__remain_node_summaries(cpc->reason)) { @@ -1572,7 +1590,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_warn(sbi, "Start checkpoint disabled!"); } if (cpc->reason != CP_RESIZE) - mutex_lock(&sbi->cp_mutex); + down_write(&sbi->cp_global_sem); if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || @@ -1600,7 +1618,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) goto out; } - if (NM_I(sbi)->dirty_nat_cnt == 0 && + if (NM_I(sbi)->nat_cnt[DIRTY_NAT] == 0 && SIT_I(sbi)->dirty_sentries == 0 && prefree_segments(sbi) == 0) { f2fs_flush_sit_entries(sbi, cpc); @@ -1647,7 +1665,7 @@ stop: trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); out: if (cpc->reason != CP_RESIZE) - mutex_unlock(&sbi->cp_mutex); + up_write(&sbi->cp_global_sem); return err; } diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 14262e0f1cd6..4bcbacfe3325 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -602,6 +602,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) f2fs_cops[fi->i_compress_algorithm]; unsigned int max_len, new_nr_cpages; struct page **new_cpages; + u32 chksum = 0; int i, ret; trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx, @@ -655,6 +656,11 @@ static int f2fs_compress_pages(struct compress_ctx *cc) cc->cbuf->clen = cpu_to_le32(cc->clen); + if (fi->i_compress_flag & 1 << COMPRESS_CHKSUM) + chksum = f2fs_crc32(F2FS_I_SB(cc->inode), + cc->cbuf->cdata, cc->clen); + cc->cbuf->chksum = cpu_to_le32(chksum); + for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++) cc->cbuf->reserved[i] = cpu_to_le32(0); @@ -790,6 +796,22 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) ret = cops->decompress_pages(dic); + if (!ret && (fi->i_compress_flag & 1 << COMPRESS_CHKSUM)) { + u32 provided = le32_to_cpu(dic->cbuf->chksum); + u32 calculated = f2fs_crc32(sbi, dic->cbuf->cdata, dic->clen); + + if (provided != calculated) { + if (!is_inode_flag_set(dic->inode, FI_COMPRESS_CORRUPT)) { + set_inode_flag(dic->inode, FI_COMPRESS_CORRUPT); + printk_ratelimited( + "%sF2FS-fs (%s): checksum invalid, nid = %lu, %x vs %x", + KERN_INFO, sbi->sb->s_id, dic->inode->i_ino, + provided, calculated); + } + set_sbi_flag(sbi, SBI_NEED_FSCK); + } + } + out_vunmap_cbuf: vm_unmap_ram(dic->cbuf, dic->nr_cpages); out_vunmap_rbuf: @@ -798,8 +820,6 @@ destroy_decompress_ctx: if (cops->destroy_decompress_ctx) cops->destroy_decompress_ctx(dic); out_free_dic: - if (verity) - atomic_set(&dic->pending_pages, dic->nr_cpages); if (!verity) f2fs_decompress_end_io(dic->rpages, dic->cluster_size, ret, false); @@ -921,7 +941,7 @@ int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index) static bool cluster_may_compress(struct compress_ctx *cc) { - if (!f2fs_compressed_file(cc->inode)) + if (!f2fs_need_compress_data(cc->inode)) return false; if (f2fs_is_atomic_file(cc->inode)) return false; diff --git a/fs/f2fs/compress.h b/fs/f2fs/compress.h new file mode 100644 index 000000000000..e69de29bb2d1 --- /dev/null +++ b/fs/f2fs/compress.h diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index be4da52604ed..aa34d620bec9 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -202,7 +202,7 @@ static void f2fs_verify_bio(struct bio *bio) dic = (struct decompress_io_ctx *)page_private(page); if (dic) { - if (atomic_dec_return(&dic->pending_pages)) + if (atomic_dec_return(&dic->verity_pages)) continue; f2fs_verify_pages(dic->rpages, dic->cluster_size); @@ -736,6 +736,9 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) static bool page_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, block_t last_blkaddr, block_t cur_blkaddr) { + if (unlikely(sbi->max_io_bytes && + bio->bi_iter.bi_size >= sbi->max_io_bytes)) + return false; if (last_blkaddr + 1 != cur_blkaddr) return false; return __same_bdev(sbi, cur_blkaddr, bio); @@ -1027,7 +1030,8 @@ static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, unsigned nr_pages, unsigned op_flag, - pgoff_t first_idx, bool for_write) + pgoff_t first_idx, bool for_write, + bool for_verity) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; @@ -1049,7 +1053,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, post_read_steps |= 1 << STEP_DECRYPT; if (f2fs_compressed_file(inode)) post_read_steps |= 1 << STEP_DECOMPRESS_NOWQ; - if (f2fs_need_verity(inode, first_idx)) + if (for_verity && f2fs_need_verity(inode, first_idx)) post_read_steps |= 1 << STEP_VERITY; if (post_read_steps) { @@ -1079,7 +1083,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, struct bio *bio; bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags, - page->index, for_write); + page->index, for_write, true); if (IS_ERR(bio)) return PTR_ERR(bio); @@ -1750,6 +1754,16 @@ bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) return true; } +static inline u64 bytes_to_blks(struct inode *inode, u64 bytes) +{ + return (bytes >> inode->i_blkbits); +} + +static inline u64 blks_to_bytes(struct inode *inode, u64 blks) +{ + return (blks << inode->i_blkbits); +} + static int __get_data_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create, int flag, pgoff_t *next_pgofs, int seg_type, bool may_write) @@ -1758,7 +1772,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock, int err; map.m_lblk = iblock; - map.m_len = bh->b_size >> inode->i_blkbits; + map.m_len = bytes_to_blks(inode, bh->b_size); map.m_next_pgofs = next_pgofs; map.m_next_extent = NULL; map.m_seg_type = seg_type; @@ -1768,20 +1782,11 @@ static int __get_data_block(struct inode *inode, sector_t iblock, if (!err) { map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; - bh->b_size = (u64)map.m_len << inode->i_blkbits; + bh->b_size = blks_to_bytes(inode, map.m_len); } return err; } -static int get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create, int flag, - pgoff_t *next_pgofs) -{ - return __get_data_block(inode, iblock, bh_result, create, - flag, next_pgofs, - NO_CHECK_TYPE, create); -} - static int get_data_block_dio_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { @@ -1800,24 +1805,6 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, false); } -static int get_data_block_bmap(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_BMAP, NULL, - NO_CHECK_TYPE, create); -} - -static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) -{ - return (offset >> inode->i_blkbits); -} - -static inline loff_t blk_to_logical(struct inode *inode, sector_t blk) -{ - return (blk << inode->i_blkbits); -} - static int f2fs_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) { @@ -1843,7 +1830,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, return err; } - phys = (__u64)blk_to_logical(inode, ni.blk_addr); + phys = blks_to_bytes(inode, ni.blk_addr); offset = offsetof(struct f2fs_inode, i_addr) + sizeof(__le32) * (DEF_ADDRS_PER_INODE - get_inline_xattr_addrs(inode)); @@ -1875,7 +1862,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, return err; } - phys = (__u64)blk_to_logical(inode, ni.blk_addr); + phys = blks_to_bytes(inode, ni.blk_addr); len = inode->i_sb->s_blocksize; f2fs_put_page(page, 1); @@ -1913,7 +1900,7 @@ static loff_t max_inode_blocks(struct inode *inode) int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { - struct buffer_head map_bh; + struct f2fs_map_blocks map; sector_t start_blk, last_blk; pgoff_t next_pgofs; u64 logical = 0, phys = 0, size = 0; @@ -1945,29 +1932,31 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, goto out; } - if (logical_to_blk(inode, len) == 0) - len = blk_to_logical(inode, 1); + if (bytes_to_blks(inode, len) == 0) + len = blks_to_bytes(inode, 1); - start_blk = logical_to_blk(inode, start); - last_blk = logical_to_blk(inode, start + len - 1); + start_blk = bytes_to_blks(inode, start); + last_blk = bytes_to_blks(inode, start + len - 1); next: - memset(&map_bh, 0, sizeof(struct buffer_head)); - map_bh.b_size = len; + memset(&map, 0, sizeof(map)); + map.m_lblk = start_blk; + map.m_len = bytes_to_blks(inode, len); + map.m_next_pgofs = &next_pgofs; + map.m_seg_type = NO_CHECK_TYPE; if (compr_cluster) - map_bh.b_size = blk_to_logical(inode, cluster_size - 1); + map.m_len = cluster_size - 1; - ret = get_data_block(inode, start_blk, &map_bh, 0, - F2FS_GET_BLOCK_FIEMAP, &next_pgofs); + ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); if (ret) goto out; /* HOLE */ - if (!buffer_mapped(&map_bh)) { + if (!(map.m_flags & F2FS_MAP_FLAGS)) { start_blk = next_pgofs; - if (blk_to_logical(inode, start_blk) < blk_to_logical(inode, + if (blks_to_bytes(inode, start_blk) < blks_to_bytes(inode, max_inode_blocks(inode))) goto prep_next; @@ -1993,9 +1982,9 @@ next: compr_cluster = false; - logical = blk_to_logical(inode, start_blk - 1); - phys = blk_to_logical(inode, map_bh.b_blocknr); - size = blk_to_logical(inode, cluster_size); + logical = blks_to_bytes(inode, start_blk - 1); + phys = blks_to_bytes(inode, map.m_pblk); + size = blks_to_bytes(inode, cluster_size); flags |= FIEMAP_EXTENT_ENCODED; @@ -2007,20 +1996,20 @@ next: goto prep_next; } - if (map_bh.b_blocknr == COMPRESS_ADDR) { + if (map.m_pblk == COMPRESS_ADDR) { compr_cluster = true; start_blk++; goto prep_next; } - logical = blk_to_logical(inode, start_blk); - phys = blk_to_logical(inode, map_bh.b_blocknr); - size = map_bh.b_size; + logical = blks_to_bytes(inode, start_blk); + phys = blks_to_bytes(inode, map.m_pblk); + size = blks_to_bytes(inode, map.m_len); flags = 0; - if (buffer_unwritten(&map_bh)) + if (map.m_flags & F2FS_MAP_UNWRITTEN) flags = FIEMAP_EXTENT_UNWRITTEN; - start_blk += logical_to_blk(inode, size); + start_blk += bytes_to_blks(inode, size); prep_next: cond_resched(); @@ -2053,8 +2042,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page, bool is_readahead) { struct bio *bio = *bio_ret; - const unsigned blkbits = inode->i_blkbits; - const unsigned blocksize = 1 << blkbits; + const unsigned blocksize = blks_to_bytes(inode, 1); sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; @@ -2063,8 +2051,8 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page, block_in_file = (sector_t)page_index(page); last_block = block_in_file + nr_pages; - last_block_in_file = (f2fs_readpage_limit(inode) + blocksize - 1) >> - blkbits; + last_block_in_file = bytes_to_blks(inode, + f2fs_readpage_limit(inode) + blocksize - 1); if (last_block > last_block_in_file) last_block = last_block_in_file; @@ -2133,7 +2121,7 @@ submit_and_realloc: if (bio == NULL) { bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, is_readahead ? REQ_RAHEAD : 0, page->index, - false); + false, true); if (IS_ERR(bio)) { ret = PTR_ERR(bio); bio = NULL; @@ -2177,16 +2165,17 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, struct bio *bio = *bio_ret; unsigned int start_idx = cc->cluster_idx << cc->log_cluster_size; sector_t last_block_in_file; - const unsigned blkbits = inode->i_blkbits; - const unsigned blocksize = 1 << blkbits; + const unsigned blocksize = blks_to_bytes(inode, 1); struct decompress_io_ctx *dic = NULL; + struct bio_post_read_ctx *ctx; + bool for_verity = false; int i; int ret = 0; f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc)); - last_block_in_file = (f2fs_readpage_limit(inode) + - blocksize - 1) >> blkbits; + last_block_in_file = bytes_to_blks(inode, + f2fs_readpage_limit(inode) + blocksize - 1); /* get rid of pages beyond EOF */ for (i = 0; i < cc->cluster_size; i++) { @@ -2245,10 +2234,29 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, goto out_put_dnode; } + /* + * It's possible to enable fsverity on the fly when handling a cluster, + * which requires complicated error handling. Instead of adding more + * complexity, let's give a rule where end_io post-processes fsverity + * per cluster. In order to do that, we need to submit bio, if previous + * bio sets a different post-process policy. + */ + if (fsverity_active(cc->inode)) { + atomic_set(&dic->verity_pages, cc->nr_cpages); + for_verity = true; + + if (bio) { + ctx = bio->bi_private; + if (!(ctx->enabled_steps & (1 << STEP_VERITY))) { + __submit_bio(sbi, bio, DATA); + bio = NULL; + } + } + } + for (i = 0; i < dic->nr_cpages; i++) { struct page *page = dic->cpages[i]; block_t blkaddr; - struct bio_post_read_ctx *ctx; blkaddr = data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i + 1); @@ -2264,17 +2272,31 @@ submit_and_realloc: if (!bio) { bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages, is_readahead ? REQ_RAHEAD : 0, - page->index, for_write); + page->index, for_write, for_verity); if (IS_ERR(bio)) { + unsigned int remained = dic->nr_cpages - i; + bool release = false; + ret = PTR_ERR(bio); dic->failed = true; - if (!atomic_sub_return(dic->nr_cpages - i, - &dic->pending_pages)) { + + if (for_verity) { + if (!atomic_sub_return(remained, + &dic->verity_pages)) + release = true; + } else { + if (!atomic_sub_return(remained, + &dic->pending_pages)) + release = true; + } + + if (release) { f2fs_decompress_end_io(dic->rpages, - cc->cluster_size, true, - false); + cc->cluster_size, true, + false); f2fs_free_dic(dic); } + f2fs_put_dnode(&dn); *bio_ret = NULL; return ret; @@ -3164,7 +3186,7 @@ static inline bool __should_serialize_io(struct inode *inode, if (IS_NOQUOTA(inode)) return false; - if (f2fs_compressed_file(inode)) + if (f2fs_need_compress_data(inode)) return true; if (wbc->sync_mode != WB_SYNC_ALL) return true; @@ -3799,9 +3821,6 @@ static sector_t f2fs_bmap_compress(struct inode *inode, sector_t block) static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; - struct buffer_head tmp = { - .b_size = i_blocksize(inode), - }; sector_t blknr = 0; if (f2fs_has_inline_data(inode)) @@ -3818,8 +3837,16 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) if (f2fs_compressed_file(inode)) { blknr = f2fs_bmap_compress(inode, block); } else { - if (!get_data_block_bmap(inode, block, &tmp, 0)) - blknr = tmp.b_blocknr; + struct f2fs_map_blocks map; + + memset(&map, 0, sizeof(map)); + map.m_lblk = block; + map.m_len = 1; + map.m_next_pgofs = NULL; + map.m_seg_type = NO_CHECK_TYPE; + + if (!f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_BMAP)) + blknr = map.m_pblk; } out: trace_f2fs_bmap(inode, block, blknr); @@ -3895,7 +3922,7 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, sector_t highest_pblock = 0; int nr_extents = 0; unsigned long nr_pblocks; - unsigned long len; + u64 len; int ret; /* @@ -3903,29 +3930,31 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, * to be very smart. */ cur_lblock = 0; - last_lblock = logical_to_blk(inode, i_size_read(inode)); + last_lblock = bytes_to_blks(inode, i_size_read(inode)); len = i_size_read(inode); while (cur_lblock <= last_lblock && cur_lblock < sis->max) { - struct buffer_head map_bh; + struct f2fs_map_blocks map; pgoff_t next_pgofs; cond_resched(); - memset(&map_bh, 0, sizeof(struct buffer_head)); - map_bh.b_size = len - cur_lblock; + memset(&map, 0, sizeof(map)); + map.m_lblk = cur_lblock; + map.m_len = bytes_to_blks(inode, len) - cur_lblock; + map.m_next_pgofs = &next_pgofs; + map.m_seg_type = NO_CHECK_TYPE; - ret = get_data_block(inode, cur_lblock, &map_bh, 0, - F2FS_GET_BLOCK_FIEMAP, &next_pgofs); + ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); if (ret) goto err_out; /* hole */ - if (!buffer_mapped(&map_bh)) + if (!(map.m_flags & F2FS_MAP_FLAGS)) goto err_out; - pblock = map_bh.b_blocknr; - nr_pblocks = logical_to_blk(inode, map_bh.b_size); + pblock = map.m_pblk; + nr_pblocks = map.m_len; if (cur_lblock + nr_pblocks >= sis->max) nr_pblocks = sis->max - cur_lblock; @@ -3968,7 +3997,6 @@ static int check_swap_activate(struct swap_info_struct *sis, struct inode *inode = mapping->host; unsigned blocks_per_page; unsigned long page_no; - unsigned blkbits; sector_t probe_block; sector_t last_block; sector_t lowest_block = -1; @@ -3979,8 +4007,7 @@ static int check_swap_activate(struct swap_info_struct *sis, if (PAGE_SIZE == F2FS_BLKSIZE) return check_swap_activate_fast(sis, swap_file, span); - blkbits = inode->i_blkbits; - blocks_per_page = PAGE_SIZE >> blkbits; + blocks_per_page = bytes_to_blks(inode, PAGE_SIZE); /* * Map all the blocks into the extent list. This code doesn't try @@ -3988,7 +4015,7 @@ static int check_swap_activate(struct swap_info_struct *sis, */ probe_block = 0; page_no = 0; - last_block = i_size_read(inode) >> blkbits; + last_block = bytes_to_blks(inode, i_size_read(inode)); while ((probe_block + blocks_per_page) <= last_block && page_no < sis->max) { unsigned block_in_page; @@ -4028,7 +4055,7 @@ static int check_swap_activate(struct swap_info_struct *sis, } } - first_block >>= (PAGE_SHIFT - blkbits); + first_block >>= (PAGE_SHIFT - inode->i_blkbits); if (page_no) { /* exclude the header page */ if (first_block < lowest_block) lowest_block = first_block; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index a8357fd4f5fa..197c914119da 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -145,8 +145,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->node_pages = NODE_MAPPING(sbi)->nrpages; if (sbi->meta_inode) si->meta_pages = META_MAPPING(sbi)->nrpages; - si->nats = NM_I(sbi)->nat_cnt; - si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; + si->nats = NM_I(sbi)->nat_cnt[TOTAL_NAT]; + si->dirty_nats = NM_I(sbi)->nat_cnt[DIRTY_NAT]; si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID]; @@ -278,9 +278,10 @@ get_cache: si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID] + NM_I(sbi)->nid_cnt[PREALLOC_NID]) * sizeof(struct free_nid); - si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry); - si->cache_mem += NM_I(sbi)->dirty_nat_cnt * - sizeof(struct nat_entry_set); + si->cache_mem += NM_I(sbi)->nat_cnt[TOTAL_NAT] * + sizeof(struct nat_entry); + si->cache_mem += NM_I(sbi)->nat_cnt[DIRTY_NAT] * + sizeof(struct nat_entry_set); si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); for (i = 0; i < MAX_INO_ENTRY; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 4b9ef8bbfa4a..e6270a867be1 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -5,6 +5,7 @@ * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ +#include <asm/unaligned.h> #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/sched/signal.h> @@ -206,30 +207,55 @@ static struct f2fs_dir_entry *find_in_block(struct inode *dir, /* * Test whether a case-insensitive directory entry matches the filename * being searched for. + * + * Returns 1 for a match, 0 for no match, and -errno on an error. */ -static bool f2fs_match_ci_name(const struct inode *dir, const struct qstr *name, +static int f2fs_match_ci_name(const struct inode *dir, const struct qstr *name, const u8 *de_name, u32 de_name_len) { const struct super_block *sb = dir->i_sb; const struct unicode_map *um = sb->s_encoding; + struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len); struct qstr entry = QSTR_INIT(de_name, de_name_len); int res; + if (IS_ENCRYPTED(dir)) { + const struct fscrypt_str encrypted_name = + FSTR_INIT((u8 *)de_name, de_name_len); + + if (WARN_ON_ONCE(!fscrypt_has_encryption_key(dir))) + return -EINVAL; + + decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL); + if (!decrypted_name.name) + return -ENOMEM; + res = fscrypt_fname_disk_to_usr(dir, 0, 0, &encrypted_name, + &decrypted_name); + if (res < 0) + goto out; + entry.name = decrypted_name.name; + entry.len = decrypted_name.len; + } + res = utf8_strncasecmp_folded(um, name, &entry); - if (res < 0) { - /* - * In strict mode, ignore invalid names. In non-strict mode, - * fall back to treating them as opaque byte sequences. - */ - if (sb_has_strict_encoding(sb) || name->len != entry.len) - return false; - return !memcmp(name->name, entry.name, name->len); + /* + * In strict mode, ignore invalid names. In non-strict mode, + * fall back to treating them as opaque byte sequences. + */ + if (res < 0 && !sb_has_strict_encoding(sb)) { + res = name->len == entry.len && + memcmp(name->name, entry.name, name->len) == 0; + } else { + /* utf8_strncasecmp_folded returns 0 on match */ + res = (res == 0); } - return res == 0; +out: + kfree(decrypted_name.name); + return res; } #endif /* CONFIG_UNICODE */ -static inline bool f2fs_match_name(const struct inode *dir, +static inline int f2fs_match_name(const struct inode *dir, const struct f2fs_filename *fname, const u8 *de_name, u32 de_name_len) { @@ -256,6 +282,7 @@ struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d, struct f2fs_dir_entry *de; unsigned long bit_pos = 0; int max_len = 0; + int res = 0; if (max_slots) *max_slots = 0; @@ -273,10 +300,15 @@ struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d, continue; } - if (de->hash_code == fname->hash && - f2fs_match_name(d->inode, fname, d->filename[bit_pos], - le16_to_cpu(de->name_len))) - goto found; + if (de->hash_code == fname->hash) { + res = f2fs_match_name(d->inode, fname, + d->filename[bit_pos], + le16_to_cpu(de->name_len)); + if (res < 0) + return ERR_PTR(res); + if (res) + goto found; + } if (max_slots && max_len > *max_slots) *max_slots = max_len; @@ -326,7 +358,11 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, } de = find_in_block(dir, dentry_page, fname, &max_slots); - if (de) { + if (IS_ERR(de)) { + *res_page = ERR_CAST(de); + de = NULL; + break; + } else if (de) { *res_page = dentry_page; break; } @@ -448,17 +484,39 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, f2fs_put_page(page, 1); } -static void init_dent_inode(const struct f2fs_filename *fname, +static void init_dent_inode(struct inode *dir, struct inode *inode, + const struct f2fs_filename *fname, struct page *ipage) { struct f2fs_inode *ri; + if (!fname) /* tmpfile case? */ + return; + f2fs_wait_on_page_writeback(ipage, NODE, true, true); /* copy name info. to this inode page */ ri = F2FS_INODE(ipage); ri->i_namelen = cpu_to_le32(fname->disk_name.len); memcpy(ri->i_name, fname->disk_name.name, fname->disk_name.len); + if (IS_ENCRYPTED(dir)) { + file_set_enc_name(inode); + /* + * Roll-forward recovery doesn't have encryption keys available, + * so it can't compute the dirhash for encrypted+casefolded + * filenames. Append it to i_name if possible. Else, disable + * roll-forward recovery of the dentry (i.e., make fsync'ing the + * file force a checkpoint) by setting LOST_PINO. + */ + if (IS_CASEFOLDED(dir)) { + if (fname->disk_name.len + sizeof(f2fs_hash_t) <= + F2FS_NAME_LEN) + put_unaligned(fname->hash, (f2fs_hash_t *) + &ri->i_name[fname->disk_name.len]); + else + file_lost_pino(inode); + } + } set_page_dirty(ipage); } @@ -541,11 +599,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, return page; } - if (fname) { - init_dent_inode(fname, page); - if (IS_ENCRYPTED(dir)) - file_set_enc_name(inode); - } + init_dent_inode(dir, inode, fname, page); /* * This file should be checkpointed during fsync. @@ -1022,7 +1076,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) int err = 0; if (IS_ENCRYPTED(inode)) { - err = fscrypt_get_encryption_info(inode); + err = fscrypt_prepare_readdir(inode); if (err) goto out; @@ -1081,28 +1135,13 @@ out: return err < 0 ? err : 0; } -static int f2fs_dir_open(struct inode *inode, struct file *filp) -{ - if (IS_ENCRYPTED(inode)) - return fscrypt_get_encryption_info(inode) ? -EACCES : 0; - return 0; -} - const struct file_operations f2fs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = f2fs_readdir, .fsync = f2fs_sync_file, - .open = f2fs_dir_open, .unlocked_ioctl = f2fs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = f2fs_compat_ioctl, #endif }; - -#ifdef CONFIG_UNICODE -const struct dentry_operations f2fs_dentry_ops = { - .d_hash = generic_ci_d_hash, - .d_compare = generic_ci_d_compare, -}; -#endif diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cb700d797296..bb11759191dc 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -33,10 +33,8 @@ #else #define f2fs_bug_on(sbi, condition) \ do { \ - if (unlikely(condition)) { \ - WARN_ON(1); \ + if (WARN_ON(condition)) \ set_sbi_flag(sbi, SBI_NEED_FSCK); \ - } \ } while (0) #endif @@ -147,8 +145,10 @@ struct f2fs_mount_info { /* For compression */ unsigned char compress_algorithm; /* algorithm type */ - unsigned compress_log_size; /* cluster log size */ + unsigned char compress_log_size; /* cluster log size */ + bool compress_chksum; /* compressed data chksum */ unsigned char compress_ext_cnt; /* extension count */ + int compress_mode; /* compression mode */ unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ }; @@ -402,85 +402,6 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, return size <= MAX_SIT_JENTRIES(journal); } -/* - * f2fs-specific ioctl commands - */ -#define F2FS_IOCTL_MAGIC 0xf5 -#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1) -#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2) -#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) -#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) -#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) -#define F2FS_IOC_GARBAGE_COLLECT _IOW(F2FS_IOCTL_MAGIC, 6, __u32) -#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) -#define F2FS_IOC_DEFRAGMENT _IOWR(F2FS_IOCTL_MAGIC, 8, \ - struct f2fs_defragment) -#define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ - struct f2fs_move_range) -#define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \ - struct f2fs_flush_device) -#define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \ - struct f2fs_gc_range) -#define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, __u32) -#define F2FS_IOC_SET_PIN_FILE _IOW(F2FS_IOCTL_MAGIC, 13, __u32) -#define F2FS_IOC_GET_PIN_FILE _IOR(F2FS_IOCTL_MAGIC, 14, __u32) -#define F2FS_IOC_PRECACHE_EXTENTS _IO(F2FS_IOCTL_MAGIC, 15) -#define F2FS_IOC_RESIZE_FS _IOW(F2FS_IOCTL_MAGIC, 16, __u64) -#define F2FS_IOC_GET_COMPRESS_BLOCKS _IOR(F2FS_IOCTL_MAGIC, 17, __u64) -#define F2FS_IOC_RELEASE_COMPRESS_BLOCKS \ - _IOR(F2FS_IOCTL_MAGIC, 18, __u64) -#define F2FS_IOC_RESERVE_COMPRESS_BLOCKS \ - _IOR(F2FS_IOCTL_MAGIC, 19, __u64) -#define F2FS_IOC_SEC_TRIM_FILE _IOW(F2FS_IOCTL_MAGIC, 20, \ - struct f2fs_sectrim_range) - -/* - * should be same as XFS_IOC_GOINGDOWN. - * Flags for going down operation used by FS_IOC_GOINGDOWN - */ -#define F2FS_IOC_SHUTDOWN _IOR('X', 125, __u32) /* Shutdown */ -#define F2FS_GOING_DOWN_FULLSYNC 0x0 /* going down with full sync */ -#define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */ -#define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */ -#define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */ -#define F2FS_GOING_DOWN_NEED_FSCK 0x4 /* going down to trigger fsck */ - -/* - * Flags used by F2FS_IOC_SEC_TRIM_FILE - */ -#define F2FS_TRIM_FILE_DISCARD 0x1 /* send discard command */ -#define F2FS_TRIM_FILE_ZEROOUT 0x2 /* zero out */ -#define F2FS_TRIM_FILE_MASK 0x3 - -struct f2fs_gc_range { - u32 sync; - u64 start; - u64 len; -}; - -struct f2fs_defragment { - u64 start; - u64 len; -}; - -struct f2fs_move_range { - u32 dst_fd; /* destination fd */ - u64 pos_in; /* start position in src_fd */ - u64 pos_out; /* start position in dst_fd */ - u64 len; /* size to move */ -}; - -struct f2fs_flush_device { - u32 dev_num; /* device number to flush */ - u32 segments; /* # of segments to flush */ -}; - -struct f2fs_sectrim_range { - u64 start; - u64 len; - u64 flags; -}; - /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 static inline int get_extra_isize(struct inode *inode); @@ -533,9 +454,11 @@ struct f2fs_filename { #ifdef CONFIG_UNICODE /* * For casefolded directories: the casefolded name, but it's left NULL - * if the original name is not valid Unicode or if the filesystem is - * doing an internal operation where usr_fname is also NULL. In these - * cases we fall back to treating the name as an opaque byte sequence. + * if the original name is not valid Unicode, if the directory is both + * casefolded and encrypted and its encryption key is unavailable, or if + * the filesystem is doing an internal operation where usr_fname is also + * NULL. In all these cases we fall back to treating the name as an + * opaque byte sequence. */ struct fscrypt_str cf_name; #endif @@ -753,7 +676,9 @@ enum { FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */ FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ FI_COMPRESSED_FILE, /* indicate file's data can be compressed */ + FI_COMPRESS_CORRUPT, /* indicate compressed cluster is corrupted */ FI_MMAP_FILE, /* indicate file was mmapped */ + FI_ENABLE_COMPRESS, /* enable compression in "user" compression mode */ FI_MAX, /* max flag, never be used */ }; @@ -810,6 +735,7 @@ struct f2fs_inode_info { atomic_t i_compr_blocks; /* # of compressed blocks */ unsigned char i_compress_algorithm; /* algorithm type */ unsigned char i_log_cluster_size; /* log of cluster size */ + unsigned short i_compress_flag; /* compress flag */ unsigned int i_cluster_size; /* cluster size */ }; @@ -894,6 +820,13 @@ enum nid_state { MAX_NID_STATE, }; +enum nat_state { + TOTAL_NAT, + DIRTY_NAT, + RECLAIMABLE_NAT, + MAX_NAT_STATE, +}; + struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ @@ -909,8 +842,7 @@ struct f2fs_nm_info { struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ struct list_head nat_entries; /* cached nat entry list (clean) */ spinlock_t nat_list_lock; /* protect clean nat entry list */ - unsigned int nat_cnt; /* the # of cached nat entries */ - unsigned int dirty_nat_cnt; /* total num of nat entries in set */ + unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */ unsigned int nat_blocks; /* # of nat blocks */ /* free node ids management */ @@ -1320,6 +1252,18 @@ enum fsync_mode { FSYNC_MODE_NOBARRIER, /* fsync behaves nobarrier based on posix */ }; +enum { + COMPR_MODE_FS, /* + * automatically compress compression + * enabled files + */ + COMPR_MODE_USER, /* + * automatical compression is disabled. + * user can control the file compression + * using ioctls + */ +}; + /* * this value is set in page as a private data which indicate that * the page is atomically written, and it is in inmem_pages list. @@ -1349,9 +1293,15 @@ enum compress_algorithm_type { COMPRESS_MAX, }; -#define COMPRESS_DATA_RESERVED_SIZE 5 +enum compress_flag { + COMPRESS_CHKSUM, + COMPRESS_MAX_FLAG, +}; + +#define COMPRESS_DATA_RESERVED_SIZE 4 struct compress_data { __le32 clen; /* compressed data size */ + __le32 chksum; /* compressed data chksum */ __le32 reserved[COMPRESS_DATA_RESERVED_SIZE]; /* reserved */ u8 cdata[]; /* compressed data */ }; @@ -1404,6 +1354,7 @@ struct decompress_io_ctx { size_t rlen; /* valid data length in rbuf */ size_t clen; /* valid data length in cbuf */ atomic_t pending_pages; /* in-flight compressed page count */ + atomic_t verity_pages; /* in-flight page count for verity */ bool failed; /* indicate IO error during decompression */ void *private; /* payload buffer for specified decompression algorithm */ void *private2; /* extra payload buffer */ @@ -1446,7 +1397,7 @@ struct f2fs_sb_info { int cur_cp_pack; /* remain current cp pack */ spinlock_t cp_lock; /* for flag in ckpt */ struct inode *meta_inode; /* cache meta blocks */ - struct mutex cp_mutex; /* checkpoint procedure lock */ + struct rw_semaphore cp_global_sem; /* checkpoint procedure lock */ struct rw_semaphore cp_rwsem; /* blocking FS operations */ struct rw_semaphore node_write; /* locking node writes */ struct rw_semaphore node_change; /* locking node change */ @@ -1496,6 +1447,7 @@ struct f2fs_sb_info { loff_t max_file_blocks; /* max block index of file */ int dir_level; /* directory level */ int readdir_ra; /* readahead inode in readdir */ + u64 max_io_bytes; /* max io bytes to merge IOs */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ @@ -1671,13 +1623,6 @@ static inline bool f2fs_is_multi_device(struct f2fs_sb_info *sbi) return sbi->s_ndevs > 1; } -/* For write statistics. Suppose sector size is 512 bytes, - * and the return value is in kbytes. s is of struct f2fs_sb_info. - */ -#define BD_PART_WRITTEN(s) \ -(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[STAT_WRITE]) - \ - (s)->sectors_written_start) >> 1) - static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) { unsigned long now = jiffies; @@ -2477,24 +2422,31 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, return entry; } -static inline bool is_idle(struct f2fs_sb_info *sbi, int type) +static inline bool is_inflight_io(struct f2fs_sb_info *sbi, int type) { - if (sbi->gc_mode == GC_URGENT_HIGH) - return true; - if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) || get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) || get_pages(sbi, F2FS_WB_CP_DATA) || get_pages(sbi, F2FS_DIO_READ) || get_pages(sbi, F2FS_DIO_WRITE)) - return false; + return true; if (type != DISCARD_TIME && SM_I(sbi) && SM_I(sbi)->dcc_info && atomic_read(&SM_I(sbi)->dcc_info->queued_discard)) - return false; + return true; if (SM_I(sbi) && SM_I(sbi)->fcc_info && atomic_read(&SM_I(sbi)->fcc_info->queued_flush)) + return true; + return false; +} + +static inline bool is_idle(struct f2fs_sb_info *sbi, int type) +{ + if (sbi->gc_mode == GC_URGENT_HIGH) + return true; + + if (is_inflight_io(sbi, type)) return false; if (sbi->gc_mode == GC_URGENT_LOW && @@ -2829,6 +2781,22 @@ static inline int f2fs_compressed_file(struct inode *inode) is_inode_flag_set(inode, FI_COMPRESSED_FILE); } +static inline bool f2fs_need_compress_data(struct inode *inode) +{ + int compress_mode = F2FS_OPTION(F2FS_I_SB(inode)).compress_mode; + + if (!f2fs_compressed_file(inode)) + return false; + + if (compress_mode == COMPR_MODE_FS) + return true; + else if (compress_mode == COMPR_MODE_USER && + is_inode_flag_set(inode, FI_ENABLE_COMPRESS)) + return true; + + return false; +} + static inline unsigned int addrs_per_inode(struct inode *inode) { unsigned int addrs = CUR_ADDRS_PER_INODE(inode) - @@ -3251,6 +3219,8 @@ bool f2fs_empty_dir(struct inode *dir); static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) { + if (fscrypt_is_nokey_name(dentry)) + return -ENOKEY; return f2fs_do_add_link(d_inode(dentry->d_parent), &dentry->d_name, inode, inode->i_ino, inode->i_mode); } @@ -3443,6 +3413,7 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page); void f2fs_remove_dirty_inode(struct inode *inode); int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type); +u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi); int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi); int __init f2fs_create_checkpoint_caches(void); @@ -3767,9 +3738,6 @@ static inline void f2fs_update_sit_info(struct f2fs_sb_info *sbi) {} #endif extern const struct file_operations f2fs_dir_operations; -#ifdef CONFIG_UNICODE -extern const struct dentry_operations f2fs_dentry_ops; -#endif extern const struct file_operations f2fs_file_operations; extern const struct inode_operations f2fs_file_inode_operations; extern const struct address_space_operations f2fs_dblock_aops; @@ -3961,6 +3929,9 @@ static inline void set_compress_context(struct inode *inode) F2FS_OPTION(sbi).compress_algorithm; F2FS_I(inode)->i_log_cluster_size = F2FS_OPTION(sbi).compress_log_size; + F2FS_I(inode)->i_compress_flag = + F2FS_OPTION(sbi).compress_chksum ? + 1 << COMPRESS_CHKSUM : 0; F2FS_I(inode)->i_cluster_size = 1 << F2FS_I(inode)->i_log_cluster_size; F2FS_I(inode)->i_flags |= F2FS_COMPR_FL; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ee861c6d9ff0..f585545277d7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -31,6 +31,7 @@ #include "gc.h" #include "trace.h" #include <trace/events/f2fs.h> +#include <uapi/linux/f2fs.h> static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) { @@ -412,9 +413,14 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) goto fail; /* handle inline data case */ - if (f2fs_has_inline_data(inode) && whence == SEEK_HOLE) { - data_ofs = isize; - goto found; + if (f2fs_has_inline_data(inode)) { + if (whence == SEEK_HOLE) { + data_ofs = isize; + goto found; + } else if (whence == SEEK_DATA) { + data_ofs = offset; + goto found; + } } pgofs = (pgoff_t)(offset >> PAGE_SHIFT); @@ -2230,16 +2236,12 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) switch (in) { case F2FS_GOING_DOWN_FULLSYNC: - sb = freeze_bdev(sb->s_bdev); - if (IS_ERR(sb)) { - ret = PTR_ERR(sb); + ret = freeze_bdev(sb->s_bdev); + if (ret) goto out; - } - if (sb) { - f2fs_stop_checkpoint(sbi, false); - set_sbi_flag(sbi, SBI_IS_SHUTDOWN); - thaw_bdev(sb->s_bdev, sb); - } + f2fs_stop_checkpoint(sbi, false); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); + thaw_bdev(sb->s_bdev); break; case F2FS_GOING_DOWN_METASYNC: /* do checkpoint only */ @@ -2474,26 +2476,19 @@ out: return ret; } -static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) +static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) { - struct inode *inode = file_inode(filp); - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_gc_range range; + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp)); u64 end; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - - if (copy_from_user(&range, (struct f2fs_gc_range __user *)arg, - sizeof(range))) - return -EFAULT; - if (f2fs_readonly(sbi->sb)) return -EROFS; - end = range.start + range.len; - if (end < range.start || range.start < MAIN_BLKADDR(sbi) || + end = range->start + range->len; + if (end < range->start || range->start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) return -EINVAL; @@ -2502,7 +2497,7 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) return ret; do_more: - if (!range.sync) { + if (!range->sync) { if (!down_write_trylock(&sbi->gc_lock)) { ret = -EBUSY; goto out; @@ -2511,20 +2506,30 @@ do_more: down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start)); + ret = f2fs_gc(sbi, range->sync, true, GET_SEGNO(sbi, range->start)); if (ret) { if (ret == -EBUSY) ret = -EAGAIN; goto out; } - range.start += BLKS_PER_SEC(sbi); - if (range.start <= end) + range->start += BLKS_PER_SEC(sbi); + if (range->start <= end) goto do_more; out: mnt_drop_write_file(filp); return ret; } +static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) +{ + struct f2fs_gc_range range; + + if (copy_from_user(&range, (struct f2fs_gc_range __user *)arg, + sizeof(range))) + return -EFAULT; + return __f2fs_ioc_gc_range(filp, &range); +} + static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -2861,9 +2866,9 @@ out: return ret; } -static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) +static int __f2fs_ioc_move_range(struct file *filp, + struct f2fs_move_range *range) { - struct f2fs_move_range range; struct fd dst; int err; @@ -2871,11 +2876,7 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) !(filp->f_mode & FMODE_WRITE)) return -EBADF; - if (copy_from_user(&range, (struct f2fs_move_range __user *)arg, - sizeof(range))) - return -EFAULT; - - dst = fdget(range.dst_fd); + dst = fdget(range->dst_fd); if (!dst.file) return -EBADF; @@ -2888,21 +2889,25 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) if (err) goto err_out; - err = f2fs_move_file_range(filp, range.pos_in, dst.file, - range.pos_out, range.len); + err = f2fs_move_file_range(filp, range->pos_in, dst.file, + range->pos_out, range->len); mnt_drop_write_file(filp); - if (err) - goto err_out; - - if (copy_to_user((struct f2fs_move_range __user *)arg, - &range, sizeof(range))) - err = -EFAULT; err_out: fdput(dst); return err; } +static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) +{ + struct f2fs_move_range range; + + if (copy_from_user(&range, (struct f2fs_move_range __user *)arg, + sizeof(range))) + return -EFAULT; + return __f2fs_ioc_move_range(filp, &range); +} + static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -3939,13 +3944,265 @@ err: return ret; } -long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +static int f2fs_ioc_get_compress_option(struct file *filp, unsigned long arg) { - if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) - return -EIO; - if (!f2fs_is_checkpoint_ready(F2FS_I_SB(file_inode(filp)))) - return -ENOSPC; + struct inode *inode = file_inode(filp); + struct f2fs_comp_option option; + + if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + + inode_lock_shared(inode); + + if (!f2fs_compressed_file(inode)) { + inode_unlock_shared(inode); + return -ENODATA; + } + + option.algorithm = F2FS_I(inode)->i_compress_algorithm; + option.log_cluster_size = F2FS_I(inode)->i_log_cluster_size; + + inode_unlock_shared(inode); + + if (copy_to_user((struct f2fs_comp_option __user *)arg, &option, + sizeof(option))) + return -EFAULT; + + return 0; +} + +static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_comp_option option; + int ret = 0; + + if (!f2fs_sb_has_compression(sbi)) + return -EOPNOTSUPP; + + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&option, (struct f2fs_comp_option __user *)arg, + sizeof(option))) + return -EFAULT; + + if (!f2fs_compressed_file(inode) || + option.log_cluster_size < MIN_COMPRESS_LOG_SIZE || + option.log_cluster_size > MAX_COMPRESS_LOG_SIZE || + option.algorithm >= COMPRESS_MAX) + return -EINVAL; + + file_start_write(filp); + inode_lock(inode); + + if (f2fs_is_mmap_file(inode) || get_dirty_pages(inode)) { + ret = -EBUSY; + goto out; + } + + if (inode->i_size != 0) { + ret = -EFBIG; + goto out; + } + + F2FS_I(inode)->i_compress_algorithm = option.algorithm; + F2FS_I(inode)->i_log_cluster_size = option.log_cluster_size; + F2FS_I(inode)->i_cluster_size = 1 << option.log_cluster_size; + f2fs_mark_inode_dirty_sync(inode, true); + + if (!f2fs_is_compress_backend_ready(inode)) + f2fs_warn(sbi, "compression algorithm is successfully set, " + "but current kernel doesn't support this algorithm."); +out: + inode_unlock(inode); + file_end_write(filp); + + return ret; +} + +static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len) +{ + DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, page_idx); + struct address_space *mapping = inode->i_mapping; + struct page *page; + pgoff_t redirty_idx = page_idx; + int i, page_len = 0, ret = 0; + + page_cache_ra_unbounded(&ractl, len, 0); + + for (i = 0; i < len; i++, page_idx++) { + page = read_cache_page(mapping, page_idx, NULL, NULL); + if (IS_ERR(page)) { + ret = PTR_ERR(page); + break; + } + page_len++; + } + + for (i = 0; i < page_len; i++, redirty_idx++) { + page = find_lock_page(mapping, redirty_idx); + if (!page) + ret = -ENOENT; + set_page_dirty(page); + f2fs_put_page(page, 1); + f2fs_put_page(page, 0); + } + + return ret; +} + +static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + pgoff_t page_idx = 0, last_idx; + unsigned int blk_per_seg = sbi->blocks_per_seg; + int cluster_size = F2FS_I(inode)->i_cluster_size; + int count, ret; + + if (!f2fs_sb_has_compression(sbi) || + F2FS_OPTION(sbi).compress_mode != COMPR_MODE_USER) + return -EOPNOTSUPP; + + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (!f2fs_compressed_file(inode)) + return -EINVAL; + + f2fs_balance_fs(F2FS_I_SB(inode), true); + + file_start_write(filp); + inode_lock(inode); + + if (!f2fs_is_compress_backend_ready(inode)) { + ret = -EOPNOTSUPP; + goto out; + } + + if (f2fs_is_mmap_file(inode)) { + ret = -EBUSY; + goto out; + } + + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (ret) + goto out; + + if (!atomic_read(&fi->i_compr_blocks)) + goto out; + + last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + + count = last_idx - page_idx; + while (count) { + int len = min(cluster_size, count); + + ret = redirty_blocks(inode, page_idx, len); + if (ret < 0) + break; + + if (get_dirty_pages(inode) >= blk_per_seg) + filemap_fdatawrite(inode->i_mapping); + + count -= len; + page_idx += len; + } + + if (!ret) + ret = filemap_write_and_wait_range(inode->i_mapping, 0, + LLONG_MAX); + + if (ret) + f2fs_warn(sbi, "%s: The file might be partially decompressed " + "(errno=%d). Please delete the file.\n", + __func__, ret); +out: + inode_unlock(inode); + file_end_write(filp); + + return ret; +} + +static int f2fs_ioc_compress_file(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + pgoff_t page_idx = 0, last_idx; + unsigned int blk_per_seg = sbi->blocks_per_seg; + int cluster_size = F2FS_I(inode)->i_cluster_size; + int count, ret; + + if (!f2fs_sb_has_compression(sbi) || + F2FS_OPTION(sbi).compress_mode != COMPR_MODE_USER) + return -EOPNOTSUPP; + + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + if (!f2fs_compressed_file(inode)) + return -EINVAL; + + f2fs_balance_fs(F2FS_I_SB(inode), true); + + file_start_write(filp); + inode_lock(inode); + + if (!f2fs_is_compress_backend_ready(inode)) { + ret = -EOPNOTSUPP; + goto out; + } + + if (f2fs_is_mmap_file(inode)) { + ret = -EBUSY; + goto out; + } + + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (ret) + goto out; + + set_inode_flag(inode, FI_ENABLE_COMPRESS); + + last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + + count = last_idx - page_idx; + while (count) { + int len = min(cluster_size, count); + + ret = redirty_blocks(inode, page_idx, len); + if (ret < 0) + break; + + if (get_dirty_pages(inode) >= blk_per_seg) + filemap_fdatawrite(inode->i_mapping); + + count -= len; + page_idx += len; + } + + if (!ret) + ret = filemap_write_and_wait_range(inode->i_mapping, 0, + LLONG_MAX); + + clear_inode_flag(inode, FI_ENABLE_COMPRESS); + + if (ret) + f2fs_warn(sbi, "%s: The file might be partially compressed " + "(errno=%d). Please delete the file.\n", + __func__, ret); +out: + inode_unlock(inode); + file_end_write(filp); + + return ret; +} + +static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ switch (cmd) { case FS_IOC_GETFLAGS: return f2fs_ioc_getflags(filp, arg); @@ -4027,11 +4284,29 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_reserve_compress_blocks(filp, arg); case F2FS_IOC_SEC_TRIM_FILE: return f2fs_sec_trim_file(filp, arg); + case F2FS_IOC_GET_COMPRESS_OPTION: + return f2fs_ioc_get_compress_option(filp, arg); + case F2FS_IOC_SET_COMPRESS_OPTION: + return f2fs_ioc_set_compress_option(filp, arg); + case F2FS_IOC_DECOMPRESS_FILE: + return f2fs_ioc_decompress_file(filp, arg); + case F2FS_IOC_COMPRESS_FILE: + return f2fs_ioc_compress_file(filp, arg); default: return -ENOTTY; } } +long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) + return -EIO; + if (!f2fs_is_checkpoint_ready(F2FS_I_SB(file_inode(filp)))) + return -ENOSPC; + + return __f2fs_ioctl(filp, cmd, arg); +} + static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; @@ -4148,8 +4423,63 @@ out: } #ifdef CONFIG_COMPAT +struct compat_f2fs_gc_range { + u32 sync; + compat_u64 start; + compat_u64 len; +}; +#define F2FS_IOC32_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11,\ + struct compat_f2fs_gc_range) + +static int f2fs_compat_ioc_gc_range(struct file *file, unsigned long arg) +{ + struct compat_f2fs_gc_range __user *urange; + struct f2fs_gc_range range; + int err; + + urange = compat_ptr(arg); + err = get_user(range.sync, &urange->sync); + err |= get_user(range.start, &urange->start); + err |= get_user(range.len, &urange->len); + if (err) + return -EFAULT; + + return __f2fs_ioc_gc_range(file, &range); +} + +struct compat_f2fs_move_range { + u32 dst_fd; + compat_u64 pos_in; + compat_u64 pos_out; + compat_u64 len; +}; +#define F2FS_IOC32_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ + struct compat_f2fs_move_range) + +static int f2fs_compat_ioc_move_range(struct file *file, unsigned long arg) +{ + struct compat_f2fs_move_range __user *urange; + struct f2fs_move_range range; + int err; + + urange = compat_ptr(arg); + err = get_user(range.dst_fd, &urange->dst_fd); + err |= get_user(range.pos_in, &urange->pos_in); + err |= get_user(range.pos_out, &urange->pos_out); + err |= get_user(range.len, &urange->len); + if (err) + return -EFAULT; + + return __f2fs_ioc_move_range(file, &range); +} + long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(file))))) + return -EIO; + if (!f2fs_is_checkpoint_ready(F2FS_I_SB(file_inode(file)))) + return -ENOSPC; + switch (cmd) { case FS_IOC32_GETFLAGS: cmd = FS_IOC_GETFLAGS; @@ -4160,6 +4490,10 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC32_GETVERSION: cmd = FS_IOC_GETVERSION; break; + case F2FS_IOC32_GARBAGE_COLLECT_RANGE: + return f2fs_compat_ioc_gc_range(file, arg); + case F2FS_IOC32_MOVE_RANGE: + return f2fs_compat_ioc_move_range(file, arg); case F2FS_IOC_START_ATOMIC_WRITE: case F2FS_IOC_COMMIT_ATOMIC_WRITE: case F2FS_IOC_START_VOLATILE_WRITE: @@ -4177,10 +4511,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC_GET_ENCRYPTION_KEY_STATUS: case FS_IOC_GET_ENCRYPTION_NONCE: case F2FS_IOC_GARBAGE_COLLECT: - case F2FS_IOC_GARBAGE_COLLECT_RANGE: case F2FS_IOC_WRITE_CHECKPOINT: case F2FS_IOC_DEFRAGMENT: - case F2FS_IOC_MOVE_RANGE: case F2FS_IOC_FLUSH_DEVICE: case F2FS_IOC_GET_FEATURES: case FS_IOC_FSGETXATTR: @@ -4197,11 +4529,15 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_RELEASE_COMPRESS_BLOCKS: case F2FS_IOC_RESERVE_COMPRESS_BLOCKS: case F2FS_IOC_SEC_TRIM_FILE: + case F2FS_IOC_GET_COMPRESS_OPTION: + case F2FS_IOC_SET_COMPRESS_OPTION: + case F2FS_IOC_DECOMPRESS_FILE: + case F2FS_IOC_COMPRESS_FILE: break; default: return -ENOIOCTLCMD; } - return f2fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); + return __f2fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); } #endif diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 05641a1e36cc..3ef84e6ded41 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1986,7 +1986,7 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) freeze_super(sbi->sb); down_write(&sbi->gc_lock); - mutex_lock(&sbi->cp_mutex); + down_write(&sbi->cp_global_sem); spin_lock(&sbi->stat_lock); if (shrunk_blocks + valid_user_blocks(sbi) + @@ -2031,7 +2031,7 @@ recover_out: spin_unlock(&sbi->stat_lock); } out_err: - mutex_unlock(&sbi->cp_mutex); + up_write(&sbi->cp_global_sem); up_write(&sbi->gc_lock); thaw_super(sbi->sb); clear_sbi_flag(sbi, SBI_IS_RESIZEFS); diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index de841aaf3c43..e3beac546c63 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -111,7 +111,9 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) * If the casefolded name is provided, hash it instead of the * on-disk name. If the casefolded name is *not* provided, that * should only be because the name wasn't valid Unicode, so fall - * back to treating the name as an opaque byte sequence. + * back to treating the name as an opaque byte sequence. Note + * that to handle encrypted directories, the fallback must use + * usr_fname (plaintext) rather than disk_name (ciphertext). */ WARN_ON_ONCE(!fname->usr_fname->name); if (fname->cf_name.name) { @@ -121,6 +123,13 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) name = fname->usr_fname->name; len = fname->usr_fname->len; } + if (IS_ENCRYPTED(dir)) { + struct qstr tmp = QSTR_INIT(name, len); + + fname->hash = + cpu_to_le32(fscrypt_fname_siphash(dir, &tmp)); + return; + } } #endif fname->hash = cpu_to_le32(TEA_hash_name(name, len)); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 70384e31788d..806ebabf5870 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -188,7 +188,8 @@ int f2fs_convert_inline_inode(struct inode *inode) struct page *ipage, *page; int err = 0; - if (!f2fs_has_inline_data(inode)) + if (!f2fs_has_inline_data(inode) || + f2fs_hw_is_readonly(sbi) || f2fs_readonly(sbi->sb)) return 0; page = f2fs_grab_cache_page(inode->i_mapping, 0, false); @@ -266,7 +267,7 @@ int f2fs_recover_inline_data(struct inode *inode, struct page *npage) * [prev.] [next] of inline_data flag * o o -> recover inline_data * o x -> remove inline_data, and then recover data blocks - * x o -> remove inline_data, and then recover inline_data + * x o -> remove data blocks, and then recover inline_data * x x -> recover data blocks */ if (IS_INODE(npage)) @@ -298,6 +299,7 @@ process_inline: if (IS_ERR(ipage)) return PTR_ERR(ipage); f2fs_truncate_inline_inode(inode, ipage, 0); + stat_dec_inline_inode(inode); clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { @@ -306,6 +308,7 @@ process_inline: ret = f2fs_truncate_blocks(inode, 0, false); if (ret) return ret; + stat_inc_inline_inode(inode); goto process_inline; } return 0; @@ -332,6 +335,10 @@ struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, make_dentry_ptr_inline(dir, &d, inline_dentry); de = f2fs_find_target_dentry(&d, fname, NULL); unlock_page(ipage); + if (IS_ERR(de)) { + *res_page = ERR_CAST(de); + de = NULL; + } if (de) *res_page = ipage; else diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 657db2fb6739..349d9cb933ee 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -456,6 +456,7 @@ static int do_read_inode(struct inode *inode) le64_to_cpu(ri->i_compr_blocks)); fi->i_compress_algorithm = ri->i_compress_algorithm; fi->i_log_cluster_size = ri->i_log_cluster_size; + fi->i_compress_flag = le16_to_cpu(ri->i_compress_flag); fi->i_cluster_size = 1 << fi->i_log_cluster_size; set_inode_flag(inode, FI_COMPRESSED_FILE); } @@ -634,6 +635,8 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) &F2FS_I(inode)->i_compr_blocks)); ri->i_compress_algorithm = F2FS_I(inode)->i_compress_algorithm; + ri->i_compress_flag = + cpu_to_le16(F2FS_I(inode)->i_compress_flag); ri->i_log_cluster_size = F2FS_I(inode)->i_log_cluster_size; } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 8fa37d1434de..6edb1ab579a1 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -497,6 +497,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, } err = f2fs_prepare_lookup(dir, dentry, &fname); + generic_set_encrypted_ci_d_ops(dentry); if (err == -ENOENT) goto out_splice; if (err) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d5d8ce077f29..3a24423ac65f 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -62,8 +62,8 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) sizeof(struct free_nid)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == NAT_ENTRIES) { - mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> - PAGE_SHIFT; + mem_size = (nm_i->nat_cnt[TOTAL_NAT] * + sizeof(struct nat_entry)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); if (excess_cached_nats(sbi)) res = false; @@ -109,7 +109,7 @@ static void clear_node_page_dirty(struct page *page) static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid) { - return f2fs_get_meta_page(sbi, current_nat_addr(sbi, nid)); + return f2fs_get_meta_page_retry(sbi, current_nat_addr(sbi, nid)); } static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) @@ -177,7 +177,8 @@ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, list_add_tail(&ne->list, &nm_i->nat_entries); spin_unlock(&nm_i->nat_list_lock); - nm_i->nat_cnt++; + nm_i->nat_cnt[TOTAL_NAT]++; + nm_i->nat_cnt[RECLAIMABLE_NAT]++; return ne; } @@ -207,7 +208,8 @@ static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i, static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) { radix_tree_delete(&nm_i->nat_root, nat_get_nid(e)); - nm_i->nat_cnt--; + nm_i->nat_cnt[TOTAL_NAT]--; + nm_i->nat_cnt[RECLAIMABLE_NAT]--; __free_nat_entry(e); } @@ -253,7 +255,8 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, if (get_nat_flag(ne, IS_DIRTY)) goto refresh_list; - nm_i->dirty_nat_cnt++; + nm_i->nat_cnt[DIRTY_NAT]++; + nm_i->nat_cnt[RECLAIMABLE_NAT]--; set_nat_flag(ne, IS_DIRTY, true); refresh_list: spin_lock(&nm_i->nat_list_lock); @@ -273,7 +276,8 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, set_nat_flag(ne, IS_DIRTY, false); set->entry_cnt--; - nm_i->dirty_nat_cnt--; + nm_i->nat_cnt[DIRTY_NAT]--; + nm_i->nat_cnt[RECLAIMABLE_NAT]++; } static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, @@ -2590,9 +2594,15 @@ int f2fs_recover_inline_xattr(struct inode *inode, struct page *page) ri = F2FS_INODE(page); if (ri->i_inline & F2FS_INLINE_XATTR) { - set_inode_flag(inode, FI_INLINE_XATTR); + if (!f2fs_has_inline_xattr(inode)) { + set_inode_flag(inode, FI_INLINE_XATTR); + stat_inc_inline_xattr(inode); + } } else { - clear_inode_flag(inode, FI_INLINE_XATTR); + if (f2fs_has_inline_xattr(inode)) { + stat_dec_inline_xattr(inode); + clear_inode_flag(inode, FI_INLINE_XATTR); + } goto update_inode; } @@ -2944,14 +2954,17 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) LIST_HEAD(sets); int err = 0; - /* during unmount, let's flush nat_bits before checking dirty_nat_cnt */ + /* + * during unmount, let's flush nat_bits before checking + * nat_cnt[DIRTY_NAT]. + */ if (enabled_nat_bits(sbi, cpc)) { down_write(&nm_i->nat_tree_lock); remove_nats_in_journal(sbi); up_write(&nm_i->nat_tree_lock); } - if (!nm_i->dirty_nat_cnt) + if (!nm_i->nat_cnt[DIRTY_NAT]) return 0; down_write(&nm_i->nat_tree_lock); @@ -2962,7 +2975,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * into nat entry set. */ if (enabled_nat_bits(sbi, cpc) || - !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) + !__has_cursum_space(journal, + nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) remove_nats_in_journal(sbi); while ((found = __gang_lookup_nat_set(nm_i, @@ -3086,7 +3100,6 @@ static int init_node_manager(struct f2fs_sb_info *sbi) F2FS_RESERVED_NODE_NUM; nm_i->nid_cnt[FREE_NID] = 0; nm_i->nid_cnt[PREALLOC_NID] = 0; - nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; nm_i->ra_nid_pages = DEF_RA_NID_PAGES; nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; @@ -3220,7 +3233,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) __del_from_nat_cache(nm_i, natvec[idx]); } } - f2fs_bug_on(sbi, nm_i->nat_cnt); + f2fs_bug_on(sbi, nm_i->nat_cnt[TOTAL_NAT]); /* destroy nat set cache */ nid = 0; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 69e5859e993c..f84541b57acb 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -126,13 +126,13 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne, static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi) { - return NM_I(sbi)->dirty_nat_cnt >= NM_I(sbi)->max_nid * + return NM_I(sbi)->nat_cnt[DIRTY_NAT] >= NM_I(sbi)->max_nid * NM_I(sbi)->dirty_nats_ratio / 100; } static inline bool excess_cached_nats(struct f2fs_sb_info *sbi) { - return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD; + return NM_I(sbi)->nat_cnt[TOTAL_NAT] >= DEF_NAT_CACHE_THRESHOLD; } static inline bool excess_dirty_nodes(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 4f12ade6410a..da75d5d52f0a 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -5,6 +5,7 @@ * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ +#include <asm/unaligned.h> #include <linux/fs.h> #include <linux/f2fs_fs.h> #include "f2fs.h" @@ -128,7 +129,16 @@ static int init_recovered_filename(const struct inode *dir, } /* Compute the hash of the filename */ - if (IS_CASEFOLDED(dir)) { + if (IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir)) { + /* + * In this case the hash isn't computable without the key, so it + * was saved on-disk. + */ + if (fname->disk_name.len + sizeof(f2fs_hash_t) > F2FS_NAME_LEN) + return -EINVAL; + fname->hash = get_unaligned((f2fs_hash_t *) + &raw_inode->i_name[fname->disk_name.len]); + } else if (IS_CASEFOLDED(dir)) { err = f2fs_init_casefolded_name(dir, fname); if (err) return err; @@ -789,7 +799,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) INIT_LIST_HEAD(&dir_list); /* prevent checkpoint */ - mutex_lock(&sbi->cp_mutex); + down_write(&sbi->cp_global_sem); /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list, check_only); @@ -840,7 +850,7 @@ skip: if (!err) clear_sbi_flag(sbi, SBI_POR_DOING); - mutex_unlock(&sbi->cp_mutex); + up_write(&sbi->cp_global_sem); /* let's drop all the directory inodes for clean checkpoint */ destroy_fsync_dnodes(&dir_list, err); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1596502f7375..deca74cb17df 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -529,31 +529,38 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) else f2fs_build_free_nids(sbi, false, false); - if (!is_idle(sbi, REQ_TIME) && - (!excess_dirty_nats(sbi) && !excess_dirty_nodes(sbi))) + if (excess_dirty_nats(sbi) || excess_dirty_nodes(sbi) || + excess_prefree_segs(sbi)) + goto do_sync; + + /* there is background inflight IO or foreground operation recently */ + if (is_inflight_io(sbi, REQ_TIME) || + (!f2fs_time_over(sbi, REQ_TIME) && rwsem_is_locked(&sbi->cp_rwsem))) return; + /* exceed periodical checkpoint timeout threshold */ + if (f2fs_time_over(sbi, CP_TIME)) + goto do_sync; + /* checkpoint is the only way to shrink partial cached entries */ - if (!f2fs_available_free_memory(sbi, NAT_ENTRIES) || - !f2fs_available_free_memory(sbi, INO_ENTRIES) || - excess_prefree_segs(sbi) || - excess_dirty_nats(sbi) || - excess_dirty_nodes(sbi) || - f2fs_time_over(sbi, CP_TIME)) { - if (test_opt(sbi, DATA_FLUSH) && from_bg) { - struct blk_plug plug; - - mutex_lock(&sbi->flush_lock); - - blk_start_plug(&plug); - f2fs_sync_dirty_inodes(sbi, FILE_INODE); - blk_finish_plug(&plug); + if (f2fs_available_free_memory(sbi, NAT_ENTRIES) || + f2fs_available_free_memory(sbi, INO_ENTRIES)) + return; - mutex_unlock(&sbi->flush_lock); - } - f2fs_sync_fs(sbi->sb, true); - stat_inc_bg_cp_count(sbi->stat_info); +do_sync: + if (test_opt(sbi, DATA_FLUSH) && from_bg) { + struct blk_plug plug; + + mutex_lock(&sbi->flush_lock); + + blk_start_plug(&plug); + f2fs_sync_dirty_inodes(sbi, FILE_INODE); + blk_finish_plug(&plug); + + mutex_unlock(&sbi->flush_lock); } + f2fs_sync_fs(sbi->sb, true); + stat_inc_bg_cp_count(sbi->stat_info); } static int __submit_flush_wait(struct f2fs_sb_info *sbi, @@ -3254,7 +3261,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) else return CURSEG_COLD_DATA; } - if (file_is_cold(inode) || f2fs_compressed_file(inode)) + if (file_is_cold(inode) || f2fs_need_compress_data(inode)) return CURSEG_COLD_DATA; if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || @@ -4544,7 +4551,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) return; mutex_lock(&dirty_i->seglist_lock); - for (segno = 0; segno < MAIN_SECS(sbi); segno += blks_per_sec) { + for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { valid_blocks = get_valid_blocks(sbi, segno, true); secno = GET_SEC_FROM_SEG(sbi, segno); diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index d66de5999a26..dd3c3c7a90ec 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -18,9 +18,7 @@ static unsigned int shrinker_run_no; static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) { - long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; - - return count > 0 ? count : 0; + return NM_I(sbi)->nat_cnt[RECLAIMABLE_NAT]; } static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 00eff2f51807..b4a07fe62d1a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -146,6 +146,8 @@ enum { Opt_compress_algorithm, Opt_compress_log_size, Opt_compress_extension, + Opt_compress_chksum, + Opt_compress_mode, Opt_atgc, Opt_err, }; @@ -214,6 +216,8 @@ static match_table_t f2fs_tokens = { {Opt_compress_algorithm, "compress_algorithm=%s"}, {Opt_compress_log_size, "compress_log_size=%u"}, {Opt_compress_extension, "compress_extension=%s"}, + {Opt_compress_chksum, "compress_chksum"}, + {Opt_compress_mode, "compress_mode=%s"}, {Opt_atgc, "atgc"}, {Opt_err, NULL}, }; @@ -934,10 +938,29 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) F2FS_OPTION(sbi).compress_ext_cnt++; kfree(name); break; + case Opt_compress_chksum: + F2FS_OPTION(sbi).compress_chksum = true; + break; + case Opt_compress_mode: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (!strcmp(name, "fs")) { + F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS; + } else if (!strcmp(name, "user")) { + F2FS_OPTION(sbi).compress_mode = COMPR_MODE_USER; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; #else case Opt_compress_algorithm: case Opt_compress_log_size: case Opt_compress_extension: + case Opt_compress_chksum: + case Opt_compress_mode: f2fs_info(sbi, "compression options not supported"); break; #endif @@ -1523,6 +1546,14 @@ static inline void f2fs_show_compress_options(struct seq_file *seq, seq_printf(seq, ",compress_extension=%s", F2FS_OPTION(sbi).extensions[i]); } + + if (F2FS_OPTION(sbi).compress_chksum) + seq_puts(seq, ",compress_chksum"); + + if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_FS) + seq_printf(seq, ",compress_mode=%s", "fs"); + else if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER) + seq_printf(seq, ",compress_mode=%s", "user"); } static int f2fs_show_options(struct seq_file *seq, struct dentry *root) @@ -1672,6 +1703,7 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; F2FS_OPTION(sbi).compress_log_size = MIN_COMPRESS_LOG_SIZE; F2FS_OPTION(sbi).compress_ext_cnt = 0; + F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS; F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; sbi->sb->s_flags &= ~SB_INLINECRYPT; @@ -1904,7 +1936,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (*flags & SB_RDONLY || F2FS_OPTION(sbi).whint_mode != org_mount_opt.whint_mode) { - writeback_inodes_sb(sb, WB_REASON_SYNC); sync_inodes_sb(sb); set_sbi_flag(sbi, SBI_IS_DIRTY); @@ -2744,7 +2775,6 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, block_t total_sections, blocks_per_seg; struct f2fs_super_block *raw_super = (struct f2fs_super_block *) (bh->b_data + F2FS_SUPER_OFFSET); - unsigned int blocksize; size_t crc_offset = 0; __u32 crc = 0; @@ -2770,18 +2800,11 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, } } - /* Currently, support only 4KB page cache size */ - if (F2FS_BLKSIZE != PAGE_SIZE) { - f2fs_info(sbi, "Invalid page_cache_size (%lu), supports only 4KB", - PAGE_SIZE); - return -EFSCORRUPTED; - } - /* Currently, support only 4KB block size */ - blocksize = 1 << le32_to_cpu(raw_super->log_blocksize); - if (blocksize != F2FS_BLKSIZE) { - f2fs_info(sbi, "Invalid blocksize (%u), supports only 4KB", - blocksize); + if (le32_to_cpu(raw_super->log_blocksize) != F2FS_BLKSIZE_BITS) { + f2fs_info(sbi, "Invalid log_blocksize (%u), supports only %u", + le32_to_cpu(raw_super->log_blocksize), + F2FS_BLKSIZE_BITS); return -EFSCORRUPTED; } @@ -3071,9 +3094,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->total_node_count = (le32_to_cpu(raw_super->segment_count_nat) / 2) * sbi->blocks_per_seg * NAT_ENTRY_PER_BLOCK; - sbi->root_ino_num = le32_to_cpu(raw_super->root_ino); - sbi->node_ino_num = le32_to_cpu(raw_super->node_ino); - sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino); + F2FS_ROOT_INO(sbi) = le32_to_cpu(raw_super->root_ino); + F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino); + F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino); sbi->cur_victim_sec = NULL_SECNO; sbi->next_victim_seg[BG_GC] = NULL_SEGNO; sbi->next_victim_seg[FG_GC] = NULL_SEGNO; @@ -3151,7 +3174,7 @@ static int f2fs_report_zone_cb(struct blk_zone *zone, unsigned int idx, static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) { struct block_device *bdev = FDEV(devi).bdev; - sector_t nr_sectors = bdev->bd_part->nr_sects; + sector_t nr_sectors = bdev_nr_sectors(bdev); struct f2fs_report_zones_args rep_zone_arg; int ret; @@ -3399,12 +3422,6 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) struct unicode_map *encoding; __u16 encoding_flags; - if (f2fs_sb_has_encrypt(sbi)) { - f2fs_err(sbi, - "Can't mount with encoding and encryption"); - return -EINVAL; - } - if (f2fs_sb_read_encoding(sbi->raw_super, &encoding_info, &encoding_flags)) { f2fs_err(sbi, @@ -3427,7 +3444,6 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) sbi->sb->s_encoding = encoding; sbi->sb->s_encoding_flags = encoding_flags; - sbi->sb->s_d_op = &f2fs_dentry_ops; } #else if (f2fs_sb_has_casefold(sbi)) { @@ -3559,7 +3575,7 @@ try_onemore: sbi->valid_super_block = valid_super_block; init_rwsem(&sbi->gc_lock); mutex_init(&sbi->writepages); - mutex_init(&sbi->cp_mutex); + init_rwsem(&sbi->cp_global_sem); init_rwsem(&sbi->node_write); init_rwsem(&sbi->node_change); @@ -3700,10 +3716,7 @@ try_onemore: } /* For write statistics */ - if (sb->s_bdev->bd_part) - sbi->sectors_written_start = - (u64)part_stat_read(sb->s_bdev->bd_part, - sectors[STAT_WRITE]); + sbi->sectors_written_start = f2fs_get_sectors_written(sbi); /* Read accumulated write IO statistics if exists */ seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); @@ -3918,6 +3931,7 @@ free_bio_info: #ifdef CONFIG_UNICODE utf8_unload(sb->s_encoding); + sb->s_encoding = NULL; #endif free_options: #ifdef CONFIG_QUOTA diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index ec77ccfea923..30bae57428d1 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -90,25 +90,17 @@ static ssize_t free_segments_show(struct f2fs_attr *a, static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - struct super_block *sb = sbi->sb; - - if (!sb->s_bdev->bd_part) - return sprintf(buf, "0\n"); - return sprintf(buf, "%llu\n", (unsigned long long)(sbi->kbytes_written + - BD_PART_WRITTEN(sbi))); + ((f2fs_get_sectors_written(sbi) - + sbi->sectors_written_start) >> 1))); } static ssize_t features_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - struct super_block *sb = sbi->sb; int len = 0; - if (!sb->s_bdev->bd_part) - return sprintf(buf, "0\n"); - if (f2fs_sb_has_encrypt(sbi)) len += scnprintf(buf, PAGE_SIZE - len, "%s", "encryption"); @@ -566,6 +558,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_period_ms, iostat_period_ms); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_io_bytes, max_io_bytes); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); F2FS_RW_ATTR(F2FS_SBI, f2fs_super_block, extension_list, extension_list); #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -650,6 +643,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(iostat_enable), ATTR_LIST(iostat_period_ms), ATTR_LIST(readdir_ra), + ATTR_LIST(max_io_bytes), ATTR_LIST(gc_pin_file_thresh), ATTR_LIST(extension_list), #ifdef CONFIG_F2FS_FAULT_INJECTION diff --git a/fs/fcntl.c b/fs/fcntl.c index 19ac5baad50f..05b36b28f2e8 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -781,9 +781,10 @@ void send_sigio(struct fown_struct *fown, int fd, int band) { struct task_struct *p; enum pid_type type; + unsigned long flags; struct pid *pid; - read_lock(&fown->lock); + read_lock_irqsave(&fown->lock, flags); type = fown->pid_type; pid = fown->pid; @@ -804,7 +805,7 @@ void send_sigio(struct fown_struct *fown, int fd, int band) read_unlock(&tasklist_lock); } out_unlock_fown: - read_unlock(&fown->lock); + read_unlock_irqrestore(&fown->lock, flags); } static void send_sigurg_to_task(struct task_struct *p, @@ -819,9 +820,10 @@ int send_sigurg(struct fown_struct *fown) struct task_struct *p; enum pid_type type; struct pid *pid; + unsigned long flags; int ret = 0; - read_lock(&fown->lock); + read_lock_irqsave(&fown->lock, flags); type = fown->pid_type; pid = fown->pid; @@ -844,7 +846,7 @@ int send_sigurg(struct fown_struct *fown) read_unlock(&tasklist_lock); } out_unlock_fown: - read_unlock(&fown->lock); + read_unlock_irqrestore(&fown->lock, flags); return ret; } diff --git a/fs/file.c b/fs/file.c index 4559b5fec3bd..8434e0afecc7 100644 --- a/fs/file.c +++ b/fs/file.c @@ -158,7 +158,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr) spin_unlock(&files->file_lock); new_fdt = alloc_fdtable(nr); - /* make sure all __fd_install() have seen resize_in_progress + /* make sure all fd_install() have seen resize_in_progress * or have finished their rcu_read_lock_sched() section. */ if (atomic_read(&files->count) > 1) @@ -181,7 +181,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr) rcu_assign_pointer(files->fdt, new_fdt); if (cur_fdt != &files->fdtab) call_rcu(&cur_fdt->rcu, free_fdtable_rcu); - /* coupled with smp_rmb() in __fd_install() */ + /* coupled with smp_rmb() in fd_install() */ smp_wmb(); return 1; } @@ -411,19 +411,6 @@ static struct fdtable *close_files(struct files_struct * files) return fdt; } -struct files_struct *get_files_struct(struct task_struct *task) -{ - struct files_struct *files; - - task_lock(task); - files = task->files; - if (files) - atomic_inc(&files->count); - task_unlock(task); - - return files; -} - void put_files_struct(struct files_struct *files) { if (atomic_dec_and_test(&files->count)) { @@ -436,18 +423,6 @@ void put_files_struct(struct files_struct *files) } } -void reset_files_struct(struct files_struct *files) -{ - struct task_struct *tsk = current; - struct files_struct *old; - - old = tsk->files; - task_lock(tsk); - tsk->files = files; - task_unlock(tsk); - put_files_struct(old); -} - void exit_files(struct task_struct *tsk) { struct files_struct * files = tsk->files; @@ -492,9 +467,9 @@ static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) /* * allocate a file descriptor, mark it busy. */ -int __alloc_fd(struct files_struct *files, - unsigned start, unsigned end, unsigned flags) +static int alloc_fd(unsigned start, unsigned end, unsigned flags) { + struct files_struct *files = current->files; unsigned int fd; int error; struct fdtable *fdt; @@ -550,14 +525,9 @@ out: return error; } -static int alloc_fd(unsigned start, unsigned flags) -{ - return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags); -} - int __get_unused_fd_flags(unsigned flags, unsigned long nofile) { - return __alloc_fd(current->files, 0, nofile, flags); + return alloc_fd(0, nofile, flags); } int get_unused_fd_flags(unsigned flags) @@ -596,17 +566,13 @@ EXPORT_SYMBOL(put_unused_fd); * It should never happen - if we allow dup2() do it, _really_ bad things * will follow. * - * NOTE: __fd_install() variant is really, really low-level; don't - * use it unless you are forced to by truly lousy API shoved down - * your throat. 'files' *MUST* be either current->files or obtained - * by get_files_struct(current) done by whoever had given it to you, - * or really bad things will happen. Normally you want to use - * fd_install() instead. + * This consumes the "file" refcount, so callers should treat it + * as if they had called fput(file). */ -void __fd_install(struct files_struct *files, unsigned int fd, - struct file *file) +void fd_install(unsigned int fd, struct file *file) { + struct files_struct *files = current->files; struct fdtable *fdt; rcu_read_lock_sched(); @@ -628,15 +594,6 @@ void __fd_install(struct files_struct *files, unsigned int fd, rcu_read_unlock_sched(); } -/* - * This consumes the "file" refcount, so callers should treat it - * as if they had called fput(file). - */ -void fd_install(unsigned int fd, struct file *file) -{ - __fd_install(current->files, fd, file); -} - EXPORT_SYMBOL(fd_install); static struct file *pick_file(struct files_struct *files, unsigned fd) @@ -659,11 +616,9 @@ out_unlock: return file; } -/* - * The same warnings as for __alloc_fd()/__fd_install() apply here... - */ -int __close_fd(struct files_struct *files, unsigned fd) +int close_fd(unsigned fd) { + struct files_struct *files = current->files; struct file *file; file = pick_file(files, fd); @@ -672,7 +627,36 @@ int __close_fd(struct files_struct *files, unsigned fd) return filp_close(file, files); } -EXPORT_SYMBOL(__close_fd); /* for ksys_close() */ +EXPORT_SYMBOL(close_fd); /* for ksys_close() */ + +static inline void __range_cloexec(struct files_struct *cur_fds, + unsigned int fd, unsigned int max_fd) +{ + struct fdtable *fdt; + + if (fd > max_fd) + return; + + spin_lock(&cur_fds->file_lock); + fdt = files_fdtable(cur_fds); + bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1); + spin_unlock(&cur_fds->file_lock); +} + +static inline void __range_close(struct files_struct *cur_fds, unsigned int fd, + unsigned int max_fd) +{ + while (fd <= max_fd) { + struct file *file; + + file = pick_file(cur_fds, fd++); + if (!file) + continue; + + filp_close(file, cur_fds); + cond_resched(); + } +} /** * __close_range() - Close all file descriptors in a given range. @@ -689,7 +673,7 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) struct task_struct *me = current; struct files_struct *cur_fds = me->files, *fds = NULL; - if (flags & ~CLOSE_RANGE_UNSHARE) + if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC)) return -EINVAL; if (fd > max_fd) @@ -727,16 +711,11 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) } max_fd = min(max_fd, cur_max); - while (fd <= max_fd) { - struct file *file; - file = pick_file(cur_fds, fd++); - if (!file) - continue; - - filp_close(file, cur_fds); - cond_resched(); - } + if (flags & CLOSE_RANGE_CLOEXEC) + __range_cloexec(cur_fds, fd, max_fd); + else + __range_close(cur_fds, fd, max_fd); if (fds) { /* @@ -753,11 +732,11 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) } /* - * variant of __close_fd that gets a ref on the file for later fput. + * variant of close_fd that gets a ref on the file for later fput. * The caller must ensure that filp_close() called on the file, and then * an fput(). */ -int __close_fd_get_file(unsigned int fd, struct file **res) +int close_fd_get_file(unsigned int fd, struct file **res) { struct files_struct *files = current->files; struct file *file; @@ -826,7 +805,7 @@ static struct file *__fget_files(struct files_struct *files, unsigned int fd, rcu_read_lock(); loop: - file = fcheck_files(files, fd); + file = files_lookup_fd_rcu(files, fd); if (file) { /* File object ref couldn't be taken. * dup2() atomicity guarantee is the reason @@ -877,6 +856,42 @@ struct file *fget_task(struct task_struct *task, unsigned int fd) return file; } +struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd) +{ + /* Must be called with rcu_read_lock held */ + struct files_struct *files; + struct file *file = NULL; + + task_lock(task); + files = task->files; + if (files) + file = files_lookup_fd_rcu(files, fd); + task_unlock(task); + + return file; +} + +struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd) +{ + /* Must be called with rcu_read_lock held */ + struct files_struct *files; + unsigned int fd = *ret_fd; + struct file *file = NULL; + + task_lock(task); + files = task->files; + if (files) { + for (; fd < files_fdtable(files)->max_fds; fd++) { + file = files_lookup_fd_rcu(files, fd); + if (file) + break; + } + } + task_unlock(task); + *ret_fd = fd; + return file; +} + /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. * @@ -899,7 +914,7 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask) struct file *file; if (atomic_read(&files->count) == 1) { - file = __fcheck_files(files, fd); + file = files_lookup_fd_raw(files, fd); if (!file || unlikely(file->f_mode & mask)) return 0; return (unsigned long)file; @@ -1021,7 +1036,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags) struct files_struct *files = current->files; if (!file) - return __close_fd(files, fd); + return close_fd(fd); if (fd >= rlimit(RLIMIT_NOFILE)) return -EBADF; @@ -1110,7 +1125,7 @@ static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) spin_lock(&files->file_lock); err = expand_files(files, newfd); - file = fcheck(oldfd); + file = files_lookup_fd_locked(files, oldfd); if (unlikely(!file)) goto Ebadf; if (unlikely(err < 0)) { @@ -1139,7 +1154,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) int retval = oldfd; rcu_read_lock(); - if (!fcheck_files(files, oldfd)) + if (!files_lookup_fd_rcu(files, oldfd)) retval = -EBADF; rcu_read_unlock(); return retval; @@ -1164,10 +1179,11 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes) int f_dupfd(unsigned int from, struct file *file, unsigned flags) { + unsigned long nofile = rlimit(RLIMIT_NOFILE); int err; - if (from >= rlimit(RLIMIT_NOFILE)) + if (from >= nofile) return -EINVAL; - err = alloc_fd(from, flags); + err = alloc_fd(from, nofile, flags); if (err >= 0) { get_file(file); fd_install(err, file); diff --git a/fs/file_table.c b/fs/file_table.c index 709ada3151da..45437f8e1003 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -113,7 +113,6 @@ static struct file *__alloc_file(int flags, const struct cred *cred) rwlock_init(&f->f_owner.lock); spin_lock_init(&f->f_lock); mutex_init(&f->f_pos_lock); - eventpoll_init_file(f); f->f_flags = flags; f->f_mode = OPEN_FMODE(flags); /* f->f_version: 0 */ diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e6005c78bfa9..acfb55834af2 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2321,10 +2321,6 @@ void __mark_inode_dirty(struct inode *inode, int flags) wb = locked_inode_to_wb_and_lock_list(inode); - WARN((wb->bdi->capabilities & BDI_CAP_WRITEBACK) && - !test_bit(WB_registered, &wb->state), - "bdi-%s not registered\n", bdi_dev_name(wb->bdi)); - inode->dirtied_when = jiffies; if (dirtytime) inode->dirtied_time_when = jiffies; diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index 5a48cee6d7d3..f529075a2ce8 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -19,6 +19,9 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type) void *value = NULL; struct posix_acl *acl; + if (fuse_is_bad(inode)) + return ERR_PTR(-EIO); + if (!fc->posix_acl || fc->no_getxattr) return NULL; @@ -53,6 +56,9 @@ int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type) const char *name; int ret; + if (fuse_is_bad(inode)) + return -EIO; + if (!fc->posix_acl || fc->no_setxattr) return -EOPNOTSUPP; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index ff7dbeb16f88..78f9f209078c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -202,10 +202,10 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) int ret; inode = d_inode_rcu(entry); - if (inode && is_bad_inode(inode)) + if (inode && fuse_is_bad(inode)) goto invalid; else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || - (flags & LOOKUP_REVAL)) { + (flags & (LOOKUP_EXCL | LOOKUP_REVAL))) { struct fuse_entry_out outarg; FUSE_ARGS(args); struct fuse_forget_link *forget; @@ -328,12 +328,11 @@ static struct vfsmount *fuse_dentry_automount(struct path *path) if (!fm) goto out_put_fsc; - refcount_set(&fm->count, 1); fsc->s_fs_info = fm; sb = sget_fc(fsc, NULL, set_anon_super_fc); if (IS_ERR(sb)) { err = PTR_ERR(sb); - fuse_mount_put(fm); + kfree(fm); goto out_put_fsc; } fm->fc = fuse_conn_get(fc); @@ -463,6 +462,9 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, bool outarg_valid = true; bool locked; + if (fuse_is_bad(dir)) + return ERR_PTR(-EIO); + locked = fuse_lock_inode(dir); err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name, &outarg, &inode); @@ -542,6 +544,12 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, inarg.flags = flags; inarg.mode = mode; inarg.umask = current_umask(); + + if (fm->fc->handle_killpriv_v2 && (flags & O_TRUNC) && + !(flags & O_EXCL) && !capable(CAP_FSETID)) { + inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; + } + args.opcode = FUSE_CREATE; args.nodeid = get_node_id(dir); args.in_numargs = 2; @@ -606,6 +614,9 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, struct fuse_conn *fc = get_fuse_conn(dir); struct dentry *res = NULL; + if (fuse_is_bad(dir)) + return -EIO; + if (d_in_lookup(entry)) { res = fuse_lookup(dir, entry, 0); if (IS_ERR(res)) @@ -654,6 +665,9 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, int err; struct fuse_forget_link *forget; + if (fuse_is_bad(dir)) + return -EIO; + forget = fuse_alloc_forget(); if (!forget) return -ENOMEM; @@ -781,6 +795,9 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); + if (fuse_is_bad(dir)) + return -EIO; + args.opcode = FUSE_UNLINK; args.nodeid = get_node_id(dir); args.in_numargs = 1; @@ -817,6 +834,9 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); + if (fuse_is_bad(dir)) + return -EIO; + args.opcode = FUSE_RMDIR; args.nodeid = get_node_id(dir); args.in_numargs = 1; @@ -895,6 +915,9 @@ static int fuse_rename2(struct inode *olddir, struct dentry *oldent, struct fuse_conn *fc = get_fuse_conn(olddir); int err; + if (fuse_is_bad(olddir)) + return -EIO; + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; @@ -1030,7 +1053,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, if (!err) { if (fuse_invalid_attr(&outarg.attr) || (inode->i_mode ^ outarg.attr.mode) & S_IFMT) { - make_bad_inode(inode); + fuse_make_bad(inode); err = -EIO; } else { fuse_change_attributes(inode, &outarg.attr, @@ -1232,6 +1255,9 @@ static int fuse_permission(struct inode *inode, int mask) bool refreshed = false; int err = 0; + if (fuse_is_bad(inode)) + return -EIO; + if (!fuse_allow_current_process(fc)) return -EACCES; @@ -1327,7 +1353,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, int err; err = -EIO; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) goto out_err; if (fc->cache_symlinks) @@ -1375,7 +1401,7 @@ static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end, struct fuse_conn *fc = get_fuse_conn(inode); int err; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) return -EIO; if (fc->no_fsyncdir) @@ -1649,10 +1675,20 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, inarg.valid |= FATTR_FH; inarg.fh = ff->fh; } + + /* Kill suid/sgid for non-directory chown unconditionally */ + if (fc->handle_killpriv_v2 && !S_ISDIR(inode->i_mode) && + attr->ia_valid & (ATTR_UID | ATTR_GID)) + inarg.valid |= FATTR_KILL_SUIDGID; + if (attr->ia_valid & ATTR_SIZE) { /* For mandatory locking in truncate */ inarg.valid |= FATTR_LOCKOWNER; inarg.lock_owner = fuse_lock_owner_id(fc, current->files); + + /* Kill suid/sgid for truncate only if no CAP_FSETID */ + if (fc->handle_killpriv_v2 && !capable(CAP_FSETID)) + inarg.valid |= FATTR_KILL_SUIDGID; } fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); err = fuse_simple_request(fm, &args); @@ -1664,7 +1700,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if (fuse_invalid_attr(&outarg.attr) || (inode->i_mode ^ outarg.attr.mode) & S_IFMT) { - make_bad_inode(inode); + fuse_make_bad(inode); err = -EIO; goto error; } @@ -1727,6 +1763,9 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr) struct file *file = (attr->ia_valid & ATTR_FILE) ? attr->ia_file : NULL; int ret; + if (fuse_is_bad(inode)) + return -EIO; + if (!fuse_allow_current_process(get_fuse_conn(inode))) return -EACCES; @@ -1740,7 +1779,7 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr) * * This should be done on write(), truncate() and chown(). */ - if (!fc->handle_killpriv) { + if (!fc->handle_killpriv && !fc->handle_killpriv_v2) { /* * ia_mode calculation may have used stale i_mode. * Refresh and recalculate. @@ -1785,6 +1824,9 @@ static int fuse_getattr(const struct path *path, struct kstat *stat, struct inode *inode = d_inode(path->dentry); struct fuse_conn *fc = get_fuse_conn(inode); + if (fuse_is_bad(inode)) + return -EIO; + if (!fuse_allow_current_process(fc)) { if (!request_mask) { /* diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c03034e8c152..8cccecb55fb8 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -42,6 +42,12 @@ static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, struct file *file, inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); if (!fm->fc->atomic_o_trunc) inarg.flags &= ~O_TRUNC; + + if (fm->fc->handle_killpriv_v2 && + (inarg.flags & O_TRUNC) && !capable(CAP_FSETID)) { + inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; + } + args.opcode = opcode; args.nodeid = nodeid; args.in_numargs = 1; @@ -226,6 +232,9 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) bool dax_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc && FUSE_IS_DAX(inode); + if (fuse_is_bad(inode)) + return -EIO; + err = generic_file_open(inode, file); if (err) return err; @@ -463,7 +472,7 @@ static int fuse_flush(struct file *file, fl_owner_t id) FUSE_ARGS(args); int err; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) return -EIO; err = write_inode_now(inode, 1); @@ -535,7 +544,7 @@ static int fuse_fsync(struct file *file, loff_t start, loff_t end, struct fuse_conn *fc = get_fuse_conn(inode); int err; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) return -EIO; inode_lock(inode); @@ -859,7 +868,7 @@ static int fuse_readpage(struct file *file, struct page *page) int err; err = -EIO; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) goto out; err = fuse_do_readpage(file, page); @@ -952,7 +961,7 @@ static void fuse_readahead(struct readahead_control *rac) struct fuse_conn *fc = get_fuse_conn(inode); unsigned int i, max_pages, nr_pages = 0; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) return; max_pages = min_t(unsigned int, fc->max_pages, @@ -1097,6 +1106,8 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, fuse_write_args_fill(ia, ff, pos, count); ia->write.in.flags = fuse_write_flags(iocb); + if (fm->fc->handle_killpriv_v2 && !capable(CAP_FSETID)) + ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID; err = fuse_simple_request(fm, &ap->args); if (!err && ia->write.out.size > count) @@ -1260,17 +1271,24 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t written_buffered = 0; struct inode *inode = mapping->host; ssize_t err; + struct fuse_conn *fc = get_fuse_conn(inode); loff_t endbyte = 0; - if (get_fuse_conn(inode)->writeback_cache) { + if (fc->writeback_cache) { /* Update size (EOF optimization) and mode (SUID clearing) */ err = fuse_update_attributes(mapping->host, file); if (err) return err; + if (fc->handle_killpriv_v2 && + should_remove_suid(file_dentry(file))) { + goto writethrough; + } + return generic_file_write_iter(iocb, from); } +writethrough: inode_lock(inode); /* We can write back this queue in page reclaim */ @@ -1451,7 +1469,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (write) { if (!capable(CAP_FSETID)) - ia->write.in.write_flags |= FUSE_WRITE_KILL_PRIV; + ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID; nres = fuse_send_write(ia, pos, nbytes, owner); } else { @@ -1555,7 +1573,7 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) struct fuse_file *ff = file->private_data; struct inode *inode = file_inode(file); - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) return -EIO; if (FUSE_IS_DAX(inode)) @@ -1573,7 +1591,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct fuse_file *ff = file->private_data; struct inode *inode = file_inode(file); - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) return -EIO; if (FUSE_IS_DAX(inode)) @@ -2172,7 +2190,7 @@ static int fuse_writepages(struct address_space *mapping, int err; err = -EIO; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) goto out; data.inode = inode; @@ -2281,6 +2299,9 @@ static int fuse_launder_page(struct page *page) int err = 0; if (clear_page_dirty_for_io(page)) { struct inode *inode = page->mapping->host; + + /* Serialize with pending writeback for the same page */ + fuse_wait_on_page_writeback(inode, page->index); err = fuse_writepage_locked(page); if (!err) fuse_wait_on_page_writeback(inode, page->index); @@ -2954,7 +2975,7 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd, if (!fuse_allow_current_process(fc)) return -EACCES; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) return -EIO; return fuse_do_ioctl(file, cmd, arg, flags); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d51598017d13..7c4b8cb93f9f 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -172,6 +172,8 @@ enum { FUSE_I_INIT_RDPLUS, /** An operation changing file size is in progress */ FUSE_I_SIZE_UNSTABLE, + /* Bad inode */ + FUSE_I_BAD, }; struct fuse_conn; @@ -636,6 +638,14 @@ struct fuse_conn { unsigned int legacy_opts_show:1; /* + * fs kills suid/sgid/cap on write/chown/trunc. suid is killed on + * write/trunc only if caller did not have CAP_FSETID. sgid is killed + * on write/truncate only if caller did not have CAP_FSETID as well as + * file has group execute permission. + */ + unsigned handle_killpriv_v2:1; + + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction */ @@ -801,9 +811,6 @@ struct fuse_mount { /* Underlying (potentially shared) connection to the FUSE server */ struct fuse_conn *fc; - /* Refcount */ - refcount_t count; - /* * Super block for this connection (fc->killsb must be held when * accessing this). @@ -821,9 +828,7 @@ static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb) static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) { - struct fuse_mount *fm = get_fuse_mount_super(sb); - - return fm ? fm->fc : NULL; + return get_fuse_mount_super(sb)->fc; } static inline struct fuse_mount *get_fuse_mount(struct inode *inode) @@ -833,9 +838,7 @@ static inline struct fuse_mount *get_fuse_mount(struct inode *inode) static inline struct fuse_conn *get_fuse_conn(struct inode *inode) { - struct fuse_mount *fm = get_fuse_mount(inode); - - return fm ? fm->fc : NULL; + return get_fuse_mount_super(inode->i_sb)->fc; } static inline struct fuse_inode *get_fuse_inode(struct inode *inode) @@ -858,6 +861,16 @@ static inline u64 fuse_get_attr_version(struct fuse_conn *fc) return atomic64_read(&fc->attr_version); } +static inline void fuse_make_bad(struct inode *inode) +{ + set_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state); +} + +static inline bool fuse_is_bad(struct inode *inode) +{ + return unlikely(test_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state)); +} + /** Device operations */ extern const struct file_operations fuse_dev_operations; @@ -1024,16 +1037,6 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, */ void fuse_conn_put(struct fuse_conn *fc); -/** - * Acquire reference to fuse_mount - */ -struct fuse_mount *fuse_mount_get(struct fuse_mount *fm); - -/** - * Release reference to fuse_mount - */ -void fuse_mount_put(struct fuse_mount *fm); - struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc); struct fuse_dev *fuse_dev_alloc(void); void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 1a47afc95f80..b0e18b470e91 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -132,7 +132,7 @@ static void fuse_evict_inode(struct inode *inode) fi->forget = NULL; } } - if (S_ISREG(inode->i_mode) && !is_bad_inode(inode)) { + if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) { WARN_ON(!list_empty(&fi->write_files)); WARN_ON(!list_empty(&fi->queued_writes)); } @@ -204,6 +204,16 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_mode &= ~S_ISVTX; fi->orig_ino = attr->ino; + + /* + * We are refreshing inode data and it is possible that another + * client set suid/sgid or security.capability xattr. So clear + * S_NOSEC. Ideally, we could have cleared it only if suid/sgid + * was set or if security.capability xattr was set. But we don't + * know if security.capability has been set or not. So clear it + * anyway. Its less efficient but should be safe. + */ + inode->i_flags &= ~S_NOSEC; } void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, @@ -342,7 +352,7 @@ retry: unlock_new_inode(inode); } else if ((inode->i_mode ^ attr->mode) & S_IFMT) { /* Inode has changed type, any I/O on the old should fail */ - make_bad_inode(inode); + fuse_make_bad(inode); iput(inode); goto retry; } @@ -452,7 +462,8 @@ static void fuse_put_super(struct super_block *sb) { struct fuse_mount *fm = get_fuse_mount_super(sb); - fuse_mount_put(fm); + fuse_conn_put(fm->fc); + kfree(fm); } static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr) @@ -705,7 +716,6 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, INIT_LIST_HEAD(&fc->mounts); list_add(&fm->fc_entry, &fc->mounts); fm->fc = fc; - refcount_set(&fm->count, 1); } EXPORT_SYMBOL_GPL(fuse_conn_init); @@ -732,23 +742,6 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) } EXPORT_SYMBOL_GPL(fuse_conn_get); -void fuse_mount_put(struct fuse_mount *fm) -{ - if (refcount_dec_and_test(&fm->count)) { - if (fm->fc) - fuse_conn_put(fm->fc); - kfree(fm); - } -} -EXPORT_SYMBOL_GPL(fuse_mount_put); - -struct fuse_mount *fuse_mount_get(struct fuse_mount *fm) -{ - refcount_inc(&fm->count); - return fm; -} -EXPORT_SYMBOL_GPL(fuse_mount_get); - static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) { struct fuse_attr attr; @@ -1055,6 +1048,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, !fuse_dax_check_alignment(fc, arg->map_alignment)) { ok = false; } + if (arg->flags & FUSE_HANDLE_KILLPRIV_V2) { + fc->handle_killpriv_v2 = 1; + fm->sb->s_flags |= SB_NOSEC; + } } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1097,7 +1094,8 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | - FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA; + FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | + FUSE_HANDLE_KILLPRIV_V2; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) ia->in.flags |= FUSE_MAP_ALIGNMENT; @@ -1465,7 +1463,8 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) return 0; err_put_conn: - fuse_mount_put(fm); + fuse_conn_put(fc); + kfree(fm); sb->s_fs_info = NULL; err_fput: fput(file); @@ -1557,7 +1556,7 @@ void fuse_conn_destroy(struct fuse_mount *fm) } EXPORT_SYMBOL_GPL(fuse_conn_destroy); -static void fuse_kill_sb_anon(struct super_block *sb) +static void fuse_sb_destroy(struct super_block *sb) { struct fuse_mount *fm = get_fuse_mount_super(sb); bool last; @@ -1567,6 +1566,11 @@ static void fuse_kill_sb_anon(struct super_block *sb) if (last) fuse_conn_destroy(fm); } +} + +static void fuse_kill_sb_anon(struct super_block *sb) +{ + fuse_sb_destroy(sb); kill_anon_super(sb); } @@ -1583,14 +1587,7 @@ MODULE_ALIAS_FS("fuse"); #ifdef CONFIG_BLOCK static void fuse_kill_sb_blk(struct super_block *sb) { - struct fuse_mount *fm = get_fuse_mount_super(sb); - bool last; - - if (fm) { - last = fuse_mount_remove(fm); - if (last) - fuse_conn_destroy(fm); - } + fuse_sb_destroy(sb); kill_block_super(sb); } diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 3b5e91045871..3441ffa740f3 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -207,7 +207,7 @@ retry: dput(dentry); goto retry; } - if (is_bad_inode(inode)) { + if (fuse_is_bad(inode)) { dput(dentry); return -EIO; } @@ -568,7 +568,7 @@ int fuse_readdir(struct file *file, struct dir_context *ctx) struct inode *inode = file_inode(file); int err; - if (is_bad_inode(inode)) + if (fuse_is_bad(inode)) return -EIO; mutex_lock(&ff->readdir.lock); diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 21a9e534417c..8868ac31a3c0 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1402,18 +1402,6 @@ static int virtio_fs_test_super(struct super_block *sb, return fsc_fm->fc->iq.priv == sb_fm->fc->iq.priv; } -static int virtio_fs_set_super(struct super_block *sb, - struct fs_context *fsc) -{ - int err; - - err = get_anon_bdev(&sb->s_dev); - if (!err) - fuse_mount_get(fsc->s_fs_info); - - return err; -} - static int virtio_fs_get_tree(struct fs_context *fsc) { struct virtio_fs *fs; @@ -1432,22 +1420,14 @@ static int virtio_fs_get_tree(struct fs_context *fsc) return -EINVAL; } + err = -ENOMEM; fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL); - if (!fc) { - mutex_lock(&virtio_fs_mutex); - virtio_fs_put(fs); - mutex_unlock(&virtio_fs_mutex); - return -ENOMEM; - } + if (!fc) + goto out_err; fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL); - if (!fm) { - mutex_lock(&virtio_fs_mutex); - virtio_fs_put(fs); - mutex_unlock(&virtio_fs_mutex); - kfree(fc); - return -ENOMEM; - } + if (!fm) + goto out_err; fuse_conn_init(fc, fm, get_user_ns(current_user_ns()), &virtio_fs_fiq_ops, fs); @@ -1456,14 +1436,20 @@ static int virtio_fs_get_tree(struct fs_context *fsc) fc->auto_submounts = true; fsc->s_fs_info = fm; - sb = sget_fc(fsc, virtio_fs_test_super, virtio_fs_set_super); - fuse_mount_put(fm); + sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc); + if (fsc->s_fs_info) { + fuse_conn_put(fc); + kfree(fm); + } if (IS_ERR(sb)) return PTR_ERR(sb); if (!sb->s_root) { err = virtio_fs_fill_super(sb, fsc); if (err) { + fuse_conn_put(fc); + kfree(fm); + sb->s_fs_info = NULL; deactivate_locked_super(sb); return err; } @@ -1474,6 +1460,13 @@ static int virtio_fs_get_tree(struct fs_context *fsc) WARN_ON(fsc->root); fsc->root = dget(sb->s_root); return 0; + +out_err: + kfree(fc); + mutex_lock(&virtio_fs_mutex); + virtio_fs_put(fs); + mutex_unlock(&virtio_fs_mutex); + return err; } static const struct fs_context_operations virtio_fs_context_ops = { diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 371bdcbc7233..cdea18de94f7 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -113,6 +113,9 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) struct fuse_getxattr_out outarg; ssize_t ret; + if (fuse_is_bad(inode)) + return -EIO; + if (!fuse_allow_current_process(fm->fc)) return -EACCES; @@ -178,6 +181,9 @@ static int fuse_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) { + if (fuse_is_bad(inode)) + return -EIO; + return fuse_getxattr(inode, name, value, size); } @@ -186,6 +192,9 @@ static int fuse_xattr_set(const struct xattr_handler *handler, const char *name, const void *value, size_t size, int flags) { + if (fuse_is_bad(inode)) + return -EIO; + if (!value) return fuse_removexattr(inode, name); diff --git a/fs/inode.c b/fs/inode.c index 9d78c37b00b8..cb008acf0efd 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -155,7 +155,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_bytes = 0; inode->i_generation = 0; inode->i_pipe = NULL; - inode->i_bdev = NULL; inode->i_cdev = NULL; inode->i_link = NULL; inode->i_dir_seq = 0; @@ -580,8 +579,6 @@ static void evict(struct inode *inode) truncate_inode_pages_final(&inode->i_data); clear_inode(inode); } - if (S_ISBLK(inode->i_mode) && inode->i_bdev) - bd_forget(inode); if (S_ISCHR(inode->i_mode) && inode->i_cdev) cd_forget(inode); diff --git a/fs/internal.h b/fs/internal.h index a7cd0f64faa4..77c50befbfbe 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -25,7 +25,6 @@ extern void __init bdev_cache_init(void); extern int __sync_blockdev(struct block_device *bdev, int wait); void iterate_bdevs(void (*)(struct block_device *, void *), void *); void emergency_thaw_bdev(struct super_block *sb); -void bd_forget(struct inode *inode); #else static inline void bdev_cache_init(void) { @@ -43,9 +42,6 @@ static inline int emergency_thaw_bdev(struct super_block *sb) { return 0; } -static inline void bd_forget(struct inode *inode) -{ -} #endif /* CONFIG_BLOCK */ /* @@ -78,6 +74,8 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *, long do_rmdir(int dfd, struct filename *name); long do_unlinkat(int dfd, struct filename *name); int may_linkat(struct path *link); +int do_renameat2(int olddfd, struct filename *oldname, int newdfd, + struct filename *newname, unsigned int flags); /* * namespace.c @@ -114,7 +112,8 @@ extern struct file *alloc_empty_file_noaccount(int, const struct cred *); */ extern int reconfigure_super(struct fs_context *); extern bool trylock_super(struct super_block *sb); -extern struct super_block *user_get_super(dev_t); +struct super_block *user_get_super(dev_t, bool excl); +void put_super(struct super_block *sb); extern bool mount_capable(struct fs_context *); /* diff --git a/fs/io-wq.c b/fs/io-wq.c index b53c055bea6a..f72d53848dcb 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -1078,16 +1078,6 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, return IO_WQ_CANCEL_NOTFOUND; } -static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data) -{ - return work == data; -} - -enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) -{ - return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork, false); -} - struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) { int ret = -ENOMEM, node; diff --git a/fs/io-wq.h b/fs/io-wq.h index cba36f03c355..069496c6d4f9 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -129,7 +129,6 @@ static inline bool io_wq_is_hashed(struct io_wq_work *work) } void io_wq_cancel_all(struct io_wq *wq); -enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork); typedef bool (work_cancel_fn)(struct io_wq_work *, void *); diff --git a/fs/io_uring.c b/fs/io_uring.c index 86dac2b2e276..6f9392c35eef 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -245,6 +245,8 @@ struct io_sq_data { struct task_struct *thread; struct wait_queue_head wait; + + unsigned sq_thread_idle; }; struct io_ring_ctx { @@ -285,7 +287,6 @@ struct io_ring_ctx { struct list_head timeout_list; struct list_head cq_overflow_list; - wait_queue_head_t inflight_wait; struct io_uring_sqe *sq_sqes; } ____cacheline_aligned_in_smp; @@ -310,7 +311,6 @@ struct io_ring_ctx { struct io_sq_data *sq_data; /* if using sq thread polling */ struct wait_queue_head sqo_sq_wait; - struct wait_queue_entry sqo_wait_entry; struct list_head sqd_list; /* @@ -395,16 +395,18 @@ struct io_ring_ctx { */ struct io_poll_iocb { struct file *file; - union { - struct wait_queue_head *head; - u64 addr; - }; + struct wait_queue_head *head; __poll_t events; bool done; bool canceled; struct wait_queue_entry wait; }; +struct io_poll_remove { + struct file *file; + u64 addr; +}; + struct io_close { struct file *file; struct file *put_file; @@ -444,11 +446,17 @@ struct io_timeout { u32 off; u32 target_seq; struct list_head list; + /* head of the link, used by linked timeouts only */ + struct io_kiocb *head; }; struct io_timeout_rem { struct file *file; u64 addr; + + /* timeout update */ + struct timespec64 ts; + u32 flags; }; struct io_rw { @@ -541,6 +549,27 @@ struct io_statx { struct statx __user *buffer; }; +struct io_shutdown { + struct file *file; + int how; +}; + +struct io_rename { + struct file *file; + int old_dfd; + int new_dfd; + struct filename *oldpath; + struct filename *newpath; + int flags; +}; + +struct io_unlink { + struct file *file; + int dfd; + int flags; + struct filename *filename; +}; + struct io_completion { struct file *file; struct list_head list; @@ -575,7 +604,6 @@ enum { REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, - REQ_F_LINK_HEAD_BIT, REQ_F_FAIL_LINK_BIT, REQ_F_INFLIGHT_BIT, REQ_F_CUR_POS_BIT, @@ -607,8 +635,6 @@ enum { /* IOSQE_BUFFER_SELECT */ REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), - /* head of a link */ - REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT), /* fail rest of links */ REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), /* on inflight list */ @@ -651,6 +677,7 @@ struct io_kiocb { struct file *file; struct io_rw rw; struct io_poll_iocb poll; + struct io_poll_remove poll_remove; struct io_accept accept; struct io_sync sync; struct io_cancel cancel; @@ -667,6 +694,9 @@ struct io_kiocb { struct io_splice splice; struct io_provide_buf pbuf; struct io_statx statx; + struct io_shutdown shutdown; + struct io_rename rename; + struct io_unlink unlink; /* use only after cleaning per-op data, see io_clean_op() */ struct io_completion compl; }; @@ -686,15 +716,14 @@ struct io_kiocb { struct task_struct *task; u64 user_data; - struct list_head link_list; + struct io_kiocb *link; + struct percpu_ref *fixed_file_refs; /* * 1. used with ctx->iopoll_list with reads/writes * 2. to track reqs with ->files (see io_op_def::file_table) */ struct list_head inflight_entry; - - struct percpu_ref *fixed_file_refs; struct callback_head task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ struct hlist_node hash_node; @@ -725,6 +754,8 @@ struct io_submit_state { void *reqs[IO_IOPOLL_BATCH]; unsigned int free_reqs; + bool plug_started; + /* * Batch completion logic */ @@ -735,7 +766,7 @@ struct io_submit_state { */ struct file *file; unsigned int fd; - unsigned int has_refs; + unsigned int file_refs; unsigned int ios_left; }; @@ -757,6 +788,8 @@ struct io_op_def { unsigned buffer_select : 1; /* must always have async data allocated */ unsigned needs_async_data : 1; + /* should block plug */ + unsigned plug : 1; /* size of async data needed, if any */ unsigned short async_size; unsigned work_flags; @@ -770,6 +803,7 @@ static const struct io_op_def io_op_defs[] = { .pollin = 1, .buffer_select = 1, .needs_async_data = 1, + .plug = 1, .async_size = sizeof(struct io_async_rw), .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, @@ -779,6 +813,7 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .needs_async_data = 1, + .plug = 1, .async_size = sizeof(struct io_async_rw), .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE, @@ -791,6 +826,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, + .plug = 1, .async_size = sizeof(struct io_async_rw), .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM, }, @@ -799,6 +835,7 @@ static const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .plug = 1, .async_size = sizeof(struct io_async_rw), .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE | IO_WQ_WORK_MM, @@ -818,8 +855,7 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .needs_async_data = 1, .async_size = sizeof(struct io_async_msghdr), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FS, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_RECVMSG] = { .needs_file = 1, @@ -828,15 +864,17 @@ static const struct io_op_def io_op_defs[] = { .buffer_select = 1, .needs_async_data = 1, .async_size = sizeof(struct io_async_msghdr), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FS, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_TIMEOUT] = { .needs_async_data = 1, .async_size = sizeof(struct io_timeout_data), .work_flags = IO_WQ_WORK_MM, }, - [IORING_OP_TIMEOUT_REMOVE] = {}, + [IORING_OP_TIMEOUT_REMOVE] = { + /* used by timeout updates' prep() */ + .work_flags = IO_WQ_WORK_MM, + }, [IORING_OP_ACCEPT] = { .needs_file = 1, .unbound_nonreg_file = 1, @@ -863,7 +901,7 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_OPENAT] = { .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FS, + IO_WQ_WORK_FS | IO_WQ_WORK_MM, }, [IORING_OP_CLOSE] = { .needs_file = 1, @@ -882,6 +920,7 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, + .plug = 1, .async_size = sizeof(struct io_async_rw), .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, @@ -889,6 +928,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .plug = 1, .async_size = sizeof(struct io_async_rw), .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE, @@ -915,7 +955,7 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_OPENAT2] = { .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS | - IO_WQ_WORK_BLKCG, + IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM, }, [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, @@ -934,6 +974,17 @@ static const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, }, + [IORING_OP_SHUTDOWN] = { + .needs_file = 1, + }, + [IORING_OP_RENAMEAT] = { + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES | + IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, + }, + [IORING_OP_UNLINKAT] = { + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES | + IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, + }, }; enum io_mem_account { @@ -983,6 +1034,9 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); +#define io_for_each_link(pos, head) \ + for (pos = (head); pos; pos = pos->link) + static inline void io_clean_op(struct io_kiocb *req) { if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED | @@ -990,8 +1044,39 @@ static inline void io_clean_op(struct io_kiocb *req) __io_clean_op(req); } -static void io_sq_thread_drop_mm(void) +static inline void io_set_resource_node(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + if (!req->fixed_file_refs) { + req->fixed_file_refs = &ctx->file_data->node->refs; + percpu_ref_get(req->fixed_file_refs); + } +} + +static bool io_match_task(struct io_kiocb *head, + struct task_struct *task, + struct files_struct *files) { + struct io_kiocb *req; + + if (task && head->task != task) + return false; + if (!files) + return true; + + io_for_each_link(req, head) { + if ((req->flags & REQ_F_WORK_INITIALIZED) && + (req->work.flags & IO_WQ_WORK_FILES) && + req->work.identity->files == files) + return true; + } + return false; +} + +static void io_sq_thread_drop_mm_files(void) +{ + struct files_struct *files = current->files; struct mm_struct *mm = current->mm; if (mm) { @@ -999,6 +1084,41 @@ static void io_sq_thread_drop_mm(void) mmput(mm); current->mm = NULL; } + if (files) { + struct nsproxy *nsproxy = current->nsproxy; + + task_lock(current); + current->files = NULL; + current->nsproxy = NULL; + task_unlock(current); + put_files_struct(files); + put_nsproxy(nsproxy); + } +} + +static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx) +{ + if (!current->files) { + struct files_struct *files; + struct nsproxy *nsproxy; + + task_lock(ctx->sqo_task); + files = ctx->sqo_task->files; + if (!files) { + task_unlock(ctx->sqo_task); + return -EOWNERDEAD; + } + atomic_inc(&files->count); + get_nsproxy(ctx->sqo_task->nsproxy); + nsproxy = ctx->sqo_task->nsproxy; + task_unlock(ctx->sqo_task); + + task_lock(current); + current->files = files; + current->nsproxy = nsproxy; + task_unlock(current); + } + return 0; } static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) @@ -1026,12 +1146,25 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) return -EFAULT; } -static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, - struct io_kiocb *req) +static int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx, + struct io_kiocb *req) { - if (!(io_op_defs[req->opcode].work_flags & IO_WQ_WORK_MM)) - return 0; - return __io_sq_thread_acquire_mm(ctx); + const struct io_op_def *def = &io_op_defs[req->opcode]; + int ret; + + if (def->work_flags & IO_WQ_WORK_MM) { + ret = __io_sq_thread_acquire_mm(ctx); + if (unlikely(ret)) + return ret; + } + + if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) { + ret = __io_sq_thread_acquire_files(ctx); + if (unlikely(ret)) + return ret; + } + + return 0; } static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx, @@ -1174,7 +1307,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); - init_waitqueue_head(&ctx->inflight_wait); spin_lock_init(&ctx->inflight_lock); INIT_LIST_HEAD(&ctx->inflight_list); INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work); @@ -1416,10 +1548,8 @@ static void io_prep_async_link(struct io_kiocb *req) { struct io_kiocb *cur; - io_prep_async_work(req); - if (req->flags & REQ_F_LINK_HEAD) - list_for_each_entry(cur, &req->link_list, link_list) - io_prep_async_work(cur); + io_for_each_link(cur, req) + io_prep_async_work(cur); } static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req) @@ -1460,30 +1590,18 @@ static void io_kill_timeout(struct io_kiocb *req) } } -static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk) -{ - struct io_ring_ctx *ctx = req->ctx; - - if (!tsk || req->task == tsk) - return true; - if (ctx->flags & IORING_SETUP_SQPOLL) { - if (ctx->sq_data && req->task == ctx->sq_data->thread) - return true; - } - return false; -} - /* * Returns true if we found and killed one or more timeouts */ -static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk) +static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, + struct files_struct *files) { struct io_kiocb *req, *tmp; int canceled = 0; spin_lock_irq(&ctx->completion_lock); list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { - if (io_task_match(req, tsk)) { + if (io_match_task(req, tsk, files)) { io_kill_timeout(req); canceled++; } @@ -1594,32 +1712,6 @@ static void io_cqring_mark_overflow(struct io_ring_ctx *ctx) } } -static inline bool __io_match_files(struct io_kiocb *req, - struct files_struct *files) -{ - return ((req->flags & REQ_F_WORK_INITIALIZED) && - (req->work.flags & IO_WQ_WORK_FILES)) && - req->work.identity->files == files; -} - -static bool io_match_files(struct io_kiocb *req, - struct files_struct *files) -{ - struct io_kiocb *link; - - if (!files) - return true; - if (__io_match_files(req, files)) - return true; - if (req->flags & REQ_F_LINK_HEAD) { - list_for_each_entry(link, &req->link_list, link_list) { - if (__io_match_files(link, files)) - return true; - } - } - return false; -} - /* Returns true if there are no backlogged entries after the flush */ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, struct task_struct *tsk, @@ -1647,9 +1739,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, cqe = NULL; list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) { - if (tsk && req->task != tsk) - continue; - if (!io_match_files(req, files)) + if (!io_match_task(req, tsk, files)) continue; cqe = io_get_cqring(ctx); @@ -1845,9 +1935,7 @@ fallback: static inline void io_put_file(struct io_kiocb *req, struct file *file, bool fixed) { - if (fixed) - percpu_ref_put(req->fixed_file_refs); - else + if (!fixed) fput(file); } @@ -1859,7 +1947,8 @@ static void io_dismantle_req(struct io_kiocb *req) kfree(req->async_data); if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); - + if (req->fixed_file_refs) + percpu_ref_put(req->fixed_file_refs); io_req_clean_work(req); } @@ -1882,6 +1971,14 @@ static void __io_free_req(struct io_kiocb *req) percpu_ref_put(&ctx->refs); } +static inline void io_remove_next_linked(struct io_kiocb *req) +{ + struct io_kiocb *nxt = req->link; + + req->link = nxt->link; + nxt->link = NULL; +} + static void io_kill_linked_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -1890,8 +1987,8 @@ static void io_kill_linked_timeout(struct io_kiocb *req) unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); - link = list_first_entry_or_null(&req->link_list, struct io_kiocb, - link_list); + link = req->link; + /* * Can happen if a linked timeout fired and link had been like * req -> link t-out -> link t-out [-> ...] @@ -1900,7 +1997,8 @@ static void io_kill_linked_timeout(struct io_kiocb *req) struct io_timeout_data *io = link->async_data; int ret; - list_del_init(&link->link_list); + io_remove_next_linked(req); + link->timeout.head = NULL; ret = hrtimer_try_to_cancel(&io->timer); if (ret != -1) { io_cqring_fill_event(link, -ECANCELED); @@ -1917,41 +2015,22 @@ static void io_kill_linked_timeout(struct io_kiocb *req) } } -static struct io_kiocb *io_req_link_next(struct io_kiocb *req) -{ - struct io_kiocb *nxt; - - /* - * The list should never be empty when we are called here. But could - * potentially happen if the chain is messed up, check to be on the - * safe side. - */ - if (unlikely(list_empty(&req->link_list))) - return NULL; - nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list); - list_del_init(&req->link_list); - if (!list_empty(&nxt->link_list)) - nxt->flags |= REQ_F_LINK_HEAD; - return nxt; -} - -/* - * Called if REQ_F_LINK_HEAD is set, and we fail the head request - */ static void io_fail_links(struct io_kiocb *req) { + struct io_kiocb *link, *nxt; struct io_ring_ctx *ctx = req->ctx; unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); - while (!list_empty(&req->link_list)) { - struct io_kiocb *link = list_first_entry(&req->link_list, - struct io_kiocb, link_list); + link = req->link; + req->link = NULL; - list_del_init(&link->link_list); - trace_io_uring_fail_link(req, link); + while (link) { + nxt = link->link; + link->link = NULL; + trace_io_uring_fail_link(req, link); io_cqring_fill_event(link, -ECANCELED); /* @@ -1963,8 +2042,8 @@ static void io_fail_links(struct io_kiocb *req) io_put_req_deferred(link, 2); else io_double_put_req(link); + link = nxt; } - io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); @@ -1973,7 +2052,6 @@ static void io_fail_links(struct io_kiocb *req) static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) { - req->flags &= ~REQ_F_LINK_HEAD; if (req->flags & REQ_F_LINK_TIMEOUT) io_kill_linked_timeout(req); @@ -1983,20 +2061,24 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (likely(!(req->flags & REQ_F_FAIL_LINK))) - return io_req_link_next(req); + if (likely(!(req->flags & REQ_F_FAIL_LINK))) { + struct io_kiocb *nxt = req->link; + + req->link = NULL; + return nxt; + } io_fail_links(req); return NULL; } -static struct io_kiocb *io_req_find_next(struct io_kiocb *req) +static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) { - if (likely(!(req->flags & REQ_F_LINK_HEAD))) + if (likely(!(req->link) && !(req->flags & REQ_F_LINK_TIMEOUT))) return NULL; return __io_req_find_next(req); } -static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok) +static int io_req_task_work_add(struct io_kiocb *req) { struct task_struct *tsk = req->task; struct io_ring_ctx *ctx = req->ctx; @@ -2013,7 +2095,7 @@ static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok) * will do the job. */ notify = TWA_NONE; - if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok) + if (!(ctx->flags & IORING_SETUP_SQPOLL)) notify = TWA_SIGNAL; ret = task_work_add(tsk, &req->task_work, notify); @@ -2050,7 +2132,8 @@ static void __io_req_task_submit(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - if (!__io_sq_thread_acquire_mm(ctx)) { + if (!__io_sq_thread_acquire_mm(ctx) && + !__io_sq_thread_acquire_files(ctx)) { mutex_lock(&ctx->uring_lock); __io_queue_sqe(req, NULL); mutex_unlock(&ctx->uring_lock); @@ -2075,7 +2158,7 @@ static void io_req_task_queue(struct io_kiocb *req) init_task_work(&req->task_work, io_req_task_submit); percpu_ref_get(&req->ctx->refs); - ret = io_req_task_work_add(req, true); + ret = io_req_task_work_add(req); if (unlikely(ret)) { struct task_struct *tsk; @@ -2086,7 +2169,7 @@ static void io_req_task_queue(struct io_kiocb *req) } } -static void io_queue_next(struct io_kiocb *req) +static inline void io_queue_next(struct io_kiocb *req) { struct io_kiocb *nxt = io_req_find_next(req); @@ -2143,8 +2226,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) io_free_req(req); return; } - if (req->flags & REQ_F_LINK_HEAD) - io_queue_next(req); + io_queue_next(req); if (req->task != rb->task) { if (rb->task) { @@ -2197,7 +2279,7 @@ static void io_free_req_deferred(struct io_kiocb *req) int ret; init_task_work(&req->task_work, io_put_req_deferred_cb); - ret = io_req_task_work_add(req, true); + ret = io_req_task_work_add(req); if (unlikely(ret)) { struct task_struct *tsk; @@ -2246,7 +2328,7 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush) * we wake up the task, and the next invocation will flush the * entries. We cannot safely to it from here. */ - if (noflush && !list_empty(&ctx->cq_overflow_list)) + if (noflush) return -1U; io_cqring_overflow_flush(ctx, false, NULL, NULL); @@ -2593,7 +2675,7 @@ static bool io_rw_reissue(struct io_kiocb *req, long res) if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) return false; - ret = io_sq_thread_acquire_mm(req->ctx, req); + ret = io_sq_thread_acquire_mm_files(req->ctx, req); if (io_resubmit_prep(req, ret)) { refcount_inc(&req->refs); @@ -2641,7 +2723,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) * find it from a io_iopoll_getevents() thread before the issuer is done * accessing the kiocb cookie. */ -static void io_iopoll_req_issued(struct io_kiocb *req) +static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async) { struct io_ring_ctx *ctx = req->ctx; @@ -2670,21 +2752,25 @@ static void io_iopoll_req_issued(struct io_kiocb *req) else list_add_tail(&req->inflight_entry, &ctx->iopoll_list); - if ((ctx->flags & IORING_SETUP_SQPOLL) && + /* + * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread + * task context or in io worker task context. If current task context is + * sq thread, we don't need to check whether should wake up sq thread. + */ + if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) && wq_has_sleeper(&ctx->sq_data->wait)) wake_up(&ctx->sq_data->wait); } -static void __io_state_file_put(struct io_submit_state *state) +static inline void __io_state_file_put(struct io_submit_state *state) { - if (state->has_refs) - fput_many(state->file, state->has_refs); - state->file = NULL; + fput_many(state->file, state->file_refs); + state->file_refs = 0; } static inline void io_state_file_put(struct io_submit_state *state) { - if (state->file) + if (state->file_refs) __io_state_file_put(state); } @@ -2698,29 +2784,25 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) if (!state) return fget(fd); - if (state->file) { + if (state->file_refs) { if (state->fd == fd) { - state->has_refs--; + state->file_refs--; return state->file; } __io_state_file_put(state); } state->file = fget_many(fd, state->ios_left); - if (!state->file) + if (unlikely(!state->file)) return NULL; state->fd = fd; - state->has_refs = state->ios_left - 1; + state->file_refs = state->ios_left - 1; return state->file; } static bool io_bdev_nowait(struct block_device *bdev) { -#ifdef CONFIG_BLOCK return !bdev || blk_queue_nowait(bdev_get_queue(bdev)); -#else - return true; -#endif } /* @@ -2733,14 +2815,16 @@ static bool io_file_supports_async(struct file *file, int rw) umode_t mode = file_inode(file)->i_mode; if (S_ISBLK(mode)) { - if (io_bdev_nowait(file->f_inode->i_bdev)) + if (IS_ENABLED(CONFIG_BLOCK) && + io_bdev_nowait(I_BDEV(file->f_mapping->host))) return true; return false; } if (S_ISCHR(mode) || S_ISSOCK(mode)) return true; if (S_ISREG(mode)) { - if (io_bdev_nowait(file->f_inode->i_sb->s_bdev) && + if (IS_ENABLED(CONFIG_BLOCK) && + io_bdev_nowait(file->f_inode->i_sb->s_bdev) && file->f_op != &io_uring_fops) return true; return false; @@ -3065,7 +3149,7 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, return __io_iov_buffer_select(req, iov, needs_lock); } -static ssize_t __io_import_iovec(int rw, struct io_kiocb *req, +static ssize_t io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, struct iov_iter *iter, bool needs_lock) { @@ -3094,7 +3178,7 @@ static ssize_t __io_import_iovec(int rw, struct io_kiocb *req, ret = import_single_range(rw, buf, sqe_len, *iovec, iter); *iovec = NULL; - return ret < 0 ? ret : sqe_len; + return ret; } if (req->flags & REQ_F_BUFFER_SELECT) { @@ -3111,18 +3195,6 @@ static ssize_t __io_import_iovec(int rw, struct io_kiocb *req, req->ctx->compat); } -static ssize_t io_import_iovec(int rw, struct io_kiocb *req, - struct iovec **iovec, struct iov_iter *iter, - bool needs_lock) -{ - struct io_async_rw *iorw = req->async_data; - - if (!iorw) - return __io_import_iovec(rw, req, iovec, iter, needs_lock); - *iovec = NULL; - return iov_iter_count(&iorw->iter); -} - static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) { return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; @@ -3246,7 +3318,7 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw) struct iovec *iov = iorw->fast_iov; ssize_t ret; - ret = __io_import_iovec(rw, req, &iov, &iorw->iter, false); + ret = io_import_iovec(rw, req, &iov, &iorw->iter, false); if (unlikely(ret < 0)) return ret; @@ -3305,7 +3377,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, /* submit ref gets dropped, acquire a new one */ refcount_inc(&req->refs); - ret = io_req_task_work_add(req, true); + ret = io_req_task_work_add(req); if (unlikely(ret)) { struct task_struct *tsk; @@ -3379,17 +3451,17 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, struct iov_iter __iter, *iter = &__iter; struct io_async_rw *rw = req->async_data; ssize_t io_size, ret, ret2; - size_t iov_count; bool no_async; - if (rw) + if (rw) { iter = &rw->iter; - - ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); - if (ret < 0) - return ret; - iov_count = iov_iter_count(iter); - io_size = ret; + iovec = NULL; + } else { + ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); + if (ret < 0) + return ret; + } + io_size = iov_iter_count(iter); req->result = io_size; ret = 0; @@ -3405,7 +3477,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, if (no_async) goto copy_iov; - ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count); + ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size); if (unlikely(ret)) goto out_free; @@ -3424,7 +3496,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, if (req->file->f_flags & O_NONBLOCK) goto done; /* some cases will consume bytes even on error returns */ - iov_iter_revert(iter, iov_count - iov_iter_count(iter)); + iov_iter_revert(iter, io_size - iov_iter_count(iter)); ret = 0; goto copy_iov; } else if (ret < 0) { @@ -3507,17 +3579,17 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter __iter, *iter = &__iter; struct io_async_rw *rw = req->async_data; - size_t iov_count; ssize_t ret, ret2, io_size; - if (rw) + if (rw) { iter = &rw->iter; - - ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); - if (ret < 0) - return ret; - iov_count = iov_iter_count(iter); - io_size = ret; + iovec = NULL; + } else { + ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); + if (ret < 0) + return ret; + } + io_size = iov_iter_count(iter); req->result = io_size; /* Ensure we clear previously set non-block flag */ @@ -3535,7 +3607,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, (req->flags & REQ_F_ISREG)) goto copy_iov; - ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count); + ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size); if (unlikely(ret)) goto out_free; @@ -3578,7 +3650,7 @@ done: } else { copy_iov: /* some cases will consume bytes even on error returns */ - iov_iter_revert(iter, iov_count - iov_iter_count(iter)); + iov_iter_revert(iter, io_size - iov_iter_count(iter)); ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); if (!ret) return -EAGAIN; @@ -3590,6 +3662,135 @@ out_free: return ret; } +static int io_renameat_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_rename *ren = &req->rename; + const char __user *oldf, *newf; + + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + ren->old_dfd = READ_ONCE(sqe->fd); + oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); + newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + ren->new_dfd = READ_ONCE(sqe->len); + ren->flags = READ_ONCE(sqe->rename_flags); + + ren->oldpath = getname(oldf); + if (IS_ERR(ren->oldpath)) + return PTR_ERR(ren->oldpath); + + ren->newpath = getname(newf); + if (IS_ERR(ren->newpath)) { + putname(ren->oldpath); + return PTR_ERR(ren->newpath); + } + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +static int io_renameat(struct io_kiocb *req, bool force_nonblock) +{ + struct io_rename *ren = &req->rename; + int ret; + + if (force_nonblock) + return -EAGAIN; + + ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, + ren->newpath, ren->flags); + + req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret < 0) + req_set_fail_links(req); + io_req_complete(req, ret); + return 0; +} + +static int io_unlinkat_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_unlink *un = &req->unlink; + const char __user *fname; + + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + un->dfd = READ_ONCE(sqe->fd); + + un->flags = READ_ONCE(sqe->unlink_flags); + if (un->flags & ~AT_REMOVEDIR) + return -EINVAL; + + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + un->filename = getname(fname); + if (IS_ERR(un->filename)) + return PTR_ERR(un->filename); + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +static int io_unlinkat(struct io_kiocb *req, bool force_nonblock) +{ + struct io_unlink *un = &req->unlink; + int ret; + + if (force_nonblock) + return -EAGAIN; + + if (un->flags & AT_REMOVEDIR) + ret = do_rmdir(un->dfd, un->filename); + else + ret = do_unlinkat(un->dfd, un->filename); + + req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret < 0) + req_set_fail_links(req); + io_req_complete(req, ret); + return 0; +} + +static int io_shutdown_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ +#if defined(CONFIG_NET) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || + sqe->buf_index) + return -EINVAL; + + req->shutdown.how = READ_ONCE(sqe->len); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int io_shutdown(struct io_kiocb *req, bool force_nonblock) +{ +#if defined(CONFIG_NET) + struct socket *sock; + int ret; + + if (force_nonblock) + return -EAGAIN; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + ret = __sys_shutdown_sock(sock, req->shutdown.how); + io_req_complete(req, ret); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + static int __io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -3804,7 +4005,7 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { u64 flags, mode; - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; mode = READ_ONCE(sqe->len); flags = READ_ONCE(sqe->open_flags); @@ -3818,7 +4019,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) size_t len; int ret; - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); len = READ_ONCE(sqe->len); @@ -3948,11 +4149,17 @@ static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock, head = idr_find(&ctx->io_buffer_idr, p->bgid); if (head) ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); - - io_ring_submit_lock(ctx, !force_nonblock); if (ret < 0) req_set_fail_links(req); - __io_req_complete(req, ret, 0, cs); + + /* need to hold the lock to complete IOPOLL requests */ + if (ctx->flags & IORING_SETUP_IOPOLL) { + __io_req_complete(req, ret, 0, cs); + io_ring_submit_unlock(ctx, !force_nonblock); + } else { + io_ring_submit_unlock(ctx, !force_nonblock); + __io_req_complete(req, ret, 0, cs); + } return 0; } @@ -4037,10 +4244,17 @@ static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock, } } out: - io_ring_submit_unlock(ctx, !force_nonblock); if (ret < 0) req_set_fail_links(req); - __io_req_complete(req, ret, 0, cs); + + /* need to hold the lock to complete IOPOLL requests */ + if (ctx->flags & IORING_SETUP_IOPOLL) { + __io_req_complete(req, ret, 0, cs); + io_ring_submit_unlock(ctx, !force_nonblock); + } else { + io_ring_submit_unlock(ctx, !force_nonblock); + __io_req_complete(req, ret, 0, cs); + } return 0; } @@ -4212,7 +4426,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) io_req_init_async(req); req->work.flags |= IO_WQ_WORK_NO_CANCEL; - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index) @@ -4236,7 +4450,7 @@ static int io_close(struct io_kiocb *req, bool force_nonblock, /* might be already done during nonblock submission */ if (!close->put_file) { - ret = __close_fd_get_file(close->fd, &close->put_file); + ret = close_fd_get_file(close->fd, &close->put_file); if (ret < 0) return (ret == -ENOENT) ? -EBADF : ret; } @@ -4356,9 +4570,9 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, unsigned flags; int ret; - sock = sock_from_file(req->file, &ret); + sock = sock_from_file(req->file); if (unlikely(!sock)) - return ret; + return -ENOTSOCK; if (req->async_data) { kmsg = req->async_data; @@ -4405,9 +4619,9 @@ static int io_send(struct io_kiocb *req, bool force_nonblock, unsigned flags; int ret; - sock = sock_from_file(req->file, &ret); + sock = sock_from_file(req->file); if (unlikely(!sock)) - return ret; + return -ENOTSOCK; ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); if (unlikely(ret)) @@ -4585,9 +4799,9 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, unsigned flags; int ret, cflags = 0; - sock = sock_from_file(req->file, &ret); + sock = sock_from_file(req->file); if (unlikely(!sock)) - return ret; + return -ENOTSOCK; if (req->async_data) { kmsg = req->async_data; @@ -4648,9 +4862,9 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock, unsigned flags; int ret, cflags = 0; - sock = sock_from_file(req->file, &ret); + sock = sock_from_file(req->file); if (unlikely(!sock)) - return ret; + return -ENOTSOCK; if (req->flags & REQ_F_BUFFER_SELECT) { kbuf = io_recv_buffer_select(req, !force_nonblock); @@ -4694,7 +4908,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_accept *accept = &req->accept; - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->len || sqe->buf_index) return -EINVAL; @@ -4735,7 +4949,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_connect *conn = &req->connect; struct io_async_connect *io = req->async_data; - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) return -EINVAL; @@ -4859,7 +5073,6 @@ struct io_poll_table { static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, __poll_t mask, task_work_func_t func) { - bool twa_signal_ok; int ret; /* for instances that support it check for an event match first: */ @@ -4875,20 +5088,12 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, percpu_ref_get(&req->ctx->refs); /* - * If we using the signalfd wait_queue_head for this wakeup, then - * it's not safe to use TWA_SIGNAL as we could be recursing on the - * tsk->sighand->siglock on doing the wakeup. Should not be needed - * either, as the normal wakeup will suffice. - */ - twa_signal_ok = (poll->head != &req->task->sighand->signalfd_wqh); - - /* * If this fails, then the task is exiting. When a task exits, the * work gets canceled, so just cancel this request as well instead * of executing it. We can't safely execute it anyway, as we may not * have the needed state needed for it anyway. */ - ret = io_req_task_work_add(req, twa_signal_ok); + ret = io_req_task_work_add(req); if (unlikely(ret)) { struct task_struct *tsk; @@ -5279,7 +5484,8 @@ static bool io_poll_remove_one(struct io_kiocb *req) /* * Returns true if we found and killed one or more poll requests */ -static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk) +static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, + struct files_struct *files) { struct hlist_node *tmp; struct io_kiocb *req; @@ -5291,7 +5497,7 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk) list = &ctx->cancel_hash[i]; hlist_for_each_entry_safe(req, tmp, list, hash_node) { - if (io_task_match(req, tsk)) + if (io_match_task(req, tsk, files)) posted += io_poll_remove_one(req); } } @@ -5329,7 +5535,7 @@ static int io_poll_remove_prep(struct io_kiocb *req, sqe->poll_events) return -EINVAL; - req->poll.addr = READ_ONCE(sqe->addr); + req->poll_remove.addr = READ_ONCE(sqe->addr); return 0; } @@ -5340,12 +5546,10 @@ static int io_poll_remove_prep(struct io_kiocb *req, static int io_poll_remove(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - u64 addr; int ret; - addr = req->poll.addr; spin_lock_irq(&ctx->completion_lock); - ret = io_poll_cancel(ctx, addr); + ret = io_poll_cancel(ctx, req->poll_remove.addr); spin_unlock_irq(&ctx->completion_lock); if (ret < 0) @@ -5438,15 +5642,37 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -static int __io_timeout_cancel(struct io_kiocb *req) +static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, + __u64 user_data) { - struct io_timeout_data *io = req->async_data; - int ret; + struct io_timeout_data *io; + struct io_kiocb *req; + int ret = -ENOENT; + + list_for_each_entry(req, &ctx->timeout_list, timeout.list) { + if (user_data == req->user_data) { + ret = 0; + break; + } + } + + if (ret == -ENOENT) + return ERR_PTR(ret); + io = req->async_data; ret = hrtimer_try_to_cancel(&io->timer); if (ret == -1) - return -EALREADY; + return ERR_PTR(-EALREADY); list_del_init(&req->timeout.list); + return req; +} + +static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) +{ + struct io_kiocb *req = io_timeout_extract(ctx, user_data); + + if (IS_ERR(req)) + return PTR_ERR(req); req_set_fail_links(req); io_cqring_fill_event(req, -ECANCELED); @@ -5454,35 +5680,48 @@ static int __io_timeout_cancel(struct io_kiocb *req) return 0; } -static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) +static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, + struct timespec64 *ts, enum hrtimer_mode mode) { - struct io_kiocb *req; - int ret = -ENOENT; - - list_for_each_entry(req, &ctx->timeout_list, timeout.list) { - if (user_data == req->user_data) { - ret = 0; - break; - } - } + struct io_kiocb *req = io_timeout_extract(ctx, user_data); + struct io_timeout_data *data; - if (ret == -ENOENT) - return ret; + if (IS_ERR(req)) + return PTR_ERR(req); - return __io_timeout_cancel(req); + req->timeout.off = 0; /* noseq */ + data = req->async_data; + list_add_tail(&req->timeout.list, &ctx->timeout_list); + hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode); + data->timer.function = io_timeout_fn; + hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); + return 0; } static int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_timeout_rem *tr = &req->timeout_rem; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->timeout_flags) + if (sqe->ioprio || sqe->buf_index || sqe->len) + return -EINVAL; + + tr->addr = READ_ONCE(sqe->addr); + tr->flags = READ_ONCE(sqe->timeout_flags); + if (tr->flags & IORING_TIMEOUT_UPDATE) { + if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS)) + return -EINVAL; + if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) + return -EFAULT; + } else if (tr->flags) { + /* timeout removal doesn't support flags */ return -EINVAL; + } - req->timeout_rem.addr = READ_ONCE(sqe->addr); return 0; } @@ -5491,11 +5730,19 @@ static int io_timeout_remove_prep(struct io_kiocb *req, */ static int io_timeout_remove(struct io_kiocb *req) { + struct io_timeout_rem *tr = &req->timeout_rem; struct io_ring_ctx *ctx = req->ctx; int ret; spin_lock_irq(&ctx->completion_lock); - ret = io_timeout_cancel(ctx, req->timeout_rem.addr); + if (req->timeout_rem.flags & IORING_TIMEOUT_UPDATE) { + enum hrtimer_mode mode = (tr->flags & IORING_TIMEOUT_ABS) + ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; + + ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); + } else { + ret = io_timeout_cancel(ctx, tr->addr); + } io_cqring_fill_event(req, ret); io_commit_cqring(ctx); @@ -5775,6 +6022,12 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_remove_buffers_prep(req, sqe); case IORING_OP_TEE: return io_tee_prep(req, sqe); + case IORING_OP_SHUTDOWN: + return io_shutdown_prep(req, sqe); + case IORING_OP_RENAMEAT: + return io_renameat_prep(req, sqe); + case IORING_OP_UNLINKAT: + return io_unlinkat_prep(req, sqe); } printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -5796,11 +6049,10 @@ static u32 io_get_sequence(struct io_kiocb *req) { struct io_kiocb *pos; struct io_ring_ctx *ctx = req->ctx; - u32 total_submitted, nr_reqs = 1; + u32 total_submitted, nr_reqs = 0; - if (req->flags & REQ_F_LINK_HEAD) - list_for_each_entry(pos, &req->link_list, link_list) - nr_reqs++; + io_for_each_link(pos, req) + nr_reqs++; total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped; return total_submitted - nr_reqs; @@ -5852,12 +6104,13 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) static void io_req_drop_files(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + struct io_uring_task *tctx = req->task->io_uring; unsigned long flags; spin_lock_irqsave(&ctx->inflight_lock, flags); list_del(&req->inflight_entry); - if (waitqueue_active(&ctx->inflight_wait)) - wake_up(&ctx->inflight_wait); + if (atomic_read(&tctx->in_idle)) + wake_up(&tctx->wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); req->flags &= ~REQ_F_INFLIGHT; put_files_struct(req->work.identity->files); @@ -5912,6 +6165,13 @@ static void __io_clean_op(struct io_kiocb *req) if (req->open.filename) putname(req->open.filename); break; + case IORING_OP_RENAMEAT: + putname(req->rename.oldpath); + putname(req->rename.newpath); + break; + case IORING_OP_UNLINKAT: + putname(req->unlink.filename); + break; } req->flags &= ~REQ_F_NEED_CLEANUP; } @@ -6018,6 +6278,15 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock, case IORING_OP_TEE: ret = io_tee(req, force_nonblock); break; + case IORING_OP_SHUTDOWN: + ret = io_shutdown(req, force_nonblock); + break; + case IORING_OP_RENAMEAT: + ret = io_renameat(req, force_nonblock); + break; + case IORING_OP_UNLINKAT: + ret = io_unlinkat(req, force_nonblock); + break; default: ret = -EINVAL; break; @@ -6034,7 +6303,7 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock, if (in_async) mutex_lock(&ctx->uring_lock); - io_iopoll_req_issued(req); + io_iopoll_req_issued(req, in_async); if (in_async) mutex_unlock(&ctx->uring_lock); @@ -6074,8 +6343,19 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work) } if (ret) { - req_set_fail_links(req); - io_req_complete(req, ret); + /* + * io_iopoll_complete() does not hold completion_lock to complete + * polled io, so here for polled io, just mark it done and still let + * io_iopoll_complete() complete it. + */ + if (req->ctx->flags & IORING_SETUP_IOPOLL) { + struct kiocb *kiocb = &req->rw.kiocb; + + kiocb_done(kiocb, ret, NULL); + } else { + req_set_fail_links(req); + io_req_complete(req, ret); + } } return io_steal_work(req); @@ -6101,10 +6381,7 @@ static struct file *io_file_get(struct io_submit_state *state, return NULL; fd = array_index_nospec(fd, ctx->nr_user_files); file = io_file_from_index(ctx, fd); - if (file) { - req->fixed_file_refs = &ctx->file_data->node->refs; - percpu_ref_get(req->fixed_file_refs); - } + io_set_resource_node(req); } else { trace_io_uring_file_get(ctx, fd); file = __io_file_get(state, fd); @@ -6113,45 +6390,26 @@ static struct file *io_file_get(struct io_submit_state *state, return file; } -static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, - int fd) -{ - bool fixed; - - fixed = (req->flags & REQ_F_FIXED_FILE) != 0; - if (unlikely(!fixed && io_async_submit(req->ctx))) - return -EBADF; - - req->file = io_file_get(state, req, fd, fixed); - if (req->file || io_op_defs[req->opcode].needs_file_no_error) - return 0; - return -EBADF; -} - static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) { struct io_timeout_data *data = container_of(timer, struct io_timeout_data, timer); - struct io_kiocb *req = data->req; + struct io_kiocb *prev, *req = data->req; struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *prev = NULL; unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); + prev = req->timeout.head; + req->timeout.head = NULL; /* * We don't expect the list to be empty, that will only happen if we * race with the completion of the linked work. */ - if (!list_empty(&req->link_list)) { - prev = list_entry(req->link_list.prev, struct io_kiocb, - link_list); - if (refcount_inc_not_zero(&prev->refs)) - list_del_init(&req->link_list); - else - prev = NULL; - } - + if (prev && refcount_inc_not_zero(&prev->refs)) + io_remove_next_linked(prev); + else + prev = NULL; spin_unlock_irqrestore(&ctx->completion_lock, flags); if (prev) { @@ -6167,10 +6425,10 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) static void __io_queue_linked_timeout(struct io_kiocb *req) { /* - * If the list is now empty, then our linked request finished before - * we got a chance to setup the timer + * If the back reference is NULL, then our linked request finished + * before we got a chance to setup the timer */ - if (!list_empty(&req->link_list)) { + if (req->timeout.head) { struct io_timeout_data *data = req->async_data; data->timer.function = io_link_timeout_fn; @@ -6193,18 +6451,13 @@ static void io_queue_linked_timeout(struct io_kiocb *req) static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) { - struct io_kiocb *nxt; - - if (!(req->flags & REQ_F_LINK_HEAD)) - return NULL; - if (req->flags & REQ_F_LINK_TIMEOUT) - return NULL; + struct io_kiocb *nxt = req->link; - nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, - link_list); - if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT) + if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) || + nxt->opcode != IORING_OP_LINK_TIMEOUT) return NULL; + nxt->timeout.head = req; nxt->flags |= REQ_F_LTIMEOUT_ACTIVE; req->flags |= REQ_F_LINK_TIMEOUT; return nxt; @@ -6310,8 +6563,13 @@ static inline void io_queue_link_head(struct io_kiocb *req, io_queue_sqe(req, NULL, cs); } +struct io_submit_link { + struct io_kiocb *head; + struct io_kiocb *last; +}; + static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **link, struct io_comp_state *cs) + struct io_submit_link *link, struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -6323,8 +6581,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, * submitted sync once the chain is complete. If none of those * conditions are true (normal request), then just queue it. */ - if (*link) { - struct io_kiocb *head = *link; + if (link->head) { + struct io_kiocb *head = link->head; /* * Taking sequential execution of a link, draining both sides @@ -6344,12 +6602,13 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, return ret; } trace_io_uring_link(ctx, req, head); - list_add_tail(&req->link_list, &head->link_list); + link->last->link = req; + link->last = req; /* last request of a link, enqueue the link */ if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { io_queue_link_head(head, cs); - *link = NULL; + link->head = NULL; } } else { if (unlikely(ctx->drain_next)) { @@ -6357,13 +6616,11 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ctx->drain_next = 0; } if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { - req->flags |= REQ_F_LINK_HEAD; - INIT_LIST_HEAD(&req->link_list); - ret = io_req_defer_prep(req, sqe); if (unlikely(ret)) req->flags |= REQ_F_FAIL_LINK; - *link = req; + link->head = req; + link->last = req; } else { io_queue_sqe(req, sqe, cs); } @@ -6379,7 +6636,8 @@ static void io_submit_state_end(struct io_submit_state *state) { if (!list_empty(&state->comp.list)) io_submit_flush_completions(&state->comp); - blk_finish_plug(&state->plug); + if (state->plug_started) + blk_finish_plug(&state->plug); io_state_file_put(state); if (state->free_reqs) kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs); @@ -6391,12 +6649,12 @@ static void io_submit_state_end(struct io_submit_state *state) static void io_submit_state_start(struct io_submit_state *state, struct io_ring_ctx *ctx, unsigned int max_ios) { - blk_start_plug(&state->plug); + state->plug_started = false; state->comp.nr = 0; INIT_LIST_HEAD(&state->comp.list); state->comp.ctx = ctx; state->free_reqs = 0; - state->file = NULL; + state->file_refs = 0; state->ios_left = max_ios; } @@ -6491,6 +6749,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->file = NULL; req->ctx = ctx; req->flags = 0; + req->link = NULL; + req->fixed_file_refs = NULL; /* one is dropped after submission, the other at completion */ refcount_set(&req->refs, 2); req->task = current; @@ -6499,7 +6759,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (unlikely(req->opcode >= IORING_OP_LAST)) return -EINVAL; - if (unlikely(io_sq_thread_acquire_mm(ctx, req))) + if (unlikely(io_sq_thread_acquire_mm_files(ctx, req))) return -EFAULT; sqe_flags = READ_ONCE(sqe->flags); @@ -6532,10 +6792,26 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, /* same numerical values with corresponding REQ_F_*, safe to copy */ req->flags |= sqe_flags; - if (!io_op_defs[req->opcode].needs_file) - return 0; + /* + * Plug now if we have more than 1 IO left after this, and the target + * is potentially a read/write to block based storage. + */ + if (!state->plug_started && state->ios_left > 1 && + io_op_defs[req->opcode].plug) { + blk_start_plug(&state->plug); + state->plug_started = true; + } + + ret = 0; + if (io_op_defs[req->opcode].needs_file) { + bool fixed = req->flags & REQ_F_FIXED_FILE; + + req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed); + if (unlikely(!req->file && + !io_op_defs[req->opcode].needs_file_no_error)) + ret = -EBADF; + } - ret = io_req_set_file(state, req, READ_ONCE(sqe->fd)); state->ios_left--; return ret; } @@ -6543,7 +6819,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) { struct io_submit_state state; - struct io_kiocb *link = NULL; + struct io_submit_link link; int i, submitted = 0; /* if we have a backlog and couldn't flush it all, return BUSY */ @@ -6563,6 +6839,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) refcount_add(nr, ¤t->usage); io_submit_state_start(&state, ctx, nr); + link.head = NULL; for (i = 0; i < nr; i++) { const struct io_uring_sqe *sqe; @@ -6608,8 +6885,8 @@ fail_req: percpu_counter_sub(&tctx->inflight, unused); put_task_struct_many(current, unused); } - if (link) - io_queue_link_head(link, &state.comp); + if (link.head) + io_queue_link_head(link.head, &state.comp); io_submit_state_end(&state); /* Commit SQ ring head once we've consumed and submitted all SQEs */ @@ -6633,111 +6910,45 @@ static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) spin_unlock_irq(&ctx->completion_lock); } -static int io_sq_wake_function(struct wait_queue_entry *wqe, unsigned mode, - int sync, void *key) -{ - struct io_ring_ctx *ctx = container_of(wqe, struct io_ring_ctx, sqo_wait_entry); - int ret; - - ret = autoremove_wake_function(wqe, mode, sync, key); - if (ret) { - unsigned long flags; - - spin_lock_irqsave(&ctx->completion_lock, flags); - ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; - spin_unlock_irqrestore(&ctx->completion_lock, flags); - } - return ret; -} - -enum sq_ret { - SQT_IDLE = 1, - SQT_SPIN = 2, - SQT_DID_WORK = 4, -}; - -static enum sq_ret __io_sq_thread(struct io_ring_ctx *ctx, - unsigned long start_jiffies, bool cap_entries) +static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) { - unsigned long timeout = start_jiffies + ctx->sq_thread_idle; - struct io_sq_data *sqd = ctx->sq_data; unsigned int to_submit; int ret = 0; -again: - if (!list_empty(&ctx->iopoll_list)) { + to_submit = io_sqring_entries(ctx); + /* if we're handling multiple rings, cap submit size for fairness */ + if (cap_entries && to_submit > 8) + to_submit = 8; + + if (!list_empty(&ctx->iopoll_list) || to_submit) { unsigned nr_events = 0; mutex_lock(&ctx->uring_lock); - if (!list_empty(&ctx->iopoll_list) && !need_resched()) + if (!list_empty(&ctx->iopoll_list)) io_do_iopoll(ctx, &nr_events, 0); + + if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs))) + ret = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock); } - to_submit = io_sqring_entries(ctx); - - /* - * If submit got -EBUSY, flag us as needing the application - * to enter the kernel to reap and flush events. - */ - if (!to_submit || ret == -EBUSY || need_resched()) { - /* - * Drop cur_mm before scheduling, we can't hold it for - * long periods (or over schedule()). Do this before - * adding ourselves to the waitqueue, as the unuse/drop - * may sleep. - */ - io_sq_thread_drop_mm(); - - /* - * We're polling. If we're within the defined idle - * period, then let us spin without work before going - * to sleep. The exception is if we got EBUSY doing - * more IO, we should wait for the application to - * reap events and wake us up. - */ - if (!list_empty(&ctx->iopoll_list) || need_resched() || - (!time_after(jiffies, timeout) && ret != -EBUSY && - !percpu_ref_is_dying(&ctx->refs))) - return SQT_SPIN; + if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait)) + wake_up(&ctx->sqo_sq_wait); - prepare_to_wait(&sqd->wait, &ctx->sqo_wait_entry, - TASK_INTERRUPTIBLE); + return ret; +} - /* - * While doing polled IO, before going to sleep, we need - * to check if there are new reqs added to iopoll_list, - * it is because reqs may have been punted to io worker - * and will be added to iopoll_list later, hence check - * the iopoll_list again. - */ - if ((ctx->flags & IORING_SETUP_IOPOLL) && - !list_empty_careful(&ctx->iopoll_list)) { - finish_wait(&sqd->wait, &ctx->sqo_wait_entry); - goto again; - } +static void io_sqd_update_thread_idle(struct io_sq_data *sqd) +{ + struct io_ring_ctx *ctx; + unsigned sq_thread_idle = 0; - to_submit = io_sqring_entries(ctx); - if (!to_submit || ret == -EBUSY) - return SQT_IDLE; + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + if (sq_thread_idle < ctx->sq_thread_idle) + sq_thread_idle = ctx->sq_thread_idle; } - finish_wait(&sqd->wait, &ctx->sqo_wait_entry); - io_ring_clear_wakeup_flag(ctx); - - /* if we're handling multiple rings, cap submit size for fairness */ - if (cap_entries && to_submit > 8) - to_submit = 8; - - mutex_lock(&ctx->uring_lock); - if (likely(!percpu_ref_is_dying(&ctx->refs))) - ret = io_submit_sqes(ctx, to_submit); - mutex_unlock(&ctx->uring_lock); - - if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait)) - wake_up(&ctx->sqo_sq_wait); - - return SQT_DID_WORK; + sqd->sq_thread_idle = sq_thread_idle; } static void io_sqd_init_new(struct io_sq_data *sqd) @@ -6746,39 +6957,56 @@ static void io_sqd_init_new(struct io_sq_data *sqd) while (!list_empty(&sqd->ctx_new_list)) { ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list); - init_wait(&ctx->sqo_wait_entry); - ctx->sqo_wait_entry.func = io_sq_wake_function; list_move_tail(&ctx->sqd_list, &sqd->ctx_list); complete(&ctx->sq_thread_comp); } + + io_sqd_update_thread_idle(sqd); } static int io_sq_thread(void *data) { struct cgroup_subsys_state *cur_css = NULL; + struct files_struct *old_files = current->files; + struct nsproxy *old_nsproxy = current->nsproxy; const struct cred *old_cred = NULL; struct io_sq_data *sqd = data; struct io_ring_ctx *ctx; - unsigned long start_jiffies; + unsigned long timeout = 0; + DEFINE_WAIT(wait); + + task_lock(current); + current->files = NULL; + current->nsproxy = NULL; + task_unlock(current); - start_jiffies = jiffies; while (!kthread_should_stop()) { - enum sq_ret ret = 0; - bool cap_entries; + int ret; + bool cap_entries, sqt_spin, needs_sched; /* * Any changes to the sqd lists are synchronized through the * kthread parking. This synchronizes the thread vs users, * the users are synchronized on the sqd->ctx_lock. */ - if (kthread_should_park()) + if (kthread_should_park()) { kthread_parkme(); + /* + * When sq thread is unparked, in case the previous park operation + * comes from io_put_sq_data(), which means that sq thread is going + * to be stopped, so here needs to have a check. + */ + if (kthread_should_stop()) + break; + } - if (unlikely(!list_empty(&sqd->ctx_new_list))) + if (unlikely(!list_empty(&sqd->ctx_new_list))) { io_sqd_init_new(sqd); + timeout = jiffies + sqd->sq_thread_idle; + } + sqt_spin = false; cap_entries = !list_is_singular(&sqd->ctx_list); - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { if (current->cred != ctx->creds) { if (old_cred) @@ -6791,24 +7019,49 @@ static int io_sq_thread(void *data) current->sessionid = ctx->sessionid; #endif - ret |= __io_sq_thread(ctx, start_jiffies, cap_entries); + ret = __io_sq_thread(ctx, cap_entries); + if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list))) + sqt_spin = true; - io_sq_thread_drop_mm(); + io_sq_thread_drop_mm_files(); } - if (ret & SQT_SPIN) { + if (sqt_spin || !time_after(jiffies, timeout)) { io_run_task_work(); cond_resched(); - } else if (ret == SQT_IDLE) { - if (kthread_should_park()) - continue; + if (sqt_spin) + timeout = jiffies + sqd->sq_thread_idle; + continue; + } + + if (kthread_should_park()) + continue; + + needs_sched = true; + prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + if ((ctx->flags & IORING_SETUP_IOPOLL) && + !list_empty_careful(&ctx->iopoll_list)) { + needs_sched = false; + break; + } + if (io_sqring_entries(ctx)) { + needs_sched = false; + break; + } + } + + if (needs_sched) { list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_set_wakeup_flag(ctx); + schedule(); - start_jiffies = jiffies; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_clear_wakeup_flag(ctx); } + + finish_wait(&sqd->wait, &wait); + timeout = jiffies + sqd->sq_thread_idle; } io_run_task_work(); @@ -6818,6 +7071,11 @@ static int io_sq_thread(void *data) if (old_cred) revert_creds(old_cred); + task_lock(current); + current->files = old_files; + current->nsproxy = old_nsproxy; + task_unlock(current); + kthread_parkme(); return 0; @@ -6862,13 +7120,8 @@ static int io_run_task_work_sig(void) return 1; if (!signal_pending(current)) return 0; - if (current->jobctl & JOBCTL_TASK_WORK) { - spin_lock_irq(¤t->sighand->siglock); - current->jobctl &= ~JOBCTL_TASK_WORK; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - return 1; - } + if (test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL)) + return -ERESTARTSYS; return -EINTR; } @@ -6877,7 +7130,8 @@ static int io_run_task_work_sig(void) * application must reap them itself, as they reside on the shared cq ring. */ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, - const sigset_t __user *sig, size_t sigsz) + const sigset_t __user *sig, size_t sigsz, + struct __kernel_timespec __user *uts) { struct io_wait_queue iowq = { .wq = { @@ -6889,6 +7143,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, .to_wait = min_events, }; struct io_rings *rings = ctx->rings; + struct timespec64 ts; + signed long timeout = 0; int ret = 0; do { @@ -6911,6 +7167,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } + if (uts) { + if (get_timespec64(&ts, uts)) + return -EFAULT; + timeout = timespec64_to_jiffies(&ts); + } + iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); trace_io_uring_cqring_wait(ctx, min_events); do { @@ -6924,7 +7186,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, break; if (io_should_wake(&iowq, false)) break; - schedule(); + if (uts) { + timeout = schedule_timeout(timeout); + if (timeout == 0) { + ret = -ETIME; + break; + } + } else { + schedule(); + } } while (1); finish_wait(&ctx->wait, &iowq.wq); @@ -6973,9 +7243,9 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) if (!data) return -ENXIO; - spin_lock(&data->lock); + spin_lock_bh(&data->lock); ref_node = data->node; - spin_unlock(&data->lock); + spin_unlock_bh(&data->lock); if (ref_node) percpu_ref_kill(&ref_node->refs); @@ -7098,12 +7368,11 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx) mutex_lock(&sqd->ctx_lock); list_del(&ctx->sqd_list); + io_sqd_update_thread_idle(sqd); mutex_unlock(&sqd->ctx_lock); - if (sqd->thread) { - finish_wait(&sqd->wait, &ctx->sqo_wait_entry); + if (sqd->thread) io_sq_thread_unpark(sqd); - } io_put_sq_data(sqd); ctx->sq_data = NULL; @@ -7358,7 +7627,7 @@ static void io_file_data_ref_zero(struct percpu_ref *ref) data = ref_node->file_data; ctx = data->ctx; - spin_lock(&data->lock); + spin_lock_bh(&data->lock); ref_node->done = true; while (!list_empty(&data->ref_list)) { @@ -7370,7 +7639,7 @@ static void io_file_data_ref_zero(struct percpu_ref *ref) list_del(&ref_node->node); first_add |= llist_add(&ref_node->llist, &ctx->file_put_llist); } - spin_unlock(&data->lock); + spin_unlock_bh(&data->lock); if (percpu_ref_is_dying(&data->refs)) delay = 0; @@ -7493,9 +7762,9 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, } file_data->node = ref_node; - spin_lock(&file_data->lock); + spin_lock_bh(&file_data->lock); list_add_tail(&ref_node->node, &file_data->ref_list); - spin_unlock(&file_data->lock); + spin_unlock_bh(&file_data->lock); percpu_ref_get(&file_data->refs); return ret; out_fput: @@ -7652,10 +7921,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, if (needs_switch) { percpu_ref_kill(&data->node->refs); - spin_lock(&data->lock); + spin_lock_bh(&data->lock); list_add_tail(&ref_node->node, &data->ref_list); data->node = ref_node; - spin_unlock(&data->lock); + spin_unlock_bh(&data->lock); percpu_ref_get(&ctx->file_data->refs); } else destroy_fixed_file_ref_node(ref_node); @@ -7783,7 +8052,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_sq_data *sqd; ret = -EPERM; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE)) goto err; sqd = io_get_sq_data(p); @@ -8369,8 +8638,6 @@ static void io_ring_exit_work(struct work_struct *work) * as nobody else will be looking for them. */ do { - if (ctx->rings) - io_cqring_overflow_flush(ctx, true, NULL, NULL); io_iopoll_try_reap_events(ctx); } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); io_ring_ctx_free(ctx); @@ -8380,17 +8647,17 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { mutex_lock(&ctx->uring_lock); percpu_ref_kill(&ctx->refs); + if (ctx->rings) + io_cqring_overflow_flush(ctx, true, NULL, NULL); mutex_unlock(&ctx->uring_lock); - io_kill_timeouts(ctx, NULL); - io_poll_remove_all(ctx, NULL); + io_kill_timeouts(ctx, NULL, NULL); + io_poll_remove_all(ctx, NULL, NULL); if (ctx->io_wq) io_wq_cancel_all(ctx->io_wq); /* if we failed setting up the ctx, we might not have any rings */ - if (ctx->rings) - io_cqring_overflow_flush(ctx, true, NULL, NULL); io_iopoll_try_reap_events(ctx); idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); @@ -8421,120 +8688,31 @@ static int io_uring_release(struct inode *inode, struct file *file) return 0; } -static bool io_wq_files_match(struct io_wq_work *work, void *data) -{ - struct files_struct *files = data; - - return !files || ((work->flags & IO_WQ_WORK_FILES) && - work->identity->files == files); -} - -/* - * Returns true if 'preq' is the link parent of 'req' - */ -static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req) -{ - struct io_kiocb *link; - - if (!(preq->flags & REQ_F_LINK_HEAD)) - return false; - - list_for_each_entry(link, &preq->link_list, link_list) { - if (link == req) - return true; - } - - return false; -} - -/* - * We're looking to cancel 'req' because it's holding on to our files, but - * 'req' could be a link to another request. See if it is, and cancel that - * parent request if so. - */ -static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req) -{ - struct hlist_node *tmp; - struct io_kiocb *preq; - bool found = false; - int i; - - spin_lock_irq(&ctx->completion_lock); - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct hlist_head *list; - - list = &ctx->cancel_hash[i]; - hlist_for_each_entry_safe(preq, tmp, list, hash_node) { - found = io_match_link(preq, req); - if (found) { - io_poll_remove_one(preq); - break; - } - } - } - spin_unlock_irq(&ctx->completion_lock); - return found; -} - -static bool io_timeout_remove_link(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - struct io_kiocb *preq; - bool found = false; - - spin_lock_irq(&ctx->completion_lock); - list_for_each_entry(preq, &ctx->timeout_list, timeout.list) { - found = io_match_link(preq, req); - if (found) { - __io_timeout_cancel(preq); - break; - } - } - spin_unlock_irq(&ctx->completion_lock); - return found; -} +struct io_task_cancel { + struct task_struct *task; + struct files_struct *files; +}; -static bool io_cancel_link_cb(struct io_wq_work *work, void *data) +static bool io_cancel_task_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_task_cancel *cancel = data; bool ret; - if (req->flags & REQ_F_LINK_TIMEOUT) { + if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) { unsigned long flags; struct io_ring_ctx *ctx = req->ctx; /* protect against races with linked timeouts */ spin_lock_irqsave(&ctx->completion_lock, flags); - ret = io_match_link(req, data); + ret = io_match_task(req, cancel->task, cancel->files); spin_unlock_irqrestore(&ctx->completion_lock, flags); } else { - ret = io_match_link(req, data); + ret = io_match_task(req, cancel->task, cancel->files); } return ret; } -static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) -{ - enum io_wq_cancel cret; - - /* cancel this particular work, if it's running */ - cret = io_wq_cancel_work(ctx->io_wq, &req->work); - if (cret != IO_WQ_CANCEL_NOTFOUND) - return; - - /* find links that hold this pending, cancel those */ - cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true); - if (cret != IO_WQ_CANCEL_NOTFOUND) - return; - - /* if we have a poll link holding this pending, cancel that */ - if (io_poll_remove_link(ctx, req)) - return; - - /* final option, timeout link is holding this req pending */ - io_timeout_remove_link(ctx, req); -} - static void io_cancel_defer_files(struct io_ring_ctx *ctx, struct task_struct *task, struct files_struct *files) @@ -8544,8 +8722,7 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx, spin_lock_irq(&ctx->completion_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { - if (io_task_match(de->req, task) && - io_match_files(de->req, files)) { + if (io_match_task(de->req, task, files)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } @@ -8562,72 +8739,52 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx, } } -/* - * Returns true if we found and killed one or more files pinning requests - */ -static bool io_uring_cancel_files(struct io_ring_ctx *ctx, +static void io_uring_cancel_files(struct io_ring_ctx *ctx, + struct task_struct *task, struct files_struct *files) { - if (list_empty_careful(&ctx->inflight_list)) - return false; - - /* cancel all at once, should be faster than doing it one by one*/ - io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true); - while (!list_empty_careful(&ctx->inflight_list)) { - struct io_kiocb *cancel_req = NULL, *req; + struct io_task_cancel cancel = { .task = task, .files = files }; + struct io_kiocb *req; DEFINE_WAIT(wait); + bool found = false; spin_lock_irq(&ctx->inflight_lock); list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { - if (files && (req->work.flags & IO_WQ_WORK_FILES) && + if (req->task != task || req->work.identity->files != files) continue; - /* req is being completed, ignore */ - if (!refcount_inc_not_zero(&req->refs)) - continue; - cancel_req = req; + found = true; break; } - if (cancel_req) - prepare_to_wait(&ctx->inflight_wait, &wait, - TASK_UNINTERRUPTIBLE); + if (found) + prepare_to_wait(&task->io_uring->wait, &wait, + TASK_UNINTERRUPTIBLE); spin_unlock_irq(&ctx->inflight_lock); /* We need to keep going until we don't find a matching req */ - if (!cancel_req) + if (!found) break; - /* cancel this request, or head link requests */ - io_attempt_cancel(ctx, cancel_req); - io_put_req(cancel_req); + + io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true); + io_poll_remove_all(ctx, task, files); + io_kill_timeouts(ctx, task, files); /* cancellations _may_ trigger task work */ io_run_task_work(); schedule(); - finish_wait(&ctx->inflight_wait, &wait); + finish_wait(&task->io_uring->wait, &wait); } - - return true; } -static bool io_cancel_task_cb(struct io_wq_work *work, void *data) +static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, + struct task_struct *task) { - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct task_struct *task = data; - - return io_task_match(req, task); -} - -static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, - struct task_struct *task, - struct files_struct *files) -{ - bool ret; - - ret = io_uring_cancel_files(ctx, files); - if (!files) { + while (1) { + struct io_task_cancel cancel = { .task = task, .files = NULL, }; enum io_wq_cancel cret; + bool ret = false; - cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true); + cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true); if (cret != IO_WQ_CANCEL_NOTFOUND) ret = true; @@ -8639,11 +8796,13 @@ static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, } } - ret |= io_poll_remove_all(ctx, task); - ret |= io_kill_timeouts(ctx, task); + ret |= io_poll_remove_all(ctx, task, NULL); + ret |= io_kill_timeouts(ctx, task, NULL); + if (!ret) + break; + io_run_task_work(); + cond_resched(); } - - return ret; } /* @@ -8662,17 +8821,15 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, io_sq_thread_park(ctx->sq_data); } - if (files) - io_cancel_defer_files(ctx, NULL, files); - else - io_cancel_defer_files(ctx, task, NULL); - + io_cancel_defer_files(ctx, task, files); + io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); io_cqring_overflow_flush(ctx, true, task, files); + io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); - while (__io_uring_cancel_task_requests(ctx, task, files)) { - io_run_task_work(); - cond_resched(); - } + if (!files) + __io_uring_cancel_task_requests(ctx, task); + else + io_uring_cancel_files(ctx, task, files); if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { atomic_dec(&task->io_uring->in_idle); @@ -8930,9 +9087,39 @@ static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) finish_wait(&ctx->sqo_sq_wait, &wait); } +static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, + struct __kernel_timespec __user **ts, + const sigset_t __user **sig) +{ + struct io_uring_getevents_arg arg; + + /* + * If EXT_ARG isn't set, then we have no timespec and the argp pointer + * is just a pointer to the sigset_t. + */ + if (!(flags & IORING_ENTER_EXT_ARG)) { + *sig = (const sigset_t __user *) argp; + *ts = NULL; + return 0; + } + + /* + * EXT_ARG is set - ensure we agree on the size of it and copy in our + * timespec and sigset_t pointers if good. + */ + if (*argsz != sizeof(arg)) + return -EINVAL; + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; + *sig = u64_to_user_ptr(arg.sigmask); + *argsz = arg.sigmask_sz; + *ts = u64_to_user_ptr(arg.ts); + return 0; +} + SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, - u32, min_complete, u32, flags, const sigset_t __user *, sig, - size_t, sigsz) + u32, min_complete, u32, flags, const void __user *, argp, + size_t, argsz) { struct io_ring_ctx *ctx; long ret = -EBADF; @@ -8942,7 +9129,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, io_run_task_work(); if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | - IORING_ENTER_SQ_WAIT)) + IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)) return -EINVAL; f = fdget(fd); @@ -8969,8 +9156,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { + io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); if (!list_empty_careful(&ctx->cq_overflow_list)) io_cqring_overflow_flush(ctx, false, NULL, NULL); + io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); if (flags & IORING_ENTER_SQ_WAIT) @@ -8988,6 +9177,13 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, goto out; } if (flags & IORING_ENTER_GETEVENTS) { + const sigset_t __user *sig; + struct __kernel_timespec __user *ts; + + ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); + if (unlikely(ret)) + goto out; + min_complete = min(min_complete, ctx->cq_entries); /* @@ -9000,7 +9196,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, !(ctx->flags & IORING_SETUP_SQPOLL)) { ret = io_iopoll_check(ctx, min_complete); } else { - ret = io_cqring_wait(ctx, min_complete, sig, sigsz); + ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts); } } @@ -9368,7 +9564,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | - IORING_FEAT_POLL_32BITS; + IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | + IORING_FEAT_EXT_ARG; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 10cc7979ce38..16a1e82e3aeb 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -650,7 +650,7 @@ iomap_set_page_dirty(struct page *page) return !TestSetPageDirty(page); /* - * Lock out page->mem_cgroup migration to keep PageDirty + * Lock out page's memcg migration to keep PageDirty * synchronized with per-memcg dirty page counters. */ lock_page_memcg(page); diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 7dfcab2a2da6..94b7c1cb5ceb 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -668,7 +668,7 @@ unlock: * this does not succeed, we finally try to allocate anywhere * within the aggregate. * - * we also try to allocate anywhere within the aggregate for + * we also try to allocate anywhere within the aggregate * for allocation requests larger than the allocation group * size or requests that specify no hint value. * @@ -2549,15 +2549,19 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level) */ if (oldval == NOFREE) { rc = dbBackSplit((dmtree_t *) dcp, leafno); - if (rc) + if (rc) { + release_metapage(mp); return rc; + } oldval = dcp->stree[ti]; } dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval); } else { rc = dbJoin((dmtree_t *) dcp, leafno, newval); - if (rc) + if (rc) { + release_metapage(mp); return rc; + } } /* check if the root of the current dmap control page changed due diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h index 29891fad3f09..aa03a904d5ab 100644 --- a/fs/jfs/jfs_dmap.h +++ b/fs/jfs/jfs_dmap.h @@ -183,7 +183,7 @@ typedef union dmtree { #define dmt_leafidx t1.leafidx #define dmt_height t1.height #define dmt_budmin t1.budmin -#define dmt_stree t1.stree +#define dmt_stree t2.stree /* * on-disk aggregate disk allocation map descriptor. diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c index f65bd6b35412..bb4a342a193d 100644 --- a/fs/jfs/jfs_extent.c +++ b/fs/jfs/jfs_extent.c @@ -575,7 +575,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) * blkno - starting block number of the extents current allocation. * nblks - number of blocks within the extents current allocation. * newnblks - pointer to a s64 value. on entry, this value is the - * the new desired extent size (number of blocks). on + * new desired extent size (number of blocks). on * successful exit, this value is set to the extent's actual * new size (new number of blocks). * newblkno - the starting block number of the extents new allocation. diff --git a/fs/jfs/jfs_extent.h b/fs/jfs/jfs_extent.h index dd635a8a0f8c..1c984214e95e 100644 --- a/fs/jfs/jfs_extent.h +++ b/fs/jfs/jfs_extent.h @@ -5,7 +5,7 @@ #ifndef _H_JFS_EXTENT #define _H_JFS_EXTENT -/* get block allocation allocation hint as location of disk inode */ +/* get block allocation hint as location of disk inode */ #define INOHINT(ip) \ (addressPXD(&(JFS_IP(ip)->ixpxd)) + lengthPXD(&(JFS_IP(ip)->ixpxd)) - 1) diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h index 7fd125c8dd19..805877ce5020 100644 --- a/fs/jfs/jfs_logmgr.h +++ b/fs/jfs/jfs_logmgr.h @@ -132,7 +132,7 @@ struct logpage { * (this comment should be rewritten !) * jfs uses only "after" log records (only a single writer is allowed * in a page, pages are written to temporary paging space if - * if they must be written to disk before commit, and i/o is + * they must be written to disk before commit, and i/o is * scheduled for modified pages to their home location after * the log records containing the after values and the commit * record is written to the log on disk, undo discards the copy diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index c8ce7f1bc594..dca8edd2378c 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -1474,7 +1474,7 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, * For the LOG_NOREDOINOEXT record, we need * to pass the IAG number and inode extent * index (within that IAG) from which the - * the extent being released. These have been + * extent is being released. These have been * passed to us in the iplist[1] and iplist[2]. */ lrd->log.noredoinoext.iagnum = diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index 16ad920f6fb1..3148e9b35f3b 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -3684,7 +3684,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) * * function: * Perform truncate to zero length for deleted file, leaving the - * the xtree and working map untouched. This allows the file to + * xtree and working map untouched. This allows the file to * be accessed via open file handles, while the delete of the file * is committed to disk. * diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 9aec80b9d7c6..7a53eed69fef 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -604,8 +604,7 @@ const struct dentry_operations kernfs_dops = { */ struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) { - if (dentry->d_sb->s_op == &kernfs_sops && - !d_really_is_negative(dentry)) + if (dentry->d_sb->s_op == &kernfs_sops) return kernfs_dentry_node(dentry); return NULL; } @@ -1599,7 +1598,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, return error; } -/* Relationship between s_mode and the DT_xxx types */ +/* Relationship between mode and the DT_xxx types */ static inline unsigned char dt_type(struct kernfs_node *kn) { return (kn->mode >> 12) & 15; diff --git a/fs/libfs.c b/fs/libfs.c index 7124c2e8df2f..d1c3bade9f30 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1451,4 +1451,74 @@ int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) return 0; } EXPORT_SYMBOL(generic_ci_d_hash); + +static const struct dentry_operations generic_ci_dentry_ops = { + .d_hash = generic_ci_d_hash, + .d_compare = generic_ci_d_compare, +}; +#endif + +#ifdef CONFIG_FS_ENCRYPTION +static const struct dentry_operations generic_encrypted_dentry_ops = { + .d_revalidate = fscrypt_d_revalidate, +}; +#endif + +#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE) +static const struct dentry_operations generic_encrypted_ci_dentry_ops = { + .d_hash = generic_ci_d_hash, + .d_compare = generic_ci_d_compare, + .d_revalidate = fscrypt_d_revalidate, +}; +#endif + +/** + * generic_set_encrypted_ci_d_ops - helper for setting d_ops for given dentry + * @dentry: dentry to set ops on + * + * Casefolded directories need d_hash and d_compare set, so that the dentries + * contained in them are handled case-insensitively. Note that these operations + * are needed on the parent directory rather than on the dentries in it, and + * while the casefolding flag can be toggled on and off on an empty directory, + * dentry_operations can't be changed later. As a result, if the filesystem has + * casefolding support enabled at all, we have to give all dentries the + * casefolding operations even if their inode doesn't have the casefolding flag + * currently (and thus the casefolding ops would be no-ops for now). + * + * Encryption works differently in that the only dentry operation it needs is + * d_revalidate, which it only needs on dentries that have the no-key name flag. + * The no-key flag can't be set "later", so we don't have to worry about that. + * + * Finally, to maximize compatibility with overlayfs (which isn't compatible + * with certain dentry operations) and to avoid taking an unnecessary + * performance hit, we use custom dentry_operations for each possible + * combination rather than always installing all operations. + */ +void generic_set_encrypted_ci_d_ops(struct dentry *dentry) +{ +#ifdef CONFIG_FS_ENCRYPTION + bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME; +#endif +#ifdef CONFIG_UNICODE + bool needs_ci_ops = dentry->d_sb->s_encoding; +#endif +#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE) + if (needs_encrypt_ops && needs_ci_ops) { + d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops); + return; + } #endif +#ifdef CONFIG_FS_ENCRYPTION + if (needs_encrypt_ops) { + d_set_d_op(dentry, &generic_encrypted_dentry_ops); + return; + } +#endif +#ifdef CONFIG_UNICODE + if (needs_ci_ops) { + d_set_d_op(dentry, &generic_ci_dentry_ops); + return; + } +#endif +} +EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops); diff --git a/fs/locks.c b/fs/locks.c index 1f84a03601fe..99ca97e81b7a 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -542,7 +542,7 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl, if (l->l_len > 0) { if (l->l_len - 1 > OFFSET_MAX - fl->fl_start) return -EOVERFLOW; - fl->fl_end = fl->fl_start + l->l_len - 1; + fl->fl_end = fl->fl_start + (l->l_len - 1); } else if (l->l_len < 0) { if (fl->fl_start + l->l_len < 0) @@ -750,7 +750,7 @@ static void __locks_wake_up_blocks(struct file_lock *blocker) } /** - * locks_delete_lock - stop waiting for a file lock + * locks_delete_block - stop waiting for a file lock * @waiter: the lock which was waiting * * lockd/nfsd need to disconnect the lock while working on it. @@ -2539,14 +2539,15 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, */ if (!error && file_lock->fl_type != F_UNLCK && !(file_lock->fl_flags & FL_OFDLCK)) { + struct files_struct *files = current->files; /* * We need that spin_lock here - it prevents reordering between * update of i_flctx->flc_posix and check for it done in * close(). rcu_read_lock() wouldn't do. */ - spin_lock(¤t->files->file_lock); - f = fcheck(fd); - spin_unlock(¤t->files->file_lock); + spin_lock(&files->file_lock); + f = files_lookup_fd_locked(files, fd); + spin_unlock(&files->file_lock); if (f != filp) { file_lock->fl_type = F_UNLCK; error = do_lock_file_wait(filp, cmd, file_lock); @@ -2670,14 +2671,15 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, */ if (!error && file_lock->fl_type != F_UNLCK && !(file_lock->fl_flags & FL_OFDLCK)) { + struct files_struct *files = current->files; /* * We need that spin_lock here - it prevents reordering between * update of i_flctx->flc_posix and check for it done in * close(). rcu_read_lock() wouldn't do. */ - spin_lock(¤t->files->file_lock); - f = fcheck(fd); - spin_unlock(¤t->files->file_lock); + spin_lock(&files->file_lock); + f = files_lookup_fd_locked(files, fd); + spin_unlock(&files->file_lock); if (f != filp) { file_lock->fl_type = F_UNLCK; error = do_lock_file_wait(filp, cmd, file_lock); diff --git a/fs/mount.h b/fs/mount.h index c7abb7b394d8..ce6c376e0bc2 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -6,7 +6,6 @@ #include <linux/fs_pin.h> struct mnt_namespace { - atomic_t count; struct ns_common ns; struct mount * root; /* @@ -120,7 +119,7 @@ static inline void detach_mounts(struct dentry *dentry) static inline void get_mnt_ns(struct mnt_namespace *ns) { - atomic_inc(&ns->count); + refcount_inc(&ns->ns.count); } extern seqlock_t mount_lock; diff --git a/fs/namei.c b/fs/namei.c index d4a6dd772303..03d0e11e4f36 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4346,8 +4346,8 @@ out: } EXPORT_SYMBOL(vfs_rename); -static int do_renameat2(int olddfd, const char __user *oldname, int newdfd, - const char __user *newname, unsigned int flags) +int do_renameat2(int olddfd, struct filename *from, int newdfd, + struct filename *to, unsigned int flags) { struct dentry *old_dentry, *new_dentry; struct dentry *trap; @@ -4355,32 +4355,30 @@ static int do_renameat2(int olddfd, const char __user *oldname, int newdfd, struct qstr old_last, new_last; int old_type, new_type; struct inode *delegated_inode = NULL; - struct filename *from; - struct filename *to; unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET; bool should_retry = false; - int error; + int error = -EINVAL; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) - return -EINVAL; + goto put_both; if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && (flags & RENAME_EXCHANGE)) - return -EINVAL; + goto put_both; if (flags & RENAME_EXCHANGE) target_flags = 0; retry: - from = filename_parentat(olddfd, getname(oldname), lookup_flags, - &old_path, &old_last, &old_type); + from = filename_parentat(olddfd, from, lookup_flags, &old_path, + &old_last, &old_type); if (IS_ERR(from)) { error = PTR_ERR(from); - goto exit; + goto put_new; } - to = filename_parentat(newdfd, getname(newname), lookup_flags, - &new_path, &new_last, &new_type); + to = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last, + &new_type); if (IS_ERR(to)) { error = PTR_ERR(to); goto exit1; @@ -4473,34 +4471,40 @@ exit2: if (retry_estale(error, lookup_flags)) should_retry = true; path_put(&new_path); - putname(to); exit1: path_put(&old_path); - putname(from); if (should_retry) { should_retry = false; lookup_flags |= LOOKUP_REVAL; goto retry; } -exit: +put_both: + if (!IS_ERR(from)) + putname(from); +put_new: + if (!IS_ERR(to)) + putname(to); return error; } SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, unsigned int, flags) { - return do_renameat2(olddfd, oldname, newdfd, newname, flags); + return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), + flags); } SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname) { - return do_renameat2(olddfd, oldname, newdfd, newname, 0); + return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), + 0); } SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) { - return do_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); + return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD, + getname(newname), 0); } int readlink_copy(char __user *buffer, int buflen, const char *link) diff --git a/fs/namespace.c b/fs/namespace.c index cebaa3e81794..2b681f65ca04 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3274,7 +3274,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a new_ns->ns.ops = &mntns_operations; if (!anon) new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); - atomic_set(&new_ns->count, 1); + refcount_set(&new_ns->ns.count, 1); INIT_LIST_HEAD(&new_ns->list); init_waitqueue_head(&new_ns->poll); spin_lock_init(&new_ns->ns_lock); @@ -3848,7 +3848,7 @@ void __init mnt_init(void) void put_mnt_ns(struct mnt_namespace *ns) { - if (!atomic_dec_and_test(&ns->count)) + if (!refcount_dec_and_test(&ns->ns.count)) return; drop_collected_mounts(&ns->root->mnt); free_mnt_ns(ns); diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 08108b6d2fa1..3be6836074ae 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -697,7 +697,7 @@ bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr, xdr_init_decode_pages(&xdr, &buf, lgr->layoutp->pages, lgr->layoutp->len); - xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE); + xdr_set_scratch_page(&xdr, scratch); status = -EIO; p = xdr_inline_decode(&xdr, 4); diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index dec5880ac6de..acb1d22907da 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -510,7 +510,7 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, goto out; xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen); - xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE); + xdr_set_scratch_page(&xdr, scratch); p = xdr_inline_decode(&xdr, sizeof(__be32)); if (!p) diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index e61dbc9b86ae..f7786e00a6a7 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -6,10 +6,15 @@ * * NFSv4 callback procedures */ + +#include <linux/errno.h> +#include <linux/math.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> #include <linux/slab.h> #include <linux/rcupdate.h> +#include <linux/types.h> + #include "nfs4_fs.h" #include "callback.h" #include "delegation.h" diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 4e011adaf967..8a24fe20dccf 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -576,7 +576,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en goto out_nopages; xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); - xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + xdr_set_scratch_page(&stream, scratch); do { if (entry->label) diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 3430d6891e89..7412bb164fa7 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -171,4 +171,7 @@ const struct export_operations nfs_export_ops = { .encode_fh = nfs_encode_fh, .fh_to_dentry = nfs_fh_to_dentry, .get_parent = nfs_get_parent, + .flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK| + EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS| + EXPORT_OP_NOATOMIC_ATTR, }; diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 7f5aa0403e16..d158a500c25c 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -666,7 +666,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, return -ENOMEM; xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); - xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + xdr_set_scratch_page(&stream, scratch); /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8), * num_fh (4) */ diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index d913e818858f..86c3f7e69ec4 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -82,7 +82,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, goto out_err; xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen); - xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + xdr_set_scratch_page(&stream, scratch); /* Get the stripe count (number of stripe index) */ p = xdr_inline_decode(&stream, 4); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 24bf5797f88a..4252ce633533 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -378,7 +378,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); - xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + xdr_set_scratch_page(&stream, scratch); /* stripe unit and mirror_array_cnt */ rc = -EIO; diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 3eda40a320a5..c9b61b818ec1 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -69,7 +69,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, INIT_LIST_HEAD(&dsaddrs); xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen); - xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + xdr_set_scratch_page(&stream, scratch); /* multipath count */ p = xdr_inline_decode(&stream, 4); diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 8432bd6b95f0..ea7dd8cbfac9 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -1539,7 +1539,7 @@ static int nfs4_xdr_dec_listxattrs(struct rpc_rqst *rqstp, struct compound_hdr hdr; int status; - xdr_set_scratch_buffer(xdr, page_address(res->scratch), PAGE_SIZE); + xdr_set_scratch_page(xdr, res->scratch); status = decode_compound_hdr(xdr, &hdr); if (status) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index c6dbfcae7517..2eabe5add344 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -6403,10 +6403,8 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, struct compound_hdr hdr; int status; - if (res->acl_scratch != NULL) { - void *p = page_address(res->acl_scratch); - xdr_set_scratch_buffer(xdr, p, PAGE_SIZE); - } + if (res->acl_scratch != NULL) + xdr_set_scratch_page(xdr, res->acl_scratch); status = decode_compound_hdr(xdr, &hdr); if (status) goto out; diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index b73d9dd37f73..26f2a50eceac 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -69,10 +69,14 @@ __state_in_grace(struct net *net, bool open) if (!open) return !list_empty(grace_list); + spin_lock(&grace_lock); list_for_each_entry(lm, grace_list, list) { - if (lm->block_opens) + if (lm->block_opens) { + spin_unlock(&grace_lock); return true; + } } + spin_unlock(&grace_lock); return false; } diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 21e404e7cb68..81e7bb12aca6 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -408,6 +408,12 @@ static int check_export(struct inode *inode, int *flags, unsigned char *uuid) return -EINVAL; } + if (inode->i_sb->s_export_op->flags & EXPORT_OP_NOSUBTREECHK && + !(*flags & NFSEXP_NOSUBTREECHECK)) { + dprintk("%s: %s does not support subtree checking!\n", + __func__, inode->i_sb->s_type->name); + return -EINVAL; + } return 0; } diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 3c6c2f7d1688..53fcbf79bdca 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -600,7 +600,7 @@ static struct notifier_block nfsd_file_lease_notifier = { static int nfsd_file_fsnotify_handle_event(struct fsnotify_mark *mark, u32 mask, struct inode *inode, struct inode *dir, - const struct qstr *name) + const struct qstr *name, u32 cookie) { trace_nfsd_file_fsnotify_handle_event(inode, mask); @@ -685,6 +685,7 @@ nfsd_file_cache_init(void) if (IS_ERR(nfsd_file_fsnotify_group)) { pr_err("nfsd: unable to create fsnotify group: %ld\n", PTR_ERR(nfsd_file_fsnotify_group)); + ret = PTR_ERR(nfsd_file_fsnotify_group); nfsd_file_fsnotify_group = NULL; goto out_notifier; } diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 6a900f770dd2..b0f66604532a 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -185,10 +185,6 @@ out: /* * XDR decode functions */ -static int nfsaclsvc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p) -{ - return 1; -} static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p) { @@ -255,15 +251,6 @@ static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p) * XDR encode functions */ -/* - * There must be an encoding function for void results so svc_process - * will work properly. - */ -static int nfsaclsvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p) -{ - return xdr_ressize_check(rqstp, p); -} - /* GETACL */ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p) { @@ -378,10 +365,10 @@ struct nfsd3_voidargs { int dummy; }; static const struct svc_procedure nfsd_acl_procedures2[5] = { [ACLPROC2_NULL] = { .pc_func = nfsacld_proc_null, - .pc_decode = nfsaclsvc_decode_voidarg, - .pc_encode = nfsaclsvc_encode_voidres, - .pc_argsize = sizeof(struct nfsd3_voidargs), - .pc_ressize = sizeof(struct nfsd3_voidargs), + .pc_decode = nfssvc_decode_voidarg, + .pc_encode = nfssvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST, }, diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 34a394e50e1d..7c30876a31a1 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -245,10 +245,10 @@ struct nfsd3_voidargs { int dummy; }; static const struct svc_procedure nfsd_acl_procedures3[3] = { [ACLPROC3_NULL] = { .pc_func = nfsd3_proc_null, - .pc_decode = nfs3svc_decode_voidarg, - .pc_encode = nfs3svc_encode_voidres, - .pc_argsize = sizeof(struct nfsd3_voidargs), - .pc_ressize = sizeof(struct nfsd3_voidargs), + .pc_decode = nfssvc_decode_voidarg, + .pc_encode = nfssvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST, }, diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index a633044b0dc1..76931f4f57c3 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -689,12 +689,9 @@ out: #define nfsd3_mkdirargs nfsd3_createargs #define nfsd3_readdirplusargs nfsd3_readdirargs #define nfsd3_fhandleargs nfsd_fhandle -#define nfsd3_fhandleres nfsd3_attrstat #define nfsd3_attrstatres nfsd3_attrstat #define nfsd3_wccstatres nfsd3_attrstat #define nfsd3_createres nfsd3_diropres -#define nfsd3_voidres nfsd3_voidargs -struct nfsd3_voidargs { int dummy; }; #define ST 1 /* status*/ #define FH 17 /* filehandle with length */ @@ -705,10 +702,10 @@ struct nfsd3_voidargs { int dummy; }; static const struct svc_procedure nfsd_procedures3[22] = { [NFS3PROC_NULL] = { .pc_func = nfsd3_proc_null, - .pc_decode = nfs3svc_decode_voidarg, - .pc_encode = nfs3svc_encode_voidres, - .pc_argsize = sizeof(struct nfsd3_voidargs), - .pc_ressize = sizeof(struct nfsd3_voidres), + .pc_decode = nfssvc_decode_voidarg, + .pc_encode = nfssvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST, }, diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 2277f83da250..821db21ba072 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -206,7 +206,7 @@ static __be32 * encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) { struct dentry *dentry = fhp->fh_dentry; - if (dentry && d_really_is_positive(dentry)) { + if (!fhp->fh_no_wcc && dentry && d_really_is_positive(dentry)) { __be32 err; struct kstat stat; @@ -259,11 +259,11 @@ void fill_pre_wcc(struct svc_fh *fhp) { struct inode *inode; struct kstat stat; + bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); __be32 err; - if (fhp->fh_pre_saved) + if (fhp->fh_no_wcc || fhp->fh_pre_saved) return; - inode = d_inode(fhp->fh_dentry); err = fh_getattr(fhp, &stat); if (err) { @@ -272,11 +272,12 @@ void fill_pre_wcc(struct svc_fh *fhp) stat.ctime = inode->i_ctime; stat.size = inode->i_size; } + if (v4) + fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); fhp->fh_pre_mtime = stat.mtime; fhp->fh_pre_ctime = stat.ctime; fhp->fh_pre_size = stat.size; - fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); fhp->fh_pre_saved = true; } @@ -285,30 +286,30 @@ void fill_pre_wcc(struct svc_fh *fhp) */ void fill_post_wcc(struct svc_fh *fhp) { + bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); + struct inode *inode = d_inode(fhp->fh_dentry); __be32 err; + if (fhp->fh_no_wcc) + return; + if (fhp->fh_post_saved) printk("nfsd: inode locked twice during operation.\n"); err = fh_getattr(fhp, &fhp->fh_post_attr); - fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr, - d_inode(fhp->fh_dentry)); if (err) { fhp->fh_post_saved = false; - /* Grab the ctime anyway - set_change_info might use it */ - fhp->fh_post_attr.ctime = d_inode(fhp->fh_dentry)->i_ctime; + fhp->fh_post_attr.ctime = inode->i_ctime; } else fhp->fh_post_saved = true; + if (v4) + fhp->fh_post_change = + nfsd4_change_attribute(&fhp->fh_post_attr, inode); } /* * XDR decode functions */ -int -nfs3svc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p) -{ - return 1; -} int nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p) @@ -642,12 +643,6 @@ nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p) * XDR encode functions */ -int -nfs3svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p) -{ - return xdr_ressize_check(rqstp, p); -} - /* GETATTR */ int nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p) @@ -707,6 +702,7 @@ int nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_readlinkres *resp = rqstp->rq_resp; + struct kvec *head = rqstp->rq_res.head; *p++ = resp->status; p = encode_post_op_attr(rqstp, p, &resp->fh); @@ -720,6 +716,8 @@ nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p) *p = 0; rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3); } + if (svc_encode_result_payload(rqstp, head->iov_len, resp->len)) + return 0; return 1; } else return xdr_ressize_check(rqstp, p); @@ -730,6 +728,7 @@ int nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_readres *resp = rqstp->rq_resp; + struct kvec *head = rqstp->rq_res.head; *p++ = resp->status; p = encode_post_op_attr(rqstp, p, &resp->fh); @@ -746,6 +745,9 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p) *p = 0; rqstp->rq_res.tail[0].iov_len = 4 - (resp->count & 3); } + if (svc_encode_result_payload(rqstp, head->iov_len, + resp->count)) + return 0; return 1; } else return xdr_ressize_check(rqstp, p); diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index e83b21778816..4727b7f03c5b 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -257,8 +257,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru * in NFSv4 as in v3 except EXCLUSIVE4_1. */ current->fs->umask = open->op_umask; - status = do_nfsd_create(rqstp, current_fh, open->op_fname.data, - open->op_fname.len, &open->op_iattr, + status = do_nfsd_create(rqstp, current_fh, open->op_fname, + open->op_fnamelen, &open->op_iattr, *resfh, open->op_createmode, (u32 *)open->op_verf.data, &open->op_truncate, &open->op_created); @@ -283,7 +283,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru * a chance to an acquire a delegation if appropriate. */ status = nfsd_lookup(rqstp, current_fh, - open->op_fname.data, open->op_fname.len, *resfh); + open->op_fname, open->op_fnamelen, *resfh); if (status) goto out; status = nfsd_check_obj_isreg(*resfh); @@ -360,7 +360,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, bool reclaim = false; dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n", - (int)open->op_fname.len, open->op_fname.data, + (int)open->op_fnamelen, open->op_fname, open->op_openowner); /* This check required by spec. */ @@ -1023,8 +1023,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, write->wr_how_written = write->wr_stable_how; - nvecs = svc_fill_write_vector(rqstp, write->wr_pagelist, - &write->wr_head, write->wr_buflen); + nvecs = svc_fill_write_vector(rqstp, write->wr_payload.pages, + write->wr_payload.head, write->wr_buflen); WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec)); status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf, @@ -1425,7 +1425,7 @@ static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, bool sync) return status; } -static int dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst) +static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst) { dst->cp_src_pos = src->cp_src_pos; dst->cp_dst_pos = src->cp_dst_pos; @@ -1444,8 +1444,6 @@ static int dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst) memcpy(&dst->stateid, &src->stateid, sizeof(src->stateid)); memcpy(&dst->c_fh, &src->c_fh, sizeof(src->c_fh)); dst->ss_mnt = src->ss_mnt; - - return 0; } static void cleanup_async_copy(struct nfsd4_copy *copy) @@ -1539,9 +1537,7 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, refcount_set(&async_copy->refcount, 1); memcpy(©->cp_res.cb_stateid, ©->cp_stateid, sizeof(copy->cp_stateid)); - status = dup_copy_fields(copy, async_copy); - if (status) - goto out_err; + dup_copy_fields(copy, async_copy); async_copy->copy_task = kthread_create(nfsd4_do_async_copy, async_copy, "%s", "copy thread"); if (IS_ERR(async_copy->copy_task)) @@ -2276,7 +2272,7 @@ static void svcxdr_init_encode(struct svc_rqst *rqstp, xdr->end = head->iov_base + PAGE_SIZE - rqstp->rq_auth_slack; /* Tail and page_len should be zero at this point: */ buf->len = buf->head[0].iov_len; - xdr->scratch.iov_len = 0; + xdr_reset_scratch_buffer(xdr); xdr->page_ptr = buf->pages - 1; buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages) - rqstp->rq_auth_slack; @@ -3282,7 +3278,7 @@ int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op) void warn_on_nonidempotent_op(struct nfsd4_op *op) { if (OPDESC(op)->op_flags & OP_MODIFIES_SOMETHING) { - pr_err("unable to encode reply to nonidempotent op %d (%s)\n", + pr_err("unable to encode reply to nonidempotent op %u (%s)\n", op->opnum, nfsd4_op_name(op->opnum)); WARN_ON_ONCE(1); } @@ -3295,16 +3291,13 @@ static const char *nfsd4_op_name(unsigned opnum) return "unknown_operation"; } -#define nfsd4_voidres nfsd4_voidargs -struct nfsd4_voidargs { int dummy; }; - static const struct svc_procedure nfsd_procedures4[2] = { [NFSPROC4_NULL] = { .pc_func = nfsd4_proc_null, - .pc_decode = nfs4svc_decode_voidarg, - .pc_encode = nfs4svc_encode_voidres, - .pc_argsize = sizeof(struct nfsd4_voidargs), - .pc_ressize = sizeof(struct nfsd4_voidres), + .pc_decode = nfssvc_decode_voidarg, + .pc_encode = nfssvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 1, }, diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d7f27ed6b794..1d2cd6a88f61 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -769,6 +769,7 @@ static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid, spin_lock(&nn->s2s_cp_lock); new_id = idr_alloc_cyclic(&nn->s2s_cp_stateids, stid, 0, 0, GFP_NOWAIT); stid->stid.si_opaque.so_id = new_id; + stid->stid.si_generation = 1; spin_unlock(&nn->s2s_cp_lock); idr_preload_end(); if (new_id < 0) @@ -3066,7 +3067,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, rpc_ntop(sa, addr_str, sizeof(addr_str)); dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p " - "ip_addr=%s flags %x, spa_how %d\n", + "ip_addr=%s flags %x, spa_how %u\n", __func__, rqstp, exid, exid->clname.len, exid->clname.data, addr_str, exid->flags, exid->spa_how); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 833a2c64dfe8..45ee6b12ce5b 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -54,6 +54,8 @@ #include "pnfs.h" #include "filecache.h" +#include "trace.h" + #ifdef CONFIG_NFSD_V4_SECURITY_LABEL #include <linux/security.h> #endif @@ -90,6 +92,8 @@ check_filename(char *str, int len) if (len == 0) return nfserr_inval; + if (len > NFS4_MAXNAMLEN) + return nfserr_nametoolong; if (isdotent(str, len)) return nfserr_badname; for (i = 0; i < len; i++) @@ -98,122 +102,6 @@ check_filename(char *str, int len) return 0; } -#define DECODE_HEAD \ - __be32 *p; \ - __be32 status -#define DECODE_TAIL \ - status = 0; \ -out: \ - return status; \ -xdr_error: \ - dprintk("NFSD: xdr error (%s:%d)\n", \ - __FILE__, __LINE__); \ - status = nfserr_bad_xdr; \ - goto out - -#define READMEM(x,nbytes) do { \ - x = (char *)p; \ - p += XDR_QUADLEN(nbytes); \ -} while (0) -#define SAVEMEM(x,nbytes) do { \ - if (!(x = (p==argp->tmp || p == argp->tmpp) ? \ - savemem(argp, p, nbytes) : \ - (char *)p)) { \ - dprintk("NFSD: xdr error (%s:%d)\n", \ - __FILE__, __LINE__); \ - goto xdr_error; \ - } \ - p += XDR_QUADLEN(nbytes); \ -} while (0) -#define COPYMEM(x,nbytes) do { \ - memcpy((x), p, nbytes); \ - p += XDR_QUADLEN(nbytes); \ -} while (0) - -/* READ_BUF, read_buf(): nbytes must be <= PAGE_SIZE */ -#define READ_BUF(nbytes) do { \ - if (nbytes <= (u32)((char *)argp->end - (char *)argp->p)) { \ - p = argp->p; \ - argp->p += XDR_QUADLEN(nbytes); \ - } else if (!(p = read_buf(argp, nbytes))) { \ - dprintk("NFSD: xdr error (%s:%d)\n", \ - __FILE__, __LINE__); \ - goto xdr_error; \ - } \ -} while (0) - -static void next_decode_page(struct nfsd4_compoundargs *argp) -{ - argp->p = page_address(argp->pagelist[0]); - argp->pagelist++; - if (argp->pagelen < PAGE_SIZE) { - argp->end = argp->p + XDR_QUADLEN(argp->pagelen); - argp->pagelen = 0; - } else { - argp->end = argp->p + (PAGE_SIZE>>2); - argp->pagelen -= PAGE_SIZE; - } -} - -static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) -{ - /* We want more bytes than seem to be available. - * Maybe we need a new page, maybe we have just run out - */ - unsigned int avail = (char *)argp->end - (char *)argp->p; - __be32 *p; - - if (argp->pagelen == 0) { - struct kvec *vec = &argp->rqstp->rq_arg.tail[0]; - - if (!argp->tail) { - argp->tail = true; - avail = vec->iov_len; - argp->p = vec->iov_base; - argp->end = vec->iov_base + avail; - } - - if (avail < nbytes) - return NULL; - - p = argp->p; - argp->p += XDR_QUADLEN(nbytes); - return p; - } - - if (avail + argp->pagelen < nbytes) - return NULL; - if (avail + PAGE_SIZE < nbytes) /* need more than a page !! */ - return NULL; - /* ok, we can do it with the current plus the next page */ - if (nbytes <= sizeof(argp->tmp)) - p = argp->tmp; - else { - kfree(argp->tmpp); - p = argp->tmpp = kmalloc(nbytes, GFP_KERNEL); - if (!p) - return NULL; - - } - /* - * The following memcpy is safe because read_buf is always - * called with nbytes > avail, and the two cases above both - * guarantee p points to at least nbytes bytes. - */ - memcpy(p, argp->p, avail); - next_decode_page(argp); - memcpy(((char*)p)+avail, argp->p, (nbytes - avail)); - argp->p += XDR_QUADLEN(nbytes - avail); - return p; -} - -static unsigned int compoundargs_bytes_left(struct nfsd4_compoundargs *argp) -{ - unsigned int this = (char *)argp->end - (char *)argp->p; - - return this + argp->pagelen; -} - static int zero_clientid(clientid_t *clid) { return (clid->cl_boot == 0) && (clid->cl_id == 0); @@ -259,118 +147,243 @@ svcxdr_dupstr(struct nfsd4_compoundargs *argp, void *buf, u32 len) return p; } +/* + * NFSv4 basic data type decoders + */ + +/* + * This helper handles variable-length opaques which belong to protocol + * elements that this implementation does not support. + */ static __be32 -svcxdr_construct_vector(struct nfsd4_compoundargs *argp, struct kvec *head, - struct page ***pagelist, u32 buflen) +nfsd4_decode_ignored_string(struct nfsd4_compoundargs *argp, u32 maxlen) { - int avail; - int len; - int pages; + u32 len; - /* Sorry .. no magic macros for this.. * - * READ_BUF(write->wr_buflen); - * SAVEMEM(write->wr_buf, write->wr_buflen); - */ - avail = (char *)argp->end - (char *)argp->p; - if (avail + argp->pagelen < buflen) { - dprintk("NFSD: xdr error (%s:%d)\n", - __FILE__, __LINE__); + if (xdr_stream_decode_u32(argp->xdr, &len) < 0) + return nfserr_bad_xdr; + if (maxlen && len > maxlen) + return nfserr_bad_xdr; + if (!xdr_inline_decode(argp->xdr, len)) return nfserr_bad_xdr; - } - head->iov_base = argp->p; - head->iov_len = avail; - *pagelist = argp->pagelist; - len = XDR_QUADLEN(buflen) << 2; - if (len >= avail) { - len -= avail; + return nfs_ok; +} - pages = len >> PAGE_SHIFT; - argp->pagelist += pages; - argp->pagelen -= pages * PAGE_SIZE; - len -= pages * PAGE_SIZE; +static __be32 +nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o) +{ + __be32 *p; + u32 len; - next_decode_page(argp); - } - argp->p += XDR_QUADLEN(len); + if (xdr_stream_decode_u32(argp->xdr, &len) < 0) + return nfserr_bad_xdr; + if (len == 0 || len > NFS4_OPAQUE_LIMIT) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, len); + if (!p) + return nfserr_bad_xdr; + o->data = svcxdr_tmpalloc(argp, len); + if (!o->data) + return nfserr_jukebox; + o->len = len; + memcpy(o->data, p, len); - return 0; + return nfs_ok; } -/** - * savemem - duplicate a chunk of memory for later processing - * @argp: NFSv4 compound argument structure to be freed with - * @p: pointer to be duplicated - * @nbytes: length to be duplicated - * - * Returns a pointer to a copy of @nbytes bytes of memory at @p - * that are preserved until processing of the NFSv4 compound - * operation described by @argp finishes. - */ -static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes) +static __be32 +nfsd4_decode_component4(struct nfsd4_compoundargs *argp, char **namp, u32 *lenp) { - void *ret; + __be32 *p, status; - ret = svcxdr_tmpalloc(argp, nbytes); - if (!ret) - return NULL; - memcpy(ret, p, nbytes); - return ret; + if (xdr_stream_decode_u32(argp->xdr, lenp) < 0) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, *lenp); + if (!p) + return nfserr_bad_xdr; + status = check_filename((char *)p, *lenp); + if (status) + return status; + *namp = svcxdr_tmpalloc(argp, *lenp); + if (!*namp) + return nfserr_jukebox; + memcpy(*namp, p, *lenp); + + return nfs_ok; } static __be32 -nfsd4_decode_time(struct nfsd4_compoundargs *argp, struct timespec64 *tv) +nfsd4_decode_nfstime4(struct nfsd4_compoundargs *argp, struct timespec64 *tv) { - DECODE_HEAD; + __be32 *p; - READ_BUF(12); + p = xdr_inline_decode(argp->xdr, XDR_UNIT * 3); + if (!p) + return nfserr_bad_xdr; p = xdr_decode_hyper(p, &tv->tv_sec); tv->tv_nsec = be32_to_cpup(p++); if (tv->tv_nsec >= (u32)1000000000) return nfserr_inval; + return nfs_ok; +} - DECODE_TAIL; +static __be32 +nfsd4_decode_verifier4(struct nfsd4_compoundargs *argp, nfs4_verifier *verf) +{ + __be32 *p; + + p = xdr_inline_decode(argp->xdr, NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_bad_xdr; + memcpy(verf->data, p, sizeof(verf->data)); + return nfs_ok; } +/** + * nfsd4_decode_bitmap4 - Decode an NFSv4 bitmap4 + * @argp: NFSv4 compound argument structure + * @bmval: pointer to an array of u32's to decode into + * @bmlen: size of the @bmval array + * + * The server needs to return nfs_ok rather than nfserr_bad_xdr when + * encountering bitmaps containing bits it does not recognize. This + * includes bits in bitmap words past WORDn, where WORDn is the last + * bitmap WORD the implementation currently supports. Thus we are + * careful here to simply ignore bits in bitmap words that this + * implementation has yet to support explicitly. + * + * Return values: + * %nfs_ok: @bmval populated successfully + * %nfserr_bad_xdr: the encoded bitmap was invalid + */ static __be32 -nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) +nfsd4_decode_bitmap4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen) { - u32 bmlen; - DECODE_HEAD; + u32 i, count; + __be32 *p; + + if (xdr_stream_decode_u32(argp->xdr, &count) < 0) + return nfserr_bad_xdr; + /* request sanity */ + if (count > 1000) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, count << 2); + if (!p) + return nfserr_bad_xdr; + i = 0; + while (i < count) + bmval[i++] = be32_to_cpup(p++); + while (i < bmlen) + bmval[i++] = 0; + + return nfs_ok; +} + +static __be32 +nfsd4_decode_nfsace4(struct nfsd4_compoundargs *argp, struct nfs4_ace *ace) +{ + __be32 *p, status; + u32 length; + + if (xdr_stream_decode_u32(argp->xdr, &ace->type) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &ace->flag) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &ace->access_mask) < 0) + return nfserr_bad_xdr; + + if (xdr_stream_decode_u32(argp->xdr, &length) < 0) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, length); + if (!p) + return nfserr_bad_xdr; + ace->whotype = nfs4_acl_get_whotype((char *)p, length); + if (ace->whotype != NFS4_ACL_WHO_NAMED) + status = nfs_ok; + else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) + status = nfsd_map_name_to_gid(argp->rqstp, + (char *)p, length, &ace->who_gid); + else + status = nfsd_map_name_to_uid(argp->rqstp, + (char *)p, length, &ace->who_uid); + + return status; +} + +/* A counted array of nfsace4's */ +static noinline __be32 +nfsd4_decode_acl(struct nfsd4_compoundargs *argp, struct nfs4_acl **acl) +{ + struct nfs4_ace *ace; + __be32 status; + u32 count; + + if (xdr_stream_decode_u32(argp->xdr, &count) < 0) + return nfserr_bad_xdr; + + if (count > xdr_stream_remaining(argp->xdr) / 20) + /* + * Even with 4-byte names there wouldn't be + * space for that many aces; something fishy is + * going on: + */ + return nfserr_fbig; + + *acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(count)); + if (*acl == NULL) + return nfserr_jukebox; + + (*acl)->naces = count; + for (ace = (*acl)->aces; ace < (*acl)->aces + count; ace++) { + status = nfsd4_decode_nfsace4(argp, ace); + if (status) + return status; + } + + return nfs_ok; +} - bmval[0] = 0; - bmval[1] = 0; - bmval[2] = 0; +static noinline __be32 +nfsd4_decode_security_label(struct nfsd4_compoundargs *argp, + struct xdr_netobj *label) +{ + u32 lfs, pi, length; + __be32 *p; - READ_BUF(4); - bmlen = be32_to_cpup(p++); - if (bmlen > 1000) - goto xdr_error; + if (xdr_stream_decode_u32(argp->xdr, &lfs) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &pi) < 0) + return nfserr_bad_xdr; - READ_BUF(bmlen << 2); - if (bmlen > 0) - bmval[0] = be32_to_cpup(p++); - if (bmlen > 1) - bmval[1] = be32_to_cpup(p++); - if (bmlen > 2) - bmval[2] = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &length) < 0) + return nfserr_bad_xdr; + if (length > NFS4_MAXLABELLEN) + return nfserr_badlabel; + p = xdr_inline_decode(argp->xdr, length); + if (!p) + return nfserr_bad_xdr; + label->len = length; + label->data = svcxdr_dupstr(argp, p, length); + if (!label->data) + return nfserr_jukebox; - DECODE_TAIL; + return nfs_ok; } static __be32 -nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, - struct iattr *iattr, struct nfs4_acl **acl, - struct xdr_netobj *label, int *umask) +nfsd4_decode_fattr4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen, + struct iattr *iattr, struct nfs4_acl **acl, + struct xdr_netobj *label, int *umask) { - int expected_len, len = 0; - u32 dummy32; - char *buf; + unsigned int starting_pos; + u32 attrlist4_count; + __be32 *p, status; - DECODE_HEAD; iattr->ia_valid = 0; - if ((status = nfsd4_decode_bitmap(argp, bmval))) - return status; + status = nfsd4_decode_bitmap4(argp, bmval, bmlen); + if (status) + return nfserr_bad_xdr; if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0 || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1 @@ -380,96 +393,69 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, return nfserr_attrnotsupp; } - READ_BUF(4); - expected_len = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &attrlist4_count) < 0) + return nfserr_bad_xdr; + starting_pos = xdr_stream_pos(argp->xdr); if (bmval[0] & FATTR4_WORD0_SIZE) { - READ_BUF(8); - len += 8; - p = xdr_decode_hyper(p, &iattr->ia_size); + u64 size; + + if (xdr_stream_decode_u64(argp->xdr, &size) < 0) + return nfserr_bad_xdr; + iattr->ia_size = size; iattr->ia_valid |= ATTR_SIZE; } if (bmval[0] & FATTR4_WORD0_ACL) { - u32 nace; - struct nfs4_ace *ace; - - READ_BUF(4); len += 4; - nace = be32_to_cpup(p++); - - if (nace > compoundargs_bytes_left(argp)/20) - /* - * Even with 4-byte names there wouldn't be - * space for that many aces; something fishy is - * going on: - */ - return nfserr_fbig; - - *acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(nace)); - if (*acl == NULL) - return nfserr_jukebox; - - (*acl)->naces = nace; - for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) { - READ_BUF(16); len += 16; - ace->type = be32_to_cpup(p++); - ace->flag = be32_to_cpup(p++); - ace->access_mask = be32_to_cpup(p++); - dummy32 = be32_to_cpup(p++); - READ_BUF(dummy32); - len += XDR_QUADLEN(dummy32) << 2; - READMEM(buf, dummy32); - ace->whotype = nfs4_acl_get_whotype(buf, dummy32); - status = nfs_ok; - if (ace->whotype != NFS4_ACL_WHO_NAMED) - ; - else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) - status = nfsd_map_name_to_gid(argp->rqstp, - buf, dummy32, &ace->who_gid); - else - status = nfsd_map_name_to_uid(argp->rqstp, - buf, dummy32, &ace->who_uid); - if (status) - return status; - } + status = nfsd4_decode_acl(argp, acl); + if (status) + return status; } else *acl = NULL; if (bmval[1] & FATTR4_WORD1_MODE) { - READ_BUF(4); - len += 4; - iattr->ia_mode = be32_to_cpup(p++); + u32 mode; + + if (xdr_stream_decode_u32(argp->xdr, &mode) < 0) + return nfserr_bad_xdr; + iattr->ia_mode = mode; iattr->ia_mode &= (S_IFMT | S_IALLUGO); iattr->ia_valid |= ATTR_MODE; } if (bmval[1] & FATTR4_WORD1_OWNER) { - READ_BUF(4); - len += 4; - dummy32 = be32_to_cpup(p++); - READ_BUF(dummy32); - len += (XDR_QUADLEN(dummy32) << 2); - READMEM(buf, dummy32); - if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid))) + u32 length; + + if (xdr_stream_decode_u32(argp->xdr, &length) < 0) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, length); + if (!p) + return nfserr_bad_xdr; + status = nfsd_map_name_to_uid(argp->rqstp, (char *)p, length, + &iattr->ia_uid); + if (status) return status; iattr->ia_valid |= ATTR_UID; } if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) { - READ_BUF(4); - len += 4; - dummy32 = be32_to_cpup(p++); - READ_BUF(dummy32); - len += (XDR_QUADLEN(dummy32) << 2); - READMEM(buf, dummy32); - if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid))) + u32 length; + + if (xdr_stream_decode_u32(argp->xdr, &length) < 0) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, length); + if (!p) + return nfserr_bad_xdr; + status = nfsd_map_name_to_gid(argp->rqstp, (char *)p, length, + &iattr->ia_gid); + if (status) return status; iattr->ia_valid |= ATTR_GID; } if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { - READ_BUF(4); - len += 4; - dummy32 = be32_to_cpup(p++); - switch (dummy32) { + u32 set_it; + + if (xdr_stream_decode_u32(argp->xdr, &set_it) < 0) + return nfserr_bad_xdr; + switch (set_it) { case NFS4_SET_TO_CLIENT_TIME: - len += 12; - status = nfsd4_decode_time(argp, &iattr->ia_atime); + status = nfsd4_decode_nfstime4(argp, &iattr->ia_atime); if (status) return status; iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET); @@ -478,17 +464,17 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, iattr->ia_valid |= ATTR_ATIME; break; default: - goto xdr_error; + return nfserr_bad_xdr; } } if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { - READ_BUF(4); - len += 4; - dummy32 = be32_to_cpup(p++); - switch (dummy32) { + u32 set_it; + + if (xdr_stream_decode_u32(argp->xdr, &set_it) < 0) + return nfserr_bad_xdr; + switch (set_it) { case NFS4_SET_TO_CLIENT_TIME: - len += 12; - status = nfsd4_decode_time(argp, &iattr->ia_mtime); + status = nfsd4_decode_nfstime4(argp, &iattr->ia_mtime); if (status) return status; iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET); @@ -497,222 +483,329 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, iattr->ia_valid |= ATTR_MTIME; break; default: - goto xdr_error; + return nfserr_bad_xdr; } } - label->len = 0; if (IS_ENABLED(CONFIG_NFSD_V4_SECURITY_LABEL) && bmval[2] & FATTR4_WORD2_SECURITY_LABEL) { - READ_BUF(4); - len += 4; - dummy32 = be32_to_cpup(p++); /* lfs: we don't use it */ - READ_BUF(4); - len += 4; - dummy32 = be32_to_cpup(p++); /* pi: we don't use it either */ - READ_BUF(4); - len += 4; - dummy32 = be32_to_cpup(p++); - READ_BUF(dummy32); - if (dummy32 > NFS4_MAXLABELLEN) - return nfserr_badlabel; - len += (XDR_QUADLEN(dummy32) << 2); - READMEM(buf, dummy32); - label->len = dummy32; - label->data = svcxdr_dupstr(argp, buf, dummy32); - if (!label->data) - return nfserr_jukebox; + status = nfsd4_decode_security_label(argp, label); + if (status) + return status; } if (bmval[2] & FATTR4_WORD2_MODE_UMASK) { + u32 mode, mask; + if (!umask) - goto xdr_error; - READ_BUF(8); - len += 8; - dummy32 = be32_to_cpup(p++); - iattr->ia_mode = dummy32 & (S_IFMT | S_IALLUGO); - dummy32 = be32_to_cpup(p++); - *umask = dummy32 & S_IRWXUGO; + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &mode) < 0) + return nfserr_bad_xdr; + iattr->ia_mode = mode & (S_IFMT | S_IALLUGO); + if (xdr_stream_decode_u32(argp->xdr, &mask) < 0) + return nfserr_bad_xdr; + *umask = mask & S_IRWXUGO; iattr->ia_valid |= ATTR_MODE; } - if (len != expected_len) - goto xdr_error; - DECODE_TAIL; + /* request sanity: did attrlist4 contain the expected number of words? */ + if (attrlist4_count != xdr_stream_pos(argp->xdr) - starting_pos) + return nfserr_bad_xdr; + + return nfs_ok; } static __be32 -nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid) +nfsd4_decode_stateid4(struct nfsd4_compoundargs *argp, stateid_t *sid) { - DECODE_HEAD; + __be32 *p; - READ_BUF(sizeof(stateid_t)); + p = xdr_inline_decode(argp->xdr, NFS4_STATEID_SIZE); + if (!p) + return nfserr_bad_xdr; sid->si_generation = be32_to_cpup(p++); - COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); + memcpy(&sid->si_opaque, p, sizeof(sid->si_opaque)); + return nfs_ok; +} + +static __be32 +nfsd4_decode_clientid4(struct nfsd4_compoundargs *argp, clientid_t *clientid) +{ + __be32 *p; - DECODE_TAIL; + p = xdr_inline_decode(argp->xdr, sizeof(__be64)); + if (!p) + return nfserr_bad_xdr; + memcpy(clientid, p, sizeof(*clientid)); + return nfs_ok; } static __be32 -nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access) +nfsd4_decode_state_owner4(struct nfsd4_compoundargs *argp, + clientid_t *clientid, struct xdr_netobj *owner) { - DECODE_HEAD; + __be32 status; + + status = nfsd4_decode_clientid4(argp, clientid); + if (status) + return status; + return nfsd4_decode_opaque(argp, owner); +} - READ_BUF(4); - access->ac_req_access = be32_to_cpup(p++); +#ifdef CONFIG_NFSD_PNFS +static __be32 +nfsd4_decode_deviceid4(struct nfsd4_compoundargs *argp, + struct nfsd4_deviceid *devid) +{ + __be32 *p; - DECODE_TAIL; + p = xdr_inline_decode(argp->xdr, NFS4_DEVICEID4_SIZE); + if (!p) + return nfserr_bad_xdr; + memcpy(devid, p, sizeof(*devid)); + return nfs_ok; } -static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs) +static __be32 +nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp, + struct nfsd4_layoutcommit *lcp) { - DECODE_HEAD; - struct user_namespace *userns = nfsd_user_namespace(argp->rqstp); - u32 dummy, uid, gid; - char *machine_name; - int i; - int nr_secflavs; + if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_layout_type) < 0) + return nfserr_bad_xdr; + if (lcp->lc_layout_type < LAYOUT_NFSV4_1_FILES) + return nfserr_bad_xdr; + if (lcp->lc_layout_type >= LAYOUT_TYPE_MAX) + return nfserr_bad_xdr; + + if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_up_len) < 0) + return nfserr_bad_xdr; + if (lcp->lc_up_len > 0) { + lcp->lc_up_layout = xdr_inline_decode(argp->xdr, lcp->lc_up_len); + if (!lcp->lc_up_layout) + return nfserr_bad_xdr; + } + + return nfs_ok; +} + +static __be32 +nfsd4_decode_layoutreturn4(struct nfsd4_compoundargs *argp, + struct nfsd4_layoutreturn *lrp) +{ + __be32 status; + + if (xdr_stream_decode_u32(argp->xdr, &lrp->lr_return_type) < 0) + return nfserr_bad_xdr; + switch (lrp->lr_return_type) { + case RETURN_FILE: + if (xdr_stream_decode_u64(argp->xdr, &lrp->lr_seg.offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lrp->lr_seg.length) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_stateid4(argp, &lrp->lr_sid); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &lrp->lrf_body_len) < 0) + return nfserr_bad_xdr; + if (lrp->lrf_body_len > 0) { + lrp->lrf_body = xdr_inline_decode(argp->xdr, lrp->lrf_body_len); + if (!lrp->lrf_body) + return nfserr_bad_xdr; + } + break; + case RETURN_FSID: + case RETURN_ALL: + lrp->lr_seg.offset = 0; + lrp->lr_seg.length = NFS4_MAX_UINT64; + break; + default: + return nfserr_bad_xdr; + } + + return nfs_ok; +} + +#endif /* CONFIG_NFSD_PNFS */ + +static __be32 +nfsd4_decode_sessionid4(struct nfsd4_compoundargs *argp, + struct nfs4_sessionid *sessionid) +{ + __be32 *p; + + p = xdr_inline_decode(argp->xdr, NFS4_MAX_SESSIONID_LEN); + if (!p) + return nfserr_bad_xdr; + memcpy(sessionid->data, p, sizeof(sessionid->data)); + return nfs_ok; +} + +/* Defined in Appendix A of RFC 5531 */ +static __be32 +nfsd4_decode_authsys_parms(struct nfsd4_compoundargs *argp, + struct nfsd4_cb_sec *cbs) +{ + u32 stamp, gidcount, uid, gid; + __be32 *p, status; + + if (xdr_stream_decode_u32(argp->xdr, &stamp) < 0) + return nfserr_bad_xdr; + /* machine name */ + status = nfsd4_decode_ignored_string(argp, 255); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &uid) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &gid) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &gidcount) < 0) + return nfserr_bad_xdr; + if (gidcount > 16) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, gidcount << 2); + if (!p) + return nfserr_bad_xdr; + if (cbs->flavor == (u32)(-1)) { + struct user_namespace *userns = nfsd_user_namespace(argp->rqstp); + + kuid_t kuid = make_kuid(userns, uid); + kgid_t kgid = make_kgid(userns, gid); + if (uid_valid(kuid) && gid_valid(kgid)) { + cbs->uid = kuid; + cbs->gid = kgid; + cbs->flavor = RPC_AUTH_UNIX; + } else { + dprintk("RPC_AUTH_UNIX with invalid uid or gid, ignoring!\n"); + } + } + + return nfs_ok; +} + +static __be32 +nfsd4_decode_gss_cb_handles4(struct nfsd4_compoundargs *argp, + struct nfsd4_cb_sec *cbs) +{ + __be32 status; + u32 service; + + dprintk("RPC_AUTH_GSS callback secflavor not supported!\n"); + + if (xdr_stream_decode_u32(argp->xdr, &service) < 0) + return nfserr_bad_xdr; + if (service < RPC_GSS_SVC_NONE || service > RPC_GSS_SVC_PRIVACY) + return nfserr_bad_xdr; + /* gcbp_handle_from_server */ + status = nfsd4_decode_ignored_string(argp, 0); + if (status) + return status; + /* gcbp_handle_from_client */ + status = nfsd4_decode_ignored_string(argp, 0); + if (status) + return status; + + return nfs_ok; +} + +/* a counted array of callback_sec_parms4 items */ +static __be32 +nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs) +{ + u32 i, secflavor, nr_secflavs; + __be32 status; /* callback_sec_params4 */ - READ_BUF(4); - nr_secflavs = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &nr_secflavs) < 0) + return nfserr_bad_xdr; if (nr_secflavs) cbs->flavor = (u32)(-1); else /* Is this legal? Be generous, take it to mean AUTH_NONE: */ cbs->flavor = 0; + for (i = 0; i < nr_secflavs; ++i) { - READ_BUF(4); - dummy = be32_to_cpup(p++); - switch (dummy) { + if (xdr_stream_decode_u32(argp->xdr, &secflavor) < 0) + return nfserr_bad_xdr; + switch (secflavor) { case RPC_AUTH_NULL: - /* Nothing to read */ + /* void */ if (cbs->flavor == (u32)(-1)) cbs->flavor = RPC_AUTH_NULL; break; case RPC_AUTH_UNIX: - READ_BUF(8); - /* stamp */ - dummy = be32_to_cpup(p++); - - /* machine name */ - dummy = be32_to_cpup(p++); - READ_BUF(dummy); - SAVEMEM(machine_name, dummy); - - /* uid, gid */ - READ_BUF(8); - uid = be32_to_cpup(p++); - gid = be32_to_cpup(p++); - - /* more gids */ - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy * 4); - if (cbs->flavor == (u32)(-1)) { - kuid_t kuid = make_kuid(userns, uid); - kgid_t kgid = make_kgid(userns, gid); - if (uid_valid(kuid) && gid_valid(kgid)) { - cbs->uid = kuid; - cbs->gid = kgid; - cbs->flavor = RPC_AUTH_UNIX; - } else { - dprintk("RPC_AUTH_UNIX with invalid" - "uid or gid ignoring!\n"); - } - } + status = nfsd4_decode_authsys_parms(argp, cbs); + if (status) + return status; break; case RPC_AUTH_GSS: - dprintk("RPC_AUTH_GSS callback secflavor " - "not supported!\n"); - READ_BUF(8); - /* gcbp_service */ - dummy = be32_to_cpup(p++); - /* gcbp_handle_from_server */ - dummy = be32_to_cpup(p++); - READ_BUF(dummy); - p += XDR_QUADLEN(dummy); - /* gcbp_handle_from_client */ - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy); + status = nfsd4_decode_gss_cb_handles4(argp, cbs); + if (status) + return status; break; default: - dprintk("Illegal callback secflavor\n"); return nfserr_inval; } } - DECODE_TAIL; -} -static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc) -{ - DECODE_HEAD; + return nfs_ok; +} - READ_BUF(4); - bc->bc_cb_program = be32_to_cpup(p++); - nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec); - DECODE_TAIL; -} +/* + * NFSv4 operation argument decoders + */ -static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts) +static __be32 +nfsd4_decode_access(struct nfsd4_compoundargs *argp, + struct nfsd4_access *access) { - DECODE_HEAD; - - READ_BUF(NFS4_MAX_SESSIONID_LEN + 8); - COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN); - bcts->dir = be32_to_cpup(p++); - /* XXX: skipping ctsa_use_conn_in_rdma_mode. Perhaps Tom Tucker - * could help us figure out we should be using it. */ - DECODE_TAIL; + if (xdr_stream_decode_u32(argp->xdr, &access->ac_req_access) < 0) + return nfserr_bad_xdr; + return nfs_ok; } static __be32 nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) { - DECODE_HEAD; - - READ_BUF(4); - close->cl_seqid = be32_to_cpup(p++); - return nfsd4_decode_stateid(argp, &close->cl_stateid); - - DECODE_TAIL; + if (xdr_stream_decode_u32(argp->xdr, &close->cl_seqid) < 0) + return nfserr_bad_xdr; + return nfsd4_decode_stateid4(argp, &close->cl_stateid); } static __be32 nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit) { - DECODE_HEAD; - - READ_BUF(12); - p = xdr_decode_hyper(p, &commit->co_offset); - commit->co_count = be32_to_cpup(p++); - - DECODE_TAIL; + if (xdr_stream_decode_u64(argp->xdr, &commit->co_offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &commit->co_count) < 0) + return nfserr_bad_xdr; + return nfs_ok; } static __be32 nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create) { - DECODE_HEAD; + __be32 *p, status; - READ_BUF(4); - create->cr_type = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &create->cr_type) < 0) + return nfserr_bad_xdr; switch (create->cr_type) { case NF4LNK: - READ_BUF(4); - create->cr_datalen = be32_to_cpup(p++); - READ_BUF(create->cr_datalen); + if (xdr_stream_decode_u32(argp->xdr, &create->cr_datalen) < 0) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, create->cr_datalen); + if (!p) + return nfserr_bad_xdr; create->cr_data = svcxdr_dupstr(argp, p, create->cr_datalen); if (!create->cr_data) return nfserr_jukebox; break; case NF4BLK: case NF4CHR: - READ_BUF(8); - create->cr_specdata1 = be32_to_cpup(p++); - create->cr_specdata2 = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &create->cr_specdata1) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &create->cr_specdata2) < 0) + return nfserr_bad_xdr; break; case NF4SOCK: case NF4FIFO: @@ -720,151 +813,210 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create default: break; } - - READ_BUF(4); - create->cr_namelen = be32_to_cpup(p++); - READ_BUF(create->cr_namelen); - SAVEMEM(create->cr_name, create->cr_namelen); - if ((status = check_filename(create->cr_name, create->cr_namelen))) + status = nfsd4_decode_component4(argp, &create->cr_name, + &create->cr_namelen); + if (status) return status; - - status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, - &create->cr_acl, &create->cr_label, - &create->cr_umask); + status = nfsd4_decode_fattr4(argp, create->cr_bmval, + ARRAY_SIZE(create->cr_bmval), + &create->cr_iattr, &create->cr_acl, + &create->cr_label, &create->cr_umask); if (status) - goto out; + return status; - DECODE_TAIL; + return nfs_ok; } static inline __be32 nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr) { - return nfsd4_decode_stateid(argp, &dr->dr_stateid); + return nfsd4_decode_stateid4(argp, &dr->dr_stateid); } static inline __be32 nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr) { - return nfsd4_decode_bitmap(argp, getattr->ga_bmval); + return nfsd4_decode_bitmap4(argp, getattr->ga_bmval, + ARRAY_SIZE(getattr->ga_bmval)); } static __be32 nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link) { - DECODE_HEAD; + return nfsd4_decode_component4(argp, &link->li_name, &link->li_namelen); +} - READ_BUF(4); - link->li_namelen = be32_to_cpup(p++); - READ_BUF(link->li_namelen); - SAVEMEM(link->li_name, link->li_namelen); - if ((status = check_filename(link->li_name, link->li_namelen))) +static __be32 +nfsd4_decode_open_to_lock_owner4(struct nfsd4_compoundargs *argp, + struct nfsd4_lock *lock) +{ + __be32 status; + + if (xdr_stream_decode_u32(argp->xdr, &lock->lk_new_open_seqid) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_stateid4(argp, &lock->lk_new_open_stateid); + if (status) return status; + if (xdr_stream_decode_u32(argp->xdr, &lock->lk_new_lock_seqid) < 0) + return nfserr_bad_xdr; + return nfsd4_decode_state_owner4(argp, &lock->lk_new_clientid, + &lock->lk_new_owner); +} - DECODE_TAIL; +static __be32 +nfsd4_decode_exist_lock_owner4(struct nfsd4_compoundargs *argp, + struct nfsd4_lock *lock) +{ + __be32 status; + + status = nfsd4_decode_stateid4(argp, &lock->lk_old_lock_stateid); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &lock->lk_old_lock_seqid) < 0) + return nfserr_bad_xdr; + + return nfs_ok; } static __be32 -nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) +nfsd4_decode_locker4(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) { - DECODE_HEAD; + if (xdr_stream_decode_bool(argp->xdr, &lock->lk_is_new) < 0) + return nfserr_bad_xdr; + if (lock->lk_is_new) + return nfsd4_decode_open_to_lock_owner4(argp, lock); + return nfsd4_decode_exist_lock_owner4(argp, lock); +} - /* - * type, reclaim(boolean), offset, length, new_lock_owner(boolean) - */ - READ_BUF(28); - lock->lk_type = be32_to_cpup(p++); +static __be32 +nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) +{ + if (xdr_stream_decode_u32(argp->xdr, &lock->lk_type) < 0) + return nfserr_bad_xdr; if ((lock->lk_type < NFS4_READ_LT) || (lock->lk_type > NFS4_WRITEW_LT)) - goto xdr_error; - lock->lk_reclaim = be32_to_cpup(p++); - p = xdr_decode_hyper(p, &lock->lk_offset); - p = xdr_decode_hyper(p, &lock->lk_length); - lock->lk_is_new = be32_to_cpup(p++); - - if (lock->lk_is_new) { - READ_BUF(4); - lock->lk_new_open_seqid = be32_to_cpup(p++); - status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid); - if (status) - return status; - READ_BUF(8 + sizeof(clientid_t)); - lock->lk_new_lock_seqid = be32_to_cpup(p++); - COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t)); - lock->lk_new_owner.len = be32_to_cpup(p++); - READ_BUF(lock->lk_new_owner.len); - READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len); - } else { - status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid); - if (status) - return status; - READ_BUF(4); - lock->lk_old_lock_seqid = be32_to_cpup(p++); - } - - DECODE_TAIL; + return nfserr_bad_xdr; + if (xdr_stream_decode_bool(argp->xdr, &lock->lk_reclaim) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lock->lk_offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lock->lk_length) < 0) + return nfserr_bad_xdr; + return nfsd4_decode_locker4(argp, lock); } static __be32 nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt) { - DECODE_HEAD; - - READ_BUF(32); - lockt->lt_type = be32_to_cpup(p++); - if((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT)) - goto xdr_error; - p = xdr_decode_hyper(p, &lockt->lt_offset); - p = xdr_decode_hyper(p, &lockt->lt_length); - COPYMEM(&lockt->lt_clientid, 8); - lockt->lt_owner.len = be32_to_cpup(p++); - READ_BUF(lockt->lt_owner.len); - READMEM(lockt->lt_owner.data, lockt->lt_owner.len); - - DECODE_TAIL; + if (xdr_stream_decode_u32(argp->xdr, &lockt->lt_type) < 0) + return nfserr_bad_xdr; + if ((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT)) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lockt->lt_offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lockt->lt_length) < 0) + return nfserr_bad_xdr; + return nfsd4_decode_state_owner4(argp, &lockt->lt_clientid, + &lockt->lt_owner); } static __be32 nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) { - DECODE_HEAD; + __be32 status; - READ_BUF(8); - locku->lu_type = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &locku->lu_type) < 0) + return nfserr_bad_xdr; if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT)) - goto xdr_error; - locku->lu_seqid = be32_to_cpup(p++); - status = nfsd4_decode_stateid(argp, &locku->lu_stateid); + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &locku->lu_seqid) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_stateid4(argp, &locku->lu_stateid); if (status) return status; - READ_BUF(16); - p = xdr_decode_hyper(p, &locku->lu_offset); - p = xdr_decode_hyper(p, &locku->lu_length); + if (xdr_stream_decode_u64(argp->xdr, &locku->lu_offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &locku->lu_length) < 0) + return nfserr_bad_xdr; - DECODE_TAIL; + return nfs_ok; } static __be32 nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup) { - DECODE_HEAD; + return nfsd4_decode_component4(argp, &lookup->lo_name, &lookup->lo_len); +} - READ_BUF(4); - lookup->lo_len = be32_to_cpup(p++); - READ_BUF(lookup->lo_len); - SAVEMEM(lookup->lo_name, lookup->lo_len); - if ((status = check_filename(lookup->lo_name, lookup->lo_len))) - return status; +static __be32 +nfsd4_decode_createhow4(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) +{ + __be32 status; - DECODE_TAIL; + if (xdr_stream_decode_u32(argp->xdr, &open->op_createmode) < 0) + return nfserr_bad_xdr; + switch (open->op_createmode) { + case NFS4_CREATE_UNCHECKED: + case NFS4_CREATE_GUARDED: + status = nfsd4_decode_fattr4(argp, open->op_bmval, + ARRAY_SIZE(open->op_bmval), + &open->op_iattr, &open->op_acl, + &open->op_label, &open->op_umask); + if (status) + return status; + break; + case NFS4_CREATE_EXCLUSIVE: + status = nfsd4_decode_verifier4(argp, &open->op_verf); + if (status) + return status; + break; + case NFS4_CREATE_EXCLUSIVE4_1: + if (argp->minorversion < 1) + return nfserr_bad_xdr; + status = nfsd4_decode_verifier4(argp, &open->op_verf); + if (status) + return status; + status = nfsd4_decode_fattr4(argp, open->op_bmval, + ARRAY_SIZE(open->op_bmval), + &open->op_iattr, &open->op_acl, + &open->op_label, &open->op_umask); + if (status) + return status; + break; + default: + return nfserr_bad_xdr; + } + + return nfs_ok; +} + +static __be32 +nfsd4_decode_openflag4(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) +{ + __be32 status; + + if (xdr_stream_decode_u32(argp->xdr, &open->op_create) < 0) + return nfserr_bad_xdr; + switch (open->op_create) { + case NFS4_OPEN_NOCREATE: + break; + case NFS4_OPEN_CREATE: + status = nfsd4_decode_createhow4(argp, open); + if (status) + return status; + break; + default: + return nfserr_bad_xdr; + } + + return nfs_ok; } static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *share_access, u32 *deleg_want, u32 *deleg_when) { - __be32 *p; u32 w; - READ_BUF(4); - w = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &w) < 0) + return nfserr_bad_xdr; *share_access = w & NFS4_SHARE_ACCESS_MASK; *deleg_want = w & NFS4_SHARE_WANT_MASK; if (deleg_when) @@ -907,206 +1059,153 @@ static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *sh NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED): return nfs_ok; } -xdr_error: return nfserr_bad_xdr; } static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x) { - __be32 *p; - - READ_BUF(4); - *x = be32_to_cpup(p++); - /* Note: unlinke access bits, deny bits may be zero. */ - if (*x & ~NFS4_SHARE_DENY_BOTH) + if (xdr_stream_decode_u32(argp->xdr, x) < 0) return nfserr_bad_xdr; - return nfs_ok; -xdr_error: - return nfserr_bad_xdr; -} - -static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o) -{ - __be32 *p; - - READ_BUF(4); - o->len = be32_to_cpup(p++); - - if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT) + /* Note: unlike access bits, deny bits may be zero. */ + if (*x & ~NFS4_SHARE_DENY_BOTH) return nfserr_bad_xdr; - READ_BUF(o->len); - SAVEMEM(o->data, o->len); return nfs_ok; -xdr_error: - return nfserr_bad_xdr; } static __be32 -nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) +nfsd4_decode_open_claim4(struct nfsd4_compoundargs *argp, + struct nfsd4_open *open) { - DECODE_HEAD; - u32 dummy; - - memset(open->op_bmval, 0, sizeof(open->op_bmval)); - open->op_iattr.ia_valid = 0; - open->op_openowner = NULL; - - open->op_xdr_error = 0; - /* seqid, share_access, share_deny, clientid, ownerlen */ - READ_BUF(4); - open->op_seqid = be32_to_cpup(p++); - /* decode, yet ignore deleg_when until supported */ - status = nfsd4_decode_share_access(argp, &open->op_share_access, - &open->op_deleg_want, &dummy); - if (status) - goto xdr_error; - status = nfsd4_decode_share_deny(argp, &open->op_share_deny); - if (status) - goto xdr_error; - READ_BUF(sizeof(clientid_t)); - COPYMEM(&open->op_clientid, sizeof(clientid_t)); - status = nfsd4_decode_opaque(argp, &open->op_owner); - if (status) - goto xdr_error; - READ_BUF(4); - open->op_create = be32_to_cpup(p++); - switch (open->op_create) { - case NFS4_OPEN_NOCREATE: - break; - case NFS4_OPEN_CREATE: - READ_BUF(4); - open->op_createmode = be32_to_cpup(p++); - switch (open->op_createmode) { - case NFS4_CREATE_UNCHECKED: - case NFS4_CREATE_GUARDED: - status = nfsd4_decode_fattr(argp, open->op_bmval, - &open->op_iattr, &open->op_acl, &open->op_label, - &open->op_umask); - if (status) - goto out; - break; - case NFS4_CREATE_EXCLUSIVE: - READ_BUF(NFS4_VERIFIER_SIZE); - COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE); - break; - case NFS4_CREATE_EXCLUSIVE4_1: - if (argp->minorversion < 1) - goto xdr_error; - READ_BUF(NFS4_VERIFIER_SIZE); - COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE); - status = nfsd4_decode_fattr(argp, open->op_bmval, - &open->op_iattr, &open->op_acl, &open->op_label, - &open->op_umask); - if (status) - goto out; - break; - default: - goto xdr_error; - } - break; - default: - goto xdr_error; - } + __be32 status; - /* open_claim */ - READ_BUF(4); - open->op_claim_type = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &open->op_claim_type) < 0) + return nfserr_bad_xdr; switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_NULL: case NFS4_OPEN_CLAIM_DELEGATE_PREV: - READ_BUF(4); - open->op_fname.len = be32_to_cpup(p++); - READ_BUF(open->op_fname.len); - SAVEMEM(open->op_fname.data, open->op_fname.len); - if ((status = check_filename(open->op_fname.data, open->op_fname.len))) + status = nfsd4_decode_component4(argp, &open->op_fname, + &open->op_fnamelen); + if (status) return status; break; case NFS4_OPEN_CLAIM_PREVIOUS: - READ_BUF(4); - open->op_delegate_type = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &open->op_delegate_type) < 0) + return nfserr_bad_xdr; break; case NFS4_OPEN_CLAIM_DELEGATE_CUR: - status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid); + status = nfsd4_decode_stateid4(argp, &open->op_delegate_stateid); if (status) return status; - READ_BUF(4); - open->op_fname.len = be32_to_cpup(p++); - READ_BUF(open->op_fname.len); - SAVEMEM(open->op_fname.data, open->op_fname.len); - if ((status = check_filename(open->op_fname.data, open->op_fname.len))) + status = nfsd4_decode_component4(argp, &open->op_fname, + &open->op_fnamelen); + if (status) return status; break; case NFS4_OPEN_CLAIM_FH: case NFS4_OPEN_CLAIM_DELEG_PREV_FH: if (argp->minorversion < 1) - goto xdr_error; + return nfserr_bad_xdr; /* void */ break; case NFS4_OPEN_CLAIM_DELEG_CUR_FH: if (argp->minorversion < 1) - goto xdr_error; - status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid); + return nfserr_bad_xdr; + status = nfsd4_decode_stateid4(argp, &open->op_delegate_stateid); if (status) return status; break; default: - goto xdr_error; + return nfserr_bad_xdr; } - DECODE_TAIL; + return nfs_ok; +} + +static __be32 +nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) +{ + __be32 status; + u32 dummy; + + memset(open->op_bmval, 0, sizeof(open->op_bmval)); + open->op_iattr.ia_valid = 0; + open->op_openowner = NULL; + + open->op_xdr_error = 0; + if (xdr_stream_decode_u32(argp->xdr, &open->op_seqid) < 0) + return nfserr_bad_xdr; + /* deleg_want is ignored */ + status = nfsd4_decode_share_access(argp, &open->op_share_access, + &open->op_deleg_want, &dummy); + if (status) + return status; + status = nfsd4_decode_share_deny(argp, &open->op_share_deny); + if (status) + return status; + status = nfsd4_decode_state_owner4(argp, &open->op_clientid, + &open->op_owner); + if (status) + return status; + status = nfsd4_decode_openflag4(argp, open); + if (status) + return status; + return nfsd4_decode_open_claim4(argp, open); } static __be32 nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf) { - DECODE_HEAD; + __be32 status; if (argp->minorversion >= 1) return nfserr_notsupp; - status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid); + status = nfsd4_decode_stateid4(argp, &open_conf->oc_req_stateid); if (status) return status; - READ_BUF(4); - open_conf->oc_seqid = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &open_conf->oc_seqid) < 0) + return nfserr_bad_xdr; - DECODE_TAIL; + return nfs_ok; } static __be32 nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_downgrade *open_down) { - DECODE_HEAD; - - status = nfsd4_decode_stateid(argp, &open_down->od_stateid); + __be32 status; + + status = nfsd4_decode_stateid4(argp, &open_down->od_stateid); if (status) return status; - READ_BUF(4); - open_down->od_seqid = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &open_down->od_seqid) < 0) + return nfserr_bad_xdr; + /* deleg_want is ignored */ status = nfsd4_decode_share_access(argp, &open_down->od_share_access, &open_down->od_deleg_want, NULL); if (status) return status; - status = nfsd4_decode_share_deny(argp, &open_down->od_share_deny); - if (status) - return status; - DECODE_TAIL; + return nfsd4_decode_share_deny(argp, &open_down->od_share_deny); } static __be32 nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh) { - DECODE_HEAD; + __be32 *p; - READ_BUF(4); - putfh->pf_fhlen = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &putfh->pf_fhlen) < 0) + return nfserr_bad_xdr; if (putfh->pf_fhlen > NFS4_FHSIZE) - goto xdr_error; - READ_BUF(putfh->pf_fhlen); - SAVEMEM(putfh->pf_fhval, putfh->pf_fhlen); + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, putfh->pf_fhlen); + if (!p) + return nfserr_bad_xdr; + putfh->pf_fhval = svcxdr_tmpalloc(argp, putfh->pf_fhlen); + if (!putfh->pf_fhval) + return nfserr_jukebox; + memcpy(putfh->pf_fhval, p, putfh->pf_fhlen); - DECODE_TAIL; + return nfs_ok; } static __be32 @@ -1120,109 +1219,68 @@ nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, void *p) static __be32 nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read) { - DECODE_HEAD; + __be32 status; - status = nfsd4_decode_stateid(argp, &read->rd_stateid); + status = nfsd4_decode_stateid4(argp, &read->rd_stateid); if (status) return status; - READ_BUF(12); - p = xdr_decode_hyper(p, &read->rd_offset); - read->rd_length = be32_to_cpup(p++); + if (xdr_stream_decode_u64(argp->xdr, &read->rd_offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &read->rd_length) < 0) + return nfserr_bad_xdr; - DECODE_TAIL; + return nfs_ok; } static __be32 nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *readdir) { - DECODE_HEAD; + __be32 status; - READ_BUF(24); - p = xdr_decode_hyper(p, &readdir->rd_cookie); - COPYMEM(readdir->rd_verf.data, sizeof(readdir->rd_verf.data)); - readdir->rd_dircount = be32_to_cpup(p++); - readdir->rd_maxcount = be32_to_cpup(p++); - if ((status = nfsd4_decode_bitmap(argp, readdir->rd_bmval))) - goto out; + if (xdr_stream_decode_u64(argp->xdr, &readdir->rd_cookie) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_verifier4(argp, &readdir->rd_verf); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &readdir->rd_dircount) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &readdir->rd_maxcount) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_uint32_array(argp->xdr, readdir->rd_bmval, + ARRAY_SIZE(readdir->rd_bmval)) < 0) + return nfserr_bad_xdr; - DECODE_TAIL; + return nfs_ok; } static __be32 nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove) { - DECODE_HEAD; - - READ_BUF(4); - remove->rm_namelen = be32_to_cpup(p++); - READ_BUF(remove->rm_namelen); - SAVEMEM(remove->rm_name, remove->rm_namelen); - if ((status = check_filename(remove->rm_name, remove->rm_namelen))) - return status; - - DECODE_TAIL; + return nfsd4_decode_component4(argp, &remove->rm_name, &remove->rm_namelen); } static __be32 nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename) { - DECODE_HEAD; - - READ_BUF(4); - rename->rn_snamelen = be32_to_cpup(p++); - READ_BUF(rename->rn_snamelen); - SAVEMEM(rename->rn_sname, rename->rn_snamelen); - READ_BUF(4); - rename->rn_tnamelen = be32_to_cpup(p++); - READ_BUF(rename->rn_tnamelen); - SAVEMEM(rename->rn_tname, rename->rn_tnamelen); - if ((status = check_filename(rename->rn_sname, rename->rn_snamelen))) - return status; - if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen))) - return status; + __be32 status; - DECODE_TAIL; + status = nfsd4_decode_component4(argp, &rename->rn_sname, &rename->rn_snamelen); + if (status) + return status; + return nfsd4_decode_component4(argp, &rename->rn_tname, &rename->rn_tnamelen); } static __be32 nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid) { - DECODE_HEAD; - - if (argp->minorversion >= 1) - return nfserr_notsupp; - - READ_BUF(sizeof(clientid_t)); - COPYMEM(clientid, sizeof(clientid_t)); - - DECODE_TAIL; + return nfsd4_decode_clientid4(argp, clientid); } static __be32 nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp, struct nfsd4_secinfo *secinfo) { - DECODE_HEAD; - - READ_BUF(4); - secinfo->si_namelen = be32_to_cpup(p++); - READ_BUF(secinfo->si_namelen); - SAVEMEM(secinfo->si_name, secinfo->si_namelen); - status = check_filename(secinfo->si_name, secinfo->si_namelen); - if (status) - return status; - DECODE_TAIL; -} - -static __be32 -nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp, - struct nfsd4_secinfo_no_name *sin) -{ - DECODE_HEAD; - - READ_BUF(4); - sin->sin_style = be32_to_cpup(p++); - DECODE_TAIL; + return nfsd4_decode_component4(argp, &secinfo->si_name, &secinfo->si_namelen); } static __be32 @@ -1230,362 +1288,381 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta { __be32 status; - status = nfsd4_decode_stateid(argp, &setattr->sa_stateid); + status = nfsd4_decode_stateid4(argp, &setattr->sa_stateid); if (status) return status; - return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, - &setattr->sa_acl, &setattr->sa_label, NULL); + return nfsd4_decode_fattr4(argp, setattr->sa_bmval, + ARRAY_SIZE(setattr->sa_bmval), + &setattr->sa_iattr, &setattr->sa_acl, + &setattr->sa_label, NULL); } static __be32 nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid *setclientid) { - DECODE_HEAD; + __be32 *p, status; if (argp->minorversion >= 1) return nfserr_notsupp; - READ_BUF(NFS4_VERIFIER_SIZE); - COPYMEM(setclientid->se_verf.data, NFS4_VERIFIER_SIZE); - + status = nfsd4_decode_verifier4(argp, &setclientid->se_verf); + if (status) + return status; status = nfsd4_decode_opaque(argp, &setclientid->se_name); if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &setclientid->se_callback_prog) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &setclientid->se_callback_netid_len) < 0) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, setclientid->se_callback_netid_len); + if (!p) return nfserr_bad_xdr; - READ_BUF(8); - setclientid->se_callback_prog = be32_to_cpup(p++); - setclientid->se_callback_netid_len = be32_to_cpup(p++); - READ_BUF(setclientid->se_callback_netid_len); - SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len); - READ_BUF(4); - setclientid->se_callback_addr_len = be32_to_cpup(p++); + setclientid->se_callback_netid_val = svcxdr_tmpalloc(argp, + setclientid->se_callback_netid_len); + if (!setclientid->se_callback_netid_val) + return nfserr_jukebox; + memcpy(setclientid->se_callback_netid_val, p, + setclientid->se_callback_netid_len); - READ_BUF(setclientid->se_callback_addr_len); - SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len); - READ_BUF(4); - setclientid->se_callback_ident = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &setclientid->se_callback_addr_len) < 0) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, setclientid->se_callback_addr_len); + if (!p) + return nfserr_bad_xdr; + setclientid->se_callback_addr_val = svcxdr_tmpalloc(argp, + setclientid->se_callback_addr_len); + if (!setclientid->se_callback_addr_val) + return nfserr_jukebox; + memcpy(setclientid->se_callback_addr_val, p, + setclientid->se_callback_addr_len); + if (xdr_stream_decode_u32(argp->xdr, &setclientid->se_callback_ident) < 0) + return nfserr_bad_xdr; - DECODE_TAIL; + return nfs_ok; } static __be32 nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid_confirm *scd_c) { - DECODE_HEAD; + __be32 status; if (argp->minorversion >= 1) return nfserr_notsupp; - READ_BUF(8 + NFS4_VERIFIER_SIZE); - COPYMEM(&scd_c->sc_clientid, 8); - COPYMEM(&scd_c->sc_confirm, NFS4_VERIFIER_SIZE); - - DECODE_TAIL; + status = nfsd4_decode_clientid4(argp, &scd_c->sc_clientid); + if (status) + return status; + return nfsd4_decode_verifier4(argp, &scd_c->sc_confirm); } /* Also used for NVERIFY */ static __be32 nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify) { - DECODE_HEAD; + __be32 *p, status; - if ((status = nfsd4_decode_bitmap(argp, verify->ve_bmval))) - goto out; + status = nfsd4_decode_bitmap4(argp, verify->ve_bmval, + ARRAY_SIZE(verify->ve_bmval)); + if (status) + return status; /* For convenience's sake, we compare raw xdr'd attributes in * nfsd4_proc_verify */ - READ_BUF(4); - verify->ve_attrlen = be32_to_cpup(p++); - READ_BUF(verify->ve_attrlen); - SAVEMEM(verify->ve_attrval, verify->ve_attrlen); + if (xdr_stream_decode_u32(argp->xdr, &verify->ve_attrlen) < 0) + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, verify->ve_attrlen); + if (!p) + return nfserr_bad_xdr; + verify->ve_attrval = svcxdr_tmpalloc(argp, verify->ve_attrlen); + if (!verify->ve_attrval) + return nfserr_jukebox; + memcpy(verify->ve_attrval, p, verify->ve_attrlen); - DECODE_TAIL; + return nfs_ok; } static __be32 nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) { - DECODE_HEAD; + __be32 status; - status = nfsd4_decode_stateid(argp, &write->wr_stateid); + status = nfsd4_decode_stateid4(argp, &write->wr_stateid); if (status) return status; - READ_BUF(16); - p = xdr_decode_hyper(p, &write->wr_offset); - write->wr_stable_how = be32_to_cpup(p++); + if (xdr_stream_decode_u64(argp->xdr, &write->wr_offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &write->wr_stable_how) < 0) + return nfserr_bad_xdr; if (write->wr_stable_how > NFS_FILE_SYNC) - goto xdr_error; - write->wr_buflen = be32_to_cpup(p++); - - status = svcxdr_construct_vector(argp, &write->wr_head, - &write->wr_pagelist, write->wr_buflen); - if (status) - return status; + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &write->wr_buflen) < 0) + return nfserr_bad_xdr; + if (!xdr_stream_subsegment(argp->xdr, &write->wr_payload, write->wr_buflen)) + return nfserr_bad_xdr; - DECODE_TAIL; + return nfs_ok; } static __be32 nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_release_lockowner *rlockowner) { - DECODE_HEAD; + __be32 status; if (argp->minorversion >= 1) return nfserr_notsupp; - READ_BUF(12); - COPYMEM(&rlockowner->rl_clientid, sizeof(clientid_t)); - rlockowner->rl_owner.len = be32_to_cpup(p++); - READ_BUF(rlockowner->rl_owner.len); - READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len); + status = nfsd4_decode_state_owner4(argp, &rlockowner->rl_clientid, + &rlockowner->rl_owner); + if (status) + return status; if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid)) return nfserr_inval; - DECODE_TAIL; + + return nfs_ok; +} + +static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc) +{ + if (xdr_stream_decode_u32(argp->xdr, &bc->bc_cb_program) < 0) + return nfserr_bad_xdr; + return nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec); +} + +static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts) +{ + u32 use_conn_in_rdma_mode; + __be32 status; + + status = nfsd4_decode_sessionid4(argp, &bcts->sessionid); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &bcts->dir) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &use_conn_in_rdma_mode) < 0) + return nfserr_bad_xdr; + + return nfs_ok; } static __be32 -nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, - struct nfsd4_exchange_id *exid) +nfsd4_decode_state_protect_ops(struct nfsd4_compoundargs *argp, + struct nfsd4_exchange_id *exid) { - int dummy, tmp; - DECODE_HEAD; + __be32 status; - READ_BUF(NFS4_VERIFIER_SIZE); - COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE); + status = nfsd4_decode_bitmap4(argp, exid->spo_must_enforce, + ARRAY_SIZE(exid->spo_must_enforce)); + if (status) + return nfserr_bad_xdr; + status = nfsd4_decode_bitmap4(argp, exid->spo_must_allow, + ARRAY_SIZE(exid->spo_must_allow)); + if (status) + return nfserr_bad_xdr; - status = nfsd4_decode_opaque(argp, &exid->clname); + return nfs_ok; +} + +/* + * This implementation currently does not support SP4_SSV. + * This decoder simply skips over these arguments. + */ +static noinline __be32 +nfsd4_decode_ssv_sp_parms(struct nfsd4_compoundargs *argp, + struct nfsd4_exchange_id *exid) +{ + u32 count, window, num_gss_handles; + __be32 status; + + /* ssp_ops */ + status = nfsd4_decode_state_protect_ops(argp, exid); if (status) + return status; + + /* ssp_hash_algs<> */ + if (xdr_stream_decode_u32(argp->xdr, &count) < 0) + return nfserr_bad_xdr; + while (count--) { + status = nfsd4_decode_ignored_string(argp, 0); + if (status) + return status; + } + + /* ssp_encr_algs<> */ + if (xdr_stream_decode_u32(argp->xdr, &count) < 0) + return nfserr_bad_xdr; + while (count--) { + status = nfsd4_decode_ignored_string(argp, 0); + if (status) + return status; + } + + if (xdr_stream_decode_u32(argp->xdr, &window) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &num_gss_handles) < 0) return nfserr_bad_xdr; - READ_BUF(4); - exid->flags = be32_to_cpup(p++); + return nfs_ok; +} - /* Ignore state_protect4_a */ - READ_BUF(4); - exid->spa_how = be32_to_cpup(p++); +static __be32 +nfsd4_decode_state_protect4_a(struct nfsd4_compoundargs *argp, + struct nfsd4_exchange_id *exid) +{ + __be32 status; + + if (xdr_stream_decode_u32(argp->xdr, &exid->spa_how) < 0) + return nfserr_bad_xdr; switch (exid->spa_how) { case SP4_NONE: break; case SP4_MACH_CRED: - /* spo_must_enforce */ - status = nfsd4_decode_bitmap(argp, - exid->spo_must_enforce); + status = nfsd4_decode_state_protect_ops(argp, exid); if (status) - goto out; - /* spo_must_allow */ - status = nfsd4_decode_bitmap(argp, exid->spo_must_allow); - if (status) - goto out; + return status; break; case SP4_SSV: - /* ssp_ops */ - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy * 4); - p += dummy; - - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy * 4); - p += dummy; - - /* ssp_hash_algs<> */ - READ_BUF(4); - tmp = be32_to_cpup(p++); - while (tmp--) { - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy); - p += XDR_QUADLEN(dummy); - } - - /* ssp_encr_algs<> */ - READ_BUF(4); - tmp = be32_to_cpup(p++); - while (tmp--) { - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy); - p += XDR_QUADLEN(dummy); - } - - /* ignore ssp_window and ssp_num_gss_handles: */ - READ_BUF(8); + status = nfsd4_decode_ssv_sp_parms(argp, exid); + if (status) + return status; break; default: - goto xdr_error; + return nfserr_bad_xdr; } - READ_BUF(4); /* nfs_impl_id4 array length */ - dummy = be32_to_cpup(p++); + return nfs_ok; +} - if (dummy > 1) - goto xdr_error; +static __be32 +nfsd4_decode_nfs_impl_id4(struct nfsd4_compoundargs *argp, + struct nfsd4_exchange_id *exid) +{ + __be32 status; + u32 count; - if (dummy == 1) { + if (xdr_stream_decode_u32(argp->xdr, &count) < 0) + return nfserr_bad_xdr; + switch (count) { + case 0: + break; + case 1: + /* Note that RFC 8881 places no length limit on + * nii_domain, but this implementation permits no + * more than NFS4_OPAQUE_LIMIT bytes */ status = nfsd4_decode_opaque(argp, &exid->nii_domain); if (status) - goto xdr_error; - - /* nii_name */ + return status; + /* Note that RFC 8881 places no length limit on + * nii_name, but this implementation permits no + * more than NFS4_OPAQUE_LIMIT bytes */ status = nfsd4_decode_opaque(argp, &exid->nii_name); if (status) - goto xdr_error; - - /* nii_date */ - status = nfsd4_decode_time(argp, &exid->nii_time); + return status; + status = nfsd4_decode_nfstime4(argp, &exid->nii_time); if (status) - goto xdr_error; + return status; + break; + default: + return nfserr_bad_xdr; } - DECODE_TAIL; -} -static __be32 -nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, - struct nfsd4_create_session *sess) -{ - DECODE_HEAD; - - READ_BUF(16); - COPYMEM(&sess->clientid, 8); - sess->seqid = be32_to_cpup(p++); - sess->flags = be32_to_cpup(p++); - - /* Fore channel attrs */ - READ_BUF(28); - p++; /* headerpadsz is always 0 */ - sess->fore_channel.maxreq_sz = be32_to_cpup(p++); - sess->fore_channel.maxresp_sz = be32_to_cpup(p++); - sess->fore_channel.maxresp_cached = be32_to_cpup(p++); - sess->fore_channel.maxops = be32_to_cpup(p++); - sess->fore_channel.maxreqs = be32_to_cpup(p++); - sess->fore_channel.nr_rdma_attrs = be32_to_cpup(p++); - if (sess->fore_channel.nr_rdma_attrs == 1) { - READ_BUF(4); - sess->fore_channel.rdma_attrs = be32_to_cpup(p++); - } else if (sess->fore_channel.nr_rdma_attrs > 1) { - dprintk("Too many fore channel attr bitmaps!\n"); - goto xdr_error; - } - - /* Back channel attrs */ - READ_BUF(28); - p++; /* headerpadsz is always 0 */ - sess->back_channel.maxreq_sz = be32_to_cpup(p++); - sess->back_channel.maxresp_sz = be32_to_cpup(p++); - sess->back_channel.maxresp_cached = be32_to_cpup(p++); - sess->back_channel.maxops = be32_to_cpup(p++); - sess->back_channel.maxreqs = be32_to_cpup(p++); - sess->back_channel.nr_rdma_attrs = be32_to_cpup(p++); - if (sess->back_channel.nr_rdma_attrs == 1) { - READ_BUF(4); - sess->back_channel.rdma_attrs = be32_to_cpup(p++); - } else if (sess->back_channel.nr_rdma_attrs > 1) { - dprintk("Too many back channel attr bitmaps!\n"); - goto xdr_error; - } - - READ_BUF(4); - sess->callback_prog = be32_to_cpup(p++); - nfsd4_decode_cb_sec(argp, &sess->cb_sec); - DECODE_TAIL; + return nfs_ok; } static __be32 -nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp, - struct nfsd4_destroy_session *destroy_session) +nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, + struct nfsd4_exchange_id *exid) { - DECODE_HEAD; - READ_BUF(NFS4_MAX_SESSIONID_LEN); - COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN); + __be32 status; - DECODE_TAIL; + status = nfsd4_decode_verifier4(argp, &exid->verifier); + if (status) + return status; + status = nfsd4_decode_opaque(argp, &exid->clname); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &exid->flags) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_state_protect4_a(argp, exid); + if (status) + return status; + return nfsd4_decode_nfs_impl_id4(argp, exid); } static __be32 -nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp, - struct nfsd4_free_stateid *free_stateid) +nfsd4_decode_channel_attrs4(struct nfsd4_compoundargs *argp, + struct nfsd4_channel_attrs *ca) { - DECODE_HEAD; - - READ_BUF(sizeof(stateid_t)); - free_stateid->fr_stateid.si_generation = be32_to_cpup(p++); - COPYMEM(&free_stateid->fr_stateid.si_opaque, sizeof(stateid_opaque_t)); - - DECODE_TAIL; -} + __be32 *p; -static __be32 -nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, - struct nfsd4_sequence *seq) -{ - DECODE_HEAD; + p = xdr_inline_decode(argp->xdr, XDR_UNIT * 7); + if (!p) + return nfserr_bad_xdr; - READ_BUF(NFS4_MAX_SESSIONID_LEN + 16); - COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN); - seq->seqid = be32_to_cpup(p++); - seq->slotid = be32_to_cpup(p++); - seq->maxslots = be32_to_cpup(p++); - seq->cachethis = be32_to_cpup(p++); + /* headerpadsz is ignored */ + p++; + ca->maxreq_sz = be32_to_cpup(p++); + ca->maxresp_sz = be32_to_cpup(p++); + ca->maxresp_cached = be32_to_cpup(p++); + ca->maxops = be32_to_cpup(p++); + ca->maxreqs = be32_to_cpup(p++); + ca->nr_rdma_attrs = be32_to_cpup(p); + switch (ca->nr_rdma_attrs) { + case 0: + break; + case 1: + if (xdr_stream_decode_u32(argp->xdr, &ca->rdma_attrs) < 0) + return nfserr_bad_xdr; + break; + default: + return nfserr_bad_xdr; + } - DECODE_TAIL; + return nfs_ok; } static __be32 -nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_stateid *test_stateid) +nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, + struct nfsd4_create_session *sess) { - int i; - __be32 *p, status; - struct nfsd4_test_stateid_id *stateid; - - READ_BUF(4); - test_stateid->ts_num_ids = ntohl(*p++); - - INIT_LIST_HEAD(&test_stateid->ts_stateid_list); - - for (i = 0; i < test_stateid->ts_num_ids; i++) { - stateid = svcxdr_tmpalloc(argp, sizeof(*stateid)); - if (!stateid) { - status = nfserrno(-ENOMEM); - goto out; - } - - INIT_LIST_HEAD(&stateid->ts_id_list); - list_add_tail(&stateid->ts_id_list, &test_stateid->ts_stateid_list); + __be32 status; - status = nfsd4_decode_stateid(argp, &stateid->ts_id_stateid); - if (status) - goto out; - } + status = nfsd4_decode_clientid4(argp, &sess->clientid); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &sess->seqid) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &sess->flags) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_channel_attrs4(argp, &sess->fore_channel); + if (status) + return status; + status = nfsd4_decode_channel_attrs4(argp, &sess->back_channel); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &sess->callback_prog) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_cb_sec(argp, &sess->cb_sec); + if (status) + return status; - status = 0; -out: - return status; -xdr_error: - dprintk("NFSD: xdr error (%s:%d)\n", __FILE__, __LINE__); - status = nfserr_bad_xdr; - goto out; + return nfs_ok; } -static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, struct nfsd4_destroy_clientid *dc) +static __be32 +nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp, + struct nfsd4_destroy_session *destroy_session) { - DECODE_HEAD; - - READ_BUF(8); - COPYMEM(&dc->clientid, 8); - - DECODE_TAIL; + return nfsd4_decode_sessionid4(argp, &destroy_session->sessionid); } -static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc) +static __be32 +nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp, + struct nfsd4_free_stateid *free_stateid) { - DECODE_HEAD; - - READ_BUF(4); - rc->rca_one_fs = be32_to_cpup(p++); - - DECODE_TAIL; + return nfsd4_decode_stateid4(argp, &free_stateid->fr_stateid); } #ifdef CONFIG_NFSD_PNFS @@ -1593,244 +1670,264 @@ static __be32 nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp, struct nfsd4_getdeviceinfo *gdev) { - DECODE_HEAD; - u32 num, i; - - READ_BUF(sizeof(struct nfsd4_deviceid) + 3 * 4); - COPYMEM(&gdev->gd_devid, sizeof(struct nfsd4_deviceid)); - gdev->gd_layout_type = be32_to_cpup(p++); - gdev->gd_maxcount = be32_to_cpup(p++); - num = be32_to_cpup(p++); - if (num) { - if (num > 1000) - goto xdr_error; - READ_BUF(4 * num); - gdev->gd_notify_types = be32_to_cpup(p++); - for (i = 1; i < num; i++) { - if (be32_to_cpup(p++)) { - status = nfserr_inval; - goto out; - } - } - } - DECODE_TAIL; -} - -static __be32 -nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, - struct nfsd4_layoutget *lgp) -{ - DECODE_HEAD; - - READ_BUF(36); - lgp->lg_signal = be32_to_cpup(p++); - lgp->lg_layout_type = be32_to_cpup(p++); - lgp->lg_seg.iomode = be32_to_cpup(p++); - p = xdr_decode_hyper(p, &lgp->lg_seg.offset); - p = xdr_decode_hyper(p, &lgp->lg_seg.length); - p = xdr_decode_hyper(p, &lgp->lg_minlength); + __be32 status; - status = nfsd4_decode_stateid(argp, &lgp->lg_sid); + status = nfsd4_decode_deviceid4(argp, &gdev->gd_devid); if (status) return status; + if (xdr_stream_decode_u32(argp->xdr, &gdev->gd_layout_type) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &gdev->gd_maxcount) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_uint32_array(argp->xdr, + &gdev->gd_notify_types, 1) < 0) + return nfserr_bad_xdr; - READ_BUF(4); - lgp->lg_maxcount = be32_to_cpup(p++); - - DECODE_TAIL; + return nfs_ok; } static __be32 nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, - struct nfsd4_layoutcommit *lcp) + struct nfsd4_layoutcommit *lcp) { - DECODE_HEAD; - u32 timechange; - - READ_BUF(20); - p = xdr_decode_hyper(p, &lcp->lc_seg.offset); - p = xdr_decode_hyper(p, &lcp->lc_seg.length); - lcp->lc_reclaim = be32_to_cpup(p++); + __be32 *p, status; - status = nfsd4_decode_stateid(argp, &lcp->lc_sid); + if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_seg.offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_seg.length) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_bool(argp->xdr, &lcp->lc_reclaim) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_stateid4(argp, &lcp->lc_sid); if (status) return status; - - READ_BUF(4); - lcp->lc_newoffset = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_newoffset) < 0) + return nfserr_bad_xdr; if (lcp->lc_newoffset) { - READ_BUF(8); - p = xdr_decode_hyper(p, &lcp->lc_last_wr); + if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_last_wr) < 0) + return nfserr_bad_xdr; } else lcp->lc_last_wr = 0; - READ_BUF(4); - timechange = be32_to_cpup(p++); - if (timechange) { - status = nfsd4_decode_time(argp, &lcp->lc_mtime); + p = xdr_inline_decode(argp->xdr, XDR_UNIT); + if (!p) + return nfserr_bad_xdr; + if (xdr_item_is_present(p)) { + status = nfsd4_decode_nfstime4(argp, &lcp->lc_mtime); if (status) return status; } else { lcp->lc_mtime.tv_nsec = UTIME_NOW; } - READ_BUF(8); - lcp->lc_layout_type = be32_to_cpup(p++); + return nfsd4_decode_layoutupdate4(argp, lcp); +} - /* - * Save the layout update in XDR format and let the layout driver deal - * with it later. - */ - lcp->lc_up_len = be32_to_cpup(p++); - if (lcp->lc_up_len > 0) { - READ_BUF(lcp->lc_up_len); - READMEM(lcp->lc_up_layout, lcp->lc_up_len); - } +static __be32 +nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, + struct nfsd4_layoutget *lgp) +{ + __be32 status; - DECODE_TAIL; + if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_signal) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_layout_type) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_seg.iomode) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lgp->lg_seg.offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lgp->lg_seg.length) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &lgp->lg_minlength) < 0) + return nfserr_bad_xdr; + status = nfsd4_decode_stateid4(argp, &lgp->lg_sid); + if (status) + return status; + if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_maxcount) < 0) + return nfserr_bad_xdr; + + return nfs_ok; } static __be32 nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, struct nfsd4_layoutreturn *lrp) { - DECODE_HEAD; + if (xdr_stream_decode_bool(argp->xdr, &lrp->lr_reclaim) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &lrp->lr_layout_type) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &lrp->lr_seg.iomode) < 0) + return nfserr_bad_xdr; + return nfsd4_decode_layoutreturn4(argp, lrp); +} +#endif /* CONFIG_NFSD_PNFS */ - READ_BUF(16); - lrp->lr_reclaim = be32_to_cpup(p++); - lrp->lr_layout_type = be32_to_cpup(p++); - lrp->lr_seg.iomode = be32_to_cpup(p++); - lrp->lr_return_type = be32_to_cpup(p++); - if (lrp->lr_return_type == RETURN_FILE) { - READ_BUF(16); - p = xdr_decode_hyper(p, &lrp->lr_seg.offset); - p = xdr_decode_hyper(p, &lrp->lr_seg.length); +static __be32 nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp, + struct nfsd4_secinfo_no_name *sin) +{ + if (xdr_stream_decode_u32(argp->xdr, &sin->sin_style) < 0) + return nfserr_bad_xdr; + return nfs_ok; +} - status = nfsd4_decode_stateid(argp, &lrp->lr_sid); - if (status) - return status; +static __be32 +nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, + struct nfsd4_sequence *seq) +{ + __be32 *p, status; - READ_BUF(4); - lrp->lrf_body_len = be32_to_cpup(p++); - if (lrp->lrf_body_len > 0) { - READ_BUF(lrp->lrf_body_len); - READMEM(lrp->lrf_body, lrp->lrf_body_len); - } - } else { - lrp->lr_seg.offset = 0; - lrp->lr_seg.length = NFS4_MAX_UINT64; - } + status = nfsd4_decode_sessionid4(argp, &seq->sessionid); + if (status) + return status; + p = xdr_inline_decode(argp->xdr, XDR_UNIT * 4); + if (!p) + return nfserr_bad_xdr; + seq->seqid = be32_to_cpup(p++); + seq->slotid = be32_to_cpup(p++); + seq->maxslots = be32_to_cpup(p++); + seq->cachethis = be32_to_cpup(p); - DECODE_TAIL; + return nfs_ok; } -#endif /* CONFIG_NFSD_PNFS */ static __be32 -nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp, - struct nfsd4_fallocate *fallocate) +nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_stateid *test_stateid) { - DECODE_HEAD; + struct nfsd4_test_stateid_id *stateid; + __be32 status; + u32 i; - status = nfsd4_decode_stateid(argp, &fallocate->falloc_stateid); - if (status) - return status; + if (xdr_stream_decode_u32(argp->xdr, &test_stateid->ts_num_ids) < 0) + return nfserr_bad_xdr; + + INIT_LIST_HEAD(&test_stateid->ts_stateid_list); + for (i = 0; i < test_stateid->ts_num_ids; i++) { + stateid = svcxdr_tmpalloc(argp, sizeof(*stateid)); + if (!stateid) + return nfserrno(-ENOMEM); /* XXX: not jukebox? */ + INIT_LIST_HEAD(&stateid->ts_id_list); + list_add_tail(&stateid->ts_id_list, &test_stateid->ts_stateid_list); + status = nfsd4_decode_stateid4(argp, &stateid->ts_id_stateid); + if (status) + return status; + } + + return nfs_ok; +} - READ_BUF(16); - p = xdr_decode_hyper(p, &fallocate->falloc_offset); - xdr_decode_hyper(p, &fallocate->falloc_length); +static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, + struct nfsd4_destroy_clientid *dc) +{ + return nfsd4_decode_clientid4(argp, &dc->clientid); +} - DECODE_TAIL; +static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, + struct nfsd4_reclaim_complete *rc) +{ + if (xdr_stream_decode_bool(argp->xdr, &rc->rca_one_fs) < 0) + return nfserr_bad_xdr; + return nfs_ok; } static __be32 -nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone) +nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp, + struct nfsd4_fallocate *fallocate) { - DECODE_HEAD; + __be32 status; - status = nfsd4_decode_stateid(argp, &clone->cl_src_stateid); - if (status) - return status; - status = nfsd4_decode_stateid(argp, &clone->cl_dst_stateid); + status = nfsd4_decode_stateid4(argp, &fallocate->falloc_stateid); if (status) return status; + if (xdr_stream_decode_u64(argp->xdr, &fallocate->falloc_offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &fallocate->falloc_length) < 0) + return nfserr_bad_xdr; - READ_BUF(8 + 8 + 8); - p = xdr_decode_hyper(p, &clone->cl_src_pos); - p = xdr_decode_hyper(p, &clone->cl_dst_pos); - p = xdr_decode_hyper(p, &clone->cl_count); - DECODE_TAIL; + return nfs_ok; } static __be32 nfsd4_decode_nl4_server(struct nfsd4_compoundargs *argp, struct nl4_server *ns) { - DECODE_HEAD; struct nfs42_netaddr *naddr; + __be32 *p; - READ_BUF(4); - ns->nl4_type = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &ns->nl4_type) < 0) + return nfserr_bad_xdr; /* currently support for 1 inter-server source server */ switch (ns->nl4_type) { case NL4_NETADDR: naddr = &ns->u.nl4_addr; - READ_BUF(4); - naddr->netid_len = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &naddr->netid_len) < 0) + return nfserr_bad_xdr; if (naddr->netid_len > RPCBIND_MAXNETIDLEN) - goto xdr_error; + return nfserr_bad_xdr; - READ_BUF(naddr->netid_len + 4); /* 4 for uaddr len */ - COPYMEM(naddr->netid, naddr->netid_len); + p = xdr_inline_decode(argp->xdr, naddr->netid_len); + if (!p) + return nfserr_bad_xdr; + memcpy(naddr->netid, p, naddr->netid_len); - naddr->addr_len = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &naddr->addr_len) < 0) + return nfserr_bad_xdr; if (naddr->addr_len > RPCBIND_MAXUADDRLEN) - goto xdr_error; + return nfserr_bad_xdr; - READ_BUF(naddr->addr_len); - COPYMEM(naddr->addr, naddr->addr_len); + p = xdr_inline_decode(argp->xdr, naddr->addr_len); + if (!p) + return nfserr_bad_xdr; + memcpy(naddr->addr, p, naddr->addr_len); break; default: - goto xdr_error; + return nfserr_bad_xdr; } - DECODE_TAIL; + + return nfs_ok; } static __be32 nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) { - DECODE_HEAD; struct nl4_server *ns_dummy; - int i, count; + u32 consecutive, i, count; + __be32 status; - status = nfsd4_decode_stateid(argp, ©->cp_src_stateid); + status = nfsd4_decode_stateid4(argp, ©->cp_src_stateid); if (status) return status; - status = nfsd4_decode_stateid(argp, ©->cp_dst_stateid); + status = nfsd4_decode_stateid4(argp, ©->cp_dst_stateid); if (status) return status; + if (xdr_stream_decode_u64(argp->xdr, ©->cp_src_pos) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, ©->cp_dst_pos) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, ©->cp_count) < 0) + return nfserr_bad_xdr; + /* ca_consecutive: we always do consecutive copies */ + if (xdr_stream_decode_u32(argp->xdr, &consecutive) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, ©->cp_synchronous) < 0) + return nfserr_bad_xdr; - READ_BUF(8 + 8 + 8 + 4 + 4 + 4); - p = xdr_decode_hyper(p, ©->cp_src_pos); - p = xdr_decode_hyper(p, ©->cp_dst_pos); - p = xdr_decode_hyper(p, ©->cp_count); - p++; /* ca_consecutive: we always do consecutive copies */ - copy->cp_synchronous = be32_to_cpup(p++); - - count = be32_to_cpup(p++); - + if (xdr_stream_decode_u32(argp->xdr, &count) < 0) + return nfserr_bad_xdr; copy->cp_intra = false; if (count == 0) { /* intra-server copy */ copy->cp_intra = true; - goto intra; + return nfs_ok; } - /* decode all the supplied server addresses but use first */ + /* decode all the supplied server addresses but use only the first */ status = nfsd4_decode_nl4_server(argp, ©->cp_src); if (status) return status; ns_dummy = kmalloc(sizeof(struct nl4_server), GFP_KERNEL); if (ns_dummy == NULL) - return nfserrno(-ENOMEM); + return nfserrno(-ENOMEM); /* XXX: jukebox? */ for (i = 0; i < count - 1; i++) { status = nfsd4_decode_nl4_server(argp, ns_dummy); if (status) { @@ -1839,44 +1936,64 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) } } kfree(ns_dummy); -intra: - DECODE_TAIL; + return nfs_ok; +} + +static __be32 +nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp, + struct nfsd4_copy_notify *cn) +{ + __be32 status; + + status = nfsd4_decode_stateid4(argp, &cn->cpn_src_stateid); + if (status) + return status; + return nfsd4_decode_nl4_server(argp, &cn->cpn_dst); } static __be32 nfsd4_decode_offload_status(struct nfsd4_compoundargs *argp, struct nfsd4_offload_status *os) { - return nfsd4_decode_stateid(argp, &os->stateid); + return nfsd4_decode_stateid4(argp, &os->stateid); } static __be32 -nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp, - struct nfsd4_copy_notify *cn) +nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) { __be32 status; - status = nfsd4_decode_stateid(argp, &cn->cpn_src_stateid); + status = nfsd4_decode_stateid4(argp, &seek->seek_stateid); if (status) return status; - return nfsd4_decode_nl4_server(argp, &cn->cpn_dst); + if (xdr_stream_decode_u64(argp->xdr, &seek->seek_offset) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u32(argp->xdr, &seek->seek_whence) < 0) + return nfserr_bad_xdr; + + return nfs_ok; } static __be32 -nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) +nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone) { - DECODE_HEAD; + __be32 status; - status = nfsd4_decode_stateid(argp, &seek->seek_stateid); + status = nfsd4_decode_stateid4(argp, &clone->cl_src_stateid); if (status) return status; + status = nfsd4_decode_stateid4(argp, &clone->cl_dst_stateid); + if (status) + return status; + if (xdr_stream_decode_u64(argp->xdr, &clone->cl_src_pos) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &clone->cl_dst_pos) < 0) + return nfserr_bad_xdr; + if (xdr_stream_decode_u64(argp->xdr, &clone->cl_count) < 0) + return nfserr_bad_xdr; - READ_BUF(8 + 4); - p = xdr_decode_hyper(p, &seek->seek_offset); - seek->seek_whence = be32_to_cpup(p); - - DECODE_TAIL; + return nfs_ok; } /* @@ -1889,13 +2006,14 @@ nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) */ /* - * Decode data into buffer. Uses head and pages constructed by - * svcxdr_construct_vector. + * Decode data into buffer. */ static __be32 -nfsd4_vbuf_from_vector(struct nfsd4_compoundargs *argp, struct kvec *head, - struct page **pages, char **bufp, u32 buflen) +nfsd4_vbuf_from_vector(struct nfsd4_compoundargs *argp, struct xdr_buf *xdr, + char **bufp, u32 buflen) { + struct page **pages = xdr->pages; + struct kvec *head = xdr->head; char *tmp, *dp; u32 len; @@ -1938,25 +2056,22 @@ nfsd4_vbuf_from_vector(struct nfsd4_compoundargs *argp, struct kvec *head, static __be32 nfsd4_decode_xattr_name(struct nfsd4_compoundargs *argp, char **namep) { - DECODE_HEAD; char *name, *sp, *dp; u32 namelen, cnt; + __be32 *p; - READ_BUF(4); - namelen = be32_to_cpup(p++); - + if (xdr_stream_decode_u32(argp->xdr, &namelen) < 0) + return nfserr_bad_xdr; if (namelen > (XATTR_NAME_MAX - XATTR_USER_PREFIX_LEN)) return nfserr_nametoolong; - if (namelen == 0) - goto xdr_error; - - READ_BUF(namelen); - + return nfserr_bad_xdr; + p = xdr_inline_decode(argp->xdr, namelen); + if (!p) + return nfserr_bad_xdr; name = svcxdr_tmpalloc(argp, namelen + XATTR_USER_PREFIX_LEN + 1); if (!name) return nfserr_jukebox; - memcpy(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); /* @@ -1969,14 +2084,14 @@ nfsd4_decode_xattr_name(struct nfsd4_compoundargs *argp, char **namep) while (cnt-- > 0) { if (*sp == '\0') - goto xdr_error; + return nfserr_bad_xdr; *dp++ = *sp++; } *dp = '\0'; *namep = name; - DECODE_TAIL; + return nfs_ok; } /* @@ -2008,13 +2123,11 @@ static __be32 nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp, struct nfsd4_setxattr *setxattr) { - DECODE_HEAD; u32 flags, maxcount, size; - struct kvec head; - struct page **pagelist; + __be32 status; - READ_BUF(4); - flags = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &flags) < 0) + return nfserr_bad_xdr; if (flags > SETXATTR4_REPLACE) return nfserr_inval; @@ -2027,33 +2140,32 @@ nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp, maxcount = svc_max_payload(argp->rqstp); maxcount = min_t(u32, XATTR_SIZE_MAX, maxcount); - READ_BUF(4); - size = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &size) < 0) + return nfserr_bad_xdr; if (size > maxcount) return nfserr_xattr2big; setxattr->setxa_len = size; if (size > 0) { - status = svcxdr_construct_vector(argp, &head, &pagelist, size); - if (status) - return status; + struct xdr_buf payload; - status = nfsd4_vbuf_from_vector(argp, &head, pagelist, - &setxattr->setxa_buf, size); + if (!xdr_stream_subsegment(argp->xdr, &payload, size)) + return nfserr_bad_xdr; + status = nfsd4_vbuf_from_vector(argp, &payload, + &setxattr->setxa_buf, size); } - DECODE_TAIL; + return nfs_ok; } static __be32 nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp, struct nfsd4_listxattrs *listxattrs) { - DECODE_HEAD; u32 maxcount; - READ_BUF(12); - p = xdr_decode_hyper(p, &listxattrs->lsxa_cookie); + if (xdr_stream_decode_u64(argp->xdr, &listxattrs->lsxa_cookie) < 0) + return nfserr_bad_xdr; /* * If the cookie is too large to have even one user.x attribute @@ -2063,7 +2175,8 @@ nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp, (XATTR_LIST_MAX / (XATTR_USER_PREFIX_LEN + 2))) return nfserr_badcookie; - maxcount = be32_to_cpup(p++); + if (xdr_stream_decode_u32(argp->xdr, &maxcount) < 0) + return nfserr_bad_xdr; if (maxcount < 8) /* Always need at least 2 words (length and one character) */ return nfserr_inval; @@ -2071,7 +2184,7 @@ nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp, maxcount = min(maxcount, svc_max_payload(argp->rqstp)); listxattrs->lsxa_maxcount = maxcount; - DECODE_TAIL; + return nfs_ok; } static __be32 @@ -2198,43 +2311,54 @@ nfsd4_opnum_in_range(struct nfsd4_compoundargs *argp, struct nfsd4_op *op) return true; } -static __be32 +static int nfsd4_decode_compound(struct nfsd4_compoundargs *argp) { - DECODE_HEAD; struct nfsd4_op *op; bool cachethis = false; int auth_slack= argp->rqstp->rq_auth_slack; int max_reply = auth_slack + 8; /* opcnt, status */ int readcount = 0; int readbytes = 0; + __be32 *p; int i; - READ_BUF(4); - argp->taglen = be32_to_cpup(p++); - READ_BUF(argp->taglen); - SAVEMEM(argp->tag, argp->taglen); - READ_BUF(8); - argp->minorversion = be32_to_cpup(p++); - argp->opcnt = be32_to_cpup(p++); - max_reply += 4 + (XDR_QUADLEN(argp->taglen) << 2); - - if (argp->taglen > NFSD4_MAX_TAGLEN) - goto xdr_error; + if (xdr_stream_decode_u32(argp->xdr, &argp->taglen) < 0) + return 0; + max_reply += XDR_UNIT; + argp->tag = NULL; + if (unlikely(argp->taglen)) { + if (argp->taglen > NFSD4_MAX_TAGLEN) + return 0; + p = xdr_inline_decode(argp->xdr, argp->taglen); + if (!p) + return 0; + argp->tag = svcxdr_tmpalloc(argp, argp->taglen); + if (!argp->tag) + return 0; + memcpy(argp->tag, p, argp->taglen); + max_reply += xdr_align_size(argp->taglen); + } + + if (xdr_stream_decode_u32(argp->xdr, &argp->minorversion) < 0) + return 0; + if (xdr_stream_decode_u32(argp->xdr, &argp->opcnt) < 0) + return 0; + /* * NFS4ERR_RESOURCE is a more helpful error than GARBAGE_ARGS * here, so we return success at the xdr level so that * nfsd4_proc can handle this is an NFS-level error. */ if (argp->opcnt > NFSD_MAX_OPS_PER_COMPOUND) - return 0; + return 1; if (argp->opcnt > ARRAY_SIZE(argp->iops)) { argp->ops = kzalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL); if (!argp->ops) { argp->ops = argp->iops; dprintk("nfsd: couldn't allocate room for COMPOUND\n"); - goto xdr_error; + return 0; } } @@ -2245,12 +2369,16 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) op = &argp->ops[i]; op->replay = NULL; - READ_BUF(4); - op->opnum = be32_to_cpup(p++); - - if (nfsd4_opnum_in_range(argp, op)) + if (xdr_stream_decode_u32(argp->xdr, &op->opnum) < 0) + return 0; + if (nfsd4_opnum_in_range(argp, op)) { op->status = nfsd4_dec_ops[op->opnum](argp, &op->u); - else { + if (op->status != nfs_ok) + trace_nfsd_compound_decode_err(argp->rqstp, + argp->opcnt, i, + op->opnum, + op->status); + } else { op->opnum = OP_ILLEGAL; op->status = nfserr_op_illegal; } @@ -2289,7 +2417,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack) clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags); - DECODE_TAIL; + return 1; } static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode, @@ -2298,12 +2426,8 @@ static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode, if (exp->ex_flags & NFSEXP_V4ROOT) { *p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time)); *p++ = 0; - } else if (IS_I_VERSION(inode)) { + } else p = xdr_encode_hyper(p, nfsd4_change_attribute(stat, inode)); - } else { - *p++ = cpu_to_be32(stat->ctime.tv_sec); - *p++ = cpu_to_be32(stat->ctime.tv_nsec); - } return p; } @@ -2335,15 +2459,8 @@ static __be32 *encode_time_delta(__be32 *p, struct inode *inode) static __be32 *encode_cinfo(__be32 *p, struct nfsd4_change_info *c) { *p++ = cpu_to_be32(c->atomic); - if (c->change_supported) { - p = xdr_encode_hyper(p, c->before_change); - p = xdr_encode_hyper(p, c->after_change); - } else { - *p++ = cpu_to_be32(c->before_ctime_sec); - *p++ = cpu_to_be32(c->before_ctime_nsec); - *p++ = cpu_to_be32(c->after_ctime_sec); - *p++ = cpu_to_be32(c->after_ctime_nsec); - } + p = xdr_encode_hyper(p, c->before_change); + p = xdr_encode_hyper(p, c->after_change); return p; } @@ -2558,7 +2675,7 @@ static u32 nfs4_file_type(umode_t mode) case S_IFREG: return NF4REG; case S_IFSOCK: return NF4SOCK; default: return NF4BAD; - }; + } } static inline __be32 @@ -3194,16 +3311,6 @@ out_acl: goto out; } - if (bmval2 & FATTR4_WORD2_CHANGE_ATTR_TYPE) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - if (IS_I_VERSION(d_inode(dentry))) - *p++ = cpu_to_be32(NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR); - else - *p++ = cpu_to_be32(NFS4_CHANGE_TYPE_IS_TIME_METADATA); - } - #ifdef CONFIG_NFSD_V4_SECURITY_LABEL if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { status = nfsd4_encode_security_label(xdr, rqstp, context, @@ -3756,8 +3863,8 @@ static __be32 nfsd4_encode_splice_read( { struct xdr_stream *xdr = &resp->xdr; struct xdr_buf *buf = xdr->buf; + int status, space_left; u32 eof; - int space_left; __be32 nfserr; __be32 *p = xdr->p - 2; @@ -3768,14 +3875,13 @@ static __be32 nfsd4_encode_splice_read( nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp, file, read->rd_offset, &maxcount, &eof); read->rd_length = maxcount; - if (nfserr) { - /* - * nfsd_splice_actor may have already messed with the - * page length; reset it so as not to confuse - * xdr_truncate_encode: - */ - buf->page_len = 0; - return nfserr; + if (nfserr) + goto out_err; + status = svc_encode_result_payload(read->rd_rqstp, + buf->head[0].iov_len, maxcount); + if (status) { + nfserr = nfserrno(status); + goto out_err; } *(p++) = htonl(eof); @@ -3806,6 +3912,15 @@ static __be32 nfsd4_encode_splice_read( xdr->end = (__be32 *)((void *)xdr->end + space_left); return 0; + +out_err: + /* + * nfsd_splice_actor may have already messed with the + * page length; reset it so as not to confuse + * xdr_truncate_encode in our caller. + */ + buf->page_len = 0; + return nfserr; } static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, @@ -3829,7 +3944,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, read->rd_length = maxcount; if (nfserr) return nfserr; - if (svc_encode_read_payload(resp->rqstp, starting_len + 8, maxcount)) + if (svc_encode_result_payload(resp->rqstp, starting_len + 8, maxcount)) return nfserr_io; xdr_truncate_encode(xdr, starting_len + 8 + xdr_align_size(maxcount)); @@ -3897,6 +4012,7 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd int zero = 0; struct xdr_stream *xdr = &resp->xdr; int length_offset = xdr->buf->len; + int status; __be32 *p; p = xdr_reserve_space(xdr, 4); @@ -3917,9 +4033,13 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd (char *)p, &maxcount); if (nfserr == nfserr_isdir) nfserr = nfserr_inval; - if (nfserr) { - xdr_truncate_encode(xdr, length_offset); - return nfserr; + if (nfserr) + goto out_err; + status = svc_encode_result_payload(readlink->rl_rqstp, length_offset, + maxcount); + if (status) { + nfserr = nfserrno(status); + goto out_err; } wire_count = htonl(maxcount); @@ -3929,6 +4049,10 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount, &zero, 4 - (maxcount&3)); return 0; + +out_err: + xdr_truncate_encode(xdr, length_offset); + return nfserr; } static __be32 @@ -4575,7 +4699,7 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, __be32 *p; nfserr = nfsd42_encode_write_res(resp, ©->cp_res, - copy->cp_synchronous); + !!copy->cp_synchronous); if (nfserr) return nfserr; @@ -5182,10 +5306,12 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) if (op->status && opdesc && !(opdesc->op_flags & OP_NONTRIVIAL_ERROR_ENCODE)) goto status; - BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || + BUG_ON(op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || !nfsd4_enc_ops[op->opnum]); encoder = nfsd4_enc_ops[op->opnum]; op->status = encoder(resp, op->status, &op->u); + if (op->status) + trace_nfsd_compound_encode_err(rqstp, op->opnum, op->status); if (opdesc && opdesc->op_release) opdesc->op_release(&op->u); xdr_commit_encode(xdr); @@ -5254,12 +5380,6 @@ nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op) p = xdr_encode_opaque_fixed(p, rp->rp_buf, rp->rp_buflen); } -int -nfs4svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p) -{ - return xdr_ressize_check(rqstp, p); -} - void nfsd4_release_compoundargs(struct svc_rqst *rqstp) { struct nfsd4_compoundargs *args = rqstp->rq_argp; @@ -5268,8 +5388,6 @@ void nfsd4_release_compoundargs(struct svc_rqst *rqstp) kfree(args->ops); args->ops = args->iops; } - kfree(args->tmpp); - args->tmpp = NULL; while (args->to_free) { struct svcxdr_tmpbuf *tb = args->to_free; args->to_free = tb->next; @@ -5278,33 +5396,18 @@ void nfsd4_release_compoundargs(struct svc_rqst *rqstp) } int -nfs4svc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p) -{ - return 1; -} - -int nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd4_compoundargs *args = rqstp->rq_argp; - if (rqstp->rq_arg.head[0].iov_len % 4) { - /* client is nuts */ - dprintk("%s: compound not properly padded! (peeraddr=%pISc xid=0x%x)", - __func__, svc_addr(rqstp), be32_to_cpu(rqstp->rq_xid)); - return 0; - } - args->p = p; - args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len; - args->pagelist = rqstp->rq_arg.pages; - args->pagelen = rqstp->rq_arg.page_len; - args->tail = false; - args->tmpp = NULL; + /* svcxdr_tmp_alloc */ args->to_free = NULL; + + args->xdr = &rqstp->rq_arg_stream; args->ops = args->iops; args->rqstp = rqstp; - return !nfsd4_decode_compound(args); + return nfsd4_decode_compound(args); } int diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index cb742e17e04a..d63cf8196fed 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -74,6 +74,14 @@ extern unsigned long nfsd_drc_mem_used; extern const struct seq_operations nfs_exports_op; /* + * Common void argument and result helpers + */ +struct nfsd_voidargs { }; +struct nfsd_voidres { }; +int nfssvc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p); +int nfssvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p); + +/* * Function prototypes. */ int nfsd_svc(int nrservs, struct net *net, const struct cred *cred); @@ -387,7 +395,6 @@ void nfsd_lockd_shutdown(void); #define NFSD4_2_SUPPORTED_ATTRS_WORD2 \ (NFSD4_1_SUPPORTED_ATTRS_WORD2 | \ - FATTR4_WORD2_CHANGE_ATTR_TYPE | \ FATTR4_WORD2_MODE_UMASK | \ NFSD4_2_SECURITY_ATTRS | \ FATTR4_WORD2_XATTR_SUPPORT) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index c81dbbad8792..66f2ef67792a 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -268,12 +268,20 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) if (fileid_type == FILEID_ROOT) dentry = dget(exp->ex_path.dentry); else { - dentry = exportfs_decode_fh(exp->ex_path.mnt, fid, - data_left, fileid_type, - nfsd_acceptable, exp); - if (IS_ERR_OR_NULL(dentry)) + dentry = exportfs_decode_fh_raw(exp->ex_path.mnt, fid, + data_left, fileid_type, + nfsd_acceptable, exp); + if (IS_ERR_OR_NULL(dentry)) { trace_nfsd_set_fh_dentry_badhandle(rqstp, fhp, dentry ? PTR_ERR(dentry) : -ESTALE); + switch (PTR_ERR(dentry)) { + case -ENOMEM: + case -ETIMEDOUT: + break; + default: + dentry = ERR_PTR(-ESTALE); + } + } } if (dentry == NULL) goto out; @@ -291,6 +299,20 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) fhp->fh_dentry = dentry; fhp->fh_export = exp; + + switch (rqstp->rq_vers) { + case 4: + if (dentry->d_sb->s_export_op->flags & EXPORT_OP_NOATOMIC_ATTR) + fhp->fh_no_atomic_attr = true; + break; + case 3: + if (dentry->d_sb->s_export_op->flags & EXPORT_OP_NOWCC) + fhp->fh_no_wcc = true; + break; + case 2: + fhp->fh_no_wcc = true; + } + return 0; out: exp_put(exp); @@ -559,6 +581,9 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, */ set_version_and_fsid_type(fhp, exp, ref_fh); + /* If we have a ref_fh, then copy the fh_no_wcc setting from it. */ + fhp->fh_no_wcc = ref_fh ? ref_fh->fh_no_wcc : false; + if (ref_fh == fhp) fh_put(ref_fh); @@ -662,6 +687,7 @@ fh_put(struct svc_fh *fhp) exp_put(exp); fhp->fh_export = NULL; } + fhp->fh_no_wcc = false; return; } diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 56cfbc361561..cb20c2cd3469 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -35,6 +35,12 @@ typedef struct svc_fh { bool fh_locked; /* inode locked by us */ bool fh_want_write; /* remount protection taken */ + bool fh_no_wcc; /* no wcc data needed */ + bool fh_no_atomic_attr; + /* + * wcc data is not atomic with + * operation + */ int fh_flags; /* FH flags */ #ifdef CONFIG_NFSD_V3 bool fh_post_saved; /* post-op attrs saved */ @@ -54,7 +60,6 @@ typedef struct svc_fh { struct kstat fh_post_attr; /* full attrs after operation */ u64 fh_post_change; /* nfsv4 change; see above */ #endif /* CONFIG_NFSD_V3 */ - } svc_fh; #define NFSD4_FH_FOREIGN (1<<0) #define SET_FH_FLAG(c, f) ((c)->fh_flags |= (f)) @@ -259,13 +264,16 @@ fh_clear_wcc(struct svc_fh *fhp) static inline u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode) { - u64 chattr; - - chattr = stat->ctime.tv_sec; - chattr <<= 30; - chattr += stat->ctime.tv_nsec; - chattr += inode_query_iversion(inode); - return chattr; + if (IS_I_VERSION(inode)) { + u64 chattr; + + chattr = stat->ctime.tv_sec; + chattr <<= 30; + chattr += stat->ctime.tv_nsec; + chattr += inode_query_iversion(inode); + return chattr; + } else + return time_to_chattr(&stat->ctime); } extern void fill_pre_wcc(struct svc_fh *fhp); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 0d71549f9d42..9473d048efec 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -609,7 +609,6 @@ nfsd_proc_statfs(struct svc_rqst *rqstp) * NFSv2 Server procedures. * Only the results of non-idempotent operations are cached. */ -struct nfsd_void { int dummy; }; #define ST 1 /* status */ #define FH 8 /* filehandle */ @@ -618,10 +617,10 @@ struct nfsd_void { int dummy; }; static const struct svc_procedure nfsd_procedures2[18] = { [NFSPROC_NULL] = { .pc_func = nfsd_proc_null, - .pc_decode = nfssvc_decode_void, - .pc_encode = nfssvc_encode_void, - .pc_argsize = sizeof(struct nfsd_void), - .pc_ressize = sizeof(struct nfsd_void), + .pc_decode = nfssvc_decode_voidarg, + .pc_encode = nfssvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 0, }, @@ -647,10 +646,10 @@ static const struct svc_procedure nfsd_procedures2[18] = { }, [NFSPROC_ROOT] = { .pc_func = nfsd_proc_root, - .pc_decode = nfssvc_decode_void, - .pc_encode = nfssvc_encode_void, - .pc_argsize = sizeof(struct nfsd_void), - .pc_ressize = sizeof(struct nfsd_void), + .pc_decode = nfssvc_decode_voidarg, + .pc_encode = nfssvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 0, }, @@ -685,10 +684,10 @@ static const struct svc_procedure nfsd_procedures2[18] = { }, [NFSPROC_WRITECACHE] = { .pc_func = nfsd_proc_writecache, - .pc_decode = nfssvc_decode_void, - .pc_encode = nfssvc_encode_void, - .pc_argsize = sizeof(struct nfsd_void), - .pc_ressize = sizeof(struct nfsd_void), + .pc_decode = nfssvc_decode_voidarg, + .pc_encode = nfssvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd_voidargs), + .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 0, }, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 27b1ad136150..00384c332f9b 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -29,6 +29,8 @@ #include "netns.h" #include "filecache.h" +#include "trace.h" + #define NFSDDBG_FACILITY NFSDDBG_SVC bool inter_copy_offload_enable; @@ -527,8 +529,7 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net) return; nfsd_shutdown_net(net); - printk(KERN_WARNING "nfsd: last server has exited, flushing export " - "cache\n"); + pr_info("nfsd: last server has exited, flushing export cache\n"); nfsd_export_flush(net); } @@ -1009,17 +1010,16 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) struct kvec *resv = &rqstp->rq_res.head[0]; __be32 *p; - dprintk("nfsd_dispatch: vers %d proc %d\n", - rqstp->rq_vers, rqstp->rq_proc); - if (nfs_request_too_big(rqstp, proc)) - goto out_too_large; + goto out_decode_err; /* * Give the xdr decoder a chance to change this if it wants * (necessary in the NFSv4.0 compound case) */ rqstp->rq_cachetype = proc->pc_cachetype; + + svcxdr_init_decode(rqstp); if (!proc->pc_decode(rqstp, argv->iov_base)) goto out_decode_err; @@ -1050,29 +1050,51 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) out_cached_reply: return 1; -out_too_large: - dprintk("nfsd: NFSv%d argument too large\n", rqstp->rq_vers); - *statp = rpc_garbage_args; - return 1; - out_decode_err: - dprintk("nfsd: failed to decode arguments!\n"); + trace_nfsd_garbage_args_err(rqstp); *statp = rpc_garbage_args; return 1; out_update_drop: - dprintk("nfsd: Dropping request; may be revisited later\n"); nfsd_cache_update(rqstp, RC_NOCACHE, NULL); out_dropit: return 0; out_encode_err: - dprintk("nfsd: failed to encode result!\n"); + trace_nfsd_cant_encode_err(rqstp); nfsd_cache_update(rqstp, RC_NOCACHE, NULL); *statp = rpc_system_err; return 1; } +/** + * nfssvc_decode_voidarg - Decode void arguments + * @rqstp: Server RPC transaction context + * @p: buffer containing arguments to decode + * + * Return values: + * %0: Arguments were not valid + * %1: Decoding was successful + */ +int nfssvc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p) +{ + return 1; +} + +/** + * nfssvc_encode_voidres - Encode void results + * @rqstp: Server RPC transaction context + * @p: buffer in which to encode results + * + * Return values: + * %0: Local error while encoding + * %1: Encoding was successful + */ +int nfssvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p) +{ + return xdr_ressize_check(rqstp, p); +} + int nfsd_pool_stats_open(struct inode *inode, struct file *file) { int ret; diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 8a288c8fcd57..7aa6e8aca2c1 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -192,11 +192,6 @@ __be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *f /* * XDR decode functions */ -int -nfssvc_decode_void(struct svc_rqst *rqstp, __be32 *p) -{ - return xdr_argsize_check(rqstp, p); -} int nfssvc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p) @@ -423,11 +418,6 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) /* * XDR encode functions */ -int -nfssvc_encode_void(struct svc_rqst *rqstp, __be32 *p) -{ - return xdr_ressize_check(rqstp, p); -} int nfssvc_encode_stat(struct svc_rqst *rqstp, __be32 *p) @@ -469,6 +459,7 @@ int nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd_readlinkres *resp = rqstp->rq_resp; + struct kvec *head = rqstp->rq_res.head; *p++ = resp->status; if (resp->status != nfs_ok) @@ -483,6 +474,8 @@ nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p) *p = 0; rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3); } + if (svc_encode_result_payload(rqstp, head->iov_len, resp->len)) + return 0; return 1; } @@ -490,6 +483,7 @@ int nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd_readres *resp = rqstp->rq_resp; + struct kvec *head = rqstp->rq_res.head; *p++ = resp->status; if (resp->status != nfs_ok) @@ -507,6 +501,8 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p) *p = 0; rqstp->rq_res.tail[0].iov_len = 4 - (resp->count&3); } + if (svc_encode_result_payload(rqstp, head->iov_len, resp->count)) + return 0; return 1; } diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c index 90967466a1e5..f008b95ceec2 100644 --- a/fs/nfsd/trace.c +++ b/fs/nfsd/trace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #define CREATE_TRACE_POINTS #include "trace.h" diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 99bf07800cd0..92a0973dd671 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -12,6 +12,100 @@ #include "export.h" #include "nfsfh.h" +#define NFSD_TRACE_PROC_ARG_FIELDS \ + __field(unsigned int, netns_ino) \ + __field(u32, xid) \ + __array(unsigned char, server, sizeof(struct sockaddr_in6)) \ + __array(unsigned char, client, sizeof(struct sockaddr_in6)) + +#define NFSD_TRACE_PROC_ARG_ASSIGNMENTS \ + do { \ + __entry->netns_ino = SVC_NET(rqstp)->ns.inum; \ + __entry->xid = be32_to_cpu(rqstp->rq_xid); \ + memcpy(__entry->server, &rqstp->rq_xprt->xpt_local, \ + rqstp->rq_xprt->xpt_locallen); \ + memcpy(__entry->client, &rqstp->rq_xprt->xpt_remote, \ + rqstp->rq_xprt->xpt_remotelen); \ + } while (0); + +#define NFSD_TRACE_PROC_RES_FIELDS \ + __field(unsigned int, netns_ino) \ + __field(u32, xid) \ + __field(unsigned long, status) \ + __array(unsigned char, server, sizeof(struct sockaddr_in6)) \ + __array(unsigned char, client, sizeof(struct sockaddr_in6)) + +#define NFSD_TRACE_PROC_RES_ASSIGNMENTS(error) \ + do { \ + __entry->netns_ino = SVC_NET(rqstp)->ns.inum; \ + __entry->xid = be32_to_cpu(rqstp->rq_xid); \ + __entry->status = be32_to_cpu(error); \ + memcpy(__entry->server, &rqstp->rq_xprt->xpt_local, \ + rqstp->rq_xprt->xpt_locallen); \ + memcpy(__entry->client, &rqstp->rq_xprt->xpt_remote, \ + rqstp->rq_xprt->xpt_remotelen); \ + } while (0); + +TRACE_EVENT(nfsd_garbage_args_err, + TP_PROTO( + const struct svc_rqst *rqstp + ), + TP_ARGS(rqstp), + TP_STRUCT__entry( + NFSD_TRACE_PROC_ARG_FIELDS + + __field(u32, vers) + __field(u32, proc) + ), + TP_fast_assign( + NFSD_TRACE_PROC_ARG_ASSIGNMENTS + + __entry->vers = rqstp->rq_vers; + __entry->proc = rqstp->rq_proc; + ), + TP_printk("xid=0x%08x vers=%u proc=%u", + __entry->xid, __entry->vers, __entry->proc + ) +); + +TRACE_EVENT(nfsd_cant_encode_err, + TP_PROTO( + const struct svc_rqst *rqstp + ), + TP_ARGS(rqstp), + TP_STRUCT__entry( + NFSD_TRACE_PROC_ARG_FIELDS + + __field(u32, vers) + __field(u32, proc) + ), + TP_fast_assign( + NFSD_TRACE_PROC_ARG_ASSIGNMENTS + + __entry->vers = rqstp->rq_vers; + __entry->proc = rqstp->rq_proc; + ), + TP_printk("xid=0x%08x vers=%u proc=%u", + __entry->xid, __entry->vers, __entry->proc + ) +); + +#define show_nfsd_may_flags(x) \ + __print_flags(x, "|", \ + { NFSD_MAY_EXEC, "EXEC" }, \ + { NFSD_MAY_WRITE, "WRITE" }, \ + { NFSD_MAY_READ, "READ" }, \ + { NFSD_MAY_SATTR, "SATTR" }, \ + { NFSD_MAY_TRUNC, "TRUNC" }, \ + { NFSD_MAY_LOCK, "LOCK" }, \ + { NFSD_MAY_OWNER_OVERRIDE, "OWNER_OVERRIDE" }, \ + { NFSD_MAY_LOCAL_ACCESS, "LOCAL_ACCESS" }, \ + { NFSD_MAY_BYPASS_GSS_ON_ROOT, "BYPASS_GSS_ON_ROOT" }, \ + { NFSD_MAY_NOT_BREAK_LEASE, "NOT_BREAK_LEASE" }, \ + { NFSD_MAY_BYPASS_GSS, "BYPASS_GSS" }, \ + { NFSD_MAY_READ_IF_EXEC, "READ_IF_EXEC" }, \ + { NFSD_MAY_64BIT_COOKIE, "64BIT_COOKIE" }) + TRACE_EVENT(nfsd_compound, TP_PROTO(const struct svc_rqst *rqst, u32 args_opcnt), @@ -51,6 +145,56 @@ TRACE_EVENT(nfsd_compound_status, __get_str(name), __entry->status) ) +TRACE_EVENT(nfsd_compound_decode_err, + TP_PROTO( + const struct svc_rqst *rqstp, + u32 args_opcnt, + u32 resp_opcnt, + u32 opnum, + __be32 status + ), + TP_ARGS(rqstp, args_opcnt, resp_opcnt, opnum, status), + TP_STRUCT__entry( + NFSD_TRACE_PROC_RES_FIELDS + + __field(u32, args_opcnt) + __field(u32, resp_opcnt) + __field(u32, opnum) + ), + TP_fast_assign( + NFSD_TRACE_PROC_RES_ASSIGNMENTS(status) + + __entry->args_opcnt = args_opcnt; + __entry->resp_opcnt = resp_opcnt; + __entry->opnum = opnum; + ), + TP_printk("op=%u/%u opnum=%u status=%lu", + __entry->resp_opcnt, __entry->args_opcnt, + __entry->opnum, __entry->status) +); + +TRACE_EVENT(nfsd_compound_encode_err, + TP_PROTO( + const struct svc_rqst *rqstp, + u32 opnum, + __be32 status + ), + TP_ARGS(rqstp, opnum, status), + TP_STRUCT__entry( + NFSD_TRACE_PROC_RES_FIELDS + + __field(u32, opnum) + ), + TP_fast_assign( + NFSD_TRACE_PROC_RES_ASSIGNMENTS(status) + + __entry->opnum = opnum; + ), + TP_printk("opnum=%u status=%lu", + __entry->opnum, __entry->status) +); + + DECLARE_EVENT_CLASS(nfsd_fh_err_class, TP_PROTO(struct svc_rqst *rqstp, struct svc_fh *fhp, @@ -421,6 +565,9 @@ TRACE_EVENT(nfsd_clid_inuse_err, __entry->cl_boot, __entry->cl_id) ) +/* + * from fs/nfsd/filecache.h + */ TRACE_DEFINE_ENUM(NFSD_FILE_HASHED); TRACE_DEFINE_ENUM(NFSD_FILE_PENDING); TRACE_DEFINE_ENUM(NFSD_FILE_BREAK_READ); @@ -435,13 +582,6 @@ TRACE_DEFINE_ENUM(NFSD_FILE_REFERENCED); { 1 << NFSD_FILE_BREAK_WRITE, "BREAK_WRITE" }, \ { 1 << NFSD_FILE_REFERENCED, "REFERENCED"}) -/* FIXME: This should probably be fleshed out in the future. */ -#define show_nf_may(val) \ - __print_flags(val, "|", \ - { NFSD_MAY_READ, "READ" }, \ - { NFSD_MAY_WRITE, "WRITE" }, \ - { NFSD_MAY_NOT_BREAK_LEASE, "NOT_BREAK_LEASE" }) - DECLARE_EVENT_CLASS(nfsd_file_class, TP_PROTO(struct nfsd_file *nf), TP_ARGS(nf), @@ -461,12 +601,12 @@ DECLARE_EVENT_CLASS(nfsd_file_class, __entry->nf_may = nf->nf_may; __entry->nf_file = nf->nf_file; ), - TP_printk("hash=0x%x inode=0x%p ref=%d flags=%s may=%s file=%p", + TP_printk("hash=0x%x inode=%p ref=%d flags=%s may=%s file=%p", __entry->nf_hashval, __entry->nf_inode, __entry->nf_ref, show_nf_flags(__entry->nf_flags), - show_nf_may(__entry->nf_may), + show_nfsd_may_flags(__entry->nf_may), __entry->nf_file) ) @@ -492,10 +632,10 @@ TRACE_EVENT(nfsd_file_acquire, __field(u32, xid) __field(unsigned int, hash) __field(void *, inode) - __field(unsigned int, may_flags) + __field(unsigned long, may_flags) __field(int, nf_ref) __field(unsigned long, nf_flags) - __field(unsigned char, nf_may) + __field(unsigned long, nf_may) __field(struct file *, nf_file) __field(u32, status) ), @@ -512,12 +652,12 @@ TRACE_EVENT(nfsd_file_acquire, __entry->status = be32_to_cpu(status); ), - TP_printk("xid=0x%x hash=0x%x inode=0x%p may_flags=%s ref=%d nf_flags=%s nf_may=%s nf_file=0x%p status=%u", + TP_printk("xid=0x%x hash=0x%x inode=%p may_flags=%s ref=%d nf_flags=%s nf_may=%s nf_file=%p status=%u", __entry->xid, __entry->hash, __entry->inode, - show_nf_may(__entry->may_flags), __entry->nf_ref, - show_nf_flags(__entry->nf_flags), - show_nf_may(__entry->nf_may), __entry->nf_file, - __entry->status) + show_nfsd_may_flags(__entry->may_flags), + __entry->nf_ref, show_nf_flags(__entry->nf_flags), + show_nfsd_may_flags(__entry->nf_may), + __entry->nf_file, __entry->status) ); DECLARE_EVENT_CLASS(nfsd_file_search_class, @@ -533,7 +673,7 @@ DECLARE_EVENT_CLASS(nfsd_file_search_class, __entry->hash = hash; __entry->found = found; ), - TP_printk("hash=0x%x inode=0x%p found=%d", __entry->hash, + TP_printk("hash=0x%x inode=%p found=%d", __entry->hash, __entry->inode, __entry->found) ); @@ -561,7 +701,7 @@ TRACE_EVENT(nfsd_file_fsnotify_handle_event, __entry->mode = inode->i_mode; __entry->mask = mask; ), - TP_printk("inode=0x%p nlink=%u mode=0%ho mask=0x%x", __entry->inode, + TP_printk("inode=%p nlink=%u mode=0%ho mask=0x%x", __entry->inode, __entry->nlink, __entry->mode, __entry->mask) ); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 1ecaceebee13..04937e51de56 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -978,18 +978,25 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, __be32 *verf) { struct file *file = nf->nf_file; + struct super_block *sb = file_inode(file)->i_sb; struct svc_export *exp; struct iov_iter iter; __be32 nfserr; int host_err; int use_wgather; loff_t pos = offset; + unsigned long exp_op_flags = 0; unsigned int pflags = current->flags; rwf_t flags = 0; + bool restore_flags = false; trace_nfsd_write_opened(rqstp, fhp, offset, *cnt); - if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) + if (sb->s_export_op) + exp_op_flags = sb->s_export_op->flags; + + if (test_bit(RQ_LOCAL, &rqstp->rq_flags) && + !(exp_op_flags & EXPORT_OP_REMOTE_FS)) { /* * We want throttling in balance_dirty_pages() * and shrink_inactive_list() to only consider @@ -998,6 +1005,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, * the client's dirty pages or its congested queue. */ current->flags |= PF_LOCAL_THROTTLE; + restore_flags = true; + } exp = fhp->fh_export; use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp); @@ -1049,7 +1058,7 @@ out_nfserr: trace_nfsd_write_err(rqstp, fhp, offset, host_err); nfserr = nfserrno(host_err); } - if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) + if (restore_flags) current_restore_flags(pflags, PF_LOCAL_THROTTLE); return nfserr; } @@ -1724,7 +1733,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, struct inode *fdir, *tdir; __be32 err; int host_err; - bool has_cached = false; + bool close_cached = false; err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); if (err) @@ -1783,8 +1792,9 @@ retry: if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) goto out_dput_new; - if (nfsd_has_cached_files(ndentry)) { - has_cached = true; + if ((ndentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) && + nfsd_has_cached_files(ndentry)) { + close_cached = true; goto out_dput_old; } else { host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0); @@ -1805,7 +1815,7 @@ retry: * as that would do the wrong thing if the two directories * were the same, so again we do it by hand. */ - if (!has_cached) { + if (!close_cached) { fill_post_wcc(ffhp); fill_post_wcc(tfhp); } @@ -1819,8 +1829,8 @@ retry: * shouldn't be done with locks held however, so we delay it until this * point and then reattempt the whole shebang. */ - if (has_cached) { - has_cached = false; + if (close_cached) { + close_cached = false; nfsd_close_cached_files(ndentry); dput(ndentry); goto retry; @@ -1872,7 +1882,8 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, type = d_inode(rdentry)->i_mode & S_IFMT; if (type != S_IFDIR) { - nfsd_close_cached_files(rdentry); + if (rdentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) + nfsd_close_cached_files(rdentry); host_err = vfs_unlink(dirp, rdentry, NULL); } else { host_err = vfs_rmdir(dirp, rdentry); diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index 0ff336b0b25f..ad77387734cc 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -144,7 +144,6 @@ union nfsd_xdrstore { #define NFS2_SVC_XDRSIZE sizeof(union nfsd_xdrstore) -int nfssvc_decode_void(struct svc_rqst *, __be32 *); int nfssvc_decode_fhandle(struct svc_rqst *, __be32 *); int nfssvc_decode_sattrargs(struct svc_rqst *, __be32 *); int nfssvc_decode_diropargs(struct svc_rqst *, __be32 *); @@ -156,7 +155,6 @@ int nfssvc_decode_readlinkargs(struct svc_rqst *, __be32 *); int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *); int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *); int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *); -int nfssvc_encode_void(struct svc_rqst *, __be32 *); int nfssvc_encode_stat(struct svc_rqst *, __be32 *); int nfssvc_encode_attrstat(struct svc_rqst *, __be32 *); int nfssvc_encode_diropres(struct svc_rqst *, __be32 *); diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index ae6fa6c9cb46..456fcd7a1038 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -273,7 +273,6 @@ union nfsd3_xdrstore { #define NFS3_SVC_XDRSIZE sizeof(union nfsd3_xdrstore) -int nfs3svc_decode_voidarg(struct svc_rqst *, __be32 *); int nfs3svc_decode_fhandle(struct svc_rqst *, __be32 *); int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *); @@ -290,7 +289,6 @@ int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_readdirplusargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_commitargs(struct svc_rqst *, __be32 *); -int nfs3svc_encode_voidres(struct svc_rqst *, __be32 *); int nfs3svc_encode_attrstat(struct svc_rqst *, __be32 *); int nfs3svc_encode_wccstat(struct svc_rqst *, __be32 *); int nfs3svc_encode_diropres(struct svc_rqst *, __be32 *); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 679d40af1bbb..a60ff5ce1a37 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -76,12 +76,7 @@ static inline bool nfsd4_has_session(struct nfsd4_compound_state *cs) struct nfsd4_change_info { u32 atomic; - bool change_supported; - u32 before_ctime_sec; - u32 before_ctime_nsec; u64 before_change; - u32 after_ctime_sec; - u32 after_ctime_nsec; u64 after_change; }; @@ -252,7 +247,8 @@ struct nfsd4_listxattrs { struct nfsd4_open { u32 op_claim_type; /* request */ - struct xdr_netobj op_fname; /* request - everything but CLAIM_PREV */ + u32 op_fnamelen; + char * op_fname; /* request - everything but CLAIM_PREV */ u32 op_delegate_type; /* request - CLAIM_PREV only */ stateid_t op_delegate_stateid; /* request - response */ u32 op_why_no_deleg; /* response - DELEG_NONE_EXT only */ @@ -385,13 +381,6 @@ struct nfsd4_setclientid_confirm { nfs4_verifier sc_confirm; }; -struct nfsd4_saved_compoundargs { - __be32 *p; - __be32 *end; - int pagelen; - struct page **pagelist; -}; - struct nfsd4_test_stateid_id { __be32 ts_id_status; stateid_t ts_id_stateid; @@ -419,8 +408,7 @@ struct nfsd4_write { u64 wr_offset; /* request */ u32 wr_stable_how; /* request */ u32 wr_buflen; /* request */ - struct kvec wr_head; - struct page ** wr_pagelist; /* request */ + struct xdr_buf wr_payload; /* request */ u32 wr_bytes_written; /* response */ u32 wr_how_written; /* response */ @@ -433,7 +421,7 @@ struct nfsd4_exchange_id { u32 flags; clientid_t clientid; u32 seqid; - int spa_how; + u32 spa_how; u32 spo_must_enforce[3]; u32 spo_must_allow[3]; struct xdr_netobj nii_domain; @@ -554,7 +542,7 @@ struct nfsd4_copy { bool cp_intra; /* both */ - bool cp_synchronous; + u32 cp_synchronous; /* response */ struct nfsd42_write_res cp_res; @@ -615,7 +603,7 @@ struct nfsd4_copy_notify { }; struct nfsd4_op { - int opnum; + u32 opnum; const struct nfsd4_operation * opdesc; __be32 status; union nfsd4_op_u { @@ -696,15 +684,8 @@ struct svcxdr_tmpbuf { struct nfsd4_compoundargs { /* scratch variables for XDR decode */ - __be32 * p; - __be32 * end; - struct page ** pagelist; - int pagelen; - bool tail; - __be32 tmp[8]; - __be32 * tmpp; + struct xdr_stream *xdr; struct svcxdr_tmpbuf *to_free; - struct svc_rqst *rqstp; u32 taglen; @@ -767,22 +748,14 @@ static inline void set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) { BUG_ON(!fhp->fh_pre_saved); - cinfo->atomic = (u32)fhp->fh_post_saved; - cinfo->change_supported = IS_I_VERSION(d_inode(fhp->fh_dentry)); + cinfo->atomic = (u32)(fhp->fh_post_saved && !fhp->fh_no_atomic_attr); cinfo->before_change = fhp->fh_pre_change; cinfo->after_change = fhp->fh_post_change; - cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec; - cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec; - cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec; - cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec; - } bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp); -int nfs4svc_decode_voidarg(struct svc_rqst *, __be32 *); -int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *); int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *); int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *); __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index e3726aca28ed..cd4da9535aed 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -134,14 +134,9 @@ static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int); static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *); static void nilfs_dispose_list(struct the_nilfs *, struct list_head *, int); -#define nilfs_cnt32_gt(a, b) \ - (typecheck(__u32, a) && typecheck(__u32, b) && \ - ((__s32)(b) - (__s32)(a) < 0)) #define nilfs_cnt32_ge(a, b) \ (typecheck(__u32, a) && typecheck(__u32, b) && \ ((__s32)(a) - (__s32)(b) >= 0)) -#define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a) -#define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a) static int nilfs_prepare_segment_lock(struct super_block *sb, struct nilfs_transaction_info *ti) diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 5dcda8f20c04..e85e13c50d6d 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -72,7 +72,7 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) */ static int dnotify_handle_event(struct fsnotify_mark *inode_mark, u32 mask, struct inode *inode, struct inode *dir, - const struct qstr *name) + const struct qstr *name, u32 cookie) { struct dnotify_mark *dn_mark; struct dnotify_struct *dn; @@ -327,7 +327,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) } rcu_read_lock(); - f = fcheck(fd); + f = lookup_fd_rcu(fd); rcu_read_unlock(); /* if (f != filp) means that we lost a race and another task/thread diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 9167884a61ec..1192c9953620 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -268,12 +268,11 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, continue; /* - * If the event is for a child and this mark is on a parent not + * If the event is on a child and this mark is on a parent not * watching children, don't send it! */ - if (event_mask & FS_EVENT_ON_CHILD && - type == FSNOTIFY_OBJ_TYPE_INODE && - !(mark->mask & FS_EVENT_ON_CHILD)) + if (type == FSNOTIFY_OBJ_TYPE_PARENT && + !(mark->mask & FS_EVENT_ON_CHILD)) continue; marks_mask |= mark->mask; diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 8d3ad5ef2925..30d422b8c0fc 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -152,6 +152,13 @@ static bool fsnotify_event_needs_parent(struct inode *inode, struct mount *mnt, if (mask & FS_ISDIR) return false; + /* + * All events that are possible on child can also may be reported with + * parent/name info to inode/sb/mount. Otherwise, a watching parent + * could result in events reported with unexpected name info to sb/mount. + */ + BUILD_BUG_ON(FS_EVENTS_POSS_ON_CHILD & ~FS_EVENTS_POSS_TO_PARENT); + /* Did either inode/sb/mount subscribe for events with parent/name? */ marks_mask |= fsnotify_parent_needed_mask(inode->i_fsnotify_mask); marks_mask |= fsnotify_parent_needed_mask(inode->i_sb->s_fsnotify_mask); @@ -232,47 +239,76 @@ notify: } EXPORT_SYMBOL_GPL(__fsnotify_parent); +static int fsnotify_handle_inode_event(struct fsnotify_group *group, + struct fsnotify_mark *inode_mark, + u32 mask, const void *data, int data_type, + struct inode *dir, const struct qstr *name, + u32 cookie) +{ + const struct path *path = fsnotify_data_path(data, data_type); + struct inode *inode = fsnotify_data_inode(data, data_type); + const struct fsnotify_ops *ops = group->ops; + + if (WARN_ON_ONCE(!ops->handle_inode_event)) + return 0; + + if ((inode_mark->mask & FS_EXCL_UNLINK) && + path && d_unlinked(path->dentry)) + return 0; + + /* Check interest of this mark in case event was sent with two marks */ + if (!(mask & inode_mark->mask & ALL_FSNOTIFY_EVENTS)) + return 0; + + return ops->handle_inode_event(inode_mark, mask, inode, dir, name, cookie); +} + static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); - struct fsnotify_mark *child_mark = fsnotify_iter_child_mark(iter_info); - struct inode *inode = fsnotify_data_inode(data, data_type); - const struct fsnotify_ops *ops = group->ops; + struct fsnotify_mark *parent_mark = fsnotify_iter_parent_mark(iter_info); int ret; - if (WARN_ON_ONCE(!ops->handle_inode_event)) - return 0; - if (WARN_ON_ONCE(fsnotify_iter_sb_mark(iter_info)) || WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info))) return 0; - /* - * An event can be sent on child mark iterator instead of inode mark - * iterator because of other groups that have interest of this inode - * and have marks on both parent and child. We can simplify this case. - */ - if (!inode_mark) { - inode_mark = child_mark; - child_mark = NULL; + if (parent_mark) { + /* + * parent_mark indicates that the parent inode is watching + * children and interested in this event, which is an event + * possible on child. But is *this mark* watching children and + * interested in this event? + */ + if (parent_mark->mask & FS_EVENT_ON_CHILD) { + ret = fsnotify_handle_inode_event(group, parent_mark, mask, + data, data_type, dir, name, 0); + if (ret) + return ret; + } + if (!inode_mark) + return 0; + } + + if (mask & FS_EVENT_ON_CHILD) { + /* + * Some events can be sent on both parent dir and child marks + * (e.g. FS_ATTRIB). If both parent dir and child are + * watching, report the event once to parent dir with name (if + * interested) and once to child without name (if interested). + * The child watcher is expecting an event without a file name + * and without the FS_EVENT_ON_CHILD flag. + */ + mask &= ~FS_EVENT_ON_CHILD; dir = NULL; name = NULL; } - ret = ops->handle_inode_event(inode_mark, mask, inode, dir, name); - if (ret || !child_mark) - return ret; - - /* - * Some events can be sent on both parent dir and child marks - * (e.g. FS_ATTRIB). If both parent dir and child are watching, - * report the event once to parent dir with name and once to child - * without name. - */ - return ops->handle_inode_event(child_mark, mask, inode, NULL, NULL); + return fsnotify_handle_inode_event(group, inode_mark, mask, data, data_type, + dir, name, cookie); } static int send_to_group(__u32 mask, const void *data, int data_type, @@ -430,7 +466,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, struct fsnotify_iter_info iter_info = {}; struct super_block *sb; struct mount *mnt = NULL; - struct inode *child = NULL; + struct inode *parent = NULL; int ret = 0; __u32 test_mask, marks_mask; @@ -442,11 +478,10 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, inode = dir; } else if (mask & FS_EVENT_ON_CHILD) { /* - * Event on child - report on TYPE_INODE to dir if it is - * watching children and on TYPE_CHILD to child. + * Event on child - report on TYPE_PARENT to dir if it is + * watching children and on TYPE_INODE to child. */ - child = inode; - inode = dir; + parent = dir; } sb = inode->i_sb; @@ -460,7 +495,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, if (!sb->s_fsnotify_marks && (!mnt || !mnt->mnt_fsnotify_marks) && (!inode || !inode->i_fsnotify_marks) && - (!child || !child->i_fsnotify_marks)) + (!parent || !parent->i_fsnotify_marks)) return 0; marks_mask = sb->s_fsnotify_mask; @@ -468,8 +503,8 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, marks_mask |= mnt->mnt_fsnotify_mask; if (inode) marks_mask |= inode->i_fsnotify_mask; - if (child) - marks_mask |= child->i_fsnotify_mask; + if (parent) + marks_mask |= parent->i_fsnotify_mask; /* @@ -492,9 +527,9 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] = fsnotify_first_mark(&inode->i_fsnotify_marks); } - if (child) { - iter_info.marks[FSNOTIFY_OBJ_TYPE_CHILD] = - fsnotify_first_mark(&child->i_fsnotify_marks); + if (parent) { + iter_info.marks[FSNOTIFY_OBJ_TYPE_PARENT] = + fsnotify_first_mark(&parent->i_fsnotify_marks); } /* diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index 4327d0e9c364..2007e3711916 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -24,11 +24,10 @@ static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse) extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group); -extern int inotify_handle_event(struct fsnotify_group *group, u32 mask, - const void *data, int data_type, - struct inode *dir, - const struct qstr *file_name, u32 cookie, - struct fsnotify_iter_info *iter_info); +extern int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, + u32 mask, struct inode *inode, + struct inode *dir, + const struct qstr *name, u32 cookie); extern const struct fsnotify_ops inotify_fsnotify_ops; extern struct kmem_cache *inotify_inode_mark_cachep; diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 9ddcbadc98e2..1901d799909b 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -55,25 +55,21 @@ static int inotify_merge(struct list_head *list, return event_compare(last_event, event); } -static int inotify_one_event(struct fsnotify_group *group, u32 mask, - struct fsnotify_mark *inode_mark, - const struct path *path, - const struct qstr *file_name, u32 cookie) +int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask, + struct inode *inode, struct inode *dir, + const struct qstr *name, u32 cookie) { struct inotify_inode_mark *i_mark; struct inotify_event_info *event; struct fsnotify_event *fsn_event; + struct fsnotify_group *group = inode_mark->group; int ret; int len = 0; int alloc_len = sizeof(struct inotify_event_info); struct mem_cgroup *old_memcg; - if ((inode_mark->mask & FS_EXCL_UNLINK) && - path && d_unlinked(path->dentry)) - return 0; - - if (file_name) { - len = file_name->len; + if (name) { + len = name->len; alloc_len += len + 1; } @@ -117,7 +113,7 @@ static int inotify_one_event(struct fsnotify_group *group, u32 mask, event->sync_cookie = cookie; event->name_len = len; if (len) - strcpy(event->name, file_name->name); + strcpy(event->name, name->name); ret = fsnotify_add_event(group, fsn_event, inotify_merge); if (ret) { @@ -131,37 +127,6 @@ static int inotify_one_event(struct fsnotify_group *group, u32 mask, return 0; } -int inotify_handle_event(struct fsnotify_group *group, u32 mask, - const void *data, int data_type, struct inode *dir, - const struct qstr *file_name, u32 cookie, - struct fsnotify_iter_info *iter_info) -{ - const struct path *path = fsnotify_data_path(data, data_type); - struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); - struct fsnotify_mark *child_mark = fsnotify_iter_child_mark(iter_info); - int ret = 0; - - if (WARN_ON(fsnotify_iter_vfsmount_mark(iter_info))) - return 0; - - /* - * Some events cannot be sent on both parent and child marks - * (e.g. IN_CREATE). Those events are always sent on inode_mark. - * For events that are possible on both parent and child (e.g. IN_OPEN), - * event is sent on inode_mark with name if the parent is watching and - * is sent on child_mark without name if child is watching. - * If both parent and child are watching, report the event with child's - * name here and report another event without child's name below. - */ - if (inode_mark) - ret = inotify_one_event(group, mask, inode_mark, path, - file_name, cookie); - if (ret || !child_mark) - return ret; - - return inotify_one_event(group, mask, child_mark, path, NULL, 0); -} - static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group) { inotify_ignored_and_remove_idr(fsn_mark, group); @@ -227,7 +192,7 @@ static void inotify_free_mark(struct fsnotify_mark *fsn_mark) } const struct fsnotify_ops inotify_fsnotify_ops = { - .handle_event = inotify_handle_event, + .handle_inode_event = inotify_handle_inode_event, .free_group_priv = inotify_free_group_priv, .free_event = inotify_free_event, .freeing_mark = inotify_freeing_mark, diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 186722ba3894..59c177011a0f 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -37,6 +37,15 @@ #include <asm/ioctls.h> +/* + * An inotify watch requires allocating an inotify_inode_mark structure as + * well as pinning the watched inode. Doubling the size of a VFS inode + * should be more than enough to cover the additional filesystem inode + * size increase. + */ +#define INOTIFY_WATCH_COST (sizeof(struct inotify_inode_mark) + \ + 2 * sizeof(struct inode)) + /* configurable via /proc/sys/fs/inotify/ */ static int inotify_max_queued_events __read_mostly; @@ -486,14 +495,10 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group) { struct inotify_inode_mark *i_mark; - struct fsnotify_iter_info iter_info = { }; - - fsnotify_iter_set_report_type_mark(&iter_info, FSNOTIFY_OBJ_TYPE_INODE, - fsn_mark); /* Queue ignore event for the watch */ - inotify_handle_event(group, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, - NULL, NULL, 0, &iter_info); + inotify_handle_inode_event(fsn_mark, FS_IN_IGNORED, NULL, NULL, NULL, + 0); i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); /* remove this mark from the idr */ @@ -801,6 +806,18 @@ out: */ static int __init inotify_user_setup(void) { + unsigned long watches_max; + struct sysinfo si; + + si_meminfo(&si); + /* + * Allow up to 1% of addressable memory to be allocated for inotify + * watches (per user) limited to the range [8192, 1048576]. + */ + watches_max = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) / + INOTIFY_WATCH_COST; + watches_max = clamp(watches_max, 8192UL, 1048576UL); + BUILD_BUG_ON(IN_ACCESS != FS_ACCESS); BUILD_BUG_ON(IN_MODIFY != FS_MODIFY); BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB); @@ -827,7 +844,7 @@ static int __init inotify_user_setup(void) inotify_max_queued_events = 16384; init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128; - init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = 8192; + init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = watches_max; return 0; } diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index f42967b738eb..e5aab265dff1 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -323,7 +323,7 @@ static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb, unsigned long flags; struct file *file = iocb->ki_filp; struct inode *vi = file_inode(file); - ntfs_inode *base_ni, *ni = NTFS_I(vi); + ntfs_inode *ni = NTFS_I(vi); ntfs_volume *vol = ni->vol; ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " @@ -365,9 +365,6 @@ static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb, err = -EOPNOTSUPP; goto out; } - base_ni = ni; - if (NInoAttr(ni)) - base_ni = ni->ext.base_ntfs_ino; err = file_remove_privs(file); if (unlikely(err)) goto out; diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index caf563981532..f7e4cbc26eaf 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2347,7 +2347,6 @@ int ntfs_truncate(struct inode *vi) ATTR_RECORD *a; const char *te = " Leaving file length out of sync with i_size."; int err, mp_size, size_change, alloc_change; - u32 attr_len; ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); BUG_ON(NInoAttr(ni)); @@ -2721,7 +2720,6 @@ do_non_resident_truncate: * this cannot fail since we are making the attribute smaller thus by * definition there is enough space to do so. */ - attr_len = le32_to_cpu(a->length); err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); BUG_ON(err); diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c index a0c40f1be7ac..bc1bf217b38e 100644 --- a/fs/ntfs/logfile.c +++ b/fs/ntfs/logfile.c @@ -478,7 +478,7 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp) u8 *kaddr = NULL; RESTART_PAGE_HEADER *rstr1_ph = NULL; RESTART_PAGE_HEADER *rstr2_ph = NULL; - int log_page_size, log_page_mask, err; + int log_page_size, err; bool logfile_is_empty = true; u8 log_page_bits; @@ -501,7 +501,6 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp) log_page_size = DefaultLogPageSize; else log_page_size = PAGE_SIZE; - log_page_mask = log_page_size - 1; /* * Use ntfs_ffs() instead of ffs() to enable the compiler to * optimize log_page_size and log_page_bits into constants. diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 79a231719460..3bd8119bed5e 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1198,7 +1198,6 @@ static int o2net_process_message(struct o2net_sock_container *sc, msglog(hdr, "bad magic\n"); ret = -EINVAL; goto out; - break; } /* find a handler for it */ diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index c46bf7f581a1..2a237ab00453 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1088,8 +1088,8 @@ static int ocfs2_check_if_ancestor(struct ocfs2_super *osb, child_inode_no = parent_inode_no; if (++i >= MAX_LOOKUP_TIMES) { - mlog(ML_NOTICE, "max lookup times reached, filesystem " - "may have nested directories, " + mlog_ratelimited(ML_NOTICE, "max lookup times reached, " + "filesystem may have nested directories, " "src inode: %llu, dest inode: %llu.\n", (unsigned long long)src_inode_no, (unsigned long long)dest_inode_no); diff --git a/fs/open.c b/fs/open.c index 9af548fb841b..1e06e443a565 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1010,6 +1010,10 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op) if (how->resolve & ~VALID_RESOLVE_FLAGS) return -EINVAL; + /* Scoping flags are mutually exclusive. */ + if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT)) + return -EINVAL; + /* Deal with the mode. */ if (WILL_CREATE(flags)) { if (how->mode & ~S_IALLUGO) @@ -1292,7 +1296,7 @@ EXPORT_SYMBOL(filp_close); */ SYSCALL_DEFINE1(close, unsigned int, fd) { - int retval = __close_fd(current->files, fd); + int retval = close_fd(fd); /* can't restart close syscall because file table entry was cleared */ if (unlikely(retval == -ERESTARTSYS || diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 955ecd4030f0..e5b616c93e11 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -275,7 +275,8 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) return err; } -struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper) +struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, + bool is_upper) { struct ovl_fh *fh; int fh_type, dwords; @@ -319,7 +320,8 @@ struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper) if (is_upper) fh->fb.flags |= OVL_FH_FLAG_PATH_UPPER; fh->fb.len = sizeof(fh->fb) + buflen; - fh->fb.uuid = *uuid; + if (ofs->config.uuid) + fh->fb.uuid = *uuid; return fh; @@ -328,8 +330,8 @@ out_err: return ERR_PTR(err); } -int ovl_set_origin(struct dentry *dentry, struct dentry *lower, - struct dentry *upper) +int ovl_set_origin(struct ovl_fs *ofs, struct dentry *dentry, + struct dentry *lower, struct dentry *upper) { const struct ovl_fh *fh = NULL; int err; @@ -340,7 +342,7 @@ int ovl_set_origin(struct dentry *dentry, struct dentry *lower, * up and a pure upper inode. */ if (ovl_can_decode_fh(lower->d_sb)) { - fh = ovl_encode_real_fh(lower, false); + fh = ovl_encode_real_fh(ofs, lower, false); if (IS_ERR(fh)) return PTR_ERR(fh); } @@ -352,7 +354,8 @@ int ovl_set_origin(struct dentry *dentry, struct dentry *lower, fh ? fh->fb.len : 0, 0); kfree(fh); - return err; + /* Ignore -EPERM from setting "user.*" on symlink/special */ + return err == -EPERM ? 0 : err; } /* Store file handle of @upper dir in @index dir entry */ @@ -362,7 +365,7 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper, const struct ovl_fh *fh; int err; - fh = ovl_encode_real_fh(upper, true); + fh = ovl_encode_real_fh(ofs, upper, true); if (IS_ERR(fh)) return PTR_ERR(fh); @@ -380,6 +383,7 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper, static int ovl_create_index(struct dentry *dentry, struct dentry *origin, struct dentry *upper) { + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *indexdir = ovl_indexdir(dentry->d_sb); struct inode *dir = d_inode(indexdir); struct dentry *index = NULL; @@ -402,7 +406,7 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin, if (WARN_ON(ovl_test_flag(OVL_INDEX, d_inode(dentry)))) return -EIO; - err = ovl_get_index_name(origin, &name); + err = ovl_get_index_name(ofs, origin, &name); if (err) return err; @@ -411,7 +415,7 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin, if (IS_ERR(temp)) goto free_name; - err = ovl_set_upper_fh(OVL_FS(dentry->d_sb), upper, temp); + err = ovl_set_upper_fh(ofs, upper, temp); if (err) goto out; @@ -521,7 +525,7 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) * hard link. */ if (c->origin) { - err = ovl_set_origin(c->dentry, c->lowerpath.dentry, temp); + err = ovl_set_origin(ofs, c->dentry, c->lowerpath.dentry, temp); if (err) return err; } @@ -700,7 +704,7 @@ out_dput: static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) { int err; - struct ovl_fs *ofs = c->dentry->d_sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); bool to_index = false; /* @@ -722,7 +726,7 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) if (to_index) { c->destdir = ovl_indexdir(c->dentry->d_sb); - err = ovl_get_index_name(c->lowerpath.dentry, &c->destname); + err = ovl_get_index_name(ofs, c->lowerpath.dentry, &c->destname); if (err) return err; } else if (WARN_ON(!c->parent)) { diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index ed35be3fafc6..41ebf52f1bbc 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -211,7 +211,8 @@ static int ovl_check_encode_origin(struct dentry *dentry) return 1; } -static int ovl_dentry_to_fid(struct dentry *dentry, u32 *fid, int buflen) +static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct dentry *dentry, + u32 *fid, int buflen) { struct ovl_fh *fh = NULL; int err, enc_lower; @@ -226,7 +227,7 @@ static int ovl_dentry_to_fid(struct dentry *dentry, u32 *fid, int buflen) goto fail; /* Encode an upper or lower file handle */ - fh = ovl_encode_real_fh(enc_lower ? ovl_dentry_lower(dentry) : + fh = ovl_encode_real_fh(ofs, enc_lower ? ovl_dentry_lower(dentry) : ovl_dentry_upper(dentry), !enc_lower); if (IS_ERR(fh)) return PTR_ERR(fh); @@ -249,6 +250,7 @@ fail: static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, struct inode *parent) { + struct ovl_fs *ofs = OVL_FS(inode->i_sb); struct dentry *dentry; int bytes, buflen = *max_len << 2; @@ -260,7 +262,7 @@ static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, if (WARN_ON(!dentry)) return FILEID_INVALID; - bytes = ovl_dentry_to_fid(dentry, fid, buflen); + bytes = ovl_dentry_to_fid(ofs, dentry, fid, buflen); dput(dentry); if (bytes <= 0) return FILEID_INVALID; @@ -680,7 +682,7 @@ static struct dentry *ovl_upper_fh_to_d(struct super_block *sb, if (!ovl_upper_mnt(ofs)) return ERR_PTR(-EACCES); - upper = ovl_decode_real_fh(fh, ovl_upper_mnt(ofs), true); + upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), true); if (IS_ERR_OR_NULL(upper)) return upper; diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index efccb7c1f9bc..bd9dd38347ae 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -53,9 +53,10 @@ static struct file *ovl_open_realfile(const struct file *file, err = inode_permission(realinode, MAY_OPEN | acc_mode); if (err) { realfile = ERR_PTR(err); - } else if (!inode_owner_or_capable(realinode)) { - realfile = ERR_PTR(-EPERM); } else { + if (!inode_owner_or_capable(realinode)) + flags &= ~O_NOATIME; + realfile = open_with_fake_path(&file->f_path, flags, realinode, current_cred()); } @@ -75,12 +76,6 @@ static int ovl_change_flags(struct file *file, unsigned int flags) struct inode *inode = file_inode(file); int err; - flags |= OVL_OPEN_FLAGS; - - /* If some flag changed that cannot be changed then something's amiss */ - if (WARN_ON((file->f_flags ^ flags) & ~OVL_SETFL_MASK)) - return -EIO; - flags &= OVL_SETFL_MASK; if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode)) @@ -397,48 +392,6 @@ out_unlock: return ret; } -static ssize_t ovl_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) -{ - ssize_t ret; - struct fd real; - const struct cred *old_cred; - - ret = ovl_real_fdget(in, &real); - if (ret) - return ret; - - old_cred = ovl_override_creds(file_inode(in)->i_sb); - ret = generic_file_splice_read(real.file, ppos, pipe, len, flags); - revert_creds(old_cred); - - ovl_file_accessed(in); - fdput(real); - return ret; -} - -static ssize_t -ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags) -{ - struct fd real; - const struct cred *old_cred; - ssize_t ret; - - ret = ovl_real_fdget(out, &real); - if (ret) - return ret; - - old_cred = ovl_override_creds(file_inode(out)->i_sb); - ret = iter_file_splice_write(pipe, real.file, ppos, len, flags); - revert_creds(old_cred); - - ovl_file_accessed(out); - fdput(real); - return ret; -} - static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct fd real; @@ -541,46 +494,31 @@ static long ovl_real_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fd real; - const struct cred *old_cred; long ret; ret = ovl_real_fdget(file, &real); if (ret) return ret; - old_cred = ovl_override_creds(file_inode(file)->i_sb); ret = security_file_ioctl(real.file, cmd, arg); - if (!ret) + if (!ret) { + /* + * Don't override creds, since we currently can't safely check + * permissions before doing so. + */ ret = vfs_ioctl(real.file, cmd, arg); - revert_creds(old_cred); + } fdput(real); return ret; } -static unsigned int ovl_iflags_to_fsflags(unsigned int iflags) -{ - unsigned int flags = 0; - - if (iflags & S_SYNC) - flags |= FS_SYNC_FL; - if (iflags & S_APPEND) - flags |= FS_APPEND_FL; - if (iflags & S_IMMUTABLE) - flags |= FS_IMMUTABLE_FL; - if (iflags & S_NOATIME) - flags |= FS_NOATIME_FL; - - return flags; -} - static long ovl_ioctl_set_flags(struct file *file, unsigned int cmd, - unsigned long arg, unsigned int flags) + unsigned long arg) { long ret; struct inode *inode = file_inode(file); - unsigned int oldflags; if (!inode_owner_or_capable(inode)) return -EACCES; @@ -591,10 +529,13 @@ static long ovl_ioctl_set_flags(struct file *file, unsigned int cmd, inode_lock(inode); - /* Check the capability before cred override */ - oldflags = ovl_iflags_to_fsflags(READ_ONCE(inode->i_flags)); - ret = vfs_ioc_setflags_prepare(inode, oldflags, flags); - if (ret) + /* + * Prevent copy up if immutable and has no CAP_LINUX_IMMUTABLE + * capability. + */ + ret = -EPERM; + if (!ovl_has_upperdata(inode) && IS_IMMUTABLE(inode) && + !capable(CAP_LINUX_IMMUTABLE)) goto unlock; ret = ovl_maybe_copy_up(file_dentry(file), O_WRONLY); @@ -613,46 +554,6 @@ unlock: } -static long ovl_ioctl_set_fsflags(struct file *file, unsigned int cmd, - unsigned long arg) -{ - unsigned int flags; - - if (get_user(flags, (int __user *) arg)) - return -EFAULT; - - return ovl_ioctl_set_flags(file, cmd, arg, flags); -} - -static unsigned int ovl_fsxflags_to_fsflags(unsigned int xflags) -{ - unsigned int flags = 0; - - if (xflags & FS_XFLAG_SYNC) - flags |= FS_SYNC_FL; - if (xflags & FS_XFLAG_APPEND) - flags |= FS_APPEND_FL; - if (xflags & FS_XFLAG_IMMUTABLE) - flags |= FS_IMMUTABLE_FL; - if (xflags & FS_XFLAG_NOATIME) - flags |= FS_NOATIME_FL; - - return flags; -} - -static long ovl_ioctl_set_fsxflags(struct file *file, unsigned int cmd, - unsigned long arg) -{ - struct fsxattr fa; - - memset(&fa, 0, sizeof(fa)); - if (copy_from_user(&fa, (void __user *) arg, sizeof(fa))) - return -EFAULT; - - return ovl_ioctl_set_flags(file, cmd, arg, - ovl_fsxflags_to_fsflags(fa.fsx_xflags)); -} - long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { long ret; @@ -663,12 +564,9 @@ long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ret = ovl_real_ioctl(file, cmd, arg); break; - case FS_IOC_SETFLAGS: - ret = ovl_ioctl_set_fsflags(file, cmd, arg); - break; - case FS_IOC_FSSETXATTR: - ret = ovl_ioctl_set_fsxflags(file, cmd, arg); + case FS_IOC_SETFLAGS: + ret = ovl_ioctl_set_flags(file, cmd, arg); break; default: @@ -801,8 +699,8 @@ const struct file_operations ovl_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = ovl_compat_ioctl, #endif - .splice_read = ovl_splice_read, - .splice_write = ovl_splice_write, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .copy_file_range = ovl_copy_file_range, .remap_file_range = ovl_remap_file_range, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index b584dca845ba..d739e14c6814 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -329,8 +329,14 @@ static const char *ovl_get_link(struct dentry *dentry, bool ovl_is_private_xattr(struct super_block *sb, const char *name) { - return strncmp(name, OVL_XATTR_PREFIX, - sizeof(OVL_XATTR_PREFIX) - 1) == 0; + struct ovl_fs *ofs = sb->s_fs_info; + + if (ofs->config.userxattr) + return strncmp(name, OVL_XATTR_USER_PREFIX, + sizeof(OVL_XATTR_USER_PREFIX) - 1) == 0; + else + return strncmp(name, OVL_XATTR_TRUSTED_PREFIX, + sizeof(OVL_XATTR_TRUSTED_PREFIX) - 1) == 0; } int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, @@ -476,7 +482,7 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { int err; - struct inode *realinode = ovl_inode_real(inode); + struct inode *realinode = ovl_inode_realdata(inode); const struct cred *old_cred; if (!realinode->i_op->fiemap) @@ -690,7 +696,7 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev) * For the first, copy up case, the union nlink does not change, whether the * operation succeeds or fails, but the upper inode nlink may change. * Therefore, before copy up, we store the union nlink value relative to the - * lower inode nlink in the index inode xattr trusted.overlay.nlink. + * lower inode nlink in the index inode xattr .overlay.nlink. * * For the second, upper hardlink case, the union nlink should be incremented * or decremented IFF the operation succeeds, aligned with nlink change of the diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index a6162c4076db..3fe05fb5d145 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -150,17 +150,22 @@ invalid: goto out; } -struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, - bool connected) +struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh, + struct vfsmount *mnt, bool connected) { struct dentry *real; int bytes; + if (!capable(CAP_DAC_READ_SEARCH)) + return NULL; + /* * Make sure that the stored uuid matches the uuid of the lower * layer where file handle will be decoded. + * In case of uuid=off option just make sure that stored uuid is null. */ - if (!uuid_equal(&fh->fb.uuid, &mnt->mnt_sb->s_uuid)) + if (ofs->config.uuid ? !uuid_equal(&fh->fb.uuid, &mnt->mnt_sb->s_uuid) : + !uuid_is_null(&fh->fb.uuid)) return NULL; bytes = (fh->fb.len - offsetof(struct ovl_fb, fid)); @@ -354,7 +359,7 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, ofs->layers[i].fs->bad_uuid) continue; - origin = ovl_decode_real_fh(fh, ofs->layers[i].mnt, + origin = ovl_decode_real_fh(ofs, fh, ofs->layers[i].mnt, connected); if (origin) break; @@ -450,7 +455,7 @@ int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, struct ovl_fh *fh; int err; - fh = ovl_encode_real_fh(real, is_upper); + fh = ovl_encode_real_fh(ofs, real, is_upper); err = PTR_ERR(fh); if (IS_ERR(fh)) { fh = NULL; @@ -488,7 +493,7 @@ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index) if (IS_ERR_OR_NULL(fh)) return ERR_CAST(fh); - upper = ovl_decode_real_fh(fh, ovl_upper_mnt(ofs), true); + upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), true); kfree(fh); if (IS_ERR_OR_NULL(upper)) @@ -640,12 +645,13 @@ static int ovl_get_index_name_fh(struct ovl_fh *fh, struct qstr *name) * index dir was cleared. Either way, that index cannot be used to indentify * the overlay inode. */ -int ovl_get_index_name(struct dentry *origin, struct qstr *name) +int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin, + struct qstr *name) { struct ovl_fh *fh; int err; - fh = ovl_encode_real_fh(origin, false); + fh = ovl_encode_real_fh(ofs, origin, false); if (IS_ERR(fh)) return PTR_ERR(fh); @@ -694,7 +700,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, bool is_dir = d_is_dir(origin); int err; - err = ovl_get_index_name(origin, &name); + err = ovl_get_index_name(ofs, origin, &name); if (err) return ERR_PTR(err); @@ -805,7 +811,7 @@ static int ovl_fix_origin(struct ovl_fs *ofs, struct dentry *dentry, if (err) return err; - err = ovl_set_origin(dentry, lower, upper); + err = ovl_set_origin(ofs, dentry, lower, upper); if (!err) err = ovl_set_impure(dentry->d_parent, upper->d_parent); @@ -1003,6 +1009,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, * Just make sure a corresponding data dentry has been found. */ if (d.metacopy || (uppermetacopy && !ctr)) { + pr_warn_ratelimited("metacopy with no lower data found - abort lookup (%pd2)\n", + dentry); err = -EIO; goto out_put; } else if (!d.is_dir && upperdentry && !ctr && origin_path) { diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index f8880aa2ba0e..b487e48c7fd4 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -22,7 +22,9 @@ enum ovl_path_type { #define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE) #define OVL_TYPE_ORIGIN(type) ((type) & __OVL_PATH_ORIGIN) -#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay." +#define OVL_XATTR_NAMESPACE "overlay." +#define OVL_XATTR_TRUSTED_PREFIX XATTR_TRUSTED_PREFIX OVL_XATTR_NAMESPACE +#define OVL_XATTR_USER_PREFIX XATTR_USER_PREFIX OVL_XATTR_NAMESPACE enum ovl_xattr { OVL_XATTR_OPAQUE, @@ -113,10 +115,10 @@ struct ovl_fh { #define OVL_FH_FID_OFFSET (OVL_FH_WIRE_OFFSET + \ offsetof(struct ovl_fb, fid)) -extern const char *ovl_xattr_table[]; +extern const char *const ovl_xattr_table[][2]; static inline const char *ovl_xattr(struct ovl_fs *ofs, enum ovl_xattr ox) { - return ovl_xattr_table[ox]; + return ovl_xattr_table[ox][ofs->config.userxattr]; } static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) @@ -383,8 +385,8 @@ static inline int ovl_check_fh_len(struct ovl_fh *fh, int fh_len) return ovl_check_fb_len(&fh->fb, fh_len - OVL_FH_WIRE_OFFSET); } -struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, - bool connected); +struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh, + struct vfsmount *mnt, bool connected); int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, struct dentry *upperdentry, struct ovl_path **stackp); int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, @@ -392,7 +394,8 @@ int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, bool set); struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index); int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index); -int ovl_get_index_name(struct dentry *origin, struct qstr *name); +int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin, + struct qstr *name); struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh); struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, struct dentry *origin, bool verify); @@ -514,9 +517,10 @@ int ovl_maybe_copy_up(struct dentry *dentry, int flags); int ovl_copy_xattr(struct super_block *sb, struct dentry *old, struct dentry *new); int ovl_set_attr(struct dentry *upper, struct kstat *stat); -struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper); -int ovl_set_origin(struct dentry *dentry, struct dentry *lower, - struct dentry *upper); +struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, + bool is_upper); +int ovl_set_origin(struct ovl_fs *ofs, struct dentry *dentry, + struct dentry *lower, struct dentry *upper); /* export.c */ extern const struct export_operations ovl_export_operations; diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 1b5a2094df8e..fbd5e27ce66b 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -14,9 +14,11 @@ struct ovl_config { bool redirect_follow; const char *redirect_mode; bool index; + bool uuid; bool nfs_export; int xino; bool metacopy; + bool userxattr; bool ovl_volatile; }; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 290983bcfbb3..2bd570cbe8a4 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -79,7 +79,7 @@ static void ovl_dentry_release(struct dentry *dentry) static struct dentry *ovl_d_real(struct dentry *dentry, const struct inode *inode) { - struct dentry *real; + struct dentry *real = NULL, *lower; /* It's an overlay file */ if (inode && d_inode(dentry) == inode) @@ -98,9 +98,10 @@ static struct dentry *ovl_d_real(struct dentry *dentry, if (real && !inode && ovl_has_upperdata(d_inode(dentry))) return real; - real = ovl_dentry_lowerdata(dentry); - if (!real) + lower = ovl_dentry_lowerdata(dentry); + if (!lower) goto bug; + real = lower; /* Handle recursion */ real = d_real(real, inode); @@ -108,8 +109,10 @@ static struct dentry *ovl_d_real(struct dentry *dentry, if (!inode || inode == d_inode(real)) return real; bug: - WARN(1, "ovl_d_real(%pd4, %s:%lu): real dentry not found\n", dentry, - inode ? inode->i_sb->s_id : "NULL", inode ? inode->i_ino : 0); + WARN(1, "%s(%pd4, %s:%lu): real dentry (%p/%lu) not found\n", + __func__, dentry, inode ? inode->i_sb->s_id : "NULL", + inode ? inode->i_ino : 0, real, + real && d_inode(real) ? d_inode(real)->i_ino : 0); return dentry; } @@ -356,6 +359,8 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) seq_printf(m, ",redirect_dir=%s", ofs->config.redirect_mode); if (ofs->config.index != ovl_index_def) seq_printf(m, ",index=%s", ofs->config.index ? "on" : "off"); + if (!ofs->config.uuid) + seq_puts(m, ",uuid=off"); if (ofs->config.nfs_export != ovl_nfs_export_def) seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ? "on" : "off"); @@ -410,7 +415,10 @@ enum { OPT_REDIRECT_DIR, OPT_INDEX_ON, OPT_INDEX_OFF, + OPT_UUID_ON, + OPT_UUID_OFF, OPT_NFS_EXPORT_ON, + OPT_USERXATTR, OPT_NFS_EXPORT_OFF, OPT_XINO_ON, OPT_XINO_OFF, @@ -429,6 +437,9 @@ static const match_table_t ovl_tokens = { {OPT_REDIRECT_DIR, "redirect_dir=%s"}, {OPT_INDEX_ON, "index=on"}, {OPT_INDEX_OFF, "index=off"}, + {OPT_USERXATTR, "userxattr"}, + {OPT_UUID_ON, "uuid=on"}, + {OPT_UUID_OFF, "uuid=off"}, {OPT_NFS_EXPORT_ON, "nfs_export=on"}, {OPT_NFS_EXPORT_OFF, "nfs_export=off"}, {OPT_XINO_ON, "xino=on"}, @@ -549,6 +560,14 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) index_opt = true; break; + case OPT_UUID_ON: + config->uuid = true; + break; + + case OPT_UUID_OFF: + config->uuid = false; + break; + case OPT_NFS_EXPORT_ON: config->nfs_export = true; nfs_export_opt = true; @@ -585,6 +604,10 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) config->ovl_volatile = true; break; + case OPT_USERXATTR: + config->userxattr = true; + break; + default: pr_err("unrecognized mount option \"%s\" or missing value\n", p); @@ -688,6 +711,28 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) } } + + /* Resolve userxattr -> !redirect && !metacopy dependency */ + if (config->userxattr) { + if (config->redirect_follow && redirect_opt) { + pr_err("conflicting options: userxattr,redirect_dir=%s\n", + config->redirect_mode); + return -EINVAL; + } + if (config->metacopy && metacopy_opt) { + pr_err("conflicting options: userxattr,metacopy=on\n"); + return -EINVAL; + } + /* + * Silently disable default setting of redirect and metacopy. + * This shall be the default in the future as well: these + * options must be explicitly enabled if used together with + * userxattr. + */ + config->redirect_dir = config->redirect_follow = false; + config->metacopy = false; + } + return 0; } @@ -1037,8 +1082,14 @@ ovl_posix_acl_default_xattr_handler = { .set = ovl_posix_acl_xattr_set, }; -static const struct xattr_handler ovl_own_xattr_handler = { - .prefix = OVL_XATTR_PREFIX, +static const struct xattr_handler ovl_own_trusted_xattr_handler = { + .prefix = OVL_XATTR_TRUSTED_PREFIX, + .get = ovl_own_xattr_get, + .set = ovl_own_xattr_set, +}; + +static const struct xattr_handler ovl_own_user_xattr_handler = { + .prefix = OVL_XATTR_USER_PREFIX, .get = ovl_own_xattr_get, .set = ovl_own_xattr_set, }; @@ -1049,12 +1100,22 @@ static const struct xattr_handler ovl_other_xattr_handler = { .set = ovl_other_xattr_set, }; -static const struct xattr_handler *ovl_xattr_handlers[] = { +static const struct xattr_handler *ovl_trusted_xattr_handlers[] = { #ifdef CONFIG_FS_POSIX_ACL &ovl_posix_acl_access_xattr_handler, &ovl_posix_acl_default_xattr_handler, #endif - &ovl_own_xattr_handler, + &ovl_own_trusted_xattr_handler, + &ovl_other_xattr_handler, + NULL +}; + +static const struct xattr_handler *ovl_user_xattr_handlers[] = { +#ifdef CONFIG_FS_POSIX_ACL + &ovl_posix_acl_access_xattr_handler, + &ovl_posix_acl_default_xattr_handler, +#endif + &ovl_own_user_xattr_handler, &ovl_other_xattr_handler, NULL }; @@ -1317,7 +1378,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, pr_warn("upper fs does not support RENAME_WHITEOUT.\n"); /* - * Check if upper/work fs supports trusted.overlay.* xattr + * Check if upper/work fs supports (trusted|user).overlay.* xattr */ err = ovl_do_setxattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE, "0", 1); if (err) { @@ -1456,10 +1517,10 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, /* * Verify upper root is exclusively associated with index dir. - * Older kernels stored upper fh in "trusted.overlay.origin" + * Older kernels stored upper fh in ".overlay.origin" * xattr. If that xattr exists, verify that it is a match to * upper dir file handle. In any case, verify or set xattr - * "trusted.overlay.upper" to indicate that index may have + * ".overlay.upper" to indicate that index may have * directory entries. */ if (ovl_check_origin_xattr(ofs, ofs->indexdir)) { @@ -1877,6 +1938,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) ofs->share_whiteout = true; ofs->config.index = ovl_index_def; + ofs->config.uuid = true; ofs->config.nfs_export = ovl_nfs_export_def; ofs->config.xino = ovl_xino_def(); ofs->config.metacopy = ovl_metacopy_def; @@ -1956,6 +2018,11 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!ovl_upper_mnt(ofs)) sb->s_flags |= SB_RDONLY; + if (!ofs->config.uuid && ofs->numfs > 1) { + pr_warn("The uuid=off requires a single fs for lower and upper, falling back to uuid=on.\n"); + ofs->config.uuid = true; + } + if (!ovl_force_readonly(ofs) && ofs->config.index) { err = ovl_get_indexdir(sb, ofs, oe, &upperpath); if (err) @@ -1991,7 +2058,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) cap_lower(cred->cap_effective, CAP_SYS_RESOURCE); sb->s_magic = OVERLAYFS_SUPER_MAGIC; - sb->s_xattr = ovl_xattr_handlers; + sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers : + ovl_trusted_xattr_handlers; sb->s_fs_info = ofs; sb->s_flags |= SB_POSIXACL; sb->s_iflags |= SB_I_SKIP_SYNC; @@ -2028,6 +2096,7 @@ static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags, static struct file_system_type ovl_fs_type = { .owner = THIS_MODULE, .name = "overlay", + .fs_flags = FS_USERNS_MOUNT, .mount = ovl_mount, .kill_sb = kill_anon_super, }; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 23f475627d07..6569031af3cd 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -50,6 +50,9 @@ const struct cred *ovl_override_creds(struct super_block *sb) */ int ovl_can_decode_fh(struct super_block *sb) { + if (!capable(CAP_DAC_READ_SEARCH)) + return 0; + if (!sb->s_export_op || !sb->s_export_op->fh_to_dentry) return 0; @@ -582,9 +585,10 @@ bool ovl_check_dir_xattr(struct super_block *sb, struct dentry *dentry, #define OVL_XATTR_METACOPY_POSTFIX "metacopy" #define OVL_XATTR_TAB_ENTRY(x) \ - [x] = OVL_XATTR_PREFIX x ## _POSTFIX + [x] = { [false] = OVL_XATTR_TRUSTED_PREFIX x ## _POSTFIX, \ + [true] = OVL_XATTR_USER_PREFIX x ## _POSTFIX } -const char *ovl_xattr_table[] = { +const char *const ovl_xattr_table[][2] = { OVL_XATTR_TAB_ENTRY(OVL_XATTR_OPAQUE), OVL_XATTR_TAB_ENTRY(OVL_XATTR_REDIRECT), OVL_XATTR_TAB_ENTRY(OVL_XATTR_ORIGIN), @@ -716,6 +720,7 @@ bool ovl_need_index(struct dentry *dentry) /* Caller must hold OVL_I(inode)->lock */ static void ovl_cleanup_index(struct dentry *dentry) { + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *indexdir = ovl_indexdir(dentry->d_sb); struct inode *dir = indexdir->d_inode; struct dentry *lowerdentry = ovl_dentry_lower(dentry); @@ -725,7 +730,7 @@ static void ovl_cleanup_index(struct dentry *dentry) struct qstr name = { }; int err; - err = ovl_get_index_name(lowerdentry, &name); + err = ovl_get_index_name(ofs, lowerdentry, &name); if (err) goto fail; @@ -879,6 +884,13 @@ int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct dentry *dentry) if (res < 0) { if (res == -ENODATA || res == -EOPNOTSUPP) return 0; + /* + * getxattr on user.* may fail with EACCES in case there's no + * read permission on the inode. Not much we can do, other than + * tell the caller that this is not a metacopy inode. + */ + if (ofs->config.userxattr && res == -EACCES) + return 0; goto out; } diff --git a/fs/pipe.c b/fs/pipe.c index 0ac197658a2d..c5989cfd564d 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1342,9 +1342,8 @@ out_revert_acct: } /* - * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same - * location, so checking ->i_pipe is not enough to verify that this is a - * pipe. + * Note that i_pipe and i_cdev share the same location, so checking ->i_pipe is + * not enough to verify that this is a pipe. */ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) { diff --git a/fs/proc/array.c b/fs/proc/array.c index 65ec2029fa80..bb87e4d89cd8 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -56,6 +56,7 @@ #include <linux/types.h> #include <linux/errno.h> #include <linux/time.h> +#include <linux/time_namespace.h> #include <linux/kernel.h> #include <linux/kernel_stat.h> #include <linux/tty.h> @@ -368,6 +369,34 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) seq_puts(m, "vulnerable"); break; } + + seq_puts(m, "\nSpeculationIndirectBranch:\t"); + switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_INDIRECT_BRANCH)) { + case -EINVAL: + seq_puts(m, "unsupported"); + break; + case PR_SPEC_NOT_AFFECTED: + seq_puts(m, "not affected"); + break; + case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE: + seq_puts(m, "conditional force disabled"); + break; + case PR_SPEC_PRCTL | PR_SPEC_DISABLE: + seq_puts(m, "conditional disabled"); + break; + case PR_SPEC_PRCTL | PR_SPEC_ENABLE: + seq_puts(m, "conditional enabled"); + break; + case PR_SPEC_ENABLE: + seq_puts(m, "always enabled"); + break; + case PR_SPEC_DISABLE: + seq_puts(m, "always disabled"); + break; + default: + seq_puts(m, "unknown"); + break; + } seq_putc(m, '\n'); } @@ -382,9 +411,9 @@ static inline void task_context_switch_counts(struct seq_file *m, static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) { seq_printf(m, "Cpus_allowed:\t%*pb\n", - cpumask_pr_args(task->cpus_ptr)); + cpumask_pr_args(&task->cpus_mask)); seq_printf(m, "Cpus_allowed_list:\t%*pbl\n", - cpumask_pr_args(task->cpus_ptr)); + cpumask_pr_args(&task->cpus_mask)); } static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) @@ -533,8 +562,9 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, priority = task_prio(task); nice = task_nice(task); - /* convert nsec -> ticks */ - start_time = nsec_to_clock_t(task->start_boottime); + /* apply timens offset for boottime and convert nsec -> ticks */ + start_time = + nsec_to_clock_t(timens_add_boottime_ns(task->start_boottime)); seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns)); seq_puts(m, " ("); diff --git a/fs/proc/base.c b/fs/proc/base.c index b362523a9829..b3422cda2a91 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -405,11 +405,11 @@ print0: static int lock_trace(struct task_struct *task) { - int err = mutex_lock_killable(&task->signal->exec_update_mutex); + int err = down_read_killable(&task->signal->exec_update_lock); if (err) return err; if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) { - mutex_unlock(&task->signal->exec_update_mutex); + up_read(&task->signal->exec_update_lock); return -EPERM; } return 0; @@ -417,7 +417,7 @@ static int lock_trace(struct task_struct *task) static void unlock_trace(struct task_struct *task) { - mutex_unlock(&task->signal->exec_update_mutex); + up_read(&task->signal->exec_update_lock); } #ifdef CONFIG_STACKTRACE @@ -2021,7 +2021,7 @@ const struct dentry_operations pid_dentry_operations = * file type from dcache entry. * * Since all of the proc inode numbers are dynamically generated, the inode - * numbers do not exist until the inode is cache. This means creating the + * numbers do not exist until the inode is cache. This means creating * the dcache entry in readdir is necessary to keep the inode numbers * reported by readdir in sync with the inode numbers reported * by stat. @@ -2930,7 +2930,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh unsigned long flags; int result; - result = mutex_lock_killable(&task->signal->exec_update_mutex); + result = down_read_killable(&task->signal->exec_update_lock); if (result) return result; @@ -2966,7 +2966,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh result = 0; out_unlock: - mutex_unlock(&task->signal->exec_update_mutex); + up_read(&task->signal->exec_update_lock); return result; } @@ -3263,6 +3263,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_PROC_PID_ARCH_STATUS ONE("arch_status", S_IRUGO, proc_pid_arch_status), #endif +#ifdef CONFIG_SECCOMP_CACHE_DEBUG + ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), +#endif }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) @@ -3592,6 +3595,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_PROC_PID_ARCH_STATUS ONE("arch_status", S_IRUGO, proc_pid_arch_status), #endif +#ifdef CONFIG_SECCOMP_CACHE_DEBUG + ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), +#endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 81882a13212d..cb51763ed554 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -28,14 +28,13 @@ static int seq_show(struct seq_file *m, void *v) if (!task) return -ENOENT; - files = get_files_struct(task); - put_task_struct(task); - + task_lock(task); + files = task->files; if (files) { unsigned int fd = proc_fd(m->private); spin_lock(&files->file_lock); - file = fcheck_files(files, fd); + file = files_lookup_fd_locked(files, fd); if (file) { struct fdtable *fdt = files_fdtable(files); @@ -47,8 +46,9 @@ static int seq_show(struct seq_file *m, void *v) ret = 0; } spin_unlock(&files->file_lock); - put_files_struct(files); } + task_unlock(task); + put_task_struct(task); if (ret) return ret; @@ -57,6 +57,7 @@ static int seq_show(struct seq_file *m, void *v) (long long)file->f_pos, f_flags, real_mount(file->f_path.mnt)->mnt_id); + /* show_fd_locks() never deferences files so a stale value is safe */ show_fd_locks(m, file, files); if (seq_has_overflowed(m)) goto out; @@ -83,18 +84,13 @@ static const struct file_operations proc_fdinfo_file_operations = { static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode) { - struct files_struct *files = get_files_struct(task); struct file *file; - if (!files) - return false; - rcu_read_lock(); - file = fcheck_files(files, fd); + file = task_lookup_fd_rcu(task, fd); if (file) *mode = file->f_mode; rcu_read_unlock(); - put_files_struct(files); return !!file; } @@ -146,29 +142,22 @@ static const struct dentry_operations tid_fd_dentry_operations = { static int proc_fd_link(struct dentry *dentry, struct path *path) { - struct files_struct *files = NULL; struct task_struct *task; int ret = -ENOENT; task = get_proc_task(d_inode(dentry)); if (task) { - files = get_files_struct(task); - put_task_struct(task); - } - - if (files) { unsigned int fd = proc_fd(d_inode(dentry)); struct file *fd_file; - spin_lock(&files->file_lock); - fd_file = fcheck_files(files, fd); + fd_file = fget_task(task, fd); if (fd_file) { *path = fd_file->f_path; path_get(&fd_file->f_path); ret = 0; + fput(fd_file); } - spin_unlock(&files->file_lock); - put_files_struct(files); + put_task_struct(task); } return ret; @@ -229,7 +218,6 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, instantiate_t instantiate) { struct task_struct *p = get_proc_task(file_inode(file)); - struct files_struct *files; unsigned int fd; if (!p) @@ -237,22 +225,18 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, if (!dir_emit_dots(file, ctx)) goto out; - files = get_files_struct(p); - if (!files) - goto out; rcu_read_lock(); - for (fd = ctx->pos - 2; - fd < files_fdtable(files)->max_fds; - fd++, ctx->pos++) { + for (fd = ctx->pos - 2;; fd++) { struct file *f; struct fd_data data; char name[10 + 1]; unsigned int len; - f = fcheck_files(files, fd); + f = task_lookup_next_fd_rcu(p, &fd); + ctx->pos = fd + 2LL; if (!f) - continue; + break; data.mode = f->f_mode; rcu_read_unlock(); data.fd = fd; @@ -261,13 +245,11 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, if (!proc_fill_cache(file, ctx, name, len, instantiate, p, &data)) - goto out_fd_loop; + goto out; cond_resched(); rcu_read_lock(); } rcu_read_unlock(); -out_fd_loop: - put_files_struct(files); out: put_task_struct(p); return 0; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index b84663252add..6c0a05f55d6b 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -349,6 +349,16 @@ static const struct file_operations proc_dir_operations = { .iterate_shared = proc_readdir, }; +static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + return 0; +} + +const struct dentry_operations proc_net_dentry_ops = { + .d_revalidate = proc_net_d_revalidate, + .d_delete = always_delete_dentry, +}; + /* * proc directories can do almost nothing.. */ @@ -471,8 +481,8 @@ struct proc_dir_entry *proc_symlink(const char *name, } EXPORT_SYMBOL(proc_symlink); -struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, - struct proc_dir_entry *parent, void *data) +struct proc_dir_entry *_proc_mkdir(const char *name, umode_t mode, + struct proc_dir_entry *parent, void *data, bool force_lookup) { struct proc_dir_entry *ent; @@ -484,10 +494,20 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, ent->data = data; ent->proc_dir_ops = &proc_dir_operations; ent->proc_iops = &proc_dir_inode_operations; + if (force_lookup) { + pde_force_lookup(ent); + } ent = proc_register(parent, ent); } return ent; } +EXPORT_SYMBOL_GPL(_proc_mkdir); + +struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, void *data) +{ + return _proc_mkdir(name, mode, parent, data, false); +} EXPORT_SYMBOL_GPL(proc_mkdir_data); struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode, diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 917cc85e3466..f60b379dcdc7 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -190,10 +190,9 @@ struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_e extern int proc_readdir(struct file *, struct dir_context *); int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *); -static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) +static inline void pde_get(struct proc_dir_entry *pde) { refcount_inc(&pde->refcnt); - return pde; } extern void pde_put(struct proc_dir_entry *); @@ -310,3 +309,10 @@ extern unsigned long task_statm(struct mm_struct *, unsigned long *, unsigned long *, unsigned long *, unsigned long *); extern void task_mem(struct seq_file *, struct mm_struct *); + +extern const struct dentry_operations proc_net_dentry_ops; +static inline void pde_force_lookup(struct proc_dir_entry *pde) +{ + /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */ + pde->proc_dops = &proc_net_dentry_ops; +} diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index e502414b3556..4d2e64e9016c 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -193,8 +193,6 @@ kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg) return 1; p = pfn_to_page(pfn); - if (!memmap_valid_within(pfn, p, page_zone(p))) - return 1; ent = kmalloc(sizeof(*ent), GFP_KERNEL); if (!ent) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 887a5532e449..d6fc74619625 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -107,7 +107,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_node_page_state(NR_KERNEL_SCS_KB)); #endif show_val_kb(m, "PageTables: ", - global_zone_page_state(NR_PAGETABLE)); + global_node_page_state(NR_PAGETABLE)); show_val_kb(m, "NFS_Unstable: ", 0); show_val_kb(m, "Bounce: ", diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index ed8a6306990c..18601042af99 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -39,22 +39,6 @@ static struct net *get_proc_net(const struct inode *inode) return maybe_get_net(PDE_NET(PDE(inode))); } -static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags) -{ - return 0; -} - -static const struct dentry_operations proc_net_dentry_ops = { - .d_revalidate = proc_net_d_revalidate, - .d_delete = always_delete_dentry, -}; - -static void pde_force_lookup(struct proc_dir_entry *pde) -{ - /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */ - pde->proc_dops = &proc_net_dentry_ops; -} - static int seq_open_net(struct inode *inode, struct file *file) { unsigned int state_size = PDE(inode)->state_size; @@ -140,7 +124,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_data); * @mode: The file's access mode. * @parent: The parent directory in which to create. * @ops: The seq_file ops with which to read the file. - * @write: The write method which which to 'modify' the file. + * @write: The write method with which to 'modify' the file. * @data: Data for retrieval by PDE_DATA(). * * Create a network namespaced proc file in the @parent directory with the @@ -232,7 +216,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_single); * @mode: The file's access mode. * @parent: The parent directory in which to create. * @show: The seqfile show method with which to read the file. - * @write: The write method which which to 'modify' the file. + * @write: The write method with which to 'modify' the file. * @data: Data for retrieval by PDE_DATA(). * * Create a network-namespaced proc file in the @parent directory with the diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 4695b6de3151..f25e8531fd27 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -10,6 +10,7 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/time.h> +#include <linux/time_namespace.h> #include <linux/irqnr.h> #include <linux/sched/cputime.h> #include <linux/tick.h> @@ -118,6 +119,8 @@ static int show_stat(struct seq_file *p, void *v) irq = softirq = steal = 0; guest = guest_nice = 0; getboottime64(&boottime); + /* shift boot timestamp according to the timens offset */ + timens_sub_boottime(&boottime); for_each_possible_cpu(i) { struct kernel_cpustat kcpustat; diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index e16a49ebfe54..8adabde685f1 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -14,6 +14,14 @@ config PSTORE If you don't have a platform persistent store driver, say N. +config PSTORE_DEFAULT_KMSG_BYTES + int "Default kernel log storage space" if EXPERT + depends on PSTORE + default "10240" + help + Defines default size of pstore kernel log storage. + Can be enlarged if needed, not recommended to shrink it. + config PSTORE_DEFLATE_COMPRESS tristate "DEFLATE (ZLIB) compression" default y diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index fcd5563dde06..4bb8a344957a 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -90,7 +90,6 @@ MODULE_PARM_DESC(blkdev, "block device for pstore storage"); static DEFINE_MUTEX(pstore_blk_lock); static struct block_device *psblk_bdev; static struct pstore_zone_info *pstore_zone_info; -static pstore_blk_panic_write_op blkdev_panic_write; struct bdev_info { dev_t devt; @@ -245,7 +244,7 @@ static struct block_device *psblk_get_bdev(void *holder, return bdev; } - nr_sects = part_nr_sects_read(bdev->bd_part); + nr_sects = bdev_nr_sectors(bdev); if (!nr_sects) { pr_err("not enough space for '%s'\n", blkdev); blkdev_put(bdev, mode); @@ -341,24 +340,11 @@ static ssize_t psblk_generic_blk_write(const char *buf, size_t bytes, return ret; } -static ssize_t psblk_blk_panic_write(const char *buf, size_t size, - loff_t off) -{ - int ret; - - if (!blkdev_panic_write) - return -EOPNOTSUPP; - - /* size and off must align to SECTOR_SIZE for block device */ - ret = blkdev_panic_write(buf, off >> SECTOR_SHIFT, - size >> SECTOR_SHIFT); - /* try next zone */ - if (ret == -ENOMSG) - return ret; - return ret ? -EIO : size; -} - -static int __register_pstore_blk(struct pstore_blk_info *info) +/* + * This takes its configuration only from the module parameters now. + * See psblk_get_bdev() and blkdev. + */ +static int __register_pstore_blk(void) { char bdev_name[BDEVNAME_SIZE]; struct block_device *bdev; @@ -378,68 +364,34 @@ static int __register_pstore_blk(struct pstore_blk_info *info) } /* only allow driver matching the @blkdev */ - if (!binfo.devt || (!best_effort && - MAJOR(binfo.devt) != info->major)) { - pr_debug("invalid major %u (expect %u)\n", - info->major, MAJOR(binfo.devt)); + if (!binfo.devt) { + pr_debug("no major\n"); ret = -ENODEV; goto err_put_bdev; } /* psblk_bdev must be assigned before register to pstore/blk */ psblk_bdev = bdev; - blkdev_panic_write = info->panic_write; - - /* Copy back block device details. */ - info->devt = binfo.devt; - info->nr_sects = binfo.nr_sects; - info->start_sect = binfo.start_sect; memset(&dev, 0, sizeof(dev)); - dev.total_size = info->nr_sects << SECTOR_SHIFT; - dev.flags = info->flags; + dev.total_size = binfo.nr_sects << SECTOR_SHIFT; dev.read = psblk_generic_blk_read; dev.write = psblk_generic_blk_write; - dev.erase = NULL; - dev.panic_write = info->panic_write ? psblk_blk_panic_write : NULL; ret = __register_pstore_device(&dev); if (ret) goto err_put_bdev; bdevname(bdev, bdev_name); - pr_info("attached %s%s\n", bdev_name, - info->panic_write ? "" : " (no dedicated panic_write!)"); + pr_info("attached %s (no dedicated panic_write!)\n", bdev_name); return 0; err_put_bdev: psblk_bdev = NULL; - blkdev_panic_write = NULL; psblk_put_bdev(bdev, holder); return ret; } -/** - * register_pstore_blk() - register block device to pstore/blk - * - * @info: details on the desired block device interface - * - * Return: - * * 0 - OK - * * Others - something error. - */ -int register_pstore_blk(struct pstore_blk_info *info) -{ - int ret; - - mutex_lock(&pstore_blk_lock); - ret = __register_pstore_blk(info); - mutex_unlock(&pstore_blk_lock); - - return ret; -} -EXPORT_SYMBOL_GPL(register_pstore_blk); - static void __unregister_pstore_blk(unsigned int major) { struct pstore_device_info dev = { .read = psblk_generic_blk_read }; @@ -449,24 +401,10 @@ static void __unregister_pstore_blk(unsigned int major) if (psblk_bdev && MAJOR(psblk_bdev->bd_dev) == major) { __unregister_pstore_device(&dev); psblk_put_bdev(psblk_bdev, holder); - blkdev_panic_write = NULL; psblk_bdev = NULL; } } -/** - * unregister_pstore_blk() - unregister block device from pstore/blk - * - * @major: the major device number of device - */ -void unregister_pstore_blk(unsigned int major) -{ - mutex_lock(&pstore_blk_lock); - __unregister_pstore_blk(major); - mutex_unlock(&pstore_blk_lock); -} -EXPORT_SYMBOL_GPL(unregister_pstore_blk); - /* get information of pstore/blk */ int pstore_blk_get_config(struct pstore_blk_config *info) { @@ -483,12 +421,11 @@ EXPORT_SYMBOL_GPL(pstore_blk_get_config); static int __init pstore_blk_init(void) { - struct pstore_blk_info info = { }; int ret = 0; mutex_lock(&pstore_blk_lock); if (!pstore_zone_info && best_effort && blkdev[0]) - ret = __register_pstore_blk(&info); + ret = __register_pstore_blk(); mutex_unlock(&pstore_blk_lock); return ret; diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index c331efe8de95..93a217e4f563 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -266,7 +266,7 @@ static void parse_options(char *options) */ static int pstore_show_options(struct seq_file *m, struct dentry *root) { - if (kmsg_bytes != PSTORE_DEFAULT_KMSG_BYTES) + if (kmsg_bytes != CONFIG_PSTORE_DEFAULT_KMSG_BYTES) seq_printf(m, ",kmsg_bytes=%lu", kmsg_bytes); return 0; } diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 7fb219042f13..801d6c0b170c 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -6,7 +6,6 @@ #include <linux/time.h> #include <linux/pstore.h> -#define PSTORE_DEFAULT_KMSG_BYTES 10240 extern unsigned long kmsg_bytes; #ifdef CONFIG_PSTORE_FTRACE diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 36714df37d5d..32f64abc277c 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -101,7 +101,7 @@ static char *big_oops_buf; static size_t big_oops_buf_sz; /* How much of the console log to snapshot */ -unsigned long kmsg_bytes = PSTORE_DEFAULT_KMSG_BYTES; +unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; void pstore_set_kmsg_bytes(int bytes) { diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index 3ce89216670c..5266ccbec007 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -1299,6 +1299,10 @@ int register_pstore_zone(struct pstore_zone_info *info) pr_warn("total_size must be >= 4096\n"); return -EINVAL; } + if (info->total_size > SZ_128M) { + pr_warn("capping size to 128MiB\n"); + info->total_size = SZ_128M; + } if (!info->kmsg_size && !info->pmsg_size && !info->console_size && !info->ftrace_size) { diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index bb02989d92b6..4f1373463766 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2455,7 +2455,7 @@ int dquot_resume(struct super_block *sb, int type) ret = dquot_load_quota_sb(sb, cnt, dqopt->info[cnt].dqi_fmt_id, flags); if (ret < 0) - vfs_cleanup_quota_inode(sb, type); + vfs_cleanup_quota_inode(sb, cnt); } return ret; diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 9af95c7a0bbe..6d16b2be5ac4 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -20,6 +20,7 @@ #include <linux/writeback.h> #include <linux/nospec.h> #include "compat.h" +#include "../internal.h" static int check_quotactl_permission(struct super_block *sb, int type, int cmd, qid_t id) @@ -865,27 +866,42 @@ static bool quotactl_cmd_onoff(int cmd) static struct super_block *quotactl_block(const char __user *special, int cmd) { #ifdef CONFIG_BLOCK - struct block_device *bdev; struct super_block *sb; struct filename *tmp = getname(special); + bool excl = false, thawed = false; + int error; + dev_t dev; if (IS_ERR(tmp)) return ERR_CAST(tmp); - bdev = lookup_bdev(tmp->name); + error = lookup_bdev(tmp->name, &dev); putname(tmp); - if (IS_ERR(bdev)) - return ERR_CAST(bdev); - if (quotactl_cmd_onoff(cmd)) - sb = get_super_exclusive_thawed(bdev); - else if (quotactl_cmd_write(cmd)) - sb = get_super_thawed(bdev); - else - sb = get_super(bdev); - bdput(bdev); + if (error) + return ERR_PTR(error); + + if (quotactl_cmd_onoff(cmd)) { + excl = true; + thawed = true; + } else if (quotactl_cmd_write(cmd)) { + thawed = true; + } + +retry: + sb = user_get_super(dev, excl); if (!sb) return ERR_PTR(-ENODEV); - + if (thawed && sb->s_writers.frozen != SB_UNFROZEN) { + if (excl) + up_write(&sb->s_umount); + else + up_read(&sb->s_umount); + wait_event(sb->s_writers.wait_unfrozen, + sb->s_writers.frozen == SB_UNFROZEN); + put_super(sb); + goto retry; + } return sb; + #else return ERR_PTR(-ENODEV); #endif diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index a6f856f341dc..c5562c871c8b 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -62,7 +62,7 @@ static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) memset(buf, 0, info->dqi_usable_bs); return sb->s_op->quota_read(sb, info->dqi_type, buf, - info->dqi_usable_bs, blk << info->dqi_blocksize_bits); + info->dqi_usable_bs, (loff_t)blk << info->dqi_blocksize_bits); } static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) @@ -71,7 +71,7 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) ssize_t ret; ret = sb->s_op->quota_write(sb, info->dqi_type, buf, - info->dqi_usable_bs, blk << info->dqi_blocksize_bits); + info->dqi_usable_bs, (loff_t)blk << info->dqi_blocksize_bits); if (ret != info->dqi_usable_bs) { quota_error(sb, "dquota write failed"); if (ret >= 0) @@ -284,7 +284,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info, blk); goto out_buf; } - dquot->dq_off = (blk << info->dqi_blocksize_bits) + + dquot->dq_off = ((loff_t)blk << info->dqi_blocksize_bits) + sizeof(struct qt_disk_dqdbheader) + i * info->dqi_entry_size; kfree(buf); @@ -559,7 +559,7 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info, ret = -EIO; goto out_buf; } else { - ret = (blk << info->dqi_blocksize_bits) + sizeof(struct + ret = ((loff_t)blk << info->dqi_blocksize_bits) + sizeof(struct qt_disk_dqdbheader) + i * info->dqi_entry_size; } out_buf: diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index e69a2bfdd81c..c21106557a37 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -157,6 +157,25 @@ static int v2_read_file_info(struct super_block *sb, int type) qinfo->dqi_entry_size = sizeof(struct v2r1_disk_dqblk); qinfo->dqi_ops = &v2r1_qtree_ops; } + ret = -EUCLEAN; + /* Some sanity checks of the read headers... */ + if ((loff_t)qinfo->dqi_blocks << qinfo->dqi_blocksize_bits > + i_size_read(sb_dqopt(sb)->files[type])) { + quota_error(sb, "Number of blocks too big for quota file size (%llu > %llu).", + (loff_t)qinfo->dqi_blocks << qinfo->dqi_blocksize_bits, + i_size_read(sb_dqopt(sb)->files[type])); + goto out; + } + if (qinfo->dqi_free_blk >= qinfo->dqi_blocks) { + quota_error(sb, "Free block number too big (%u >= %u).", + qinfo->dqi_free_blk, qinfo->dqi_blocks); + goto out; + } + if (qinfo->dqi_free_entry >= qinfo->dqi_blocks) { + quota_error(sb, "Block with free entry too big (%u >= %u).", + qinfo->dqi_free_entry, qinfo->dqi_blocks); + goto out; + } ret = 0; out: up_read(&dqopt->dqio_sem); diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 8bf88d690729..476a7ff49482 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -454,6 +454,12 @@ static int is_leaf(char *buf, int blocksize, struct buffer_head *bh) "(second one): %h", ih); return 0; } + if (is_direntry_le_ih(ih) && (ih_item_len(ih) < (ih_entry_count(ih) * IH_SIZE))) { + reiserfs_warning(NULL, "reiserfs-5093", + "item entry count seems wrong %h", + ih); + return 0; + } prev_location = ih_location(ih); } diff --git a/fs/remap_range.c b/fs/remap_range.c index e6099beefa97..77dba3a49e65 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -456,8 +456,16 @@ loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, if (ret) return ret; + /* + * This is redundant if called from vfs_dedupe_file_range(), but other + * callers need it and it's not performance sesitive... + */ + ret = remap_verify_area(src_file, src_pos, len, false); + if (ret) + goto out_drop_write; + ret = remap_verify_area(dst_file, dst_pos, len, true); - if (ret < 0) + if (ret) goto out_drop_write; ret = -EPERM; diff --git a/fs/statfs.c b/fs/statfs.c index 59f33752c131..68cb07788750 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -235,7 +235,7 @@ SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user static int vfs_ustat(dev_t dev, struct kstatfs *sbuf) { - struct super_block *s = user_get_super(dev); + struct super_block *s = user_get_super(dev, false); int err; if (!s) return -EINVAL; diff --git a/fs/super.c b/fs/super.c index 98bb0629ee10..2c6cdea2ab2d 100644 --- a/fs/super.c +++ b/fs/super.c @@ -307,7 +307,7 @@ static void __put_super(struct super_block *s) * Drops a temporary reference, frees superblock if there's no * references left. */ -static void put_super(struct super_block *sb) +void put_super(struct super_block *sb) { spin_lock(&sb_lock); __put_super(sb); @@ -740,7 +740,14 @@ void iterate_supers_type(struct file_system_type *type, EXPORT_SYMBOL(iterate_supers_type); -static struct super_block *__get_super(struct block_device *bdev, bool excl) +/** + * get_super - get the superblock of a device + * @bdev: device to get the superblock for + * + * Scans the superblock list and finds the superblock of the file system + * mounted on the device given. %NULL is returned if no match is found. + */ +struct super_block *get_super(struct block_device *bdev) { struct super_block *sb; @@ -755,17 +762,11 @@ rescan: if (sb->s_bdev == bdev) { sb->s_count++; spin_unlock(&sb_lock); - if (!excl) - down_read(&sb->s_umount); - else - down_write(&sb->s_umount); + down_read(&sb->s_umount); /* still alive? */ if (sb->s_root && (sb->s_flags & SB_BORN)) return sb; - if (!excl) - up_read(&sb->s_umount); - else - up_write(&sb->s_umount); + up_read(&sb->s_umount); /* nope, got unmounted */ spin_lock(&sb_lock); __put_super(sb); @@ -777,66 +778,6 @@ rescan: } /** - * get_super - get the superblock of a device - * @bdev: device to get the superblock for - * - * Scans the superblock list and finds the superblock of the file system - * mounted on the device given. %NULL is returned if no match is found. - */ -struct super_block *get_super(struct block_device *bdev) -{ - return __get_super(bdev, false); -} -EXPORT_SYMBOL(get_super); - -static struct super_block *__get_super_thawed(struct block_device *bdev, - bool excl) -{ - while (1) { - struct super_block *s = __get_super(bdev, excl); - if (!s || s->s_writers.frozen == SB_UNFROZEN) - return s; - if (!excl) - up_read(&s->s_umount); - else - up_write(&s->s_umount); - wait_event(s->s_writers.wait_unfrozen, - s->s_writers.frozen == SB_UNFROZEN); - put_super(s); - } -} - -/** - * get_super_thawed - get thawed superblock of a device - * @bdev: device to get the superblock for - * - * Scans the superblock list and finds the superblock of the file system - * mounted on the device. The superblock is returned once it is thawed - * (or immediately if it was not frozen). %NULL is returned if no match - * is found. - */ -struct super_block *get_super_thawed(struct block_device *bdev) -{ - return __get_super_thawed(bdev, false); -} -EXPORT_SYMBOL(get_super_thawed); - -/** - * get_super_exclusive_thawed - get thawed superblock of a device - * @bdev: device to get the superblock for - * - * Scans the superblock list and finds the superblock of the file system - * mounted on the device. The superblock is returned once it is thawed - * (or immediately if it was not frozen) and s_umount semaphore is held - * in exclusive mode. %NULL is returned if no match is found. - */ -struct super_block *get_super_exclusive_thawed(struct block_device *bdev) -{ - return __get_super_thawed(bdev, true); -} -EXPORT_SYMBOL(get_super_exclusive_thawed); - -/** * get_active_super - get an active reference to the superblock of a device * @bdev: device to get the superblock for * @@ -867,7 +808,7 @@ restart: return NULL; } -struct super_block *user_get_super(dev_t dev) +struct super_block *user_get_super(dev_t dev, bool excl) { struct super_block *sb; @@ -879,11 +820,17 @@ rescan: if (sb->s_dev == dev) { sb->s_count++; spin_unlock(&sb_lock); - down_read(&sb->s_umount); + if (excl) + down_write(&sb->s_umount); + else + down_read(&sb->s_umount); /* still alive? */ if (sb->s_root && (sb->s_flags & SB_BORN)) return sb; - up_read(&sb->s_umount); + if (excl) + up_write(&sb->s_umount); + else + up_read(&sb->s_umount); /* nope, got unmounted */ spin_lock(&sb_lock); __put_super(sb); diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c index b93b3cd10bfd..0886d835f597 100644 --- a/fs/ubifs/auth.c +++ b/fs/ubifs/auth.c @@ -12,7 +12,6 @@ #include <linux/crypto.h> #include <linux/verification.h> #include <crypto/hash.h> -#include <crypto/sha.h> #include <crypto/algapi.h> #include <keys/user-type.h> #include <keys/asymmetric-type.h> diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 155521e51ac5..7949d7c9aa8c 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -203,6 +203,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino); err = fscrypt_prepare_lookup(dir, dentry, &nm); + generic_set_encrypted_ci_d_ops(dentry); if (err == -ENOENT) return d_splice_alias(NULL, dentry); if (err) @@ -270,6 +271,15 @@ done: return d_splice_alias(inode, dentry); } +static int ubifs_prepare_create(struct inode *dir, struct dentry *dentry, + struct fscrypt_name *nm) +{ + if (fscrypt_is_nokey_name(dentry)) + return -ENOKEY; + + return fscrypt_setup_filename(dir, &dentry->d_name, 0, nm); +} + static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { @@ -293,7 +303,7 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (err) return err; - err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); + err = ubifs_prepare_create(dir, dentry, &nm); if (err) goto out_budg; @@ -505,7 +515,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) return 0; if (encrypted) { - err = fscrypt_get_encryption_info(dir); + err = fscrypt_prepare_readdir(dir); if (err) return err; @@ -953,7 +963,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (err) return err; - err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); + err = ubifs_prepare_create(dir, dentry, &nm); if (err) goto out_budg; @@ -1038,7 +1048,7 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry, return err; } - err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); + err = ubifs_prepare_create(dir, dentry, &nm); if (err) { kfree(dev); goto out_budg; @@ -1122,7 +1132,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, if (err) return err; - err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); + err = ubifs_prepare_create(dir, dentry, &nm); if (err) goto out_budg; @@ -1610,14 +1620,6 @@ int ubifs_getattr(const struct path *path, struct kstat *stat, return 0; } -static int ubifs_dir_open(struct inode *dir, struct file *file) -{ - if (IS_ENCRYPTED(dir)) - return fscrypt_get_encryption_info(dir) ? -EACCES : 0; - - return 0; -} - const struct inode_operations ubifs_dir_inode_operations = { .lookup = ubifs_lookup, .create = ubifs_create, @@ -1644,7 +1646,6 @@ const struct file_operations ubifs_dir_operations = { .iterate_shared = ubifs_readdir, .fsync = ubifs_fsync, .unlocked_ioctl = ubifs_ioctl, - .open = ubifs_dir_open, #ifdef CONFIG_COMPAT .compat_ioctl = ubifs_compat_ioctl, #endif diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 000b457ad087..894cc28142e7 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -28,7 +28,7 @@ #include <linux/security.h> #include <linux/hugetlb.h> -int sysctl_unprivileged_userfaultfd __read_mostly = 1; +int sysctl_unprivileged_userfaultfd __read_mostly; static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; @@ -405,6 +405,13 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) if (ctx->features & UFFD_FEATURE_SIGBUS) goto out; + if ((vmf->flags & FAULT_FLAG_USER) == 0 && + ctx->flags & UFFD_USER_MODE_ONLY) { + printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd " + "sysctl knob to 1 if kernel faults must be handled " + "without obtaining CAP_SYS_PTRACE capability\n"); + goto out; + } /* * If it's already released don't get it. This avoids to loop @@ -1959,16 +1966,23 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) struct userfaultfd_ctx *ctx; int fd; - if (!sysctl_unprivileged_userfaultfd && !capable(CAP_SYS_PTRACE)) + if (!sysctl_unprivileged_userfaultfd && + (flags & UFFD_USER_MODE_ONLY) == 0 && + !capable(CAP_SYS_PTRACE)) { + printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd " + "sysctl knob to 1 if kernel faults must be handled " + "without obtaining CAP_SYS_PTRACE capability\n"); return -EPERM; + } BUG_ON(!current->mm); /* Check the UFFD_* constants for consistency. */ + BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS); BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK); - if (flags & ~UFFD_SHARED_FCNTL_FLAGS) + if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY)) return -EINVAL; ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); diff --git a/fs/verity/enable.c b/fs/verity/enable.c index 5ab3bbec8108..f7e997a01ad0 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * fs/verity/enable.c: ioctl to enable verity on a file + * Ioctl to enable verity on a file * * Copyright 2019 Google LLC */ @@ -398,9 +398,9 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg) * Some pages of the file may have been evicted from pagecache after * being used in the Merkle tree construction, then read into pagecache * again by another process reading from the file concurrently. Since - * these pages didn't undergo verification against the file measurement - * which fs-verity now claims to be enforcing, we have to wipe the - * pagecache to ensure that all future reads are verified. + * these pages didn't undergo verification against the file digest which + * fs-verity now claims to be enforcing, we have to wipe the pagecache + * to ensure that all future reads are verified. */ filemap_write_and_wait(inode->i_mapping); invalidate_inode_pages2(inode->i_mapping); diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index e96d99d5145e..6413d28664d6 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -14,7 +14,7 @@ #define pr_fmt(fmt) "fs-verity: " fmt -#include <crypto/sha.h> +#include <crypto/sha2.h> #include <linux/fsverity.h> #include <linux/mempool.h> @@ -67,52 +67,22 @@ struct merkle_tree_params { * When a verity file is first opened, an instance of this struct is allocated * and stored in ->i_verity_info; it remains until the inode is evicted. It * caches information about the Merkle tree that's needed to efficiently verify - * data read from the file. It also caches the file measurement. The Merkle - * tree pages themselves are not cached here, but the filesystem may cache them. + * data read from the file. It also caches the file digest. The Merkle tree + * pages themselves are not cached here, but the filesystem may cache them. */ struct fsverity_info { struct merkle_tree_params tree_params; u8 root_hash[FS_VERITY_MAX_DIGEST_SIZE]; - u8 measurement[FS_VERITY_MAX_DIGEST_SIZE]; + u8 file_digest[FS_VERITY_MAX_DIGEST_SIZE]; const struct inode *inode; }; -/* - * Merkle tree properties. The file measurement is the hash of this structure - * excluding the signature and with the sig_size field set to 0. - */ -struct fsverity_descriptor { - __u8 version; /* must be 1 */ - __u8 hash_algorithm; /* Merkle tree hash algorithm */ - __u8 log_blocksize; /* log2 of size of data and tree blocks */ - __u8 salt_size; /* size of salt in bytes; 0 if none */ - __le32 sig_size; /* size of signature in bytes; 0 if none */ - __le64 data_size; /* size of file the Merkle tree is built over */ - __u8 root_hash[64]; /* Merkle tree root hash */ - __u8 salt[32]; /* salt prepended to each hashed block */ - __u8 __reserved[144]; /* must be 0's */ - __u8 signature[]; /* optional PKCS#7 signature */ -}; - /* Arbitrary limit to bound the kmalloc() size. Can be changed. */ #define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384 #define FS_VERITY_MAX_SIGNATURE_SIZE (FS_VERITY_MAX_DESCRIPTOR_SIZE - \ sizeof(struct fsverity_descriptor)) -/* - * Format in which verity file measurements are signed. This is the same as - * 'struct fsverity_digest', except here some magic bytes are prepended to - * provide some context about what is being signed in case the same key is used - * for non-fsverity purposes, and here the fields have fixed endianness. - */ -struct fsverity_signed_digest { - char magic[8]; /* must be "FSVerity" */ - __le16 digest_algorithm; - __le16 digest_size; - __u8 digest[]; -}; - /* hash_algs.c */ extern struct fsverity_hash_alg fsverity_hash_algs[]; diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c index c37e186ebeb6..71d0fccb6d4c 100644 --- a/fs/verity/hash_algs.c +++ b/fs/verity/hash_algs.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * fs/verity/hash_algs.c: fs-verity hash algorithms + * fs-verity hash algorithms * * Copyright 2019 Google LLC */ diff --git a/fs/verity/init.c b/fs/verity/init.c index 94c104e00861..c98b7016f446 100644 --- a/fs/verity/init.c +++ b/fs/verity/init.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * fs/verity/init.c: fs-verity module initialization and logging + * fs-verity module initialization and logging * * Copyright 2019 Google LLC */ diff --git a/fs/verity/measure.c b/fs/verity/measure.c index df409a5682ed..f0d7b30c62db 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * fs/verity/measure.c: ioctl to get a verity file's measurement + * Ioctl to get a verity file's digest * * Copyright 2019 Google LLC */ @@ -10,12 +10,12 @@ #include <linux/uaccess.h> /** - * fsverity_ioctl_measure() - get a verity file's measurement - * @filp: file to get measurement of + * fsverity_ioctl_measure() - get a verity file's digest + * @filp: file to get digest of * @_uarg: user pointer to fsverity_digest * - * Retrieve the file measurement that the kernel is enforcing for reads from a - * verity file. See the "FS_IOC_MEASURE_VERITY" section of + * Retrieve the file digest that the kernel is enforcing for reads from a verity + * file. See the "FS_IOC_MEASURE_VERITY" section of * Documentation/filesystems/fsverity.rst for the documentation. * * Return: 0 on success, -errno on failure @@ -51,7 +51,7 @@ int fsverity_ioctl_measure(struct file *filp, void __user *_uarg) if (copy_to_user(uarg, &arg, sizeof(arg))) return -EFAULT; - if (copy_to_user(uarg->digest, vi->measurement, hash_alg->digest_size)) + if (copy_to_user(uarg->digest, vi->file_digest, hash_alg->digest_size)) return -EFAULT; return 0; diff --git a/fs/verity/open.c b/fs/verity/open.c index bfe0280c14e4..228d0eca3e2e 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * fs/verity/open.c: opening fs-verity files + * Opening fs-verity files * * Copyright 2019 Google LLC */ @@ -124,18 +124,18 @@ out_err: } /* - * Compute the file measurement by hashing the fsverity_descriptor excluding the + * Compute the file digest by hashing the fsverity_descriptor excluding the * signature and with the sig_size field set to 0. */ -static int compute_file_measurement(struct fsverity_hash_alg *hash_alg, - struct fsverity_descriptor *desc, - u8 *measurement) +static int compute_file_digest(struct fsverity_hash_alg *hash_alg, + struct fsverity_descriptor *desc, + u8 *file_digest) { __le32 sig_size = desc->sig_size; int err; desc->sig_size = 0; - err = fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), measurement); + err = fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), file_digest); desc->sig_size = sig_size; return err; @@ -199,15 +199,15 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode, memcpy(vi->root_hash, desc->root_hash, vi->tree_params.digest_size); - err = compute_file_measurement(vi->tree_params.hash_alg, desc, - vi->measurement); + err = compute_file_digest(vi->tree_params.hash_alg, desc, + vi->file_digest); if (err) { - fsverity_err(inode, "Error %d computing file measurement", err); + fsverity_err(inode, "Error %d computing file digest", err); goto out; } - pr_debug("Computed file measurement: %s:%*phN\n", + pr_debug("Computed file digest: %s:%*phN\n", vi->tree_params.hash_alg->name, - vi->tree_params.digest_size, vi->measurement); + vi->tree_params.digest_size, vi->file_digest); err = fsverity_verify_signature(vi, desc, desc_size); out: @@ -354,7 +354,7 @@ int __init fsverity_init_info_cache(void) { fsverity_info_cachep = KMEM_CACHE_USERCOPY(fsverity_info, SLAB_RECLAIM_ACCOUNT, - measurement); + file_digest); if (!fsverity_info_cachep) return -ENOMEM; return 0; diff --git a/fs/verity/signature.c b/fs/verity/signature.c index b14ed96387ec..012468eda2a7 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * fs/verity/signature.c: verification of builtin signatures + * Verification of builtin signatures * * Copyright 2019 Google LLC */ @@ -32,8 +32,8 @@ static struct key *fsverity_keyring; * @desc: the file's fsverity_descriptor * @desc_size: size of @desc * - * If the file's fs-verity descriptor includes a signature of the file - * measurement, verify it against the certificates in the fs-verity keyring. + * If the file's fs-verity descriptor includes a signature of the file digest, + * verify it against the certificates in the fs-verity keyring. * * Return: 0 on success (signature valid or not required); -errno on failure */ @@ -44,7 +44,7 @@ int fsverity_verify_signature(const struct fsverity_info *vi, const struct inode *inode = vi->inode; const struct fsverity_hash_alg *hash_alg = vi->tree_params.hash_alg; const u32 sig_size = le32_to_cpu(desc->sig_size); - struct fsverity_signed_digest *d; + struct fsverity_formatted_digest *d; int err; if (sig_size == 0) { @@ -67,7 +67,7 @@ int fsverity_verify_signature(const struct fsverity_info *vi, memcpy(d->magic, "FSVerity", 8); d->digest_algorithm = cpu_to_le16(hash_alg - fsverity_hash_algs); d->digest_size = cpu_to_le16(hash_alg->digest_size); - memcpy(d->digest, vi->measurement, hash_alg->digest_size); + memcpy(d->digest, vi->file_digest, hash_alg->digest_size); err = verify_pkcs7_signature(d, sizeof(*d) + hash_alg->digest_size, desc->signature, sig_size, @@ -90,8 +90,8 @@ int fsverity_verify_signature(const struct fsverity_info *vi, return err; } - pr_debug("Valid signature for file measurement %s:%*phN\n", - hash_alg->name, hash_alg->digest_size, vi->measurement); + pr_debug("Valid signature for file digest %s:%*phN\n", + hash_alg->name, hash_alg->digest_size, vi->file_digest); return 0; } diff --git a/fs/verity/verify.c b/fs/verity/verify.c index a8b68c6f663d..0adb970f4e73 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * fs/verity/verify.c: data verification functions, i.e. hooks for ->readpages() + * Data verification functions, i.e. hooks for ->readpages() * * Copyright 2019 Google LLC */ diff --git a/fs/xattr.c b/fs/xattr.c index cd7a563e8bcd..fd57153b1f61 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -276,8 +276,16 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value, { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; + const void *orig_value = value; int error; + if (size && strcmp(name, XATTR_NAME_CAPS) == 0) { + error = cap_convert_nscap(dentry, &value, size); + if (error < 0) + return error; + size = error; + } + retry_deleg: inode_lock(inode); error = __vfs_setxattr_locked(dentry, name, value, size, flags, @@ -289,6 +297,9 @@ retry_deleg: if (!error) goto retry_deleg; } + if (value != orig_value) + kfree(value); + return error; } EXPORT_SYMBOL_GPL(vfs_setxattr); @@ -537,12 +548,6 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value, if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) posix_acl_fix_xattr_from_user(kvalue, size); - else if (strcmp(kname, XATTR_NAME_CAPS) == 0) { - error = cap_convert_nscap(d, &kvalue, size); - if (error < 0) - goto out; - size = error; - } } error = vfs_setxattr(d, kname, kvalue, size, flags); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index ef1d5bb88b93..b7c5783a031c 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -433,13 +433,10 @@ xfs_fs_goingdown( { switch (inflags) { case XFS_FSOP_GOING_FLAGS_DEFAULT: { - struct super_block *sb = freeze_bdev(mp->m_super->s_bdev); - - if (sb && !IS_ERR(sb)) { + if (!freeze_bdev(mp->m_super->s_bdev)) { xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); - thaw_bdev(sb->s_bdev, sb); + thaw_bdev(mp->m_super->s_bdev); } - break; } case XFS_FSOP_GOING_FLAGS_LOGFLUSH: |