diff options
Diffstat (limited to 'fs')
178 files changed, 8310 insertions, 3240 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index aa4c12282301..462253ae483a 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -203,7 +203,7 @@ config TMPFS_XATTR config TMPFS_INODE64 bool "Use 64-bit ino_t by default in tmpfs" - depends on TMPFS && 64BIT + depends on TMPFS && 64BIT && !(S390 || ALPHA) default n help tmpfs has historically used only inode numbers as wide as an unsigned @@ -333,6 +333,10 @@ config NFS_COMMON depends on NFSD || NFS_FS || LOCKD default y +config NFS_V4_2_SSC_HELPER + tristate + default y if NFS_V4=y || NFS_FS=y + source "net/sunrpc/Kconfig" source "fs/ceph/Kconfig" source "fs/cifs/Kconfig" diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 885da6d983b4..c6f1c8c1934e 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -29,7 +29,7 @@ config BINFMT_ELF latest version). config COMPAT_BINFMT_ELF - bool + def_bool y depends on COMPAT && BINFMT_ELF select ELFCORE @@ -45,7 +45,7 @@ config ARCH_USE_GNU_PROPERTY config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y if !BINFMT_ELF - depends on (ARM || (SUPERH && !MMU) || C6X) + depends on (ARM || (SUPERH && !MMU)) select ELFCORE help ELF FDPIC binaries are based on ELF, but allow the individual load diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 41c5749f4db7..5400a876d73f 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -460,8 +460,10 @@ affs_xrename(struct inode *old_dir, struct dentry *old_dentry, return -EIO; bh_new = affs_bread(sb, d_inode(new_dentry)->i_ino); - if (!bh_new) + if (!bh_new) { + affs_brelse(bh_old); return -EIO; + } /* Remove old header from its parent directory. */ affs_lock_dir(old_dir); diff --git a/fs/afs/main.c b/fs/afs/main.c index accdd8970e7c..b2975256dadb 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -193,7 +193,7 @@ static int __init afs_init(void) goto error_cache; #endif - ret = register_pernet_subsys(&afs_net_ops); + ret = register_pernet_device(&afs_net_ops); if (ret < 0) goto error_net; @@ -213,7 +213,7 @@ static int __init afs_init(void) error_proc: afs_fs_exit(); error_fs: - unregister_pernet_subsys(&afs_net_ops); + unregister_pernet_device(&afs_net_ops); error_net: #ifdef CONFIG_AFS_FSCACHE fscache_unregister_netfs(&afs_cache_netfs); @@ -244,7 +244,7 @@ static void __exit afs_exit(void) proc_remove(afs_proc_symlink); afs_fs_exit(); - unregister_pernet_subsys(&afs_net_ops); + unregister_pernet_device(&afs_net_ops); #ifdef CONFIG_AFS_FSCACHE fscache_unregister_netfs(&afs_cache_netfs); #endif diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 950bc177238a..4c1550b13899 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1495,7 +1495,7 @@ static void fill_note(struct memelfnote *note, const char *name, int type, * fill up all the fields in prstatus from the given task struct, except * registers which need to be filled up separately. */ -static void fill_prstatus(struct elf_prstatus *prstatus, +static void fill_prstatus(struct elf_prstatus_common *prstatus, struct task_struct *p, long signr) { prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; @@ -1717,11 +1717,11 @@ static void do_thread_regset_writeback(struct task_struct *task, } #ifndef PRSTATUS_SIZE -#define PRSTATUS_SIZE(S, R) sizeof(S) +#define PRSTATUS_SIZE sizeof(struct elf_prstatus) #endif #ifndef SET_PR_FPVALID -#define SET_PR_FPVALID(S, V, R) ((S)->pr_fpvalid = (V)) +#define SET_PR_FPVALID(S) ((S)->pr_fpvalid = 1) #endif static int fill_thread_core_info(struct elf_thread_core_info *t, @@ -1729,7 +1729,6 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, long signr, size_t *total) { unsigned int i; - int regset0_size; /* * NT_PRSTATUS is the one special case, because the regset data @@ -1737,14 +1736,12 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, * than being the whole note contents. We fill the reset in here. * We assume that regset 0 is NT_PRSTATUS. */ - fill_prstatus(&t->prstatus, t->task, signr); - regset0_size = regset_get(t->task, &view->regsets[0], + fill_prstatus(&t->prstatus.common, t->task, signr); + regset_get(t->task, &view->regsets[0], sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg); - if (regset0_size < 0) - return 0; fill_note(&t->notes[0], "CORE", NT_PRSTATUS, - PRSTATUS_SIZE(t->prstatus, regset0_size), &t->prstatus); + PRSTATUS_SIZE, &t->prstatus); *total += notesize(&t->notes[0]); do_thread_regset_writeback(t->task, &view->regsets[0]); @@ -1772,7 +1769,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, continue; if (is_fpreg) - SET_PR_FPVALID(&t->prstatus, 1, regset0_size); + SET_PR_FPVALID(&t->prstatus); fill_note(&t->notes[i], is_fpreg ? "CORE" : "LINUX", note_type, ret, data); @@ -1961,7 +1958,7 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t) struct task_struct *p = t->thread; t->num_notes = 0; - fill_prstatus(&t->prstatus, p, signr); + fill_prstatus(&t->prstatus.common, p, signr); elf_core_copy_task_regs(p, &t->prstatus.pr_reg); fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), @@ -2040,7 +2037,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, } /* now collect the dump for the current */ memset(info->prstatus, 0, sizeof(*info->prstatus)); - fill_prstatus(info->prstatus, current, siginfo->si_signo); + fill_prstatus(&info->prstatus->common, current, siginfo->si_signo); elf_core_copy_regs(&info->prstatus->pr_reg, regs); /* Set up header */ diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index be4062b8ba75..03d81a14bcbf 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1191,18 +1191,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, struct elf_prstatus_fdpic { - struct elf_siginfo pr_info; /* Info associated with signal */ - short pr_cursig; /* Current signal */ - unsigned long pr_sigpend; /* Set of pending signals */ - unsigned long pr_sighold; /* Set of held signals */ - pid_t pr_pid; - pid_t pr_ppid; - pid_t pr_pgrp; - pid_t pr_sid; - struct __kernel_old_timeval pr_utime; /* User time */ - struct __kernel_old_timeval pr_stime; /* System time */ - struct __kernel_old_timeval pr_cutime; /* Cumulative user time */ - struct __kernel_old_timeval pr_cstime; /* Cumulative system time */ + struct elf_prstatus_common common; elf_gregset_t pr_reg; /* GP registers */ /* When using FDPIC, the loadmap addresses need to be communicated * to GDB in order for GDB to do the necessary relocations. The @@ -1301,7 +1290,7 @@ static inline void fill_note(struct memelfnote *note, const char *name, int type * fill up all the fields in prstatus from the given task struct, except * registers which need to be filled up separately. */ -static void fill_prstatus(struct elf_prstatus_fdpic *prstatus, +static void fill_prstatus(struct elf_prstatus_common *prstatus, struct task_struct *p, long signr) { prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; @@ -1332,9 +1321,6 @@ static void fill_prstatus(struct elf_prstatus_fdpic *prstatus, } prstatus->pr_cutime = ns_to_kernel_old_timeval(p->signal->cutime); prstatus->pr_cstime = ns_to_kernel_old_timeval(p->signal->cstime); - - prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap; - prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap; } static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, @@ -1405,7 +1391,9 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_ if (!t) return t; - fill_prstatus(&t->prstatus, p, signr); + fill_prstatus(&t->prstatus.common, p, signr); + t->prstatus.pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap; + t->prstatus.pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap; regset_get(p, &view->regsets[0], sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg); diff --git a/fs/block_dev.c b/fs/block_dev.c index 3b8963e228a1..235b5042672e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -130,7 +130,15 @@ EXPORT_SYMBOL(truncate_bdev_range); static void set_init_blocksize(struct block_device *bdev) { - bdev->bd_inode->i_blkbits = blksize_bits(bdev_logical_block_size(bdev)); + unsigned int bsize = bdev_logical_block_size(bdev); + loff_t size = i_size_read(bdev->bd_inode); + + while (bsize < PAGE_SIZE) { + if (size & bsize) + break; + bsize <<= 1; + } + bdev->bd_inode->i_blkbits = blksize_bits(bsize); } int set_blocksize(struct block_device *bdev, int size) diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 9f1b1a88e317..b634c42115ea 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -1,5 +1,21 @@ # SPDX-License-Identifier: GPL-2.0 +# Subset of W=1 warnings +subdir-ccflags-y += -Wextra -Wunused -Wno-unused-parameter +subdir-ccflags-y += -Wmissing-declarations +subdir-ccflags-y += -Wmissing-format-attribute +subdir-ccflags-y += -Wmissing-prototypes +subdir-ccflags-y += -Wold-style-definition +subdir-ccflags-y += -Wmissing-include-dirs +subdir-ccflags-y += $(call cc-option, -Wunused-but-set-variable) +subdir-ccflags-y += $(call cc-option, -Wunused-const-variable) +subdir-ccflags-y += $(call cc-option, -Wpacked-not-aligned) +subdir-ccflags-y += $(call cc-option, -Wstringop-truncation) +# The following turn off the warnings enabled by -Wextra +subdir-ccflags-y += -Wno-missing-field-initializers +subdir-ccflags-y += -Wno-sign-compare +subdir-ccflags-y += -Wno-type-limits + obj-$(CONFIG_BTRFS_FS) := btrfs.o btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ @@ -11,7 +27,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ - block-rsv.o delalloc-space.o block-group.o discard.o reflink.o + block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ + subpage.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 02d7d7b2563b..f47c1528eb9a 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1501,7 +1501,13 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, } /** - * btrfs_check_shared - tell us whether an extent is shared + * Check if an extent is shared or not + * + * @root: root inode belongs to + * @inum: inode number of the inode whose extent we are checking + * @bytenr: logical bytenr of the extent we are checking + * @roots: list of roots this extent is shared among + * @tmp: temporary list used for iteration * * btrfs_check_shared uses the backref walking code but will short * circuit as soon as it finds a root or inode that doesn't match the @@ -2541,13 +2547,6 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, list_del(&edge->list[UPPER]); btrfs_backref_free_edge(cache, edge); - if (RB_EMPTY_NODE(&upper->rb_node)) { - BUG_ON(!list_empty(&node->upper)); - btrfs_backref_drop_node(cache, node); - node = upper; - node->lowest = 1; - continue; - } /* * Add the node to leaf node list if no other child block * cached. @@ -2624,7 +2623,7 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache, /* Only reloc backref cache cares about a specific root */ if (cache->is_reloc) { root = find_reloc_root(cache->fs_info, cur->bytenr); - if (WARN_ON(!root)) + if (!root) return -ENOENT; cur->root = root; } else { @@ -3117,7 +3116,7 @@ void btrfs_backref_error_cleanup(struct btrfs_backref_cache *cache, list_del_init(&lower->list); if (lower == node) node = NULL; - btrfs_backref_free_node(cache, lower); + btrfs_backref_drop_node(cache, lower); } btrfs_backref_cleanup_node(cache, node); diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index ff705cc564a9..17abde7f794c 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -296,6 +296,9 @@ static inline void btrfs_backref_free_node(struct btrfs_backref_cache *cache, struct btrfs_backref_node *node) { if (node) { + ASSERT(list_empty(&node->list)); + ASSERT(list_empty(&node->lower)); + ASSERT(node->eb == NULL); cache->nr_nodes--; btrfs_put_root(node->root); kfree(node); @@ -340,11 +343,11 @@ static inline void btrfs_backref_drop_node_buffer( static inline void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, struct btrfs_backref_node *node) { - BUG_ON(!list_empty(&node->upper)); + ASSERT(list_empty(&node->upper)); btrfs_backref_drop_node_buffer(node); - list_del(&node->list); - list_del(&node->lower); + list_del_init(&node->list); + list_del_init(&node->lower); if (!RB_EMPTY_NODE(&node->rb_node)) rb_erase(&node->rb_node, &tree->rb_root); btrfs_backref_free_node(tree, node); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 52f2198d44c9..5064be59dac5 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -15,6 +15,7 @@ #include "delalloc-space.h" #include "discard.h" #include "raid56.h" +#include "zoned.h" /* * Return target flags in extended format or 0 if restripe for this chunk_type @@ -673,7 +674,15 @@ static noinline void caching_thread(struct btrfs_work *work) wake_up(&caching_ctl->wait); } - if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + /* + * If we are in the transaction that populated the free space tree we + * can't actually cache from the free space tree as our commit root and + * real root are the same, so we could change the contents of the blocks + * while caching. Instead do the slow caching in this case, and after + * the transaction has committed we will be safe. + */ + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags))) ret = load_free_space_tree(caching_ctl); else ret = load_extent_tree_free(caching_ctl); @@ -716,6 +725,10 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only struct btrfs_caching_control *caching_ctl = NULL; int ret = 0; + /* Allocator for zoned filesystems does not use the cache at all */ + if (btrfs_is_zoned(fs_info)) + return 0; + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); if (!caching_ctl) return -ENOMEM; @@ -888,6 +901,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, btrfs_return_cluster_to_free_space(block_group, cluster); spin_unlock(&cluster->refill_lock); + btrfs_clear_treelog_bg(block_group); + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; @@ -1000,12 +1015,17 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, WARN_ON(block_group->space_info->total_bytes < block_group->length); WARN_ON(block_group->space_info->bytes_readonly - < block_group->length); + < block_group->length - block_group->zone_unusable); + WARN_ON(block_group->space_info->bytes_zone_unusable + < block_group->zone_unusable); WARN_ON(block_group->space_info->disk_total < block_group->length * factor); } block_group->space_info->total_bytes -= block_group->length; - block_group->space_info->bytes_readonly -= block_group->length; + block_group->space_info->bytes_readonly -= + (block_group->length - block_group->zone_unusable); + block_group->space_info->bytes_zone_unusable -= + block_group->zone_unusable; block_group->space_info->disk_total -= block_group->length * factor; spin_unlock(&block_group->space_info->lock); @@ -1149,7 +1169,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) } num_bytes = cache->length - cache->reserved - cache->pinned - - cache->bytes_super - cache->used; + cache->bytes_super - cache->zone_unusable - cache->used; /* * Data never overcommits, even in mixed mode, so do just the straight @@ -1180,6 +1200,12 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) if (!ret) { sinfo->bytes_readonly += num_bytes; + if (btrfs_is_zoned(cache->fs_info)) { + /* Migrate zone_unusable bytes to readonly */ + sinfo->bytes_readonly += cache->zone_unusable; + sinfo->bytes_zone_unusable -= cache->zone_unusable; + cache->zone_unusable = 0; + } cache->ro++; list_add_tail(&cache->ro_list, &sinfo->ro_bgs); } @@ -1254,6 +1280,13 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; + /* + * Long running balances can keep us blocked here for eternity, so + * simply skip deletion if we're unable to get the mutex. + */ + if (!mutex_trylock(&fs_info->delete_unused_bgs_mutex)) + return; + spin_lock(&fs_info->unused_bgs_lock); while (!list_empty(&fs_info->unused_bgs)) { int trimming; @@ -1273,8 +1306,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); - mutex_lock(&fs_info->delete_unused_bgs_mutex); - /* Don't want to race with allocators so take the groups_sem */ down_write(&space_info->groups_sem); @@ -1363,9 +1394,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) btrfs_space_info_update_bytes_pinned(fs_info, space_info, -block_group->pinned); space_info->bytes_readonly += block_group->pinned; - percpu_counter_add_batch(&space_info->total_bytes_pinned, - -block_group->pinned, - BTRFS_TOTAL_BYTES_PINNED_BATCH); + __btrfs_mod_total_bytes_pinned(space_info, -block_group->pinned); block_group->pinned = 0; spin_unlock(&block_group->lock); @@ -1381,8 +1410,12 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC)) goto flip_async; - /* DISCARD can flip during remount */ - trimming = btrfs_test_opt(fs_info, DISCARD_SYNC); + /* + * DISCARD can flip during remount. On zoned filesystems, we + * need to reset sequential-required zones. + */ + trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) || + btrfs_is_zoned(fs_info); /* Implicit trim during transaction commit. */ if (trimming) @@ -1420,11 +1453,11 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) end_trans: btrfs_end_transaction(trans); next: - mutex_unlock(&fs_info->delete_unused_bgs_mutex); btrfs_put_block_group(block_group); spin_lock(&fs_info->unused_bgs_lock); } spin_unlock(&fs_info->unused_bgs_lock); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); return; flip_async: @@ -1553,8 +1586,11 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) } /** - * btrfs_rmap_block - Map a physical disk address to a list of logical addresses + * Map a physical disk address to a list of logical addresses + * + * @fs_info: the filesystem * @chunk_start: logical address of block group + * @bdev: physical device to resolve, can be NULL to indicate any device * @physical: physical address to map to logical addresses * @logical: return array of logical addresses which map to @physical * @naddrs: length of @logical @@ -1564,9 +1600,9 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) * Used primarily to exclude those portions of a block group that contain super * block copies. */ -EXPORT_FOR_TESTS int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - u64 physical, u64 **logical, int *naddrs, int *stripe_len) + struct block_device *bdev, u64 physical, u64 **logical, + int *naddrs, int *stripe_len) { struct extent_map *em; struct map_lookup *map; @@ -1584,6 +1620,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, map = em->map_lookup; data_stripe_length = em->orig_block_len; io_stripe_size = map->stripe_len; + chunk_start = em->start; /* For RAID5/6 adjust to a full IO stripe length */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) @@ -1598,14 +1635,18 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, for (i = 0; i < map->num_stripes; i++) { bool already_inserted = false; u64 stripe_nr; + u64 offset; int j; if (!in_range(physical, map->stripes[i].physical, data_stripe_length)) continue; + if (bdev && map->stripes[i].dev->bdev != bdev) + continue; + stripe_nr = physical - map->stripes[i].physical; - stripe_nr = div64_u64(stripe_nr, map->stripe_len); + stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset); if (map->type & BTRFS_BLOCK_GROUP_RAID10) { stripe_nr = stripe_nr * map->num_stripes + i; @@ -1619,7 +1660,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, * instead of map->stripe_len */ - bytenr = chunk_start + stripe_nr * io_stripe_size; + bytenr = chunk_start + stripe_nr * io_stripe_size + offset; /* Ensure we don't add duplicate addresses */ for (j = 0; j < nr; j++) { @@ -1661,7 +1702,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); - ret = btrfs_rmap_block(fs_info, cache->start, + ret = btrfs_rmap_block(fs_info, cache->start, NULL, bytenr, &logical, &nr, &stripe_len); if (ret) return ret; @@ -1797,24 +1838,8 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) return ret; } -static void read_block_group_item(struct btrfs_block_group *cache, - struct btrfs_path *path, - const struct btrfs_key *key) -{ - struct extent_buffer *leaf = path->nodes[0]; - struct btrfs_block_group_item bgi; - int slot = path->slots[0]; - - cache->length = key->offset; - - read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), - sizeof(bgi)); - cache->used = btrfs_stack_block_group_used(&bgi); - cache->flags = btrfs_stack_block_group_flags(&bgi); -} - static int read_one_block_group(struct btrfs_fs_info *info, - struct btrfs_path *path, + struct btrfs_block_group_item *bgi, const struct btrfs_key *key, int need_clear) { @@ -1829,7 +1854,9 @@ static int read_one_block_group(struct btrfs_fs_info *info, if (!cache) return -ENOMEM; - read_block_group_item(cache, path, key); + cache->length = key->offset; + cache->used = btrfs_stack_block_group_used(bgi); + cache->flags = btrfs_stack_block_group_flags(bgi); set_free_space_tree_thresholds(cache); @@ -1856,6 +1883,13 @@ static int read_one_block_group(struct btrfs_fs_info *info, goto error; } + ret = btrfs_load_block_group_zone_info(cache, false); + if (ret) { + btrfs_err(info, "zoned: failed to load zone info of bg %llu", + cache->start); + goto error; + } + /* * We need to exclude the super stripes now so that the space info has * super bytes accounted for, otherwise we'll think we have more space @@ -1869,12 +1903,20 @@ static int read_one_block_group(struct btrfs_fs_info *info, } /* - * Check for two cases, either we are full, and therefore don't need - * to bother with the caching work since we won't find any space, or we - * are empty, and we can just add all the space in and be done with it. - * This saves us _a_lot_ of time, particularly in the full case. + * For zoned filesystem, space after the allocation offset is the only + * free space for a block group. So, we don't need any caching work. + * btrfs_calc_zone_unusable() will set the amount of free space and + * zone_unusable space. + * + * For regular filesystem, check for two cases, either we are full, and + * therefore don't need to bother with the caching work since we won't + * find any space, or we are empty, and we can just add all the space + * in and be done with it. This saves us _a_lot_ of time, particularly + * in the full case. */ - if (cache->length == cache->used) { + if (btrfs_is_zoned(info)) { + btrfs_calc_zone_unusable(cache); + } else if (cache->length == cache->used) { cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; btrfs_free_excluded_extents(cache); @@ -1893,7 +1935,8 @@ static int read_one_block_group(struct btrfs_fs_info *info, } trace_btrfs_add_block_group(info, cache, 0); btrfs_update_space_info(info, cache->flags, cache->length, - cache->used, cache->bytes_super, &space_info); + cache->used, cache->bytes_super, + cache->zone_unusable, &space_info); cache->space_info = space_info; @@ -1949,7 +1992,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) break; } btrfs_update_space_info(fs_info, bg->flags, em->len, em->len, - 0, &space_info); + 0, 0, &space_info); bg->space_info = space_info; link_block_group(bg); @@ -1988,19 +2031,29 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) need_clear = 1; while (1) { + struct btrfs_block_group_item bgi; + struct extent_buffer *leaf; + int slot; + ret = find_first_block_group(info, path, &key); if (ret > 0) break; if (ret != 0) goto error; - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - ret = read_one_block_group(info, path, &key, need_clear); + leaf = path->nodes[0]; + slot = path->slots[0]; + + read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), + sizeof(bgi)); + + btrfs_item_key_to_cpu(leaf, &key, slot); + btrfs_release_path(path); + ret = read_one_block_group(info, &bgi, &key, need_clear); if (ret < 0) goto error; key.objectid += key.offset; key.offset = 0; - btrfs_release_path(path); } btrfs_release_path(path); @@ -2132,6 +2185,13 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, cache->cached = BTRFS_CACHE_FINISHED; if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) cache->needs_free_space = 1; + + ret = btrfs_load_block_group_zone_info(cache, true); + if (ret) { + btrfs_put_block_group(cache); + return ret; + } + ret = exclude_super_stripes(cache); if (ret) { /* We may have excluded something, so call this just in case */ @@ -2173,7 +2233,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, */ trace_btrfs_add_block_group(fs_info, cache, 1); btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, - cache->bytes_super, &cache->space_info); + cache->bytes_super, 0, &cache->space_info); btrfs_update_global_block_rsv(fs_info); link_block_group(cache); @@ -2281,8 +2341,15 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) spin_lock(&cache->lock); if (!--cache->ro) { num_bytes = cache->length - cache->reserved - - cache->pinned - cache->bytes_super - cache->used; + cache->pinned - cache->bytes_super - + cache->zone_unusable - cache->used; sinfo->bytes_readonly -= num_bytes; + if (btrfs_is_zoned(cache->fs_info)) { + /* Migrate zone_unusable bytes back */ + cache->zone_unusable = cache->alloc_offset - cache->used; + sinfo->bytes_zone_unusable += cache->zone_unusable; + sinfo->bytes_readonly -= cache->zone_unusable; + } list_del_init(&cache->ro_list); } spin_unlock(&cache->lock); @@ -2556,8 +2623,10 @@ again: if (!path) { path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + ret = -ENOMEM; + goto out; + } } /* @@ -2651,16 +2720,14 @@ again: btrfs_put_block_group(cache); if (drop_reserve) btrfs_delayed_refs_rsv_release(fs_info, 1); - - if (ret) - break; - /* * Avoid blocking other tasks for too long. It might even save * us from writing caches for block groups that are going to be * removed. */ mutex_unlock(&trans->transaction->cache_write_mutex); + if (ret) + goto out; mutex_lock(&trans->transaction->cache_write_mutex); } mutex_unlock(&trans->transaction->cache_write_mutex); @@ -2669,7 +2736,8 @@ again: * Go through delayed refs for all the stuff we've just kicked off * and then loop back (just once) */ - ret = btrfs_run_delayed_refs(trans, 0); + if (!ret) + ret = btrfs_run_delayed_refs(trans, 0); if (!ret && loops == 0) { loops++; spin_lock(&cur_trans->dirty_bgs_lock); @@ -2683,7 +2751,12 @@ again: goto again; } spin_unlock(&cur_trans->dirty_bgs_lock); - } else if (ret < 0) { + } +out: + if (ret < 0) { + spin_lock(&cur_trans->dirty_bgs_lock); + list_splice_init(&dirty, &cur_trans->dirty_bgs); + spin_unlock(&cur_trans->dirty_bgs_lock); btrfs_cleanup_dirty_bgs(cur_trans, fs_info); } @@ -2887,10 +2960,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - percpu_counter_add_batch( - &cache->space_info->total_bytes_pinned, - num_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); + __btrfs_mod_total_bytes_pinned(cache->space_info, + num_bytes); set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 8f74a96074f7..29678426247d 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -95,6 +95,8 @@ struct btrfs_block_group { unsigned int iref:1; unsigned int has_caching_ctl:1; unsigned int removed:1; + unsigned int to_copy:1; + unsigned int relocating_repair:1; int disk_cache_state; @@ -181,8 +183,19 @@ struct btrfs_block_group { */ int needs_free_space; + /* Flag indicating this block group is placed on a sequential zone */ + bool seq_zone; + /* Record locked full stripes for RAID5/6 block group */ struct btrfs_full_stripe_locks_tree full_stripe_locks_root; + + /* + * Allocation offset for the block group to implement sequential + * allocation. This is used only on a zoned filesystem. + */ + u64 alloc_offset; + u64 zone_unusable; + u64 meta_write_pointer; }; static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) @@ -270,6 +283,9 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info); int btrfs_free_block_groups(struct btrfs_fs_info *info); void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache, struct btrfs_caching_control *caching_ctl); +int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, + struct block_device *bdev, u64 physical, u64 **logical, + int *naddrs, int *stripe_len); static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) { @@ -296,9 +312,4 @@ static inline int btrfs_block_group_done(struct btrfs_block_group *cache) void btrfs_freeze_block_group(struct btrfs_block_group *cache); void btrfs_unfreeze_block_group(struct btrfs_block_group *cache); -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - u64 physical, u64 **logical, int *naddrs, int *stripe_len); -#endif - #endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index d9bf53d9ff90..28e202e89660 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -325,7 +325,8 @@ struct btrfs_dio_private { struct inode *inode; u64 logical_offset; u64 disk_bytenr; - u64 bytes; + /* Used for bio::bi_size */ + u32 bytes; /* * References to this structure. There is one reference per in-flight diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 5ae3fa0386b7..6d203acfdeb3 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -542,13 +542,19 @@ static noinline int add_ra_bio_pages(struct inode *inode, goto next; } - end = last_offset + PAGE_SIZE - 1; /* * at this point, we have a locked page in the page cache * for these bytes in the file. But, we have to make * sure they map to this compressed extent on disk. */ - set_page_extent_mapped(page); + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_page(page); + put_page(page); + break; + } + + end = last_offset + PAGE_SIZE - 1; lock_extent(tree, last_offset, end); read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, last_offset, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cc89b63d65a4..d56730a67885 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -221,9 +221,12 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, ret = btrfs_inc_ref(trans, root, cow, 1); else ret = btrfs_inc_ref(trans, root, cow, 0); - - if (ret) + if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); + btrfs_abort_transaction(trans, ret); return ret; + } btrfs_mark_buffer_dirty(cow); *cow_ret = cow; @@ -1494,6 +1497,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, return ret; } +ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); /* * helper function for defrag to decide if two blocks pointed to by a @@ -2821,6 +2825,7 @@ done: btrfs_release_path(p); return ret; } +ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); /* * Like btrfs_search_slot, this looks for a key in the given tree. It uses the diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e6e37591f1de..3bc00aed13b2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -298,7 +298,8 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ - BTRFS_FEATURE_INCOMPAT_RAID1C34) + BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ + BTRFS_FEATURE_INCOMPAT_ZONED) #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) @@ -563,6 +564,9 @@ enum { /* Indicate that we need to cleanup space cache v1 */ BTRFS_FS_CLEANUP_SPACE_CACHE_V1, + + /* Indicate that we can't trust the free space tree for caching yet */ + BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, }; /* @@ -794,7 +798,7 @@ struct btrfs_fs_info { /* used to keep from writing metadata until there is a nice batch */ struct percpu_counter dirty_metadata_bytes; struct percpu_counter delalloc_bytes; - struct percpu_counter dio_bytes; + struct percpu_counter ordered_bytes; s32 dirty_metadata_batch; s32 delalloc_batch; @@ -930,6 +934,7 @@ struct btrfs_fs_info { /* Used to reclaim the metadata space in the background. */ struct work_struct async_reclaim_work; struct work_struct async_data_reclaim_work; + struct work_struct preempt_reclaim_work; spinlock_t unused_bgs_lock; struct list_head unused_bgs; @@ -971,6 +976,9 @@ struct btrfs_fs_info { /* Max size to emit ZONE_APPEND write command */ u64 max_zone_append_size; + struct mutex zoned_meta_io_lock; + spinlock_t treelog_bg_lock; + u64 treelog_bg; #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; @@ -1101,7 +1109,7 @@ struct btrfs_root { u32 type; - u64 highest_objectid; + u64 free_objectid; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; @@ -2737,6 +2745,7 @@ enum btrfs_flush_state { ALLOC_CHUNK_FORCE = 8, RUN_DELAYED_IPUTS = 9, COMMIT_TRANS = 10, + FORCE_COMMIT_TRANS = 11, }; int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, @@ -3097,15 +3106,14 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_snapshot(struct btrfs_root *root); -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr, +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, bool in_reclaim_context); int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, - struct btrfs_root *parent_root, - u64 new_dirid); + struct btrfs_root *parent_root); void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, unsigned *bits); void btrfs_clear_delalloc_extent(struct inode *inode, @@ -3116,6 +3124,8 @@ void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split); int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, unsigned long bio_flags); +bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio, + unsigned int size); void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index bacee09b7bfd..56642ca7af10 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -191,12 +191,14 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode, } /** - * btrfs_inode_rsv_release - release any excessive reservation. - * @inode - the inode we need to release from. - * @qgroup_free - free or convert qgroup meta. - * Unlike normal operation, qgroup meta reservation needs to know if we are - * freeing qgroup reservation or just converting it into per-trans. Normally - * @qgroup_free is true for error handling, and false for normal release. + * Release any excessive reservation + * + * @inode: the inode we need to release from + * @qgroup_free: free or convert qgroup meta. Unlike normal operation, qgroup + * meta reservation needs to know if we are freeing qgroup + * reservation or just converting it into per-trans. Normally + * @qgroup_free is true for error handling, and false for normal + * release. * * This is the same as btrfs_block_rsv_release, except that it handles the * tracepoint for the reservation. @@ -361,7 +363,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) } /** - * btrfs_delalloc_release_metadata - release a metadata reservation for an inode + * Release a metadata reservation for an inode + * * @inode: the inode to release the reservation for. * @num_bytes: the number of bytes we are releasing. * @qgroup_free: free qgroup reservation or convert it to per-trans reservation @@ -455,11 +458,13 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, } /** - * btrfs_delalloc_release_space - release data and metadata space for delalloc - * @inode: inode we're releasing space for - * @start: start position of the space already reserved - * @len: the len of the space already reserved - * @release_bytes: the len of the space we consumed or didn't use + * Release data and metadata space for delalloc + * + * @inode: inode we're releasing space for + * @reserved: list of changed/reserved ranges + * @start: start position of the space already reserved + * @len: length of the space already reserved + * @qgroup_free: should qgroup reserved-space also be freed * * This function will release the metadata space that was not used and will * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 70c0340d839c..ec0b50b8c5d6 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1154,7 +1154,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) delayed_root = fs_info->delayed_root; curr_node = btrfs_first_delayed_node(delayed_root); - while (curr_node && (!count || (count && nr--))) { + while (curr_node && (!count || nr--)) { ret = __btrfs_commit_inode_delayed_items(trans, path, curr_node); if (ret) { diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 353cc2994d10..63be7d01a9a3 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -69,9 +69,10 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) } /** - * btrfs_delayed_refs_rsv_release - release a ref head's reservation. - * @fs_info - the fs_info for our fs. - * @nr - the number of items to drop. + * Release a ref head's reservation + * + * @fs_info: the filesystem + * @nr: number of items to drop * * This drops the delayed ref head's count from the delayed refs rsv and frees * any excess reservation we had. @@ -114,10 +115,11 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) } /** - * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. - * @fs_info - the fs info for our fs. - * @src - the source block rsv to transfer from. - * @num_bytes - the number of bytes to transfer. + * Transfer bytes to our delayed refs rsv + * + * @fs_info: the filesystem + * @src: source block rsv to transfer from + * @num_bytes: number of bytes to transfer * * This transfers up to the num_bytes amount from the src rsv to the * delayed_refs_rsv. Any extra bytes are returned to the space info. @@ -162,9 +164,10 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, } /** - * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. - * @fs_info - the fs_info for our fs. - * @flush - control how we can flush for this reservation. + * Refill based on our delayed refs usage + * + * @fs_info: the filesystem + * @flush: control how we can flush for this reservation. * * This will refill the delayed block_rsv up to 1 items size worth of space and * will return -ENOSPC if we can't make the reservation. @@ -648,12 +651,12 @@ inserted: */ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *existing, - struct btrfs_delayed_ref_head *update, - int *old_ref_mod_ret) + struct btrfs_delayed_ref_head *update) { struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs; struct btrfs_fs_info *fs_info = trans->fs_info; + u64 flags = btrfs_ref_head_to_space_flags(existing); int old_ref_mod; BUG_ON(existing->is_data != update->is_data); @@ -701,8 +704,6 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, * currently, for refs we just added we know we're a-ok. */ old_ref_mod = existing->total_ref_mod; - if (old_ref_mod_ret) - *old_ref_mod_ret = old_ref_mod; existing->ref_mod += update->ref_mod; existing->total_ref_mod += update->ref_mod; @@ -724,6 +725,27 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, trans->delayed_ref_updates += csum_leaves; } } + + /* + * This handles the following conditions: + * + * 1. We had a ref mod of 0 or more and went negative, indicating that + * we may be freeing space, so add our space to the + * total_bytes_pinned counter. + * 2. We were negative and went to 0 or positive, so no longer can say + * that the space would be pinned, decrement our counter from the + * total_bytes_pinned counter. + * 3. We are now at 0 and have ->must_insert_reserved set, which means + * this was a new allocation and then we dropped it, and thus must + * add our space to the total_bytes_pinned counter. + */ + if (existing->total_ref_mod < 0 && old_ref_mod >= 0) + btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes); + else if (existing->total_ref_mod >= 0 && old_ref_mod < 0) + btrfs_mod_total_bytes_pinned(fs_info, flags, -existing->num_bytes); + else if (existing->total_ref_mod == 0 && existing->must_insert_reserved) + btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes); + spin_unlock(&existing->lock); } @@ -798,8 +820,7 @@ static noinline struct btrfs_delayed_ref_head * add_delayed_ref_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head_ref, struct btrfs_qgroup_extent_record *qrecord, - int action, int *qrecord_inserted_ret, - int *old_ref_mod, int *new_ref_mod) + int action, int *qrecord_inserted_ret) { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_root *delayed_refs; @@ -821,8 +842,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); if (existing) { - update_existing_head_ref(trans, existing, head_ref, - old_ref_mod); + update_existing_head_ref(trans, existing, head_ref); /* * we've updated the existing ref, free the newly * allocated ref @@ -830,14 +850,17 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { - if (old_ref_mod) - *old_ref_mod = 0; + u64 flags = btrfs_ref_head_to_space_flags(head_ref); + if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes; trans->delayed_ref_updates += btrfs_csum_bytes_to_leaves(trans->fs_info, head_ref->num_bytes); } + if (head_ref->ref_mod < 0) + btrfs_mod_total_bytes_pinned(trans->fs_info, flags, + head_ref->num_bytes); delayed_refs->num_heads++; delayed_refs->num_heads_ready++; atomic_inc(&delayed_refs->num_entries); @@ -845,8 +868,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, } if (qrecord_inserted_ret) *qrecord_inserted_ret = qrecord_inserted; - if (new_ref_mod) - *new_ref_mod = head_ref->total_ref_mod; return head_ref; } @@ -909,8 +930,7 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, */ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, - struct btrfs_delayed_extent_op *extent_op, - int *old_ref_mod, int *new_ref_mod) + struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_tree_ref *ref; @@ -977,8 +997,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, * the spin lock */ head_ref = add_delayed_ref_head(trans, head_ref, record, - action, &qrecord_inserted, - old_ref_mod, new_ref_mod); + action, &qrecord_inserted); ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); @@ -1006,8 +1025,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, */ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, - u64 reserved, int *old_ref_mod, - int *new_ref_mod) + u64 reserved) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_data_ref *ref; @@ -1073,8 +1091,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, * the spin lock */ head_ref = add_delayed_ref_head(trans, head_ref, record, - action, &qrecord_inserted, - old_ref_mod, new_ref_mod); + action, &qrecord_inserted); ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); @@ -1117,7 +1134,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, spin_lock(&delayed_refs->lock); add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD, - NULL, NULL, NULL); + NULL); spin_unlock(&delayed_refs->lock); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 1c977e6d45dc..e22fba272e4f 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -135,6 +135,11 @@ struct btrfs_delayed_data_ref { u64 offset; }; +enum btrfs_delayed_ref_flags { + /* Indicate that we are flushing delayed refs for the commit */ + BTRFS_DELAYED_REFS_FLUSHING, +}; + struct btrfs_delayed_ref_root { /* head ref rbtree */ struct rb_root_cached href_root; @@ -158,12 +163,7 @@ struct btrfs_delayed_ref_root { u64 pending_csums; - /* - * set when the tree is flushing before a transaction commit, - * used by the throttling code to decide if new updates need - * to be run right away - */ - int flushing; + unsigned long flags; u64 run_delayed_start; @@ -326,6 +326,16 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) } } +static inline u64 btrfs_ref_head_to_space_flags( + struct btrfs_delayed_ref_head *head_ref) +{ + if (head_ref->is_data) + return BTRFS_BLOCK_GROUP_DATA; + else if (head_ref->is_system) + return BTRFS_BLOCK_GROUP_SYSTEM; + return BTRFS_BLOCK_GROUP_METADATA; +} + static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *head) { if (refcount_dec_and_test(&head->refs)) @@ -334,12 +344,10 @@ static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *hea int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, - struct btrfs_delayed_extent_op *extent_op, - int *old_ref_mod, int *new_ref_mod); + struct btrfs_delayed_extent_op *extent_op); int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, - u64 reserved, int *old_ref_mod, - int *new_ref_mod); + u64 reserved); int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 324f646d6e5e..3a9c1e046ebe 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -22,6 +22,7 @@ #include "dev-replace.h" #include "sysfs.h" #include "zoned.h" +#include "block-group.h" /* * Device replace overview @@ -459,6 +460,185 @@ static char* btrfs_dev_name(struct btrfs_device *device) return rcu_str_deref(device->name); } +static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, + struct btrfs_device *src_dev) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_root *root = fs_info->dev_root; + struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_block_group *cache; + struct btrfs_trans_handle *trans; + int ret = 0; + u64 chunk_offset; + + /* Do not use "to_copy" on non zoned filesystem for now */ + if (!btrfs_is_zoned(fs_info)) + return 0; + + mutex_lock(&fs_info->chunk_mutex); + + /* Ensure we don't have pending new block group */ + spin_lock(&fs_info->trans_lock); + while (fs_info->running_transaction && + !list_empty(&fs_info->running_transaction->dev_update_list)) { + spin_unlock(&fs_info->trans_lock); + mutex_unlock(&fs_info->chunk_mutex); + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + mutex_lock(&fs_info->chunk_mutex); + if (ret == -ENOENT) { + spin_lock(&fs_info->trans_lock); + continue; + } else { + goto unlock; + } + } + + ret = btrfs_commit_transaction(trans); + mutex_lock(&fs_info->chunk_mutex); + if (ret) + goto unlock; + + spin_lock(&fs_info->trans_lock); + } + spin_unlock(&fs_info->trans_lock); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto unlock; + } + + path->reada = READA_FORWARD; + path->search_commit_root = 1; + path->skip_locking = 1; + + key.objectid = src_dev->devid; + key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto free_path; + if (ret > 0) { + if (path->slots[0] >= + btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto free_path; + if (ret > 0) { + ret = 0; + goto free_path; + } + } else { + ret = 0; + } + } + + while (1) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.objectid != src_dev->devid) + break; + + if (found_key.type != BTRFS_DEV_EXTENT_KEY) + break; + + if (found_key.offset < key.offset) + break; + + dev_extent = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); + + chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent); + + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + if (!cache) + goto skip; + + spin_lock(&cache->lock); + cache->to_copy = 1; + spin_unlock(&cache->lock); + + btrfs_put_block_group(cache); + +skip: + ret = btrfs_next_item(root, path); + if (ret != 0) { + if (ret > 0) + ret = 0; + break; + } + } + +free_path: + btrfs_free_path(path); +unlock: + mutex_unlock(&fs_info->chunk_mutex); + + return ret; +} + +bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, + struct btrfs_block_group *cache, + u64 physical) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct extent_map *em; + struct map_lookup *map; + u64 chunk_offset = cache->start; + int num_extents, cur_extent; + int i; + + /* Do not use "to_copy" on non zoned filesystem for now */ + if (!btrfs_is_zoned(fs_info)) + return true; + + spin_lock(&cache->lock); + if (cache->removed) { + spin_unlock(&cache->lock); + return true; + } + spin_unlock(&cache->lock); + + em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); + ASSERT(!IS_ERR(em)); + map = em->map_lookup; + + num_extents = cur_extent = 0; + for (i = 0; i < map->num_stripes; i++) { + /* We have more device extent to copy */ + if (srcdev != map->stripes[i].dev) + continue; + + num_extents++; + if (physical == map->stripes[i].physical) + cur_extent = i; + } + + free_extent_map(em); + + if (num_extents > 1 && cur_extent < num_extents - 1) { + /* + * Has more stripes on this device. Keep this block group + * readonly until we finish all the stripes. + */ + return false; + } + + /* Last stripe on this device */ + spin_lock(&cache->lock); + cache->to_copy = 0; + spin_unlock(&cache->lock); + + return true; +} + static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, int read_src) @@ -500,6 +680,10 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, if (ret) return ret; + ret = mark_block_group_to_copy(fs_info, src_device); + if (ret) + return ret; + down_write(&dev_replace->rwsem); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: @@ -715,7 +899,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - ret = btrfs_start_delalloc_roots(fs_info, U64_MAX, false); + ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); if (ret) { mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 60b70dacc299..3911049a5f23 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -18,5 +18,8 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); +bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, + struct btrfs_block_group *cache, + u64 physical); #endif diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 2b8383d41144..306ff20af70f 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -185,10 +185,12 @@ static struct btrfs_block_group *find_next_block_group( } /** - * peek_discard_list - wrap find_next_block_group() - * @discard_ctl: discard control + * Wrap find_next_block_group() + * + * @discard_ctl: discard control * @discard_state: the discard_state of the block_group after state management * @discard_index: the discard_index of the block_group after state management + * @now: time when discard was invoked, in ns * * This wraps find_next_block_group() and sets the block_group to be in use. * discard_state's control flow is managed here. Variables related to diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6b35b7e88136..41b718cfea40 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -459,6 +459,12 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec return 0; found_start = btrfs_header_bytenr(eb); + + if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) { + WARN_ON(found_start != 0); + return 0; + } + /* * Please do not consolidate these warnings into a single if. * It is useful to know what went wrong. @@ -591,6 +597,59 @@ out: return ret; } +static int validate_subpage_buffer(struct page *page, u64 start, u64 end, + int mirror) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct extent_buffer *eb; + bool reads_done; + int ret = 0; + + /* + * We don't allow bio merge for subpage metadata read, so we should + * only get one eb for each endio hook. + */ + ASSERT(end == start + fs_info->nodesize - 1); + ASSERT(PagePrivate(page)); + + eb = find_extent_buffer(fs_info, start); + /* + * When we are reading one tree block, eb must have been inserted into + * the radix tree. If not, something is wrong. + */ + ASSERT(eb); + + reads_done = atomic_dec_and_test(&eb->io_pages); + /* Subpage read must finish in page read */ + ASSERT(reads_done); + + eb->read_mirror = mirror; + if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { + ret = -EIO; + goto err; + } + ret = validate_extent_buffer(eb); + if (ret < 0) + goto err; + + if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) + btree_readahead_hook(eb, ret); + + set_extent_buffer_uptodate(eb); + + free_extent_buffer(eb); + return ret; +err: + /* + * end_bio_extent_readpage decrements io_pages in case of error, + * make sure it has something to decrement. + */ + atomic_inc(&eb->io_pages); + clear_extent_buffer_uptodate(eb); + free_extent_buffer(eb); + return ret; +} + int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, struct page *page, u64 start, u64 end, int mirror) @@ -600,6 +659,10 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, int reads_done; ASSERT(page->private); + + if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + return validate_subpage_buffer(page, start, end, mirror); + eb = (struct extent_buffer *)page->private; /* @@ -646,7 +709,7 @@ static void end_workqueue_bio(struct bio *bio) fs_info = end_io_wq->info; end_io_wq->status = bio->bi_status; - if (bio_op(bio) == REQ_OP_WRITE) { + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) wq = fs_info->endio_meta_write_workers; else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) @@ -808,6 +871,8 @@ static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, static int check_async_write(struct btrfs_fs_info *fs_info, struct btrfs_inode *bi) { + if (btrfs_is_zoned(fs_info)) + return 0; if (atomic_read(&bi->sync_writers)) return 0; if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) @@ -822,7 +887,7 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int async = check_async_write(fs_info, BTRFS_I(inode)); blk_status_t ret; - if (bio_op(bio) != REQ_OP_WRITE) { + if (btrfs_op(bio) != BTRFS_MAP_WRITE) { /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads @@ -1016,7 +1081,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->orphan_cleanup_state = 0; root->last_trans = 0; - root->highest_objectid = 0; + root->free_objectid = 0; root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; root->inode_tree = RB_ROOT; @@ -1189,7 +1254,6 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { struct btrfs_root *root; - struct extent_buffer *leaf; root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS); if (!root) @@ -1199,6 +1263,14 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; + return root; +} + +int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct extent_buffer *leaf; + /* * DON'T set SHAREABLE bit for log trees. * @@ -1211,16 +1283,15 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0, BTRFS_NESTING_NORMAL); - if (IS_ERR(leaf)) { - btrfs_put_root(root); - return ERR_CAST(leaf); - } + if (IS_ERR(leaf)) + return PTR_ERR(leaf); root->node = leaf; btrfs_mark_buffer_dirty(root->node); btrfs_tree_unlock(root->node); - return root; + + return 0; } int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, @@ -1231,6 +1302,16 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, log_root = alloc_log_tree(trans, fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); + + if (!btrfs_is_zoned(fs_info)) { + int ret = btrfs_alloc_log_tree_node(trans, log_root); + + if (ret) { + btrfs_put_root(log_root); + return ret; + } + } + WARN_ON(fs_info->log_root_tree); fs_info->log_root_tree = log_root; return 0; @@ -1242,11 +1323,18 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *log_root; struct btrfs_inode_item *inode_item; + int ret; log_root = alloc_log_tree(trans, fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); + ret = btrfs_alloc_log_tree_node(trans, log_root); + if (ret) { + btrfs_put_root(log_root); + return ret; + } + log_root->last_trans = trans->transid; log_root->root_key.offset = root->root_key.objectid; @@ -1367,14 +1455,13 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) } mutex_lock(&root->objectid_mutex); - ret = btrfs_find_highest_objectid(root, - &root->highest_objectid); + ret = btrfs_init_root_free_objectid(root); if (ret) { mutex_unlock(&root->objectid_mutex); goto fail; } - ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); + ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); mutex_unlock(&root->objectid_mutex); @@ -1470,7 +1557,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); - percpu_counter_destroy(&fs_info->dio_bytes); + percpu_counter_destroy(&fs_info->ordered_bytes); percpu_counter_destroy(&fs_info->dev_replace.bio_counter); btrfs_free_csum_hash(fs_info); btrfs_free_stripe_hash_table(fs_info); @@ -2427,13 +2514,21 @@ static int validate_super(struct btrfs_fs_info *fs_info, btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); ret = -EINVAL; } - /* Only PAGE SIZE is supported yet */ - if (sectorsize != PAGE_SIZE) { + + /* + * For 4K page size, we only support 4K sector size. + * For 64K page size, we support read-write for 64K sector size, and + * read-only for 4K sector size. + */ + if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) || + (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K && + sectorsize != SZ_64K))) { btrfs_err(fs_info, - "sectorsize %llu not supported yet, only support %lu", + "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); ret = -EINVAL; } + if (!is_power_of_2(nodesize) || nodesize < sectorsize || nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { btrfs_err(fs_info, "invalid nodesize %llu", nodesize); @@ -2646,14 +2741,13 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) * No need to hold btrfs_root::objectid_mutex since the fs * hasn't been fully initialised and we are the only user */ - ret = btrfs_find_highest_objectid(tree_root, - &tree_root->highest_objectid); + ret = btrfs_init_root_free_objectid(tree_root); if (ret < 0) { handle_error = true; continue; } - ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); + ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); ret = btrfs_read_roots(fs_info); if (ret < 0) { @@ -2695,11 +2789,13 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); + spin_lock_init(&fs_info->treelog_bg_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->unused_bg_unpin_mutex); mutex_init(&fs_info->delete_unused_bgs_mutex); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); + mutex_init(&fs_info->zoned_meta_io_lock); seqlock_init(&fs_info->profiles_lock); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); @@ -2804,7 +2900,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); - ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL); + ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL); if (ret) return ret; @@ -3044,6 +3140,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_alloc; } + fs_info->csum_size = btrfs_super_csum_size(disk_super); + ret = btrfs_init_csum_hash(fs_info, csum_type); if (ret) { err = ret; @@ -3138,8 +3236,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) btrfs_info(fs_info, "has skinny extents"); - fs_info->zoned = (features & BTRFS_FEATURE_INCOMPAT_ZONED); - /* * flag our filesystem as having big metadata blocks if * they are bigger than the page size @@ -3161,7 +3257,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); - fs_info->csum_size = btrfs_super_csum_size(disk_super); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; @@ -3193,6 +3288,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_alloc; } + /* For 4K sector size support, it's only read-only */ + if (PAGE_SIZE == SZ_64K && sectorsize == SZ_4K) { + if (!sb_rdonly(sb) || btrfs_super_log_root(disk_super)) { + btrfs_err(fs_info, + "subpage sectorsize %u only supported read-only for page size %lu", + sectorsize, PAGE_SIZE); + err = -EINVAL; + goto fail_alloc; + } + } + ret = btrfs_init_workqueues(fs_info, fs_devices); if (ret) { err = ret; @@ -3260,6 +3366,19 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_tree_roots; /* + * Get zone type information of zoned block devices. This will also + * handle emulation of a zoned filesystem if a regular device has the + * zoned incompat feature flag set. + */ + ret = btrfs_get_dev_zone_info_all_devices(fs_info); + if (ret) { + btrfs_err(fs_info, + "zoned: failed to read device zone info: %d", + ret); + goto fail_block_groups; + } + + /* * If we have a uuid root and we're not being told to rescan we need to * check the generation here so we can set the * BTRFS_FS_UPDATE_UUID_TREE_GEN bit. Otherwise we could commit the @@ -4113,6 +4232,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); + cancel_work_sync(&fs_info->preempt_reclaim_work); /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); @@ -4165,9 +4285,9 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) percpu_counter_sum(&fs_info->delalloc_bytes)); } - if (percpu_counter_sum(&fs_info->dio_bytes)) + if (percpu_counter_sum(&fs_info->ordered_bytes)) btrfs_info(fs_info, "at unmount dio bytes count %lld", - percpu_counter_sum(&fs_info->dio_bytes)); + percpu_counter_sum(&fs_info->ordered_bytes)); btrfs_sysfs_remove_mounted(fs_info); btrfs_sysfs_remove_fsid(fs_info->fs_devices); @@ -4688,6 +4808,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, EXTENT_DIRTY); btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents); + btrfs_free_redirty_list(cur_trans); + cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); } @@ -4745,7 +4867,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) +int btrfs_init_root_free_objectid(struct btrfs_root *root) { struct btrfs_path *path; int ret; @@ -4769,10 +4891,10 @@ int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) slot = path->slots[0] - 1; l = path->nodes[0]; btrfs_item_key_to_cpu(l, &found_key, slot); - *objectid = max_t(u64, found_key.objectid, - BTRFS_FIRST_FREE_OBJECTID - 1); + root->free_objectid = max_t(u64, found_key.objectid + 1, + BTRFS_FIRST_FREE_OBJECTID); } else { - *objectid = BTRFS_FIRST_FREE_OBJECTID - 1; + root->free_objectid = BTRFS_FIRST_FREE_OBJECTID; } ret = 0; error: @@ -4780,12 +4902,12 @@ error: return ret; } -int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid) +int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) { int ret; mutex_lock(&root->objectid_mutex); - if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) { + if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) { btrfs_warn(root->fs_info, "the objectid of root %llu reaches its highest value", root->root_key.objectid); @@ -4793,7 +4915,7 @@ int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid) goto out; } - *objectid = ++root->highest_objectid; + *objectid = root->free_objectid++; ret = 0; out: mutex_unlock(&root->objectid_mutex); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index e45057c0c016..0e7e9526b6a8 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -120,6 +120,8 @@ blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, extent_submit_bio_start_t *submit_bio_start); blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, int mirror_num); +int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, @@ -133,8 +135,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); -int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid); -int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid); +int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid); +int btrfs_init_root_free_objectid(struct btrfs_root *root); int __init btrfs_end_io_wq_init(void); void __cold btrfs_end_io_wq_exit(void); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d79b8369e6aa..78ad31a59e59 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -34,6 +34,8 @@ #include "block-group.h" #include "discard.h" #include "rcu-string.h" +#include "zoned.h" +#include "dev-replace.h" #undef SCRAMBLE_DELAYED_REFS @@ -82,41 +84,6 @@ void btrfs_free_excluded_extents(struct btrfs_block_group *cache) EXTENT_UPTODATE); } -static u64 generic_ref_to_space_flags(struct btrfs_ref *ref) -{ - if (ref->type == BTRFS_REF_METADATA) { - if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID) - return BTRFS_BLOCK_GROUP_SYSTEM; - else - return BTRFS_BLOCK_GROUP_METADATA; - } - return BTRFS_BLOCK_GROUP_DATA; -} - -static void add_pinned_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_ref *ref) -{ - struct btrfs_space_info *space_info; - u64 flags = generic_ref_to_space_flags(ref); - - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len, - BTRFS_TOTAL_BYTES_PINNED_BATCH); -} - -static void sub_pinned_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_ref *ref) -{ - struct btrfs_space_info *space_info; - u64 flags = generic_ref_to_space_flags(ref); - - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len, - BTRFS_TOTAL_BYTES_PINNED_BATCH); -} - /* simple helper to search for an existing data extent at a given offset */ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { @@ -1299,6 +1266,46 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, return ret; } +static int do_discard_extent(struct btrfs_bio_stripe *stripe, u64 *bytes) +{ + struct btrfs_device *dev = stripe->dev; + struct btrfs_fs_info *fs_info = dev->fs_info; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + u64 phys = stripe->physical; + u64 len = stripe->length; + u64 discarded = 0; + int ret = 0; + + /* Zone reset on a zoned filesystem */ + if (btrfs_can_zone_reset(dev, phys, len)) { + u64 src_disc; + + ret = btrfs_reset_device_zone(dev, phys, len, &discarded); + if (ret) + goto out; + + if (!btrfs_dev_replace_is_ongoing(dev_replace) || + dev != dev_replace->srcdev) + goto out; + + src_disc = discarded; + + /* Send to replace target as well */ + ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len, + &discarded); + discarded += src_disc; + } else if (blk_queue_discard(bdev_get_queue(stripe->dev->bdev))) { + ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded); + } else { + ret = 0; + *bytes = 0; + } + +out: + *bytes = discarded; + return ret; +} + int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes) { @@ -1333,20 +1340,13 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, stripe = bbio->stripes; for (i = 0; i < bbio->num_stripes; i++, stripe++) { u64 bytes; - struct request_queue *req_q; if (!stripe->dev->bdev) { ASSERT(btrfs_test_opt(fs_info, DEGRADED)); continue; } - req_q = bdev_get_queue(stripe->dev->bdev); - if (!blk_queue_discard(req_q)) - continue; - ret = btrfs_issue_discard(stripe->dev->bdev, - stripe->physical, - stripe->length, - &bytes); + ret = do_discard_extent(stripe, &bytes); if (!ret) { discarded_bytes += bytes; } else if (ret != -EOPNOTSUPP) { @@ -1388,7 +1388,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref) { struct btrfs_fs_info *fs_info = trans->fs_info; - int old_ref_mod, new_ref_mod; int ret; ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && @@ -1397,17 +1396,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID); if (generic_ref->type == BTRFS_REF_METADATA) - ret = btrfs_add_delayed_tree_ref(trans, generic_ref, - NULL, &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL); else - ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0, - &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0); btrfs_ref_tree_mod(fs_info, generic_ref); - if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) - sub_pinned_bytes(fs_info, generic_ref); - return ret; } @@ -1795,34 +1789,28 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, { int nr_items = 1; /* Dropping this ref head update. */ - if (head->total_ref_mod < 0) { - struct btrfs_space_info *space_info; - u64 flags; + /* + * We had csum deletions accounted for in our delayed refs rsv, we need + * to drop the csum leaves for this update from our delayed_refs_rsv. + */ + if (head->total_ref_mod < 0 && head->is_data) { + spin_lock(&delayed_refs->lock); + delayed_refs->pending_csums -= head->num_bytes; + spin_unlock(&delayed_refs->lock); + nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); + } - if (head->is_data) - flags = BTRFS_BLOCK_GROUP_DATA; - else if (head->is_system) - flags = BTRFS_BLOCK_GROUP_SYSTEM; - else - flags = BTRFS_BLOCK_GROUP_METADATA; - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, - -head->num_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); + /* + * We were dropping refs, or had a new ref and dropped it, and thus must + * adjust down our total_bytes_pinned, the space may or may not have + * been pinned and so is accounted for properly in the pinned space by + * now. + */ + if (head->total_ref_mod < 0 || + (head->total_ref_mod == 0 && head->must_insert_reserved)) { + u64 flags = btrfs_ref_head_to_space_flags(head); - /* - * We had csum deletions accounted for in our delayed refs rsv, - * we need to drop the csum leaves for this update from our - * delayed_refs_rsv. - */ - if (head->is_data) { - spin_lock(&delayed_refs->lock); - delayed_refs->pending_csums -= head->num_bytes; - spin_unlock(&delayed_refs->lock); - nr_items += btrfs_csum_bytes_to_leaves(fs_info, - head->num_bytes); - } + btrfs_mod_total_bytes_pinned(fs_info, flags, -head->num_bytes); } btrfs_delayed_refs_rsv_release(fs_info, nr_items); @@ -2160,7 +2148,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; if (count == 0) - count = atomic_read(&delayed_refs->num_entries) * 2; + count = delayed_refs->num_heads_ready; again: #ifdef SCRAMBLE_DELAYED_REFS @@ -2572,8 +2560,7 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, - num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); + __btrfs_mod_total_bytes_pinned(cache->space_info, num_bytes); set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); return 0; @@ -2602,8 +2589,6 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, struct btrfs_block_group *cache; int ret; - btrfs_add_excluded_extent(trans->fs_info, bytenr, num_bytes); - cache = btrfs_lookup_block_group(trans->fs_info, bytenr); if (!cache) return -EINVAL; @@ -2615,11 +2600,19 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, * the pinned extents. */ btrfs_cache_block_group(cache, 1); + /* + * Make sure we wait until the cache is completely built in case it is + * missing or is invalid and therefore needs to be rebuilt. + */ + ret = btrfs_wait_block_group_cache_done(cache); + if (ret) + goto out; pin_down_extent(trans, cache, bytenr, num_bytes, 0); /* remove us from the free space cache (if we're there at all) */ ret = btrfs_remove_free_space(cache, bytenr, num_bytes); +out: btrfs_put_block_group(cache); return ret; } @@ -2629,50 +2622,22 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, { int ret; struct btrfs_block_group *block_group; - struct btrfs_caching_control *caching_ctl; block_group = btrfs_lookup_block_group(fs_info, start); if (!block_group) return -EINVAL; - btrfs_cache_block_group(block_group, 0); - caching_ctl = btrfs_get_caching_control(block_group); - - if (!caching_ctl) { - /* Logic error */ - BUG_ON(!btrfs_block_group_done(block_group)); - ret = btrfs_remove_free_space(block_group, start, num_bytes); - } else { - /* - * We must wait for v1 caching to finish, otherwise we may not - * remove our space. - */ - btrfs_wait_space_cache_v1_finished(block_group, caching_ctl); - mutex_lock(&caching_ctl->mutex); - - if (start >= caching_ctl->progress) { - ret = btrfs_add_excluded_extent(fs_info, start, - num_bytes); - } else if (start + num_bytes <= caching_ctl->progress) { - ret = btrfs_remove_free_space(block_group, - start, num_bytes); - } else { - num_bytes = caching_ctl->progress - start; - ret = btrfs_remove_free_space(block_group, - start, num_bytes); - if (ret) - goto out_lock; + btrfs_cache_block_group(block_group, 1); + /* + * Make sure we wait until the cache is completely built in case it is + * missing or is invalid and therefore needs to be rebuilt. + */ + ret = btrfs_wait_block_group_cache_done(block_group); + if (ret) + goto out; - num_bytes = (start + num_bytes) - - caching_ctl->progress; - start = caching_ctl->progress; - ret = btrfs_add_excluded_extent(fs_info, start, - num_bytes); - } -out_lock: - mutex_unlock(&caching_ctl->mutex); - btrfs_put_caching_control(caching_ctl); - } + ret = btrfs_remove_free_space(block_group, start, num_bytes); +out: btrfs_put_block_group(block_group); return ret; } @@ -2806,11 +2771,14 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, cache->pinned -= len; btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); space_info->max_extent_size = 0; - percpu_counter_add_batch(&space_info->total_bytes_pinned, - -len, BTRFS_TOTAL_BYTES_PINNED_BATCH); + __btrfs_mod_total_bytes_pinned(space_info, -len); if (cache->ro) { space_info->bytes_readonly += len; readonly = true; + } else if (btrfs_is_zoned(fs_info)) { + /* Need reset before reusing in a zoned block group */ + space_info->bytes_zone_unusable += len; + readonly = true; } spin_unlock(&cache->lock); if (!readonly && return_free_space && @@ -2863,9 +2831,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } - if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) - clear_extent_bits(&fs_info->excluded_extents, start, - end, EXTENT_UPTODATE); if (btrfs_test_opt(fs_info, DISCARD_SYNC)) ret = btrfs_discard_extent(fs_info, start, @@ -3343,7 +3308,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ref generic_ref = { 0 }; - int pin = 1; int ret; btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, @@ -3352,13 +3316,9 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, root->root_key.objectid); if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - int old_ref_mod, new_ref_mod; - btrfs_ref_tree_mod(fs_info, &generic_ref); - ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL, - &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL); BUG_ON(ret); /* -ENOMEM */ - pin = old_ref_mod >= 0 && new_ref_mod < 0; } if (last_ref && btrfs_header_generation(buf) == trans->transid) { @@ -3366,11 +3326,12 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, buf->start); - if (!ret) + if (!ret) { + btrfs_redirty_list_add(trans->transaction, buf); goto out; + } } - pin = 0; cache = btrfs_lookup_block_group(fs_info, buf->start); if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { @@ -3379,6 +3340,13 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, goto out; } + if (btrfs_is_zoned(fs_info)) { + btrfs_redirty_list_add(trans->transaction, buf); + pin_down_extent(trans, cache, buf->start, buf->len, 1); + btrfs_put_block_group(cache); + goto out; + } + WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len); @@ -3387,9 +3355,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); } out: - if (pin) - add_pinned_bytes(fs_info, &generic_ref); - if (last_ref) { /* * Deleting the buffer, clear the corrupt flag since it doesn't @@ -3403,7 +3368,6 @@ out: int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) { struct btrfs_fs_info *fs_info = trans->fs_info; - int old_ref_mod, new_ref_mod; int ret; if (btrfs_is_testing(fs_info)) @@ -3419,14 +3383,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { /* unlocks the pinned mutex */ btrfs_pin_extent(trans, ref->bytenr, ref->len, 1); - old_ref_mod = new_ref_mod = 0; ret = 0; } else if (ref->type == BTRFS_REF_METADATA) { - ret = btrfs_add_delayed_tree_ref(trans, ref, NULL, - &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_tree_ref(trans, ref, NULL); } else { - ret = btrfs_add_delayed_data_ref(trans, ref, 0, - &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_data_ref(trans, ref, 0); } if (!((ref->type == BTRFS_REF_METADATA && @@ -3435,9 +3396,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID))) btrfs_ref_tree_mod(fs_info, ref); - if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) - add_pinned_bytes(fs_info, ref); - return ret; } @@ -3514,6 +3472,7 @@ btrfs_release_block_group(struct btrfs_block_group *cache, enum btrfs_extent_allocation_policy { BTRFS_EXTENT_ALLOC_CLUSTERED, + BTRFS_EXTENT_ALLOC_ZONED, }; /* @@ -3538,6 +3497,9 @@ struct find_free_extent_ctl { bool have_caching_bg; bool orig_have_caching_bg; + /* Allocation is called for tree-log */ + bool for_treelog; + /* RAID index, converted from flags */ int index; @@ -3766,6 +3728,118 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group, return find_free_extent_unclustered(block_group, ffe_ctl); } +/* + * Tree-log block group locking + * ============================ + * + * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which + * indicates the starting address of a block group, which is reserved only + * for tree-log metadata. + * + * Lock nesting + * ============ + * + * space_info::lock + * block_group::lock + * fs_info::treelog_bg_lock + */ + +/* + * Simple allocator for sequential-only block group. It only allows sequential + * allocation. No need to play with trees. This function also reserves the + * bytes as in btrfs_add_reserved_bytes. + */ +static int do_allocation_zoned(struct btrfs_block_group *block_group, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group **bg_ret) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_space_info *space_info = block_group->space_info; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + u64 start = block_group->start; + u64 num_bytes = ffe_ctl->num_bytes; + u64 avail; + u64 bytenr = block_group->start; + u64 log_bytenr; + int ret = 0; + bool skip; + + ASSERT(btrfs_is_zoned(block_group->fs_info)); + + /* + * Do not allow non-tree-log blocks in the dedicated tree-log block + * group, and vice versa. + */ + spin_lock(&fs_info->treelog_bg_lock); + log_bytenr = fs_info->treelog_bg; + skip = log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) || + (!ffe_ctl->for_treelog && bytenr == log_bytenr)); + spin_unlock(&fs_info->treelog_bg_lock); + if (skip) + return 1; + + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + spin_lock(&fs_info->treelog_bg_lock); + + ASSERT(!ffe_ctl->for_treelog || + block_group->start == fs_info->treelog_bg || + fs_info->treelog_bg == 0); + + if (block_group->ro) { + ret = 1; + goto out; + } + + /* + * Do not allow currently using block group to be tree-log dedicated + * block group. + */ + if (ffe_ctl->for_treelog && !fs_info->treelog_bg && + (block_group->used || block_group->reserved)) { + ret = 1; + goto out; + } + + avail = block_group->length - block_group->alloc_offset; + if (avail < num_bytes) { + if (ffe_ctl->max_extent_size < avail) { + /* + * With sequential allocator, free space is always + * contiguous + */ + ffe_ctl->max_extent_size = avail; + ffe_ctl->total_free_space = avail; + } + ret = 1; + goto out; + } + + if (ffe_ctl->for_treelog && !fs_info->treelog_bg) + fs_info->treelog_bg = block_group->start; + + ffe_ctl->found_offset = start + block_group->alloc_offset; + block_group->alloc_offset += num_bytes; + spin_lock(&ctl->tree_lock); + ctl->free_space -= num_bytes; + spin_unlock(&ctl->tree_lock); + + /* + * We do not check if found_offset is aligned to stripesize. The + * address is anyway rewritten when using zone append writing. + */ + + ffe_ctl->search_start = ffe_ctl->found_offset; + +out: + if (ret && ffe_ctl->for_treelog) + fs_info->treelog_bg = 0; + spin_unlock(&fs_info->treelog_bg_lock); + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + return ret; +} + static int do_allocation(struct btrfs_block_group *block_group, struct find_free_extent_ctl *ffe_ctl, struct btrfs_block_group **bg_ret) @@ -3773,6 +3847,8 @@ static int do_allocation(struct btrfs_block_group *block_group, switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: return do_allocation_clustered(block_group, ffe_ctl, bg_ret); + case BTRFS_EXTENT_ALLOC_ZONED: + return do_allocation_zoned(block_group, ffe_ctl, bg_ret); default: BUG(); } @@ -3787,6 +3863,9 @@ static void release_block_group(struct btrfs_block_group *block_group, ffe_ctl->retry_clustered = false; ffe_ctl->retry_unclustered = false; break; + case BTRFS_EXTENT_ALLOC_ZONED: + /* Nothing to do */ + break; default: BUG(); } @@ -3815,6 +3894,9 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl, case BTRFS_EXTENT_ALLOC_CLUSTERED: found_extent_clustered(ffe_ctl, ins); break; + case BTRFS_EXTENT_ALLOC_ZONED: + /* Nothing to do */ + break; default: BUG(); } @@ -3830,6 +3912,9 @@ static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) */ ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; return 0; + case BTRFS_EXTENT_ALLOC_ZONED: + /* Give up here */ + return -ENOSPC; default: BUG(); } @@ -3998,6 +4083,14 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, case BTRFS_EXTENT_ALLOC_CLUSTERED: return prepare_allocation_clustered(fs_info, ffe_ctl, space_info, ins); + case BTRFS_EXTENT_ALLOC_ZONED: + if (ffe_ctl->for_treelog) { + spin_lock(&fs_info->treelog_bg_lock); + if (fs_info->treelog_bg) + ffe_ctl->hint_byte = fs_info->treelog_bg; + spin_unlock(&fs_info->treelog_bg_lock); + } + return 0; default: BUG(); } @@ -4040,6 +4133,7 @@ static noinline int find_free_extent(struct btrfs_root *root, struct find_free_extent_ctl ffe_ctl = {0}; struct btrfs_space_info *space_info; bool full_search = false; + bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); WARN_ON(num_bytes < fs_info->sectorsize); @@ -4053,6 +4147,7 @@ static noinline int find_free_extent(struct btrfs_root *root, ffe_ctl.orig_have_caching_bg = false; ffe_ctl.found_offset = 0; ffe_ctl.hint_byte = hint_byte_orig; + ffe_ctl.for_treelog = for_treelog; ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED; /* For clustered allocation */ @@ -4061,6 +4156,9 @@ static noinline int find_free_extent(struct btrfs_root *root, ffe_ctl.last_ptr = NULL; ffe_ctl.use_cluster = true; + if (btrfs_is_zoned(fs_info)) + ffe_ctl.policy = BTRFS_EXTENT_ALLOC_ZONED; + ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; ins->offset = 0; @@ -4124,8 +4222,11 @@ search: struct btrfs_block_group *bg_ret; /* If the block group is read-only, we can skip it entirely. */ - if (unlikely(block_group->ro)) + if (unlikely(block_group->ro)) { + if (for_treelog) + btrfs_clear_treelog_bg(block_group); continue; + } btrfs_grab_block_group(block_group, delalloc); ffe_ctl.search_start = block_group->start; @@ -4203,20 +4304,21 @@ have_block_group: /* move on to the next group */ if (ffe_ctl.search_start + num_bytes > block_group->start + block_group->length) { - btrfs_add_free_space(block_group, ffe_ctl.found_offset, - num_bytes); + btrfs_add_free_space_unused(block_group, + ffe_ctl.found_offset, num_bytes); goto loop; } if (ffe_ctl.found_offset < ffe_ctl.search_start) - btrfs_add_free_space(block_group, ffe_ctl.found_offset, - ffe_ctl.search_start - ffe_ctl.found_offset); + btrfs_add_free_space_unused(block_group, + ffe_ctl.found_offset, + ffe_ctl.search_start - ffe_ctl.found_offset); ret = btrfs_add_reserved_bytes(block_group, ram_bytes, num_bytes, delalloc); if (ret == -EAGAIN) { - btrfs_add_free_space(block_group, ffe_ctl.found_offset, - num_bytes); + btrfs_add_free_space_unused(block_group, + ffe_ctl.found_offset, num_bytes); goto loop; } btrfs_inc_block_group_reservations(block_group); @@ -4310,6 +4412,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, bool final_tried = num_bytes == min_alloc_size; u64 flags; int ret; + bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); flags = get_alloc_profile_by_root(root, is_data); again: @@ -4333,8 +4436,8 @@ again: sinfo = btrfs_find_space_info(fs_info, flags); btrfs_err(fs_info, - "allocation failed flags %llu, wanted %llu", - flags, num_bytes); + "allocation failed flags %llu, wanted %llu tree-log %d", + flags, num_bytes, for_treelog); if (sinfo) btrfs_dump_space_info(fs_info, sinfo, num_bytes, 1); @@ -4516,7 +4619,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, } if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { - BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_SHARED_BLOCK_REF_KEY); btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent); @@ -4553,7 +4655,6 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_key *ins) { struct btrfs_ref generic_ref = { 0 }; - int ret; BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); @@ -4561,9 +4662,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, ins->objectid, ins->offset, 0); btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset); btrfs_ref_tree_mod(root->fs_info, &generic_ref); - ret = btrfs_add_delayed_data_ref(trans, &generic_ref, - ram_bytes, NULL, NULL); - return ret; + + return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes); } /* @@ -4645,6 +4745,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, __btrfs_tree_lock(buf, nest); btrfs_clean_tree_block(buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); + clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags); set_extent_buffer_uptodate(buf); @@ -4755,8 +4856,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, generic_ref.real_root = root->root_key.objectid; btrfs_init_tree_ref(&generic_ref, level, root_objectid); btrfs_ref_tree_mod(fs_info, &generic_ref); - ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, - extent_op, NULL, NULL); + ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op); if (ret) goto out_free_delayed; } @@ -5549,7 +5649,15 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) goto out_free; } - trans = btrfs_start_transaction(tree_root, 0); + /* + * Use join to avoid potential EINTR from transaction + * start. See wait_reserve_ticket and the whole + * reservation callchain. + */ + if (for_reloc) + trans = btrfs_join_transaction(tree_root); + else + trans = btrfs_start_transaction(tree_root, 0); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out_free; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c9cee458e001..4dfb3ead1175 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -24,6 +24,9 @@ #include "rcu-string.h" #include "backref.h" #include "disk-io.h" +#include "subpage.h" +#include "zoned.h" +#include "block-group.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -389,16 +392,16 @@ do_insert: } /** - * __etree_search - searche @tree for an entry that contains @offset. Such - * entry would have entry->start <= offset && entry->end >= offset. + * Search @tree for an entry that contains @offset. Such entry would have + * entry->start <= offset && entry->end >= offset. * - * @tree - the tree to search - * @offset - offset that should fall within an entry in @tree - * @next_ret - pointer to the first entry whose range ends after @offset - * @prev - pointer to the first entry whose range begins before @offset - * @p_ret - pointer where new node should be anchored (used when inserting an - * entry in the tree) - * @parent_ret - points to entry which would have been the parent of the entry, + * @tree: the tree to search + * @offset: offset that should fall within an entry in @tree + * @next_ret: pointer to the first entry whose range ends after @offset + * @prev_ret: pointer to the first entry whose range begins before @offset + * @p_ret: pointer where new node should be anchored (used when inserting an + * entry in the tree) + * @parent_ret: points to entry which would have been the parent of the entry, * containing @offset * * This function returns a pointer to the entry that contains @offset byte @@ -1588,12 +1591,13 @@ out: } /** - * find_contiguous_extent_bit: find a contiguous area of bits - * @tree - io tree to check - * @start - offset to start the search from - * @start_ret - the first offset we found with the bits set - * @end_ret - the final contiguous range of the bits that were set - * @bits - bits to look for + * Find a contiguous area of bits + * + * @tree: io tree to check + * @start: offset to start the search from + * @start_ret: the first offset we found with the bits set + * @end_ret: the final contiguous range of the bits that were set + * @bits: bits to look for * * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges * to set bits appropriately, and then merge them again. During this time it @@ -1625,14 +1629,14 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, } /** - * find_first_clear_extent_bit - find the first range that has @bits not set. - * This range could start before @start. + * Find the first range that has @bits not set. This range could start before + * @start. * - * @tree - the tree to search - * @start - the offset at/after which the found extent should start - * @start_ret - records the beginning of the range - * @end_ret - records the end of the range (inclusive) - * @bits - the set of bits which must be unset + * @tree: the tree to search + * @start: offset at/after which the found extent should start + * @start_ret: records the beginning of the range + * @end_ret: records the end of the range (inclusive) + * @bits: the set of bits which must be unset * * Since unallocated range is also considered one which doesn't have the bits * set it's possible that @end_ret contains -1, this happens in case the range @@ -1975,10 +1979,10 @@ static int __process_pages_contig(struct address_space *mapping, pages_processed++; continue; } - if (page_ops & PAGE_CLEAR_DIRTY) + if (page_ops & PAGE_START_WRITEBACK) { clear_page_dirty_for_io(pages[i]); - if (page_ops & PAGE_SET_WRITEBACK) set_page_writeback(pages[i]); + } if (page_ops & PAGE_SET_ERROR) SetPageError(pages[i]); if (page_ops & PAGE_END_WRITEBACK) @@ -2256,6 +2260,9 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); BUG_ON(!mirror_num); + if (btrfs_is_zoned(fs_info)) + return btrfs_repair_one_zone(fs_info, logical); + bio = btrfs_io_bio_alloc(1); bio->bi_iter.bi_size = 0; map_length = length; @@ -2732,6 +2739,7 @@ static void end_bio_extent_writepage(struct bio *bio) u64 start; u64 end; struct bvec_iter_all iter_all; + bool first_bvec = true; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { @@ -2758,6 +2766,11 @@ static void end_bio_extent_writepage(struct bio *bio) start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; + if (first_bvec) { + btrfs_record_physical_zoned(inode, start, bio); + first_bvec = false; + } + end_extent_writepage(page, error, start, end); end_page_writeback(page); } @@ -2775,7 +2788,7 @@ struct processed_extent { struct btrfs_inode *inode; /* Start of the range in @inode */ u64 start; - /* End of the range in in @inode */ + /* End of the range in @inode */ u64 end; bool uptodate; }; @@ -2838,15 +2851,38 @@ update: processed->uptodate = uptodate; } -static void endio_readpage_update_page_status(struct page *page, bool uptodate) +static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) { + ASSERT(PageLocked(page)); + if (fs_info->sectorsize == PAGE_SIZE) + return; + + ASSERT(PagePrivate(page)); + btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE); +} + +static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + + ASSERT(page_offset(page) <= start && + start + len <= page_offset(page) + PAGE_SIZE); + if (uptodate) { - SetPageUptodate(page); + btrfs_page_set_uptodate(fs_info, page, start, len); } else { - ClearPageUptodate(page); - SetPageError(page); + btrfs_page_clear_uptodate(fs_info, page, start, len); + btrfs_page_set_error(fs_info, page, start, len); } - unlock_page(page); + + if (fs_info->sectorsize == PAGE_SIZE) + unlock_page(page); + else if (is_data_inode(page->mapping->host)) + /* + * For subpage data, unlock the page if we're the last reader. + * For subpage metadata, page lock is not utilized for read. + */ + btrfs_subpage_end_reader(fs_info, page, start, len); } /* @@ -2983,7 +3019,7 @@ readpage_ok: bio_offset += len; /* Update page status and unlock */ - endio_readpage_update_page_status(page, uptodate); + end_page_read(page, uptodate, start, len); endio_readpage_release_extent(&processed, BTRFS_I(inode), start, end, uptodate); } @@ -3058,14 +3094,67 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) return bio; } +/** + * Attempt to add a page to bio + * + * @bio: destination bio + * @page: page to add to the bio + * @disk_bytenr: offset of the new bio or to check whether we are adding + * a contiguous page to the previous one + * @pg_offset: starting offset in the page + * @size: portion of page that we want to write + * @prev_bio_flags: flags of previous bio to see if we can merge the current one + * @bio_flags: flags of the current bio to see if we can merge them + * @return: true if page was added, false otherwise + * + * Attempt to add a page to bio considering stripe alignment etc. + * + * Return true if successfully page added. Otherwise, return false. + */ +static bool btrfs_bio_add_page(struct bio *bio, struct page *page, + u64 disk_bytenr, unsigned int size, + unsigned int pg_offset, + unsigned long prev_bio_flags, + unsigned long bio_flags) +{ + const sector_t sector = disk_bytenr >> SECTOR_SHIFT; + bool contig; + int ret; + + if (prev_bio_flags != bio_flags) + return false; + + if (prev_bio_flags & EXTENT_BIO_COMPRESSED) + contig = bio->bi_iter.bi_sector == sector; + else + contig = bio_end_sector(bio) == sector; + if (!contig) + return false; + + if (btrfs_bio_fits_in_stripe(page, size, bio, bio_flags)) + return false; + + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct page *first_page = bio_first_bvec_all(bio)->bv_page; + + if (!btrfs_bio_fits_in_ordered_extent(first_page, bio, size)) + return false; + ret = bio_add_zone_append_page(bio, page, size, pg_offset); + } else { + ret = bio_add_page(bio, page, size, pg_offset); + } + + return ret == size; +} + /* * @opf: bio REQ_OP_* and REQ_* flags as one value * @wbc: optional writeback control for io accounting * @page: page to add to the bio + * @disk_bytenr: logical bytenr where the write will be + * @size: portion of page that we want to write to * @pg_offset: offset of the new bio or to check whether we are adding * a contiguous page to the previous one - * @size: portion of page that we want to write - * @offset: starting offset in the page * @bio_ret: must be valid pointer, newly allocated bio will be stored there * @end_io_func: end_io callback for new bio * @mirror_num: desired mirror to read/write @@ -3074,7 +3163,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) */ static int submit_extent_page(unsigned int opf, struct writeback_control *wbc, - struct page *page, u64 offset, + struct page *page, u64 disk_bytenr, size_t size, unsigned long pg_offset, struct bio **bio_ret, bio_end_io_t end_io_func, @@ -3086,27 +3175,17 @@ static int submit_extent_page(unsigned int opf, int ret = 0; struct bio *bio; size_t io_size = min_t(size_t, size, PAGE_SIZE); - sector_t sector = offset >> 9; - struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct extent_io_tree *tree = &inode->io_tree; + struct btrfs_fs_info *fs_info = inode->root->fs_info; ASSERT(bio_ret); if (*bio_ret) { - bool contig; - bool can_merge = true; - bio = *bio_ret; - if (prev_bio_flags & EXTENT_BIO_COMPRESSED) - contig = bio->bi_iter.bi_sector == sector; - else - contig = bio_end_sector(bio) == sector; - - if (btrfs_bio_fits_in_stripe(page, io_size, bio, bio_flags)) - can_merge = false; - - if (prev_bio_flags != bio_flags || !contig || !can_merge || - force_bio_submit || - bio_add_page(bio, page, io_size, pg_offset) < io_size) { + if (force_bio_submit || + !btrfs_bio_add_page(bio, page, disk_bytenr, io_size, + pg_offset, prev_bio_flags, bio_flags)) { ret = submit_one_bio(bio, mirror_num, prev_bio_flags); if (ret < 0) { *bio_ret = NULL; @@ -3120,7 +3199,7 @@ static int submit_extent_page(unsigned int opf, } } - bio = btrfs_bio_alloc(offset); + bio = btrfs_bio_alloc(disk_bytenr); bio_add_page(bio, page, io_size, pg_offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; @@ -3129,20 +3208,39 @@ static int submit_extent_page(unsigned int opf, if (wbc) { struct block_device *bdev; - bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev; + bdev = fs_info->fs_devices->latest_bdev; bio_set_dev(bio, bdev); wbc_init_bio(wbc, bio); wbc_account_cgroup_owner(wbc, page, io_size); } + if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct extent_map *em; + struct map_lookup *map; + + em = btrfs_get_chunk_map(fs_info, disk_bytenr, io_size); + if (IS_ERR(em)) + return PTR_ERR(em); + + map = em->map_lookup; + /* We only support single profile for now */ + ASSERT(map->num_stripes == 1); + btrfs_io_bio(bio)->device = map->stripes[0].dev; + + free_extent_map(em); + } *bio_ret = bio; return ret; } -static void attach_extent_buffer_page(struct extent_buffer *eb, - struct page *page) +static int attach_extent_buffer_page(struct extent_buffer *eb, + struct page *page, + struct btrfs_subpage *prealloc) { + struct btrfs_fs_info *fs_info = eb->fs_info; + int ret = 0; + /* * If the page is mapped to btree inode, we should hold the private * lock to prevent race. @@ -3152,16 +3250,62 @@ static void attach_extent_buffer_page(struct extent_buffer *eb, if (page->mapping) lockdep_assert_held(&page->mapping->private_lock); - if (!PagePrivate(page)) - attach_page_private(page, eb); + if (fs_info->sectorsize == PAGE_SIZE) { + if (!PagePrivate(page)) + attach_page_private(page, eb); + else + WARN_ON(page->private != (unsigned long)eb); + return 0; + } + + /* Already mapped, just free prealloc */ + if (PagePrivate(page)) { + btrfs_free_subpage(prealloc); + return 0; + } + + if (prealloc) + /* Has preallocated memory for subpage */ + attach_page_private(page, prealloc); else - WARN_ON(page->private != (unsigned long)eb); + /* Do new allocation to attach subpage */ + ret = btrfs_attach_subpage(fs_info, page, + BTRFS_SUBPAGE_METADATA); + return ret; +} + +int set_page_extent_mapped(struct page *page) +{ + struct btrfs_fs_info *fs_info; + + ASSERT(page->mapping); + + if (PagePrivate(page)) + return 0; + + fs_info = btrfs_sb(page->mapping->host->i_sb); + + if (fs_info->sectorsize < PAGE_SIZE) + return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA); + + attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); + return 0; } -void set_page_extent_mapped(struct page *page) +void clear_page_extent_mapped(struct page *page) { + struct btrfs_fs_info *fs_info; + + ASSERT(page->mapping); + if (!PagePrivate(page)) - attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); + return; + + fs_info = btrfs_sb(page->mapping->host->i_sb); + if (fs_info->sectorsize < PAGE_SIZE) + return btrfs_detach_subpage(fs_info, page); + + detach_page_private(page); } static struct extent_map * @@ -3202,6 +3346,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, unsigned int read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 start = page_offset(page); const u64 end = start + PAGE_SIZE - 1; u64 cur = start; @@ -3218,12 +3363,19 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, unsigned long this_bio_flag = 0; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - set_page_extent_mapped(page); + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_extent(tree, start, end); + btrfs_page_set_error(fs_info, page, start, PAGE_SIZE); + unlock_page(page); + goto out; + } if (!PageUptodate(page)) { if (cleancache_get_page(page) == 0) { BUG_ON(blocksize != PAGE_SIZE); unlock_extent(tree, start, end); + unlock_page(page); goto out; } } @@ -3240,9 +3392,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, kunmap_atomic(userpage); } } + begin_page_read(fs_info, page); while (cur <= end) { bool force_bio_submit = false; - u64 offset; + u64 disk_bytenr; if (cur >= last_byte) { char *userpage; @@ -3257,13 +3410,14 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, &cached, GFP_NOFS); unlock_extent_cached(tree, cur, cur + iosize - 1, &cached); + end_page_read(page, true, cur, iosize); break; } em = __get_extent_map(inode, page, pg_offset, cur, end - cur + 1, em_cached); if (IS_ERR_OR_NULL(em)) { - SetPageError(page); unlock_extent(tree, cur, end); + end_page_read(page, false, cur, end + 1 - cur); break; } extent_offset = cur - em->start; @@ -3280,9 +3434,9 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, cur_end = min(extent_map_end(em) - 1, end); iosize = ALIGN(iosize, blocksize); if (this_bio_flag & EXTENT_BIO_COMPRESSED) - offset = em->block_start; + disk_bytenr = em->block_start; else - offset = em->block_start + extent_offset; + disk_bytenr = em->block_start + extent_offset; block_start = em->block_start; if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) block_start = EXTENT_MAP_HOLE; @@ -3346,6 +3500,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, &cached, GFP_NOFS); unlock_extent_cached(tree, cur, cur + iosize - 1, &cached); + end_page_read(page, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; @@ -3355,6 +3510,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, EXTENT_UPTODATE, 1, NULL)) { check_page_uptodate(tree, page); unlock_extent(tree, cur, cur + iosize - 1); + end_page_read(page, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; @@ -3363,15 +3519,15 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, * to date. Error out */ if (block_start == EXTENT_MAP_INLINE) { - SetPageError(page); unlock_extent(tree, cur, cur + iosize - 1); + end_page_read(page, false, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; } ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, - page, offset, iosize, + page, disk_bytenr, iosize, pg_offset, bio, end_bio_extent_readpage, 0, *bio_flags, @@ -3381,19 +3537,14 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, nr++; *bio_flags = this_bio_flag; } else { - SetPageError(page); unlock_extent(tree, cur, cur + iosize - 1); + end_page_read(page, false, cur, iosize); goto out; } cur = cur + iosize; pg_offset += iosize; } out: - if (!nr) { - if (!PageError(page)) - SetPageUptodate(page); - unlock_page(page); - } return ret; } @@ -3513,23 +3664,21 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, unsigned long nr_written, int *nr_ret) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *tree = &inode->io_tree; u64 start = page_offset(page); - u64 page_end = start + PAGE_SIZE - 1; - u64 end; + u64 end = start + PAGE_SIZE - 1; u64 cur = start; u64 extent_offset; u64 block_start; - u64 iosize; struct extent_map *em; - size_t pg_offset = 0; - size_t blocksize; int ret = 0; int nr = 0; + u32 opf = REQ_OP_WRITE; const unsigned int write_flags = wbc_to_write_flags(wbc); bool compressed; - ret = btrfs_writepage_cow_fixup(page, start, page_end); + ret = btrfs_writepage_cow_fixup(page, start, end); if (ret) { /* Fixup worker will requeue */ redirty_page_for_writepage(wbc, page); @@ -3544,16 +3693,13 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, */ update_nr_written(wbc, nr_written + 1); - end = page_end; - blocksize = inode->vfs_inode.i_sb->s_blocksize; - while (cur <= end) { + u64 disk_bytenr; u64 em_end; - u64 offset; + u32 iosize; if (cur >= i_size) { - btrfs_writepage_endio_finish_ordered(page, cur, - page_end, 1); + btrfs_writepage_endio_finish_ordered(page, cur, end, 1); break; } em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); @@ -3565,13 +3711,20 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, extent_offset = cur - em->start; em_end = extent_map_end(em); - BUG_ON(em_end <= cur); - BUG_ON(end < cur); - iosize = min(em_end - cur, end - cur + 1); - iosize = ALIGN(iosize, blocksize); - offset = em->block_start + extent_offset; + ASSERT(cur <= em_end); + ASSERT(cur < end); + ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize)); + ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize)); block_start = em->block_start; compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + disk_bytenr = em->block_start + extent_offset; + + /* Note that em_end from extent_map_end() is exclusive */ + iosize = min(em_end, end + 1) - cur; + + if (btrfs_use_zone_append(inode, em)) + opf = REQ_OP_ZONE_APPEND; + free_extent_map(em); em = NULL; @@ -3587,7 +3740,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, btrfs_writepage_endio_finish_ordered(page, cur, cur + iosize - 1, 1); cur += iosize; - pg_offset += iosize; continue; } @@ -3598,9 +3750,9 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, page->index, cur, end); } - ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - page, offset, iosize, pg_offset, - &epd->bio, + ret = submit_extent_page(opf | write_flags, wbc, page, + disk_bytenr, iosize, + cur - page_offset(page), &epd->bio, end_bio_extent_writepage, 0, 0, 0, false); if (ret) { @@ -3609,8 +3761,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, end_page_writeback(page); } - cur = cur + iosize; - pg_offset += iosize; + cur += iosize; nr++; } *nr_ret = nr; @@ -3663,7 +3814,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, flush_dcache_page(page); } - set_page_extent_mapped(page); + ret = set_page_extent_mapped(page); + if (ret < 0) { + SetPageError(page); + goto done; + } if (!epd->extent_locked) { ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start, @@ -3923,7 +4078,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct writeback_control *wbc, struct extent_page_data *epd) { - u64 offset = eb->start; + u64 disk_bytenr = eb->start; u32 nritems; int i, num_pages; unsigned long start, end; @@ -3956,7 +4111,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - p, offset, PAGE_SIZE, 0, + p, disk_bytenr, PAGE_SIZE, 0, &epd->bio, end_bio_extent_buffer_writepage, 0, 0, 0, false); @@ -3969,7 +4124,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, ret = -EIO; break; } - offset += PAGE_SIZE; + disk_bytenr += PAGE_SIZE; update_nr_written(wbc, 1); unlock_page(p); } @@ -4010,6 +4165,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, struct extent_buffer **eb_context) { struct address_space *mapping = page->mapping; + struct btrfs_block_group *cache = NULL; struct extent_buffer *eb; int ret; @@ -4042,13 +4198,31 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, if (!ret) return 0; + if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) { + /* + * If for_sync, this hole will be filled with + * trasnsaction commit. + */ + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) + ret = -EAGAIN; + else + ret = 0; + free_extent_buffer(eb); + return ret; + } + *eb_context = eb; ret = lock_extent_buffer_for_io(eb, epd); if (ret <= 0) { + btrfs_revert_meta_write_pointer(cache, eb); + if (cache) + btrfs_put_block_group(cache); free_extent_buffer(eb); return ret; } + if (cache) + btrfs_put_block_group(cache); ret = write_one_eb(eb, wbc, epd); free_extent_buffer(eb); if (ret < 0) @@ -4094,6 +4268,7 @@ int btree_write_cache_pages(struct address_space *mapping, tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; + btrfs_zoned_meta_io_lock(fs_info); retry: if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); @@ -4134,7 +4309,7 @@ retry: } if (ret < 0) { end_write_bio(&epd, ret); - return ret; + goto out; } /* * If something went wrong, don't allow any metadata write bio to be @@ -4169,14 +4344,17 @@ retry: ret = -EROFS; end_write_bio(&epd, ret); } +out: + btrfs_zoned_meta_io_unlock(fs_info); return ret; } /** - * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. + * Walk the list of dirty pages of the given address space and write all of them. + * * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @data: data passed to __extent_writepage function + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @epd: holds context for the write, namely the bio * * If a page is already under I/O, write_cache_pages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, @@ -4975,25 +5153,39 @@ int extent_buffer_under_io(const struct extent_buffer *eb) test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); } -/* - * Release all pages attached to the extent buffer. - */ -static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) +static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page) { - int i; - int num_pages; - int mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); + struct btrfs_subpage *subpage; - BUG_ON(extent_buffer_under_io(eb)); + lockdep_assert_held(&page->mapping->private_lock); - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - struct page *page = eb->pages[i]; + if (PagePrivate(page)) { + subpage = (struct btrfs_subpage *)page->private; + if (atomic_read(&subpage->eb_refs)) + return true; + } + return false; +} - if (!page) - continue; +static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); + + /* + * For mapped eb, we're going to change the page private, which should + * be done under the private_lock. + */ + if (mapped) + spin_lock(&page->mapping->private_lock); + + if (!PagePrivate(page)) { if (mapped) - spin_lock(&page->mapping->private_lock); + spin_unlock(&page->mapping->private_lock); + return; + } + + if (fs_info->sectorsize == PAGE_SIZE) { /* * We do this since we'll remove the pages after we've * removed the eb from the radix tree, so we could race @@ -5012,9 +5204,49 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) */ detach_page_private(page); } - if (mapped) spin_unlock(&page->mapping->private_lock); + return; + } + + /* + * For subpage, we can have dummy eb with page private. In this case, + * we can directly detach the private as such page is only attached to + * one dummy eb, no sharing. + */ + if (!mapped) { + btrfs_detach_subpage(fs_info, page); + return; + } + + btrfs_page_dec_eb_refs(fs_info, page); + + /* + * We can only detach the page private if there are no other ebs in the + * page range. + */ + if (!page_range_has_eb(fs_info, page)) + btrfs_detach_subpage(fs_info, page); + + spin_unlock(&page->mapping->private_lock); +} + +/* Release all pages attached to the extent buffer */ +static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) +{ + int i; + int num_pages; + + ASSERT(!extent_buffer_under_io(eb)); + + num_pages = num_extent_pages(eb); + for (i = 0; i < num_pages; i++) { + struct page *page = eb->pages[i]; + + if (!page) + continue; + + detach_extent_buffer_page(eb, page); /* One for when we allocated the page */ put_page(page); @@ -5046,6 +5278,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list, &fs_info->allocated_ebs); + INIT_LIST_HEAD(&eb->release_list); spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); @@ -5067,21 +5300,32 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) if (new == NULL) return NULL; + /* + * Set UNMAPPED before calling btrfs_release_extent_buffer(), as + * btrfs_release_extent_buffer() have different behavior for + * UNMAPPED subpage extent buffer. + */ + set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); + for (i = 0; i < num_pages; i++) { + int ret; + p = alloc_page(GFP_NOFS); if (!p) { btrfs_release_extent_buffer(new); return NULL; } - attach_extent_buffer_page(new, p); + ret = attach_extent_buffer_page(new, p, NULL); + if (ret < 0) { + put_page(p); + btrfs_release_extent_buffer(new); + return NULL; + } WARN_ON(PageDirty(p)); - SetPageUptodate(p); new->pages[i] = p; copy_page(page_address(p), page_address(src->pages[i])); } - - set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); - set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); + set_extent_buffer_uptodate(new); return new; } @@ -5099,9 +5343,14 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { + int ret; + eb->pages[i] = alloc_page(GFP_NOFS); if (!eb->pages[i]) goto err; + ret = attach_extent_buffer_page(eb, eb->pages[i], NULL); + if (ret < 0) + goto err; } set_extent_buffer_uptodate(eb); btrfs_set_header_nritems(eb, 0); @@ -5109,8 +5358,10 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, return eb; err: - for (; i > 0; i--) + for (; i > 0; i--) { + detach_extent_buffer_page(eb, eb->pages[i - 1]); __free_page(eb->pages[i - 1]); + } __free_extent_buffer(eb); return NULL; } @@ -5252,6 +5503,38 @@ free_eb: } #endif +static struct extent_buffer *grab_extent_buffer( + struct btrfs_fs_info *fs_info, struct page *page) +{ + struct extent_buffer *exists; + + /* + * For subpage case, we completely rely on radix tree to ensure we + * don't try to insert two ebs for the same bytenr. So here we always + * return NULL and just continue. + */ + if (fs_info->sectorsize < PAGE_SIZE) + return NULL; + + /* Page not yet attached to an extent buffer */ + if (!PagePrivate(page)) + return NULL; + + /* + * We could have already allocated an eb for this page and attached one + * so lets see if we can get a ref on the existing eb, and if we can we + * know it's good and we can just return that one, else we know we can + * just overwrite page->private. + */ + exists = (struct extent_buffer *)page->private; + if (atomic_inc_not_zero(&exists->refs)) + return exists; + + WARN_ON(PageDirty(page)); + detach_page_private(page); + return NULL; +} + struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level) { @@ -5290,36 +5573,58 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++, index++) { + struct btrfs_subpage *prealloc = NULL; + p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); if (!p) { exists = ERR_PTR(-ENOMEM); goto free_eb; } - spin_lock(&mapping->private_lock); - if (PagePrivate(p)) { - /* - * We could have already allocated an eb for this page - * and attached one so lets see if we can get a ref on - * the existing eb, and if we can we know it's good and - * we can just return that one, else we know we can just - * overwrite page->private. - */ - exists = (struct extent_buffer *)p->private; - if (atomic_inc_not_zero(&exists->refs)) { - spin_unlock(&mapping->private_lock); - unlock_page(p); - put_page(p); - mark_extent_buffer_accessed(exists, p); - goto free_eb; - } - exists = NULL; + /* + * Preallocate page->private for subpage case, so that we won't + * allocate memory with private_lock hold. The memory will be + * freed by attach_extent_buffer_page() or freed manually if + * we exit earlier. + * + * Although we have ensured one subpage eb can only have one + * page, but it may change in the future for 16K page size + * support, so we still preallocate the memory in the loop. + */ + ret = btrfs_alloc_subpage(fs_info, &prealloc, + BTRFS_SUBPAGE_METADATA); + if (ret < 0) { + unlock_page(p); + put_page(p); + exists = ERR_PTR(ret); + goto free_eb; + } - WARN_ON(PageDirty(p)); - detach_page_private(p); + spin_lock(&mapping->private_lock); + exists = grab_extent_buffer(fs_info, p); + if (exists) { + spin_unlock(&mapping->private_lock); + unlock_page(p); + put_page(p); + mark_extent_buffer_accessed(exists, p); + btrfs_free_subpage(prealloc); + goto free_eb; } - attach_extent_buffer_page(eb, p); + /* Should not fail, as we have preallocated the memory */ + ret = attach_extent_buffer_page(eb, p, prealloc); + ASSERT(!ret); + /* + * To inform we have extra eb under allocation, so that + * detach_extent_buffer_page() won't release the page private + * when the eb hasn't yet been inserted into radix tree. + * + * The ref will be decreased when the eb released the page, in + * detach_extent_buffer_page(). + * Thus needs no special handling in error path. + */ + btrfs_page_inc_eb_refs(fs_info, p); spin_unlock(&mapping->private_lock); + WARN_ON(PageDirty(p)); eb->pages[i] = p; if (!PageUptodate(p)) @@ -5525,31 +5830,101 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb) void clear_extent_buffer_uptodate(struct extent_buffer *eb) { - int i; + struct btrfs_fs_info *fs_info = eb->fs_info; struct page *page; int num_pages; + int i; clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; if (page) - ClearPageUptodate(page); + btrfs_page_clear_uptodate(fs_info, page, + eb->start, eb->len); } } void set_extent_buffer_uptodate(struct extent_buffer *eb) { - int i; + struct btrfs_fs_info *fs_info = eb->fs_info; struct page *page; int num_pages; + int i; set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; - SetPageUptodate(page); + btrfs_page_set_uptodate(fs_info, page, eb->start, eb->len); + } +} + +static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, + int mirror_num) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + struct extent_io_tree *io_tree; + struct page *page = eb->pages[0]; + struct bio *bio = NULL; + int ret = 0; + + ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); + ASSERT(PagePrivate(page)); + io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; + + if (wait == WAIT_NONE) { + ret = try_lock_extent(io_tree, eb->start, + eb->start + eb->len - 1); + if (ret <= 0) + return ret; + } else { + ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1); + if (ret < 0) + return ret; + } + + ret = 0; + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) || + PageUptodate(page) || + btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) { + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + unlock_extent(io_tree, eb->start, eb->start + eb->len - 1); + return ret; + } + + clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); + eb->read_mirror = 0; + atomic_set(&eb->io_pages, 1); + check_buffer_tree_ref(eb); + btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); + + ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, page, eb->start, + eb->len, eb->start - page_offset(page), &bio, + end_bio_extent_readpage, mirror_num, 0, 0, + true); + if (ret) { + /* + * In the endio function, if we hit something wrong we will + * increase the io_pages, so here we need to decrease it for + * error path. + */ + atomic_dec(&eb->io_pages); + } + if (bio) { + int tmp; + + tmp = submit_one_bio(bio, mirror_num, 0); + if (tmp < 0) + return tmp; } + if (ret || wait != WAIT_COMPLETE) + return ret; + + wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED); + if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) + ret = -EIO; + return ret; } int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) @@ -5568,10 +5943,20 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; + if (eb->fs_info->sectorsize < PAGE_SIZE) + return read_extent_buffer_subpage(eb, wait, mirror_num); + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; if (wait == WAIT_NONE) { + /* + * WAIT_NONE is only utilized by readahead. If we can't + * acquire the lock atomically it means either the eb + * is being read out or under modification. + * Either way the eb will be or has been cached, + * readahead can exit safely. + */ if (!trylock_page(page)) goto unlock_exit; } else { @@ -5823,6 +6208,8 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, char *src = (char *)srcv; unsigned long i = get_eb_page_index(start); + WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)); + if (check_eb_range(eb, start, len)) return; @@ -6169,13 +6556,115 @@ void memmove_extent_buffer(const struct extent_buffer *dst, } } +static struct extent_buffer *get_next_extent_buffer( + struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) +{ + struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE]; + struct extent_buffer *found = NULL; + u64 page_start = page_offset(page); + int ret; + int i; + + ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); + ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE); + lockdep_assert_held(&fs_info->buffer_lock); + + ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang, + bytenr >> fs_info->sectorsize_bits, + PAGE_SIZE / fs_info->nodesize); + for (i = 0; i < ret; i++) { + /* Already beyond page end */ + if (gang[i]->start >= page_start + PAGE_SIZE) + break; + /* Found one */ + if (gang[i]->start >= bytenr) { + found = gang[i]; + break; + } + } + return found; +} + +static int try_release_subpage_extent_buffer(struct page *page) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + u64 cur = page_offset(page); + const u64 end = page_offset(page) + PAGE_SIZE; + int ret; + + while (cur < end) { + struct extent_buffer *eb = NULL; + + /* + * Unlike try_release_extent_buffer() which uses page->private + * to grab buffer, for subpage case we rely on radix tree, thus + * we need to ensure radix tree consistency. + * + * We also want an atomic snapshot of the radix tree, thus go + * with spinlock rather than RCU. + */ + spin_lock(&fs_info->buffer_lock); + eb = get_next_extent_buffer(fs_info, page, cur); + if (!eb) { + /* No more eb in the page range after or at cur */ + spin_unlock(&fs_info->buffer_lock); + break; + } + cur = eb->start + eb->len; + + /* + * The same as try_release_extent_buffer(), to ensure the eb + * won't disappear out from under us. + */ + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { + spin_unlock(&eb->refs_lock); + spin_unlock(&fs_info->buffer_lock); + break; + } + spin_unlock(&fs_info->buffer_lock); + + /* + * If tree ref isn't set then we know the ref on this eb is a + * real ref, so just return, this eb will likely be freed soon + * anyway. + */ + if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { + spin_unlock(&eb->refs_lock); + break; + } + + /* + * Here we don't care about the return value, we will always + * check the page private at the end. And + * release_extent_buffer() will release the refs_lock. + */ + release_extent_buffer(eb); + } + /* + * Finally to check if we have cleared page private, as if we have + * released all ebs in the page, the page private should be cleared now. + */ + spin_lock(&page->mapping->private_lock); + if (!PagePrivate(page)) + ret = 1; + else + ret = 0; + spin_unlock(&page->mapping->private_lock); + return ret; + +} + int try_release_extent_buffer(struct page *page) { struct extent_buffer *eb; + if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + return try_release_subpage_extent_buffer(page); + /* - * We need to make sure nobody is attaching this page to an eb right - * now. + * We need to make sure nobody is changing page->private, as we rely on + * page->private as the pointer to extent buffer. */ spin_lock(&page->mapping->private_lock); if (!PagePrivate(page)) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 19221095c635..824640cb0ace 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -31,16 +31,17 @@ enum { EXTENT_BUFFER_IN_TREE, /* write IO error */ EXTENT_BUFFER_WRITE_ERR, + EXTENT_BUFFER_NO_CHECK, }; /* these are flags for __process_pages_contig */ #define PAGE_UNLOCK (1 << 0) -#define PAGE_CLEAR_DIRTY (1 << 1) -#define PAGE_SET_WRITEBACK (1 << 2) -#define PAGE_END_WRITEBACK (1 << 3) -#define PAGE_SET_PRIVATE2 (1 << 4) -#define PAGE_SET_ERROR (1 << 5) -#define PAGE_LOCK (1 << 6) +/* Page starts writeback, clear dirty bit and set writeback bit */ +#define PAGE_START_WRITEBACK (1 << 1) +#define PAGE_END_WRITEBACK (1 << 2) +#define PAGE_SET_PRIVATE2 (1 << 3) +#define PAGE_SET_ERROR (1 << 4) +#define PAGE_LOCK (1 << 5) /* * page->private values. Every page that is controlled by the extent @@ -93,6 +94,7 @@ struct extent_buffer { struct rw_semaphore lock; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; + struct list_head release_list; #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; #endif @@ -178,7 +180,8 @@ int btree_write_cache_pages(struct address_space *mapping, void extent_readahead(struct readahead_control *rac); int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); -void set_page_extent_mapped(struct page *page); +int set_page_extent_mapped(struct page *page); +void clear_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index bd6229fb2b6f..4a8e02f7b6c7 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -385,9 +385,12 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) } /** - * add_extent_mapping - add new extent map to the extent tree + * Add new extent map to the extent tree + * * @tree: tree to insert new map in * @em: map to insert + * @modified: indicate whether the given @em should be added to the + * modified list, which indicates the extent needs to be logged * * Insert @em into @tree or perform a simple forward/backward merge with * existing mappings. The extent_map struct passed in will be inserted @@ -574,12 +577,13 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, } /** - * btrfs_add_extent_mapping - add extent mapping into em_tree - * @fs_info - used for tracepoint - * @em_tree - the extent tree into which we want to insert the extent mapping - * @em_in - extent we are inserting - * @start - start of the logical range btrfs_get_extent() is requesting - * @len - length of the logical range btrfs_get_extent() is requesting + * Add extent mapping into em_tree + * + * @fs_info: the filesystem + * @em_tree: extent tree into which we want to insert the extent mapping + * @em_in: extent we are inserting + * @start: start of the logical range btrfs_get_extent() is requesting + * @len: length of the logical range btrfs_get_extent() is requesting * * Note that @em_in's range may be different from [start, start+len), * but they must be overlapped. diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 6ccfc019ad90..47cd3a6dc635 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -24,8 +24,10 @@ PAGE_SIZE)) /** - * @inode - the inode we want to update the disk_i_size for - * @new_i_size - the i_size we want to set to, 0 if we use i_size + * Set inode's size according to filesystem options + * + * @inode: inode we want to update the disk_i_size for + * @new_i_size: i_size we want to set to, 0 if we use i_size * * With NO_HOLES set this simply sets the disk_is_size to whatever i_size_read() * returns as it is perfectly fine with a file that has holes without hole file @@ -62,9 +64,11 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz } /** - * @inode - the inode we're modifying - * @start - the start file offset of the file extent we've inserted - * @len - the logical length of the file extent item + * Mark range within a file as having a new extent inserted + * + * @inode: inode being modified + * @start: start file offset of the file extent we've inserted + * @len: logical length of the file extent item * * Call when we are inserting a new file extent where there was none before. * Does not need to call this in the case where we're replacing an existing file @@ -88,9 +92,11 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, } /** - * @inode - the inode we're modifying - * @start - the start file offset of the file extent we've inserted - * @len - the logical length of the file extent item + * Marks an inode range as not having a backing extent + * + * @inode: inode being modified + * @start: start file offset of the file extent we've inserted + * @len: logical length of the file extent item * * Called when we drop a file extent, for example when we truncate. Doesn't * need to be called for cases where we're replacing a file extent, like when diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0e41459b8de6..be9e3900cce8 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -453,12 +453,11 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) } /* - * after copy_from_user, pages need to be dirtied and we need to make - * sure holes are created between the current EOF and the start of - * any next extents (if required). - * - * this also makes the decision about creating an inline extent vs - * doing real data extents, marking pages dirty and delalloc as required. + * After btrfs_copy_from_user(), update the following things for delalloc: + * - Mark newly dirtied pages as DELALLOC in the io tree. + * Used to advise which range is to be written back. + * - Mark modified pages as Uptodate/Dirty and not needing COW fixup + * - Update inode size for past EOF write */ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, @@ -1370,6 +1369,12 @@ again: goto fail; } + err = set_page_extent_mapped(pages[i]); + if (err < 0) { + faili = i; + goto fail; + } + if (i == 0) err = prepare_uptodate_page(inode, pages[i], pos, force_uptodate); @@ -1454,23 +1459,11 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, } /* - * It's possible the pages are dirty right now, but we don't want - * to clean them yet because copy_from_user may catch a page fault - * and we might have to fall back to one page at a time. If that - * happens, we'll unlock these pages and we'd have a window where - * reclaim could sneak in and drop the once-dirty page on the floor - * without writing it. - * - * We have the pages locked and the extent range locked, so there's - * no way someone can start IO on any dirty pages in this range. - * - * We'll call btrfs_dirty_pages() later on, and that will flip around - * delalloc bits and dirty the pages as required. + * We should be called after prepare_pages() which should have locked + * all pages in the range. */ - for (i = 0; i < num_pages; i++) { - set_page_extent_mapped(pages[i]); + for (i = 0; i < num_pages; i++) WARN_ON(!PageLocked(pages[i])); - } return ret; } @@ -1949,8 +1942,8 @@ relock: goto buffered; } - dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, - &btrfs_dio_ops, is_sync_kiocb(iocb)); + dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + 0); btrfs_inode_unlock(inode, ilock_flags); @@ -1997,9 +1990,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *inode = BTRFS_I(file_inode(file)); ssize_t num_written = 0; const bool sync = iocb->ki_flags & IOCB_DSYNC; @@ -2008,7 +1999,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * have opened a file as writable, we have to stop this write operation * to ensure consistency. */ - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state)) return -EROFS; if (!(iocb->ki_flags & IOCB_DIRECT) && @@ -2016,7 +2007,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, return -EOPNOTSUPP; if (sync) - atomic_inc(&BTRFS_I(inode)->sync_writers); + atomic_inc(&inode->sync_writers); if (iocb->ki_flags & IOCB_DIRECT) num_written = btrfs_direct_write(iocb, from); @@ -2028,14 +2019,14 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * otherwise subsequent syncs to a file that's been synced in this * transaction will appear to have already occurred. */ - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->last_sub_trans = root->log_transid; - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + inode->last_sub_trans = inode->root->log_transid; + spin_unlock(&inode->lock); if (num_written > 0) num_written = generic_write_sync(iocb, num_written); if (sync) - atomic_dec(&BTRFS_I(inode)->sync_writers); + atomic_dec(&inode->sync_writers); current->backing_dev_info = NULL; return num_written; @@ -2177,8 +2168,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * commit waits for their completion, to avoid data loss if we fsync, * the current transaction commits before the ordered extents complete * and a power failure happens right after that. + * + * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the + * logical address recorded in the ordered extent may change. We need + * to wait for the IO to stabilize the logical address. */ - if (full_sync) { + if (full_sync || btrfs_is_zoned(fs_info)) { ret = btrfs_wait_ordered_range(inode, start, len); } else { /* @@ -2241,6 +2236,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = PTR_ERR(trans); goto out_release_extents; } + trans->in_fsync = true; ret = btrfs_log_dentry_safe(trans, dentry, &ctx); btrfs_release_log_ctx_extents(&ctx); @@ -3622,8 +3618,7 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) return 0; btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); - ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - is_sync_kiocb(iocb)); + ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return ret; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 4d8897879c9c..5400294bd271 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -198,7 +198,7 @@ int create_free_space_inode(struct btrfs_trans_handle *trans, int ret; u64 ino; - ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino); + ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino); if (ret < 0) return ret; @@ -431,11 +431,22 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) int i; for (i = 0; i < io_ctl->num_pages; i++) { + int ret; + page = find_or_create_page(inode->i_mapping, i, mask); if (!page) { io_ctl_drop_pages(io_ctl); return -ENOMEM; } + + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_page(page); + put_page(page); + io_ctl_drop_pages(io_ctl); + return ret; + } + io_ctl->pages[i] = page; if (uptodate && !PageUptodate(page)) { btrfs_readpage(NULL, page); @@ -455,10 +466,8 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) } } - for (i = 0; i < io_ctl->num_pages; i++) { + for (i = 0; i < io_ctl->num_pages; i++) clear_page_dirty_for_io(io_ctl->pages[i]); - set_page_extent_mapped(io_ctl->pages[i]); - } return 0; } @@ -775,8 +784,10 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, while (num_entries) { e = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); - if (!e) + if (!e) { + ret = -ENOMEM; goto free_cache; + } ret = io_ctl_read_entry(&io_ctl, e, &type); if (ret) { @@ -785,6 +796,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, } if (!e->bytes) { + ret = -1; kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } @@ -805,6 +817,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, e->bitmap = kmem_cache_zalloc( btrfs_free_space_bitmap_cachep, GFP_NOFS); if (!e->bitmap) { + ret = -ENOMEM; kmem_cache_free( btrfs_free_space_cachep, e); goto free_cache; @@ -1295,11 +1308,14 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, } /** - * __btrfs_write_out_cache - write out cached info to an inode - * @root - the root the inode belongs to - * @ctl - the free space cache we are going to write out - * @block_group - the block_group for this cache if it belongs to a block_group - * @trans - the trans handle + * Write out cached info to an inode + * + * @root: root the inode belongs to + * @inode: freespace inode we are writing out + * @ctl: free space cache we are going to write out + * @block_group: block_group for this cache if it belongs to a block_group + * @io_ctl: holds context for the io + * @trans: the trans handle * * This function writes out a free space cache struct to disk for quick recovery * on mount. This will return 0 if it was successful in writing the cache out, @@ -2461,6 +2477,8 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, int ret = 0; u64 filter_bytes = bytes; + ASSERT(!btrfs_is_zoned(fs_info)); + info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); if (!info) return -ENOMEM; @@ -2518,11 +2536,49 @@ out: return ret; } +static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, + u64 bytenr, u64 size, bool used) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + u64 offset = bytenr - block_group->start; + u64 to_free, to_unusable; + + spin_lock(&ctl->tree_lock); + if (!used) + to_free = size; + else if (offset >= block_group->alloc_offset) + to_free = size; + else if (offset + size <= block_group->alloc_offset) + to_free = 0; + else + to_free = offset + size - block_group->alloc_offset; + to_unusable = size - to_free; + + ctl->free_space += to_free; + block_group->zone_unusable += to_unusable; + spin_unlock(&ctl->tree_lock); + if (!used) { + spin_lock(&block_group->lock); + block_group->alloc_offset -= size; + spin_unlock(&block_group->lock); + } + + /* All the region is now unusable. Mark it as unused and reclaim */ + if (block_group->zone_unusable == block_group->length) + btrfs_mark_bg_unused(block_group); + + return 0; +} + int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size) { enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + if (btrfs_is_zoned(block_group->fs_info)) + return __btrfs_add_free_space_zoned(block_group, bytenr, size, + true); + if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) trim_state = BTRFS_TRIM_STATE_TRIMMED; @@ -2531,6 +2587,16 @@ int btrfs_add_free_space(struct btrfs_block_group *block_group, bytenr, size, trim_state); } +int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, + u64 bytenr, u64 size) +{ + if (btrfs_is_zoned(block_group->fs_info)) + return __btrfs_add_free_space_zoned(block_group, bytenr, size, + false); + + return btrfs_add_free_space(block_group, bytenr, size); +} + /* * This is a subtle distinction because when adding free space back in general, * we want it to be added as untrimmed for async. But in the case where we add @@ -2541,6 +2607,10 @@ int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, { enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + if (btrfs_is_zoned(block_group->fs_info)) + return __btrfs_add_free_space_zoned(block_group, bytenr, size, + true); + if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) trim_state = BTRFS_TRIM_STATE_TRIMMED; @@ -2558,6 +2628,23 @@ int btrfs_remove_free_space(struct btrfs_block_group *block_group, int ret; bool re_search = false; + if (btrfs_is_zoned(block_group->fs_info)) { + /* + * This can happen with conventional zones when replaying log. + * Since the allocation info of tree-log nodes are not recorded + * to the extent-tree, calculate_alloc_pointer() failed to + * advance the allocation pointer after last allocated tree log + * node blocks. + * + * This function is called from + * btrfs_pin_extent_for_log_replay() when replaying the log. + * Advance the pointer not to overwrite the tree-log nodes. + */ + if (block_group->alloc_offset < offset + bytes) + block_group->alloc_offset = offset + bytes; + return 0; + } + spin_lock(&ctl->tree_lock); again: @@ -2652,6 +2739,16 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group, struct rb_node *n; int count = 0; + /* + * Zoned btrfs does not use free space tree and cluster. Just print + * out the free space after the allocation offset. + */ + if (btrfs_is_zoned(fs_info)) { + btrfs_info(fs_info, "free space %llu", + block_group->length - block_group->alloc_offset); + return; + } + spin_lock(&ctl->tree_lock); for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { info = rb_entry(n, struct btrfs_free_space, offset_index); @@ -2845,6 +2942,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, u64 align_gap_len = 0; enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + ASSERT(!btrfs_is_zoned(block_group->fs_info)); + spin_lock(&ctl->tree_lock); entry = find_free_space(ctl, &offset, &bytes_search, block_group->full_stripe_len, max_extent_size); @@ -2976,6 +3075,8 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, struct rb_node *node; u64 ret = 0; + ASSERT(!btrfs_is_zoned(block_group->fs_info)); + spin_lock(&cluster->lock); if (bytes > cluster->max_size) goto out; @@ -3752,6 +3853,8 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group, int ret; u64 rem = 0; + ASSERT(!btrfs_is_zoned(block_group->fs_info)); + *trimmed = 0; spin_lock(&block_group->lock); diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index ecb09a02d544..1f23088d43f9 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -107,6 +107,8 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, enum btrfs_trim_state trim_state); int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); +int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, + u64 bytenr, u64 size); int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, u64 bytenr, u64 size); int btrfs_remove_free_space(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index e33a65bd9a0c..a33bca94d133 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1150,6 +1150,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) return PTR_ERR(trans); set_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); + set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); free_space_root = btrfs_create_tree(trans, BTRFS_FREE_SPACE_TREE_OBJECTID); if (IS_ERR(free_space_root)) { @@ -1171,11 +1172,18 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE); btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); + ret = btrfs_commit_transaction(trans); - return btrfs_commit_transaction(trans); + /* + * Now that we've committed the transaction any reading of our commit + * root will be safe, so we can cache from the free space tree now. + */ + clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); + return ret; abort: clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); + clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a8e0a6b038d3..535abf898225 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -50,6 +50,7 @@ #include "delalloc-space.h" #include "block-group.h" #include "space-info.h" +#include "zoned.h" struct btrfs_iget_args { u64 ino; @@ -692,8 +693,7 @@ cont: NULL, clear_flags, PAGE_UNLOCK | - PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | + PAGE_START_WRITEBACK | page_error_op | PAGE_END_WRITEBACK); @@ -917,7 +917,6 @@ retry: ins.objectid, async_extent->ram_size, ins.offset, - BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); if (ret) { btrfs_drop_extent_cache(inode, async_extent->start, @@ -934,8 +933,7 @@ retry: async_extent->start + async_extent->ram_size - 1, NULL, EXTENT_LOCKED | EXTENT_DELALLOC, - PAGE_UNLOCK | PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK); + PAGE_UNLOCK | PAGE_START_WRITEBACK); if (btrfs_submit_compressed_write(inode, async_extent->start, async_extent->ram_size, ins.objectid, @@ -971,9 +969,8 @@ out_free: NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, - PAGE_UNLOCK | PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | - PAGE_SET_ERROR); + PAGE_UNLOCK | PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK | PAGE_SET_ERROR); free_async_extent_pages(async_extent); kfree(async_extent); goto again; @@ -1071,8 +1068,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | - PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | - PAGE_END_WRITEBACK); + PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); *nr_written = *nr_written + (end - start + PAGE_SIZE) / PAGE_SIZE; *page_started = 1; @@ -1127,7 +1123,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, free_extent_map(em); ret = btrfs_add_ordered_extent(inode, start, ins.objectid, - ram_size, cur_alloc_size, 0); + ram_size, cur_alloc_size, + BTRFS_ORDERED_REGULAR); if (ret) goto out_drop_extent_cache; @@ -1194,8 +1191,7 @@ out_reserve: out_unlock: clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; - page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | - PAGE_END_WRITEBACK; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; /* * If we reserved an extent for our delalloc range (or a subrange) and * failed to create the respective ordered extent, then it means that @@ -1320,9 +1316,8 @@ static int cow_file_range_async(struct btrfs_inode *inode, unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING; - unsigned long page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | - PAGE_SET_ERROR; + unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK | PAGE_SET_ERROR; extent_clear_unlock_delalloc(inode, start, end, locked_page, clear_bits, page_ops); @@ -1399,6 +1394,29 @@ static int cow_file_range_async(struct btrfs_inode *inode, return 0; } +static noinline int run_delalloc_zoned(struct btrfs_inode *inode, + struct page *locked_page, u64 start, + u64 end, int *page_started, + unsigned long *nr_written) +{ + int ret; + + ret = cow_file_range(inode, locked_page, start, end, page_started, + nr_written, 0); + if (ret) + return ret; + + if (*page_started) + return 0; + + __set_page_dirty_nobuffers(locked_page); + account_page_redirty(locked_page); + extent_write_locked_range(&inode->vfs_inode, start, end, WB_SYNC_ALL); + *page_started = 1; + + return 0; +} + static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes) { @@ -1519,8 +1537,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, PAGE_UNLOCK | - PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | + PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); return -ENOMEM; } @@ -1842,8 +1859,7 @@ error: locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | - PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | + PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); btrfs_free_path(path); return ret; @@ -1878,17 +1894,24 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page { int ret; int force_cow = need_force_cow(inode, start, end); + const bool zoned = btrfs_is_zoned(inode->root->fs_info); if (inode->flags & BTRFS_INODE_NODATACOW && !force_cow) { + ASSERT(!zoned); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 1, nr_written); } else if (inode->flags & BTRFS_INODE_PREALLOC && !force_cow) { + ASSERT(!zoned); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); } else if (!inode_can_compress(inode) || !inode_need_compress(inode, start, end)) { - ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1); + if (zoned) + ret = run_delalloc_zoned(inode, locked_page, start, end, + page_started, nr_written); + else + ret = cow_file_range(inode, locked_page, start, end, + page_started, nr_written, 1); } else { set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); ret = cow_file_range_async(inode, wbc, locked_page, start, end, @@ -2183,9 +2206,10 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 logical = bio->bi_iter.bi_sector << 9; + struct extent_map *em; u64 length = 0; u64 map_length; - int ret; + int ret = 0; struct btrfs_io_geometry geom; if (bio_flags & EXTENT_BIO_COMPRESSED) @@ -2193,14 +2217,19 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, length = bio->bi_iter.bi_size; map_length = length; - ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length, - &geom); + em = btrfs_get_chunk_map(fs_info, logical, map_length); + if (IS_ERR(em)) + return PTR_ERR(em); + ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, + map_length, &geom); if (ret < 0) - return ret; + goto out; if (geom.len < length + size) - return 1; - return 0; + ret = 1; +out: + free_extent_map(em); + return ret; } /* @@ -2217,6 +2246,119 @@ static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); } +bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio, + unsigned int size) +{ + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_ordered_extent *ordered; + u64 len = bio->bi_iter.bi_size + size; + bool ret = true; + + ASSERT(btrfs_is_zoned(fs_info)); + ASSERT(fs_info->max_zone_append_size > 0); + ASSERT(bio_op(bio) == REQ_OP_ZONE_APPEND); + + /* Ordered extent not yet created, so we're good */ + ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); + if (!ordered) + return ret; + + if ((bio->bi_iter.bi_sector << SECTOR_SHIFT) + len > + ordered->disk_bytenr + ordered->disk_num_bytes) + ret = false; + + btrfs_put_ordered_extent(ordered); + + return ret; +} + +static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, + struct bio *bio, loff_t file_offset) +{ + struct btrfs_ordered_extent *ordered; + struct extent_map *em = NULL, *em_new = NULL; + struct extent_map_tree *em_tree = &inode->extent_tree; + u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT; + u64 len = bio->bi_iter.bi_size; + u64 end = start + len; + u64 ordered_end; + u64 pre, post; + int ret = 0; + + ordered = btrfs_lookup_ordered_extent(inode, file_offset); + if (WARN_ON_ONCE(!ordered)) + return BLK_STS_IOERR; + + /* No need to split */ + if (ordered->disk_num_bytes == len) + goto out; + + /* We cannot split once end_bio'd ordered extent */ + if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) { + ret = -EINVAL; + goto out; + } + + /* We cannot split a compressed ordered extent */ + if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) { + ret = -EINVAL; + goto out; + } + + ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes; + /* bio must be in one ordered extent */ + if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) { + ret = -EINVAL; + goto out; + } + + /* Checksum list should be empty */ + if (WARN_ON_ONCE(!list_empty(&ordered->list))) { + ret = -EINVAL; + goto out; + } + + pre = start - ordered->disk_bytenr; + post = ordered_end - end; + + ret = btrfs_split_ordered_extent(ordered, pre, post); + if (ret) + goto out; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, ordered->file_offset, len); + if (!em) { + read_unlock(&em_tree->lock); + ret = -EIO; + goto out; + } + read_unlock(&em_tree->lock); + + ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); + /* + * We cannot reuse em_new here but have to create a new one, as + * unpin_extent_cache() expects the start of the extent map to be the + * logical offset of the file, which does not hold true anymore after + * splitting. + */ + em_new = create_io_em(inode, em->start + pre, len, + em->start + pre, em->block_start + pre, len, + len, len, BTRFS_COMPRESS_NONE, + BTRFS_ORDERED_REGULAR); + if (IS_ERR(em_new)) { + ret = PTR_ERR(em_new); + goto out; + } + free_extent_map(em_new); + +out: + free_extent_map(em); + btrfs_put_ordered_extent(ordered); + + return errno_to_blk_status(ret); +} + /* * extent_io.c submission hook. This does the right thing for csum calculation * on write, or reading the csums from the tree before a read. @@ -2252,7 +2394,16 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, if (btrfs_is_free_space_inode(BTRFS_I(inode))) metadata = BTRFS_WQ_ENDIO_FREE_SPACE; - if (bio_op(bio) != REQ_OP_WRITE) { + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct page *page = bio_first_bvec_all(bio)->bv_page; + loff_t file_offset = page_offset(page); + + ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset); + if (ret) + goto out; + } + + if (btrfs_op(bio) != BTRFS_MAP_WRITE) { ret = btrfs_bio_wq_end_io(fs_info, bio, metadata); if (ret) goto out; @@ -2754,6 +2905,9 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } + if (ordered_extent->disk) + btrfs_rewrite_logical_zoned(ordered_extent); + btrfs_free_io_failure_record(inode, start, end); if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { @@ -3103,14 +3257,16 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) } /** - * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running - * @fs_info - the fs_info for this fs - * @return - EINTR if we were killed, 0 if nothing's pending + * Wait for flushing all delayed iputs + * + * @fs_info: the filesystem * * This will wait on any delayed iputs that are currently running with KILLABLE * set. Once they are all done running we will return, unless we are killed in * which case we return EINTR. This helps in user operations like fallocate etc * that might get blocked on the iputs. + * + * Return EINTR if we were killed, 0 if nothing's pending */ int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info) { @@ -4720,6 +4876,9 @@ again: ret = -ENOMEM; goto out; } + ret = set_page_extent_mapped(page); + if (ret < 0) + goto out_unlock; if (!PageUptodate(page)) { ret = btrfs_readpage(NULL, page); @@ -4737,7 +4896,6 @@ again: wait_on_page_writeback(page); lock_extent_bits(io_tree, block_start, block_end, &cached_state); - set_page_extent_mapped(page); ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { @@ -5011,6 +5169,15 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_end_transaction(trans); } else { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + + if (btrfs_is_zoned(fs_info)) { + ret = btrfs_wait_ordered_range(inode, + ALIGN(newsize, fs_info->sectorsize), + (u64)-1); + if (ret) + return ret; + } /* * We're truncating a file that used to have good data down to @@ -6371,7 +6538,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_objectid(root, &objectid); + err = btrfs_get_free_objectid(root, &objectid); if (err) goto out_unlock; @@ -6435,7 +6602,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_objectid(root, &objectid); + err = btrfs_get_free_objectid(root, &objectid); if (err) goto out_unlock; @@ -6579,7 +6746,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_objectid(root, &objectid); + err = btrfs_get_free_objectid(root, &objectid); if (err) goto out_fail; @@ -7103,9 +7270,6 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, * @strict: if true, omit optimizations that might force us into unnecessary * cow. e.g., don't trust generation number. * - * This function will flush ordered extents in the range to ensure proper - * nocow checks for (nowait == false) case. - * * Return: * >0 and update @len if we can do nocow write * 0 if we can't do nocow write @@ -7613,6 +7777,9 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->bdev = fs_info->fs_devices->latest_bdev; iomap->length = len; + if (write && btrfs_use_zone_append(BTRFS_I(inode), em)) + iomap->flags |= IOMAP_F_ZONE_APPEND; + free_extent_map(em); return 0; @@ -7682,7 +7849,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) if (!refcount_dec_and_test(&dip->refs)) return; - if (bio_op(dip->dio_bio) == REQ_OP_WRITE) { + if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) { __endio_write_update_ordered(BTRFS_I(dip->inode), dip->logical_offset, dip->bytes, @@ -7797,10 +7964,8 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode, NULL); btrfs_queue_work(wq, &ordered->work); } - /* - * If btrfs_dec_test_ordered_pending does not find any ordered - * extent in the range, we can exit. - */ + + /* No ordered extent found in the range, exit */ if (ordered_offset == last_offset) return; /* @@ -7841,6 +8006,8 @@ static void btrfs_end_dio_bio(struct bio *bio) if (err) dip->dio_bio->bi_status = err; + btrfs_record_physical_zoned(dip->inode, dip->logical_offset, bio); + bio_put(bio); btrfs_dio_private_put(dip); } @@ -7850,7 +8017,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_private *dip = bio->bi_private; - bool write = bio_op(bio) == REQ_OP_WRITE; + bool write = btrfs_op(bio) == BTRFS_MAP_WRITE; blk_status_t ret; /* Check btrfs_submit_bio_hook() for rules about async submit. */ @@ -7900,7 +8067,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, struct inode *inode, loff_t file_offset) { - const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); + const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); size_t dip_size; struct btrfs_dio_private *dip; @@ -7930,7 +8097,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, struct bio *dio_bio, loff_t file_offset) { - const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); + const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const bool raid56 = (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK); @@ -7941,10 +8108,12 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, u64 submit_len; int clone_offset = 0; int clone_len; + u64 logical; int ret; blk_status_t status; struct btrfs_io_geometry geom; struct btrfs_dio_data *dio_data = iomap->private; + struct extent_map *em = NULL; dip = btrfs_create_dio_private(dio_bio, inode, file_offset); if (!dip) { @@ -7973,12 +8142,18 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, submit_len = dio_bio->bi_iter.bi_size; do { - ret = btrfs_get_io_geometry(fs_info, btrfs_op(dio_bio), - start_sector << 9, submit_len, - &geom); + logical = start_sector << 9; + em = btrfs_get_chunk_map(fs_info, logical, submit_len); + if (IS_ERR(em)) { + status = errno_to_blk_status(PTR_ERR(em)); + em = NULL; + goto out_err_em; + } + ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio), + logical, submit_len, &geom); if (ret) { status = errno_to_blk_status(ret); - goto out_err; + goto out_err_em; } ASSERT(geom.len <= INT_MAX); @@ -7993,6 +8168,19 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; + WARN_ON_ONCE(write && btrfs_is_zoned(fs_info) && + fs_info->max_zone_append_size && + bio_op(bio) != REQ_OP_ZONE_APPEND); + + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + status = extract_ordered_extent(BTRFS_I(inode), bio, + file_offset); + if (status) { + bio_put(bio); + goto out_err; + } + } + ASSERT(submit_len >= clone_len); submit_len -= clone_len; @@ -8023,19 +8211,24 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, bio_put(bio); if (submit_len > 0) refcount_dec(&dip->refs); - goto out_err; + goto out_err_em; } dio_data->submitted += clone_len; clone_offset += clone_len; start_sector += clone_len >> 9; file_offset += clone_len; + + free_extent_map(em); } while (submit_len > 0); return BLK_QC_T_NONE; +out_err_em: + free_extent_map(em); out_err: dip->dio_bio->bi_status = status; btrfs_dio_private_put(dip); + return BLK_QC_T_NONE; } @@ -8117,7 +8310,7 @@ static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) { int ret = try_release_extent_mapping(page, gfp_flags); if (ret == 1) - detach_page_private(page); + clear_page_extent_mapped(page); return ret; } @@ -8186,8 +8379,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, if (!inode_evicting) lock_extent_bits(tree, page_start, page_end, &cached_state); -again: + start = page_start; +again: ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1); if (ordered) { found_ordered = true; @@ -8276,7 +8470,7 @@ again: } ClearPageChecked(page); - detach_page_private(page); + clear_page_extent_mapped(page); } /* @@ -8355,7 +8549,12 @@ again: wait_on_page_writeback(page); lock_extent_bits(io_tree, page_start, page_end, &cached_state); - set_page_extent_mapped(page); + ret2 = set_page_extent_mapped(page); + if (ret2 < 0) { + ret = vmf_error(ret2); + unlock_extent_cached(io_tree, page_start, page_end, &cached_state); + goto out_unlock; + } /* * we can't set the delalloc bits if there are pending ordered @@ -8592,15 +8791,18 @@ out: */ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, - struct btrfs_root *parent_root, - u64 new_dirid) + struct btrfs_root *parent_root) { struct inode *inode; int err; u64 index = 0; + u64 ino; + + err = btrfs_get_free_objectid(new_root, &ino); + if (err < 0) + return err; - inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, - new_dirid, new_dirid, + inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, ino, ino, S_IFDIR | (~current_umask() & S_IRWXUGO), &index); if (IS_ERR(inode)) @@ -9079,7 +9281,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, u64 objectid; u64 index; - ret = btrfs_find_free_objectid(root, &objectid); + ret = btrfs_get_free_objectid(root, &objectid); if (ret) return ret; @@ -9486,11 +9688,11 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root) return start_delalloc_inodes(root, &wbc, true, false); } -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr, +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, bool in_reclaim_context) { struct writeback_control wbc = { - .nr_to_write = (nr == U64_MAX) ? LONG_MAX : (unsigned long)nr, + .nr_to_write = nr, .sync_mode = WB_SYNC_NONE, .range_start = 0, .range_end = LLONG_MAX, @@ -9507,12 +9709,12 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr, mutex_lock(&fs_info->delalloc_root_mutex); spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); - while (!list_empty(&splice) && nr) { + while (!list_empty(&splice)) { /* * Reset nr_to_write here so we know that we're doing a full * flush. */ - if (nr == U64_MAX) + if (nr == LONG_MAX) wbc.nr_to_write = LONG_MAX; root = list_first_entry(&splice, struct btrfs_root, @@ -9575,7 +9777,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_objectid(root, &objectid); + err = btrfs_get_free_objectid(root, &objectid); if (err) goto out_unlock; @@ -9909,7 +10111,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_find_free_objectid(root, &objectid); + ret = btrfs_get_free_objectid(root, &objectid); if (ret) goto out; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index dde49a791f3e..a8c60d46d19c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -528,6 +528,14 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, return -EPERM; /* + * btrfs_trim_block_group() depends on space cache, which is not + * available in zoned filesystem. So, disallow fitrim on a zoned + * filesystem for now. + */ + if (btrfs_is_zoned(fs_info)) + return -EOPNOTSUPP; + + /* * If the fs is mounted with nologreplay, which requires it to be * mounted in RO mode as well, we can not allow discard on free space * inside block groups, because log trees refer to extents that are not @@ -606,14 +614,13 @@ static noinline int create_subvol(struct inode *dir, int err; dev_t anon_dev = 0; u64 objectid; - u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); if (!root_item) return -ENOMEM; - ret = btrfs_find_free_objectid(fs_info->tree_root, &objectid); + ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid); if (ret) goto fail_free; @@ -693,7 +700,7 @@ static noinline int create_subvol(struct inode *dir, free_extent_buffer(leaf); leaf = NULL; - btrfs_set_root_dirid(root_item, new_dirid); + btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID); key.objectid = objectid; key.offset = 0; @@ -716,7 +723,7 @@ static noinline int create_subvol(struct inode *dir, btrfs_record_root_in_trans(trans, new_root); - ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid); + ret = btrfs_create_subvol_root(trans, new_root, root); btrfs_put_root(new_root); if (ret) { /* We potentially lose an unused inode item here */ @@ -724,10 +731,6 @@ static noinline int create_subvol(struct inode *dir, goto fail; } - mutex_lock(&new_root->objectid_mutex); - new_root->highest_objectid = new_dirid; - mutex_unlock(&new_root->objectid_mutex); - /* * insert the directory item */ @@ -1319,6 +1322,13 @@ again: if (!page) break; + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_page(page); + put_page(page); + break; + } + page_start = page_offset(page); page_end = page_start + PAGE_SIZE - 1; while (1) { @@ -1440,7 +1450,6 @@ again: for (i = 0; i < i_done; i++) { clear_page_dirty_for_io(pages[i]); ClearPageChecked(pages[i]); - set_page_extent_mapped(pages[i]); set_page_dirty(pages[i]); unlock_page(pages[i]); put_page(pages[i]); @@ -4951,7 +4960,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SYNC: { int ret; - ret = btrfs_start_delalloc_roots(fs_info, U64_MAX, false); + ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); if (ret) return ret; ret = btrfs_sync_fs(inode->i_sb, 1); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 79d366a36223..985a21558437 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -199,14 +199,21 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset entry->compress_type = compress_type; entry->truncated_len = (u64)-1; entry->qgroup_rsv = ret; - if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) - set_bit(type, &entry->flags); + entry->physical = (u64)-1; + entry->disk = NULL; + entry->partno = (u8)-1; - if (dio) { - percpu_counter_add_batch(&fs_info->dio_bytes, num_bytes, - fs_info->delalloc_batch); + ASSERT(type == BTRFS_ORDERED_REGULAR || + type == BTRFS_ORDERED_NOCOW || + type == BTRFS_ORDERED_PREALLOC || + type == BTRFS_ORDERED_COMPRESSED); + set_bit(type, &entry->flags); + + percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes, + fs_info->delalloc_batch); + + if (dio) set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); - } /* one ref for the tree */ refcount_set(&entry->refs, 1); @@ -256,6 +263,9 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type) { + ASSERT(type == BTRFS_ORDERED_REGULAR || + type == BTRFS_ORDERED_NOCOW || + type == BTRFS_ORDERED_PREALLOC); return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, num_bytes, disk_num_bytes, type, 0, BTRFS_COMPRESS_NONE); @@ -265,6 +275,9 @@ int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type) { + ASSERT(type == BTRFS_ORDERED_REGULAR || + type == BTRFS_ORDERED_NOCOW || + type == BTRFS_ORDERED_PREALLOC); return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, num_bytes, disk_num_bytes, type, 1, BTRFS_COMPRESS_NONE); @@ -272,11 +285,12 @@ int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int type, - int compress_type) + u64 disk_num_bytes, int compress_type) { + ASSERT(compress_type != BTRFS_COMPRESS_NONE); return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, - num_bytes, disk_num_bytes, type, 0, + num_bytes, disk_num_bytes, + BTRFS_ORDERED_COMPRESSED, 0, compress_type); } @@ -297,26 +311,33 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, } /* - * this is used to account for finished IO across a given range - * of the file. The IO may span ordered extents. If - * a given ordered_extent is completely done, 1 is returned, otherwise - * 0. + * Finish IO for one ordered extent across a given range. The range can + * contain several ordered extents. + * + * @found_ret: Return the finished ordered extent + * @file_offset: File offset for the finished IO + * Will also be updated to one byte past the range that is + * recordered as finished. This allows caller to walk forward. + * @io_size: Length of the finish IO range + * @uptodate: If the IO finished without problem * - * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used - * to make sure this function only returns 1 once for a given ordered extent. + * Return true if any ordered extent is finished in the range, and update + * @found_ret and @file_offset. + * Return false otherwise. * - * file_offset is updated to one byte past the range that is recorded as - * complete. This allows you to walk forward in the file. + * NOTE: Although The range can cross multiple ordered extents, only one + * ordered extent will be updated during one call. The caller is responsible to + * iterate all ordered extents in the range. */ -int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, - struct btrfs_ordered_extent **cached, +bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, + struct btrfs_ordered_extent **finished_ret, u64 *file_offset, u64 io_size, int uptodate) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - int ret; + bool finished = false; unsigned long flags; u64 dec_end; u64 dec_start; @@ -324,16 +345,12 @@ int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, spin_lock_irqsave(&tree->lock, flags); node = tree_search(tree, *file_offset); - if (!node) { - ret = 1; + if (!node) goto out; - } entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (!offset_in_entry(entry, *file_offset)) { - ret = 1; + if (!offset_in_entry(entry, *file_offset)) goto out; - } dec_start = max(*file_offset, entry->file_offset); dec_end = min(*file_offset + io_size, @@ -354,39 +371,50 @@ int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, set_bit(BTRFS_ORDERED_IOERR, &entry->flags); if (entry->bytes_left == 0) { - ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + /* + * Ensure only one caller can set the flag and finished_ret + * accordingly + */ + finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); /* test_and_set_bit implies a barrier */ cond_wake_up_nomb(&entry->wait); - } else { - ret = 1; } out: - if (!ret && cached && entry) { - *cached = entry; + if (finished && finished_ret && entry) { + *finished_ret = entry; refcount_inc(&entry->refs); } spin_unlock_irqrestore(&tree->lock, flags); - return ret == 0; + return finished; } /* - * this is used to account for finished IO across a given range - * of the file. The IO should not span ordered extents. If - * a given ordered_extent is completely done, 1 is returned, otherwise - * 0. + * Finish IO for one ordered extent across a given range. The range can only + * contain one ordered extent. + * + * @cached: The cached ordered extent. If not NULL, we can skip the tree + * search and use the ordered extent directly. + * Will be also used to store the finished ordered extent. + * @file_offset: File offset for the finished IO + * @io_size: Length of the finish IO range + * @uptodate: If the IO finishes without problem * - * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used - * to make sure this function only returns 1 once for a given ordered extent. + * Return true if the ordered extent is finished in the range, and update + * @cached. + * Return false otherwise. + * + * NOTE: The range can NOT cross multiple ordered extents. + * Thus caller should ensure the range doesn't cross ordered extents. */ -int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, - struct btrfs_ordered_extent **cached, - u64 file_offset, u64 io_size, int uptodate) +bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, + struct btrfs_ordered_extent **cached, + u64 file_offset, u64 io_size, int uptodate) { struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; - int ret; + bool finished = false; spin_lock_irqsave(&tree->lock, flags); if (cached && *cached) { @@ -395,41 +423,39 @@ int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, } node = tree_search(tree, file_offset); - if (!node) { - ret = 1; + if (!node) goto out; - } entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); have_entry: - if (!offset_in_entry(entry, file_offset)) { - ret = 1; + if (!offset_in_entry(entry, file_offset)) goto out; - } - if (io_size > entry->bytes_left) { + if (io_size > entry->bytes_left) btrfs_crit(inode->root->fs_info, "bad ordered accounting left %llu size %llu", entry->bytes_left, io_size); - } + entry->bytes_left -= io_size; if (!uptodate) set_bit(BTRFS_ORDERED_IOERR, &entry->flags); if (entry->bytes_left == 0) { - ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + /* + * Ensure only one caller can set the flag and finished_ret + * accordingly + */ + finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); /* test_and_set_bit implies a barrier */ cond_wake_up_nomb(&entry->wait); - } else { - ret = 1; } out: - if (!ret && cached && entry) { + if (finished && cached && entry) { *cached = entry; refcount_inc(&entry->refs); } spin_unlock_irqrestore(&tree->lock, flags); - return ret == 0; + return finished; } /* @@ -480,9 +506,8 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes, false); - if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) - percpu_counter_add_batch(&fs_info->dio_bytes, -entry->num_bytes, - fs_info->delalloc_batch); + percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes, + fs_info->delalloc_batch); tree = &btrfs_inode->ordered_tree; spin_lock_irq(&tree->lock); @@ -745,9 +770,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; + unsigned long flags; tree = &inode->ordered_tree; - spin_lock_irq(&tree->lock); + spin_lock_irqsave(&tree->lock, flags); node = tree_search(tree, file_offset); if (!node) goto out; @@ -758,7 +784,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino if (entry) refcount_inc(&entry->refs); out: - spin_unlock_irq(&tree->lock); + spin_unlock_irqrestore(&tree->lock, flags); return entry; } @@ -898,6 +924,84 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, } } +static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos, + u64 len) +{ + struct inode *inode = ordered->inode; + u64 file_offset = ordered->file_offset + pos; + u64 disk_bytenr = ordered->disk_bytenr + pos; + u64 num_bytes = len; + u64 disk_num_bytes = len; + int type; + unsigned long flags_masked = ordered->flags & ~(1 << BTRFS_ORDERED_DIRECT); + int compress_type = ordered->compress_type; + unsigned long weight; + int ret; + + weight = hweight_long(flags_masked); + WARN_ON_ONCE(weight > 1); + if (!weight) + type = 0; + else + type = __ffs(flags_masked); + + if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered->flags)) { + WARN_ON_ONCE(1); + ret = btrfs_add_ordered_extent_compress(BTRFS_I(inode), + file_offset, disk_bytenr, num_bytes, + disk_num_bytes, compress_type); + } else if (test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { + ret = btrfs_add_ordered_extent_dio(BTRFS_I(inode), file_offset, + disk_bytenr, num_bytes, disk_num_bytes, type); + } else { + ret = btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, + disk_bytenr, num_bytes, disk_num_bytes, type); + } + + return ret; +} + +int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, + u64 post) +{ + struct inode *inode = ordered->inode; + struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; + struct rb_node *node; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + int ret = 0; + + spin_lock_irq(&tree->lock); + /* Remove from tree once */ + node = &ordered->rb_node; + rb_erase(node, &tree->tree); + RB_CLEAR_NODE(node); + if (tree->last == node) + tree->last = NULL; + + ordered->file_offset += pre; + ordered->disk_bytenr += pre; + ordered->num_bytes -= (pre + post); + ordered->disk_num_bytes -= (pre + post); + ordered->bytes_left -= (pre + post); + + /* Re-insert the node */ + node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node); + if (node) + btrfs_panic(fs_info, -EEXIST, + "zoned: inconsistency in ordered tree at offset %llu", + ordered->file_offset); + + spin_unlock_irq(&tree->lock); + + if (pre) + ret = clone_ordered_extent(ordered, 0, pre); + if (post) + ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes, + post); + + return ret; +} + int __init ordered_data_init(void) { btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 0bfa82b58e23..99e0853e4d3b 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -27,7 +27,7 @@ struct btrfs_ordered_sum { }; /* - * bits for the flags field: + * Bits for btrfs_ordered_extent::flags. * * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written. * It is used to make sure metadata is inserted into the tree only once @@ -38,24 +38,36 @@ struct btrfs_ordered_sum { * IO is done and any metadata is inserted into the tree. */ enum { + /* + * Different types for direct io, one and only one of the 4 type can + * be set when creating ordered extent. + * + * REGULAR: For regular non-compressed COW write + * NOCOW: For NOCOW write into existing non-hole extent + * PREALLOC: For NOCOW write into preallocated extent + * COMPRESSED: For compressed COW write + */ + BTRFS_ORDERED_REGULAR, + BTRFS_ORDERED_NOCOW, + BTRFS_ORDERED_PREALLOC, + BTRFS_ORDERED_COMPRESSED, + + /* + * Extra bit for direct io, can only be set for + * REGULAR/NOCOW/PREALLOC. No direct io for compressed extent. + */ + BTRFS_ORDERED_DIRECT, + + /* Extra status bits for ordered extents */ + /* set when all the pages are written */ BTRFS_ORDERED_IO_DONE, /* set when removed from the tree */ BTRFS_ORDERED_COMPLETE, - /* set when we want to write in place */ - BTRFS_ORDERED_NOCOW, - /* writing a zlib compressed extent */ - BTRFS_ORDERED_COMPRESSED, - /* set when writing to preallocated extent */ - BTRFS_ORDERED_PREALLOC, - /* set when we're doing DIO with this extent */ - BTRFS_ORDERED_DIRECT, /* We had an io error when writing this out */ BTRFS_ORDERED_IOERR, /* Set when we have to truncate an extent */ BTRFS_ORDERED_TRUNCATED, - /* Regular IO for COW */ - BTRFS_ORDERED_REGULAR, /* Used during fsync to track already logged extents */ BTRFS_ORDERED_LOGGED, /* We have already logged all the csums of the ordered extent */ @@ -127,6 +139,14 @@ struct btrfs_ordered_extent { struct completion completion; struct btrfs_work flush_work; struct list_head work_list; + + /* + * Used to reverse-map physical address returned from ZONE_APPEND write + * command in a workqueue context + */ + u64 physical; + struct gendisk *disk; + u8 partno; }; /* @@ -152,11 +172,11 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); -int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, - struct btrfs_ordered_extent **cached, - u64 file_offset, u64 io_size, int uptodate); -int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, - struct btrfs_ordered_extent **cached, +bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, + struct btrfs_ordered_extent **cached, + u64 file_offset, u64 io_size, int uptodate); +bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, + struct btrfs_ordered_extent **finished_ret, u64 *file_offset, u64 io_size, int uptodate); int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, @@ -167,8 +187,7 @@ int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, u64 disk_num_bytes, int type); int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int type, - int compress_type); + u64 disk_num_bytes, int compress_type); void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, @@ -190,6 +209,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); +int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, + u64 post); int __init ordered_data_init(void); void __cold ordered_data_exit(void); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 93fbf87bdc8d..5394641541f7 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -233,8 +233,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) } x = cmpxchg(&info->stripe_hash_table, NULL, table); - if (x) - kvfree(x); + kvfree(x); return 0; } diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 4b9b6c52a83b..2b490becbe67 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -495,14 +495,15 @@ static int process_extent_item(struct btrfs_fs_info *fs_info, } static int process_leaf(struct btrfs_root *root, - struct btrfs_path *path, u64 *bytenr, u64 *num_bytes) + struct btrfs_path *path, u64 *bytenr, u64 *num_bytes, + int *tree_block_level) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf = path->nodes[0]; struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; u32 count; - int i = 0, tree_block_level = 0, ret = 0; + int i = 0, ret = 0; struct btrfs_key key; int nritems = btrfs_header_nritems(leaf); @@ -515,15 +516,15 @@ static int process_leaf(struct btrfs_root *root, case BTRFS_METADATA_ITEM_KEY: *bytenr = key.objectid; ret = process_extent_item(fs_info, path, &key, i, - &tree_block_level); + tree_block_level); break; case BTRFS_TREE_BLOCK_REF_KEY: ret = add_tree_block(fs_info, key.offset, 0, - key.objectid, tree_block_level); + key.objectid, *tree_block_level); break; case BTRFS_SHARED_BLOCK_REF_KEY: ret = add_tree_block(fs_info, 0, key.offset, - key.objectid, tree_block_level); + key.objectid, *tree_block_level); break; case BTRFS_EXTENT_DATA_REF_KEY: dref = btrfs_item_ptr(leaf, i, @@ -549,7 +550,8 @@ static int process_leaf(struct btrfs_root *root, /* Walk down to the leaf from the given level */ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, - int level, u64 *bytenr, u64 *num_bytes) + int level, u64 *bytenr, u64 *num_bytes, + int *tree_block_level) { struct extent_buffer *eb; int ret = 0; @@ -565,7 +567,8 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, path->slots[level-1] = 0; path->locks[level-1] = BTRFS_READ_LOCK; } else { - ret = process_leaf(root, path, bytenr, num_bytes); + ret = process_leaf(root, path, bytenr, num_bytes, + tree_block_level); if (ret) break; } @@ -666,18 +669,18 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, u64 bytenr = generic_ref->bytenr; u64 num_bytes = generic_ref->len; u64 parent = generic_ref->parent; - u64 ref_root; - u64 owner; - u64 offset; + u64 ref_root = 0; + u64 owner = 0; + u64 offset = 0; if (!btrfs_test_opt(fs_info, REF_VERIFY)) return 0; if (generic_ref->type == BTRFS_REF_METADATA) { - ref_root = generic_ref->tree_ref.root; + if (!parent) + ref_root = generic_ref->tree_ref.root; owner = generic_ref->tree_ref.level; - offset = 0; - } else { + } else if (!parent) { ref_root = generic_ref->data_ref.ref_root; owner = generic_ref->data_ref.ino; offset = generic_ref->data_ref.offset; @@ -693,13 +696,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, goto out; } - if (parent) { - ref->parent = parent; - } else { - ref->root_objectid = ref_root; - ref->owner = owner; - ref->offset = offset; - } + ref->parent = parent; + ref->owner = owner; + ref->root_objectid = ref_root; + ref->offset = offset; ref->num_refs = (action == BTRFS_DROP_DELAYED_REF) ? -1 : 1; memcpy(&ra->ref, ref, sizeof(struct ref_entry)); @@ -974,6 +974,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) { struct btrfs_path *path; struct extent_buffer *eb; + int tree_block_level = 0; u64 bytenr = 0, num_bytes = 0; int ret, level; @@ -998,7 +999,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) * different leaf from the original extent item. */ ret = walk_down_tree(fs_info->extent_root, path, level, - &bytenr, &num_bytes); + &bytenr, &num_bytes, &tree_block_level); if (ret) break; ret = walk_up_tree(path, &level); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index b03e7891394e..b24396cf2f99 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -81,7 +81,10 @@ static int copy_inline_to_page(struct btrfs_inode *inode, goto out_unlock; } - set_page_extent_mapped(page); + ret = set_page_extent_mapped(page); + if (ret < 0) + goto out_unlock; + clear_extent_bit(&inode->io_tree, file_offset, range_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, NULL); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index df63ef64c5c0..232d5da7b7be 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -97,6 +97,7 @@ struct tree_block { struct rb_node rb_node; u64 bytenr; }; /* Use rb_simple_node for search/insert */ + u64 owner; struct btrfs_key key; unsigned int level:8; unsigned int key_ready:1; @@ -668,9 +669,7 @@ static void __del_reloc_root(struct btrfs_root *root) RB_CLEAR_NODE(&node->rb_node); } spin_unlock(&rc->reloc_root_tree.lock); - if (!node) - return; - BUG_ON((struct btrfs_root *)node->data != root); + ASSERT(!node || (struct btrfs_root *)node->data == root); } /* @@ -2393,8 +2392,8 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, { struct extent_buffer *eb; - eb = read_tree_block(fs_info, block->bytenr, 0, block->key.offset, - block->level, NULL); + eb = read_tree_block(fs_info, block->bytenr, block->owner, + block->key.offset, block->level, NULL); if (IS_ERR(eb)) { return PTR_ERR(eb); } else if (!extent_buffer_uptodate(eb)) { @@ -2493,7 +2492,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, /* Kick in readahead for tree blocks with missing keys */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { if (!block->key_ready) - btrfs_readahead_tree_block(fs_info, block->bytenr, 0, 0, + btrfs_readahead_tree_block(fs_info, block->bytenr, + block->owner, 0, block->level); } @@ -2553,6 +2553,31 @@ static noinline_for_stack int prealloc_file_extent_cluster( if (ret) return ret; + /* + * On a zoned filesystem, we cannot preallocate the file region. + * Instead, we dirty and fiemap_write the region. + */ + if (btrfs_is_zoned(inode->root->fs_info)) { + struct btrfs_root *root = inode->root; + struct btrfs_trans_handle *trans; + + end = cluster->end - offset + 1; + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode); + i_size_write(&inode->vfs_inode, end); + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } + + return btrfs_end_transaction(trans); + } + inode_lock(&inode->vfs_inode); for (nr = 0; nr < cluster->nr; nr++) { start = cluster->boundary[nr] - offset; @@ -2615,7 +2640,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, /* * Allow error injection to test balance cancellation */ -int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) +noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) { return atomic_read(&fs_info->balance_cancel_req) || fatal_signal_pending(current); @@ -2679,6 +2704,15 @@ static int relocate_file_extent_cluster(struct inode *inode, goto out; } } + ret = set_page_extent_mapped(page); + if (ret < 0) { + btrfs_delalloc_release_metadata(BTRFS_I(inode), + PAGE_SIZE, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + unlock_page(page); + put_page(page); + goto out; + } if (PageReadahead(page)) { page_cache_async_readahead(inode->i_mapping, @@ -2706,8 +2740,6 @@ static int relocate_file_extent_cluster(struct inode *inode, lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); - set_page_extent_mapped(page); - if (nr < cluster->nr && page_start + offset == cluster->boundary[nr]) { set_extent_bits(&BTRFS_I(inode)->io_tree, @@ -2749,6 +2781,8 @@ static int relocate_file_extent_cluster(struct inode *inode, } } WARN_ON(nr != cluster->nr); + if (btrfs_is_zoned(fs_info) && !ret) + ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); out: kfree(ra); return ret; @@ -2801,21 +2835,58 @@ static int add_tree_block(struct reloc_control *rc, u32 item_size; int level = -1; u64 generation; + u64 owner = 0; eb = path->nodes[0]; item_size = btrfs_item_size_nr(eb, path->slots[0]); if (extent_key->type == BTRFS_METADATA_ITEM_KEY || item_size >= sizeof(*ei) + sizeof(*bi)) { + unsigned long ptr = 0, end; + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + end = (unsigned long)ei + item_size; if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { bi = (struct btrfs_tree_block_info *)(ei + 1); level = btrfs_tree_block_level(eb, bi); + ptr = (unsigned long)(bi + 1); } else { level = (int)extent_key->offset; + ptr = (unsigned long)(ei + 1); } generation = btrfs_extent_generation(eb, ei); + + /* + * We're reading random blocks without knowing their owner ahead + * of time. This is ok most of the time, as all reloc roots and + * fs roots have the same lock type. However normal trees do + * not, and the only way to know ahead of time is to read the + * inline ref offset. We know it's an fs root if + * + * 1. There's more than one ref. + * 2. There's a SHARED_DATA_REF_KEY set. + * 3. FULL_BACKREF is set on the flags. + * + * Otherwise it's safe to assume that the ref offset == the + * owner of this block, so we can use that when calling + * read_tree_block. + */ + if (btrfs_extent_refs(eb, ei) == 1 && + !(btrfs_extent_flags(eb, ei) & + BTRFS_BLOCK_FLAG_FULL_BACKREF) && + ptr < end) { + struct btrfs_extent_inline_ref *iref; + int type; + + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_get_extent_inline_ref_type(eb, iref, + BTRFS_REF_TYPE_BLOCK); + if (type == BTRFS_REF_TYPE_INVALID) + return -EINVAL; + if (type == BTRFS_TREE_BLOCK_REF_KEY) + owner = btrfs_extent_inline_ref_offset(eb, iref); + } } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { btrfs_print_v0_err(eb->fs_info); btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); @@ -2837,6 +2908,7 @@ static int add_tree_block(struct reloc_control *rc, block->key.offset = generation; block->level = level; block->key_ready = 0; + block->owner = owner; rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node); if (rb_node) @@ -3389,8 +3461,12 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_inode_item *item; struct extent_buffer *leaf; + u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC; int ret; + if (btrfs_is_zoned(trans->fs_info)) + flags &= ~BTRFS_INODE_PREALLOC; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -3405,8 +3481,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_generation(leaf, item, 1); btrfs_set_inode_size(leaf, item, 0); btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); - btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | - BTRFS_INODE_PREALLOC); + btrfs_set_inode_flags(leaf, item, flags); btrfs_mark_buffer_dirty(leaf); out: btrfs_free_path(path); @@ -3434,7 +3509,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, return ERR_CAST(trans); } - err = btrfs_find_free_objectid(root, &objectid); + err = btrfs_get_free_objectid(root, &objectid); if (err) goto out; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 5f4f88a4d2c8..310fce00fcda 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -166,6 +166,7 @@ struct scrub_ctx { int pages_per_rd_bio; int is_dev_replace; + u64 write_pointer; struct scrub_bio *wr_curr_bio; struct mutex wr_lock; @@ -856,6 +857,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) have_csum = sblock_to_check->pagev[0]->have_csum; dev = sblock_to_check->pagev[0]->dev; + if (btrfs_is_zoned(fs_info) && !sctx->is_dev_replace) + return btrfs_repair_one_zone(fs_info, logical); + /* * We must use GFP_NOFS because the scrub task might be waiting for a * worker task executing this function and in turn a transaction commit @@ -1619,6 +1623,28 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, return scrub_add_page_to_wr_bio(sblock->sctx, spage); } +static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) +{ + int ret = 0; + u64 length; + + if (!btrfs_is_zoned(sctx->fs_info)) + return 0; + + if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) + return 0; + + if (sctx->write_pointer < physical) { + length = physical - sctx->write_pointer; + + ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev, + sctx->write_pointer, length); + if (!ret) + sctx->write_pointer = physical; + } + return ret; +} + static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { @@ -1641,6 +1667,13 @@ again: if (sbio->page_count == 0) { struct bio *bio; + ret = fill_writer_pointer_gap(sctx, + spage->physical_for_dev_replace); + if (ret) { + mutex_unlock(&sctx->wr_lock); + return ret; + } + sbio->physical = spage->physical_for_dev_replace; sbio->logical = spage->logical; sbio->dev = sctx->wr_tgtdev; @@ -1702,6 +1735,9 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) * doubled the write performance on spinning disks when measured * with Linux 3.5 */ btrfsic_submit_bio(sbio->bio); + + if (btrfs_is_zoned(sctx->fs_info)) + sctx->write_pointer = sbio->physical + sbio->page_count * PAGE_SIZE; } static void scrub_wr_bio_end_io(struct bio *bio) @@ -3025,6 +3061,46 @@ out: return ret < 0 ? ret : 0; } +static void sync_replace_for_zoned(struct scrub_ctx *sctx) +{ + if (!btrfs_is_zoned(sctx->fs_info)) + return; + + sctx->flush_all_writes = true; + scrub_submit(sctx); + mutex_lock(&sctx->wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_lock); + + wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); +} + +static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, + u64 physical, u64 physical_end) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + int ret = 0; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); + + mutex_lock(&sctx->wr_lock); + if (sctx->write_pointer < physical_end) { + ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical, + physical, + sctx->write_pointer); + if (ret) + btrfs_err(fs_info, + "zoned: failed to recover write pointer"); + } + mutex_unlock(&sctx->wr_lock); + btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); + + return ret; +} + static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *scrub_dev, @@ -3165,6 +3241,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, */ blk_start_plug(&plug); + if (sctx->is_dev_replace && + btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) { + mutex_lock(&sctx->wr_lock); + sctx->write_pointer = physical; + mutex_unlock(&sctx->wr_lock); + sctx->flush_all_writes = true; + } + /* * now find all extents for each stripe and scrub them */ @@ -3353,6 +3437,9 @@ again: if (ret) goto out; + if (sctx->is_dev_replace) + sync_replace_for_zoned(sctx); + if (extent_logical + extent_len < key.objectid + bytes) { if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { @@ -3420,6 +3507,17 @@ out: blk_finish_plug(&plug); btrfs_free_path(path); btrfs_free_path(ppath); + + if (sctx->is_dev_replace && ret >= 0) { + int ret2; + + ret2 = sync_write_pointer_for_zoned(sctx, base + offset, + map->stripes[num].physical, + physical_end); + if (ret2) + ret = ret2; + } + return ret < 0 ? ret : 0; } @@ -3475,6 +3573,25 @@ out: return ret; } +static int finish_extent_writes_for_zoned(struct btrfs_root *root, + struct btrfs_block_group *cache) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_trans_handle *trans; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + btrfs_wait_block_group_reservations(cache); + btrfs_wait_nocow_writers(cache); + btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length); + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + return btrfs_commit_transaction(trans); +} + static noinline_for_stack int scrub_enumerate_chunks(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, u64 start, u64 end) @@ -3561,6 +3678,16 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (!cache) goto skip; + if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) { + spin_lock(&cache->lock); + if (!cache->to_copy) { + spin_unlock(&cache->lock); + ro_set = 0; + goto done; + } + spin_unlock(&cache->lock); + } + /* * Make sure that while we are scrubbing the corresponding block * group doesn't get its logical address and its device extents @@ -3619,6 +3746,16 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * group is not RO. */ ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace); + if (!ret && sctx->is_dev_replace) { + ret = finish_extent_writes_for_zoned(root, cache); + if (ret) { + btrfs_dec_block_group_ro(cache); + scrub_pause_off(fs_info); + btrfs_put_block_group(cache); + break; + } + } + if (ret == 0) { ro_set = 1; } else if (ret == -ENOSPC && !sctx->is_dev_replace) { @@ -3692,6 +3829,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, scrub_pause_off(fs_info); + if (sctx->is_dev_replace && + !btrfs_finish_block_group_to_copy(dev_replace->srcdev, + cache, found_key.offset)) + ro_set = 0; + +done: down_write(&dev_replace->rwsem); dev_replace->cursor_left = dev_replace->cursor_right; dev_replace->item_needs_writeback = 1; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index ae97f4dbaff3..f87878274e9f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1191,9 +1191,6 @@ struct backref_ctx { /* may be truncated in case it's the last extent in a file */ u64 extent_len; - /* data offset in the file extent item */ - u64 data_offset; - /* Just to check for bugs in backref resolving */ int found_itself; }; @@ -1401,19 +1398,6 @@ static int find_extent_clone(struct send_ctx *sctx, backref_ctx->cur_offset = data_offset; backref_ctx->found_itself = 0; backref_ctx->extent_len = num_bytes; - /* - * For non-compressed extents iterate_extent_inodes() gives us extent - * offsets that already take into account the data offset, but not for - * compressed extents, since the offset is logical and not relative to - * the physical extent locations. We must take this into account to - * avoid sending clone offsets that go beyond the source file's size, - * which would result in the clone ioctl failing with -EINVAL on the - * receiving end. - */ - if (compressed == BTRFS_COMPRESS_NONE) - backref_ctx->data_offset = 0; - else - backref_ctx->data_offset = btrfs_file_extent_offset(eb, fi); /* * The last extent of a file may be too large due to page alignment. @@ -5512,6 +5496,21 @@ static int clone_range(struct send_ctx *sctx, break; offset += clone_len; clone_root->offset += clone_len; + + /* + * If we are cloning from the file we are currently processing, + * and using the send root as the clone root, we must stop once + * the current clone offset reaches the current eof of the file + * at the receiver, otherwise we would issue an invalid clone + * operation (source range going beyond eof) and cause the + * receiver to fail. So if we reach the current eof, bail out + * and fallback to a regular write. + */ + if (clone_root->root == sctx->send_root && + clone_root->ino == sctx->cur_ino && + clone_root->offset >= sctx->cur_inode_next_write_offset) + break; + data_offset += clone_len; next: path->slots[0]++; @@ -6592,10 +6591,9 @@ static int changed_cb(struct btrfs_path *left_path, struct btrfs_path *right_path, struct btrfs_key *key, enum btrfs_compare_tree_result result, - void *ctx) + struct send_ctx *sctx) { int ret = 0; - struct send_ctx *sctx = ctx; if (result == BTRFS_COMPARE_TREE_SAME) { if (key->type == BTRFS_INODE_REF_KEY || @@ -6800,7 +6798,7 @@ static int tree_compare_item(struct btrfs_path *left_path, * If it detects a change, it aborts immediately. */ static int btrfs_compare_trees(struct btrfs_root *left_root, - struct btrfs_root *right_root, void *ctx) + struct btrfs_root *right_root, struct send_ctx *sctx) { struct btrfs_fs_info *fs_info = left_root->fs_info; int ret; @@ -6952,7 +6950,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, - ctx); + sctx); if (ret < 0) goto out; } @@ -6963,7 +6961,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, - ctx); + sctx); if (ret < 0) goto out; } @@ -6977,7 +6975,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, - ctx); + sctx); if (ret < 0) goto out; advance_left = ADVANCE; @@ -6985,7 +6983,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, - ctx); + sctx); if (ret < 0) goto out; advance_right = ADVANCE; @@ -7000,7 +6998,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, else result = BTRFS_COMPARE_TREE_SAME; ret = changed_cb(left_path, right_path, - &left_key, result, ctx); + &left_key, result, sctx); if (ret < 0) goto out; advance_left = ADVANCE; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index e8347461c8dd..2da6177f4b0b 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -140,6 +140,12 @@ * be freed, plus any delayed work we may not have gotten rid of in the case * of metadata. * + * FORCE_COMMIT_TRANS + * For use by the preemptive flusher. We use this to bypass the ticketing + * checks in may_commit_transaction, as we have more information about the + * overall state of the system and may want to commit the transaction ahead + * of actual ENOSPC conditions. + * * OVERCOMMIT * * Because we hold so many reservations for metadata we will allow you to @@ -163,6 +169,7 @@ u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, ASSERT(s_info); return s_info->bytes_used + s_info->bytes_reserved + s_info->bytes_pinned + s_info->bytes_readonly + + s_info->bytes_zone_unusable + (may_use_included ? s_info->bytes_may_use : 0); } @@ -206,6 +213,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) INIT_LIST_HEAD(&space_info->ro_bgs); INIT_LIST_HEAD(&space_info->tickets); INIT_LIST_HEAD(&space_info->priority_tickets); + space_info->clamp = 1; ret = btrfs_sysfs_add_space_info_type(info, space_info); if (ret) @@ -257,7 +265,7 @@ out: void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, - u64 bytes_readonly, + u64 bytes_readonly, u64 bytes_zone_unusable, struct btrfs_space_info **space_info) { struct btrfs_space_info *found; @@ -273,6 +281,7 @@ void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_used += bytes_used; found->disk_used += bytes_used * factor; found->bytes_readonly += bytes_readonly; + found->bytes_zone_unusable += bytes_zone_unusable; if (total_bytes > 0) found->full = 0; btrfs_try_granting_tickets(info, found); @@ -422,10 +431,10 @@ static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, info->total_bytes - btrfs_space_info_used(info, true), info->full ? "" : "not "); btrfs_info(fs_info, - "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", + "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu", info->total_bytes, info->bytes_used, info->bytes_pinned, info->bytes_reserved, info->bytes_may_use, - info->bytes_readonly); + info->bytes_readonly, info->bytes_zone_unusable); DUMP_BLOCK_RSV(fs_info, global_block_rsv); DUMP_BLOCK_RSV(fs_info, trans_block_rsv); @@ -454,9 +463,10 @@ again: list_for_each_entry(cache, &info->block_groups[index], list) { spin_lock(&cache->lock); btrfs_info(fs_info, - "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", + "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu zone_unusable %s", cache->start, cache->length, cache->used, cache->pinned, - cache->reserved, cache->ro ? "[readonly]" : ""); + cache->reserved, cache->zone_unusable, + cache->ro ? "[readonly]" : ""); spin_unlock(&cache->lock); btrfs_dump_free_space(cache, bytes); } @@ -489,7 +499,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, { struct btrfs_trans_handle *trans; u64 delalloc_bytes; - u64 dio_bytes; + u64 ordered_bytes; u64 items; long time_left; int loops; @@ -513,26 +523,22 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); - dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); - if (delalloc_bytes == 0 && dio_bytes == 0) { - if (trans) - return; - if (wait_ordered) - btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); + ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes); + if (delalloc_bytes == 0 && ordered_bytes == 0) return; - } /* * If we are doing more ordered than delalloc we need to just wait on * ordered extents, otherwise we'll waste time trying to flush delalloc * that likely won't give us the space back we need. */ - if (dio_bytes > delalloc_bytes) + if (ordered_bytes > delalloc_bytes) wait_ordered = true; loops = 0; - while ((delalloc_bytes || dio_bytes) && loops < 3) { - u64 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; + while ((delalloc_bytes || ordered_bytes) && loops < 3) { + u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; + long nr_pages = min_t(u64, temp, LONG_MAX); btrfs_start_delalloc_roots(fs_info, nr_pages, true); @@ -555,15 +561,16 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); - dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); + ordered_bytes = percpu_counter_sum_positive( + &fs_info->ordered_bytes); } } /** - * maybe_commit_transaction - possibly commit the transaction if its ok to - * @root - the root we're allocating for - * @bytes - the number of bytes we want to reserve - * @force - force the commit + * Possibly commit the transaction if its ok to + * + * @fs_info: the filesystem + * @space_info: space_info we are checking for commit, either data or metadata * * This will check to make sure that committing the transaction will actually * get us somewhere and then commit the transaction if it does. Otherwise it @@ -669,7 +676,7 @@ enospc: */ static void flush_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes, - int state) + enum btrfs_flush_state state, bool for_preempt) { struct btrfs_root *root = fs_info->extent_root; struct btrfs_trans_handle *trans; @@ -738,13 +745,21 @@ static void flush_space(struct btrfs_fs_info *fs_info, case COMMIT_TRANS: ret = may_commit_transaction(fs_info, space_info); break; + case FORCE_COMMIT_TRANS: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + ret = btrfs_commit_transaction(trans); + break; default: ret = -ENOSPC; break; } trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, - ret); + ret, for_preempt); return; } @@ -754,7 +769,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, { u64 used; u64 avail; - u64 expected; u64 to_reclaim = space_info->reclaim_size; lockdep_assert_held(&space_info->lock); @@ -772,43 +786,88 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, if (space_info->total_bytes + avail < used) to_reclaim += used - (space_info->total_bytes + avail); - if (to_reclaim) - return to_reclaim; - - to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (btrfs_can_overcommit(fs_info, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL)) - return 0; - - used = btrfs_space_info_used(space_info, true); - - if (btrfs_can_overcommit(fs_info, space_info, SZ_1M, - BTRFS_RESERVE_FLUSH_ALL)) - expected = div_factor_fine(space_info->total_bytes, 95); - else - expected = div_factor_fine(space_info->total_bytes, 90); - - if (used > expected) - to_reclaim = used - expected; - else - to_reclaim = 0; - to_reclaim = min(to_reclaim, space_info->bytes_may_use + - space_info->bytes_reserved); return to_reclaim; } -static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 used) +static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) { + u64 ordered, delalloc; u64 thresh = div_factor_fine(space_info->total_bytes, 98); + u64 used; /* If we're just plain full then async reclaim just slows us down. */ if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) - return 0; + return false; - if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info)) - return 0; + /* + * We have tickets queued, bail so we don't compete with the async + * flushers. + */ + if (space_info->reclaim_size) + return false; + + /* + * If we have over half of the free space occupied by reservations or + * pinned then we want to start flushing. + * + * We do not do the traditional thing here, which is to say + * + * if (used >= ((total_bytes + avail) / 2)) + * return 1; + * + * because this doesn't quite work how we want. If we had more than 50% + * of the space_info used by bytes_used and we had 0 available we'd just + * constantly run the background flusher. Instead we want it to kick in + * if our reclaimable space exceeds our clamped free space. + * + * Our clamping range is 2^1 -> 2^8. Practically speaking that means + * the following: + * + * Amount of RAM Minimum threshold Maximum threshold + * + * 256GiB 1GiB 128GiB + * 128GiB 512MiB 64GiB + * 64GiB 256MiB 32GiB + * 32GiB 128MiB 16GiB + * 16GiB 64MiB 8GiB + * + * These are the range our thresholds will fall in, corresponding to how + * much delalloc we need for the background flusher to kick in. + */ + + thresh = calc_available_free_space(fs_info, space_info, + BTRFS_RESERVE_FLUSH_ALL); + thresh += (space_info->total_bytes - space_info->bytes_used - + space_info->bytes_reserved - space_info->bytes_readonly); + thresh >>= space_info->clamp; + + used = space_info->bytes_pinned; + + /* + * If we have more ordered bytes than delalloc bytes then we're either + * doing a lot of DIO, or we simply don't have a lot of delalloc waiting + * around. Preemptive flushing is only useful in that it can free up + * space before tickets need to wait for things to finish. In the case + * of ordered extents, preemptively waiting on ordered extents gets us + * nothing, if our reservations are tied up in ordered extents we'll + * simply have to slow down writers by forcing them to wait on ordered + * extents. + * + * In the case that ordered is larger than delalloc, only include the + * block reserves that we would actually be able to directly reclaim + * from. In this case if we're heavy on metadata operations this will + * clearly be heavy enough to warrant preemptive flushing. In the case + * of heavy DIO or ordered reservations, preemptive flushing will just + * waste time and cause us to slow down. + */ + ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes); + delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes); + if (ordered >= delalloc) + used += fs_info->delayed_refs_rsv.reserved + + fs_info->delayed_block_rsv.reserved; + else + used += space_info->bytes_may_use; return (used >= thresh && !btrfs_fs_closing(fs_info) && !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); @@ -922,7 +981,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) struct btrfs_fs_info *fs_info; struct btrfs_space_info *space_info; u64 to_reclaim; - int flush_state; + enum btrfs_flush_state flush_state; int commit_cycles = 0; u64 last_tickets_id; @@ -941,7 +1000,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) flush_state = FLUSH_DELAYED_ITEMS_NR; do { - flush_space(fs_info, space_info, to_reclaim, flush_state); + flush_space(fs_info, space_info, to_reclaim, flush_state, false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = 0; @@ -990,6 +1049,105 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) } /* + * This handles pre-flushing of metadata space before we get to the point that + * we need to start blocking threads on tickets. The logic here is different + * from the other flush paths because it doesn't rely on tickets to tell us how + * much we need to flush, instead it attempts to keep us below the 80% full + * watermark of space by flushing whichever reservation pool is currently the + * largest. + */ +static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + struct btrfs_block_rsv *delayed_block_rsv; + struct btrfs_block_rsv *delayed_refs_rsv; + struct btrfs_block_rsv *global_rsv; + struct btrfs_block_rsv *trans_rsv; + int loops = 0; + + fs_info = container_of(work, struct btrfs_fs_info, + preempt_reclaim_work); + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + delayed_block_rsv = &fs_info->delayed_block_rsv; + delayed_refs_rsv = &fs_info->delayed_refs_rsv; + global_rsv = &fs_info->global_block_rsv; + trans_rsv = &fs_info->trans_block_rsv; + + spin_lock(&space_info->lock); + while (need_preemptive_reclaim(fs_info, space_info)) { + enum btrfs_flush_state flush; + u64 delalloc_size = 0; + u64 to_reclaim, block_rsv_size; + u64 global_rsv_size = global_rsv->reserved; + + loops++; + + /* + * We don't have a precise counter for the metadata being + * reserved for delalloc, so we'll approximate it by subtracting + * out the block rsv's space from the bytes_may_use. If that + * amount is higher than the individual reserves, then we can + * assume it's tied up in delalloc reservations. + */ + block_rsv_size = global_rsv_size + + delayed_block_rsv->reserved + + delayed_refs_rsv->reserved + + trans_rsv->reserved; + if (block_rsv_size < space_info->bytes_may_use) + delalloc_size = space_info->bytes_may_use - block_rsv_size; + spin_unlock(&space_info->lock); + + /* + * We don't want to include the global_rsv in our calculation, + * because that's space we can't touch. Subtract it from the + * block_rsv_size for the next checks. + */ + block_rsv_size -= global_rsv_size; + + /* + * We really want to avoid flushing delalloc too much, as it + * could result in poor allocation patterns, so only flush it if + * it's larger than the rest of the pools combined. + */ + if (delalloc_size > block_rsv_size) { + to_reclaim = delalloc_size; + flush = FLUSH_DELALLOC; + } else if (space_info->bytes_pinned > + (delayed_block_rsv->reserved + + delayed_refs_rsv->reserved)) { + to_reclaim = space_info->bytes_pinned; + flush = FORCE_COMMIT_TRANS; + } else if (delayed_block_rsv->reserved > + delayed_refs_rsv->reserved) { + to_reclaim = delayed_block_rsv->reserved; + flush = FLUSH_DELAYED_ITEMS_NR; + } else { + to_reclaim = delayed_refs_rsv->reserved; + flush = FLUSH_DELAYED_REFS_NR; + } + + /* + * We don't want to reclaim everything, just a portion, so scale + * down the to_reclaim by 1/4. If it takes us down to 0, + * reclaim 1 items worth. + */ + to_reclaim >>= 2; + if (!to_reclaim) + to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1); + flush_space(fs_info, space_info, to_reclaim, flush, true); + cond_resched(); + spin_lock(&space_info->lock); + } + + /* We only went through once, back off our clamping. */ + if (loops == 1 && !space_info->reclaim_size) + space_info->clamp = max(1, space_info->clamp - 1); + trace_btrfs_done_preemptive_reclaim(fs_info, space_info); + spin_unlock(&space_info->lock); +} + +/* * FLUSH_DELALLOC_WAIT: * Space is freed from flushing delalloc in one of two ways. * @@ -1054,7 +1212,7 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work) struct btrfs_fs_info *fs_info; struct btrfs_space_info *space_info; u64 last_tickets_id; - int flush_state = 0; + enum btrfs_flush_state flush_state = 0; fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); space_info = fs_info->data_sinfo; @@ -1069,7 +1227,7 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work) spin_unlock(&space_info->lock); while (!space_info->full) { - flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE); + flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = 0; @@ -1082,7 +1240,7 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work) while (flush_state < ARRAY_SIZE(data_flush_states)) { flush_space(fs_info, space_info, U64_MAX, - data_flush_states[flush_state]); + data_flush_states[flush_state], false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = 0; @@ -1115,6 +1273,8 @@ void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) { INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space); INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space); + INIT_WORK(&fs_info->preempt_reclaim_work, + btrfs_preempt_reclaim_metadata_space); } static const enum btrfs_flush_state priority_flush_states[] = { @@ -1153,7 +1313,8 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, flush_state = 0; do { - flush_space(fs_info, space_info, to_reclaim, states[flush_state]); + flush_space(fs_info, space_info, to_reclaim, states[flush_state], + false); flush_state++; spin_lock(&space_info->lock); if (ticket->bytes == 0) { @@ -1169,7 +1330,7 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, struct reserve_ticket *ticket) { while (!space_info->full) { - flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE); + flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); spin_lock(&space_info->lock); if (ticket->bytes == 0) { spin_unlock(&space_info->lock); @@ -1214,11 +1375,14 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, } /** - * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket - * @fs_info - the fs - * @space_info - the space_info for the reservation - * @ticket - the ticket for the reservation - * @flush - how much we can flush + * Do the appropriate flushing and waiting for a ticket + * + * @fs_info: the filesystem + * @space_info: space info for the reservation + * @ticket: ticket for the reservation + * @start_ns: timestamp when the reservation started + * @orig_bytes: amount of bytes originally reserved + * @flush: how much we can flush * * This does the work of figuring out how to flush for the ticket, waiting for * the reservation, and returning the appropriate error if there is one. @@ -1226,6 +1390,7 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket, + u64 start_ns, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) { int ret; @@ -1281,6 +1446,8 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, * space wasn't reserved at all). */ ASSERT(!(ticket->bytes == 0 && ticket->error)); + trace_btrfs_reserve_ticket(fs_info, space_info->flags, orig_bytes, + start_ns, flush, ticket->error); return ret; } @@ -1294,12 +1461,31 @@ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush) (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); } +static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes); + u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes); + + /* + * If we're heavy on ordered operations then clamping won't help us. We + * need to clamp specifically to keep up with dirty'ing buffered + * writers, because there's not a 1:1 correlation of writing delalloc + * and freeing space, like there is with flushing delayed refs or + * delayed nodes. If we're already more ordered than delalloc then + * we're keeping up, otherwise we aren't and should probably clamp. + */ + if (ordered < delalloc) + space_info->clamp = min(space_info->clamp + 1, 8); +} + /** - * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space - * @root - the root we're allocating for - * @space_info - the space info we want to allocate from - * @orig_bytes - the number of bytes we want - * @flush - whether or not we can flush to make our reservation + * Try to reserve bytes from the block_rsv's space + * + * @fs_info: the filesystem + * @space_info: space info we want to allocate from + * @orig_bytes: number of bytes we want + * @flush: whether or not we can flush to make our reservation * * This will reserve orig_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to @@ -1314,6 +1500,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, { struct work_struct *async_work; struct reserve_ticket ticket; + u64 start_ns = 0; u64 used; int ret = 0; bool pending_tickets; @@ -1366,6 +1553,9 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, space_info->reclaim_size += ticket.bytes; init_waitqueue_head(&ticket.wait); ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); + if (trace_btrfs_reserve_ticket_enabled()) + start_ns = ktime_get_ns(); + if (flush == BTRFS_RESERVE_FLUSH_ALL || flush == BTRFS_RESERVE_FLUSH_ALL_STEAL || flush == BTRFS_RESERVE_FLUSH_DATA) { @@ -1382,6 +1572,14 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, list_add_tail(&ticket.list, &space_info->priority_tickets); } + + /* + * We were forced to add a reserve ticket, so our preemptive + * flushing is unable to keep up. Clamp down on the threshold + * for the preemptive flushing in order to keep up with the + * workload. + */ + maybe_clamp_preempt(fs_info, space_info); } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { used += orig_bytes; /* @@ -1390,27 +1588,29 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * the async reclaim as we will panic. */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && - need_do_async_reclaim(fs_info, space_info, used) && - !work_busy(&fs_info->async_reclaim_work)) { + need_preemptive_reclaim(fs_info, space_info) && + !work_busy(&fs_info->preempt_reclaim_work)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); queue_work(system_unbound_wq, - &fs_info->async_reclaim_work); + &fs_info->preempt_reclaim_work); } } spin_unlock(&space_info->lock); if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) return ret; - return handle_reserve_ticket(fs_info, space_info, &ticket, flush); + return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns, + orig_bytes, flush); } /** - * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space - * @root - the root we're allocating for - * @block_rsv - the block_rsv we're allocating for - * @orig_bytes - the number of bytes we want - * @flush - whether or not we can flush to make our reservation + * Trye to reserve metadata bytes from the block_rsv's space + * + * @root: the root we're allocating for + * @block_rsv: block_rsv we're allocating for + * @orig_bytes: number of bytes we want + * @flush: whether or not we can flush to make our reservation * * This will reserve orig_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to @@ -1448,10 +1648,11 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, } /** - * btrfs_reserve_data_bytes - try to reserve data bytes for an allocation - * @fs_info - the filesystem - * @bytes - the number of bytes we need - * @flush - how we are allowed to flush + * Try to reserve data bytes for an allocation + * + * @fs_info: the filesystem + * @bytes: number of bytes we need + * @flush: how we are allowed to flush * * This will reserve bytes from the data space info. If there is not enough * space then we will attempt to flush space as specified by flush. diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 5646393b928c..b1a8ffb03b3e 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -17,11 +17,17 @@ struct btrfs_space_info { u64 bytes_may_use; /* number of bytes that may be used for delalloc/allocations */ u64 bytes_readonly; /* total bytes that are read only */ + u64 bytes_zone_unusable; /* total bytes that are unusable until + resetting the device zone */ u64 max_extent_size; /* This will hold the maximum extent size of the space info if we had an ENOSPC in the allocator. */ + int clamp; /* Used to scale our threshold for preemptive + flushing. The value is >> clamp, so turns + out to be a 2^clamp divisor. */ + unsigned int full:1; /* indicates that we cannot allocate any more chunks for this space */ unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ @@ -119,7 +125,7 @@ DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned"); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, - u64 bytes_readonly, + u64 bytes_readonly, u64 bytes_zone_unusable, struct btrfs_space_info **space_info); struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, u64 flags); @@ -152,4 +158,21 @@ static inline void btrfs_space_info_free_bytes_may_use( int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, enum btrfs_reserve_flush_enum flush); +static inline void __btrfs_mod_total_bytes_pinned( + struct btrfs_space_info *space_info, + s64 mod) +{ + percpu_counter_add_batch(&space_info->total_bytes_pinned, mod, + BTRFS_TOTAL_BYTES_PINNED_BATCH); +} + +static inline void btrfs_mod_total_bytes_pinned(struct btrfs_fs_info *fs_info, + u64 flags, s64 mod) +{ + struct btrfs_space_info *space_info = btrfs_find_space_info(fs_info, flags); + + ASSERT(space_info); + __btrfs_mod_total_bytes_pinned(space_info, mod); +} + #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c new file mode 100644 index 000000000000..c69049e7daa9 --- /dev/null +++ b/fs/btrfs/subpage.c @@ -0,0 +1,278 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/slab.h> +#include "ctree.h" +#include "subpage.h" + +int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, + struct page *page, enum btrfs_subpage_type type) +{ + struct btrfs_subpage *subpage = NULL; + int ret; + + /* + * We have cases like a dummy extent buffer page, which is not mappped + * and doesn't need to be locked. + */ + if (page->mapping) + ASSERT(PageLocked(page)); + /* Either not subpage, or the page already has private attached */ + if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page)) + return 0; + + ret = btrfs_alloc_subpage(fs_info, &subpage, type); + if (ret < 0) + return ret; + attach_page_private(page, subpage); + return 0; +} + +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, + struct page *page) +{ + struct btrfs_subpage *subpage; + + /* Either not subpage, or already detached */ + if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page)) + return; + + subpage = (struct btrfs_subpage *)detach_page_private(page); + ASSERT(subpage); + btrfs_free_subpage(subpage); +} + +int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, + struct btrfs_subpage **ret, + enum btrfs_subpage_type type) +{ + if (fs_info->sectorsize == PAGE_SIZE) + return 0; + + *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS); + if (!*ret) + return -ENOMEM; + spin_lock_init(&(*ret)->lock); + if (type == BTRFS_SUBPAGE_METADATA) + atomic_set(&(*ret)->eb_refs, 0); + else + atomic_set(&(*ret)->readers, 0); + return 0; +} + +void btrfs_free_subpage(struct btrfs_subpage *subpage) +{ + kfree(subpage); +} + +/* + * Increase the eb_refs of current subpage. + * + * This is important for eb allocation, to prevent race with last eb freeing + * of the same page. + * With the eb_refs increased before the eb inserted into radix tree, + * detach_extent_buffer_page() won't detach the page private while we're still + * allocating the extent buffer. + */ +void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, + struct page *page) +{ + struct btrfs_subpage *subpage; + + if (fs_info->sectorsize == PAGE_SIZE) + return; + + ASSERT(PagePrivate(page) && page->mapping); + lockdep_assert_held(&page->mapping->private_lock); + + subpage = (struct btrfs_subpage *)page->private; + atomic_inc(&subpage->eb_refs); +} + +void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, + struct page *page) +{ + struct btrfs_subpage *subpage; + + if (fs_info->sectorsize == PAGE_SIZE) + return; + + ASSERT(PagePrivate(page) && page->mapping); + lockdep_assert_held(&page->mapping->private_lock); + + subpage = (struct btrfs_subpage *)page->private; + ASSERT(atomic_read(&subpage->eb_refs)); + atomic_dec(&subpage->eb_refs); +} + +static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + /* Basic checks */ + ASSERT(PagePrivate(page) && page->private); + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(len, fs_info->sectorsize)); + /* + * The range check only works for mapped page, we can still have + * unmapped page like dummy extent buffer pages. + */ + if (page->mapping) + ASSERT(page_offset(page) <= start && + start + len <= page_offset(page) + PAGE_SIZE); +} + +void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const int nbits = len >> fs_info->sectorsize_bits; + int ret; + + btrfs_subpage_assert(fs_info, page, start, len); + + ret = atomic_add_return(nbits, &subpage->readers); + ASSERT(ret == nbits); +} + +void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const int nbits = len >> fs_info->sectorsize_bits; + + btrfs_subpage_assert(fs_info, page, start, len); + ASSERT(atomic_read(&subpage->readers) >= nbits); + if (atomic_sub_and_test(nbits, &subpage->readers)) + unlock_page(page); +} + +/* + * Convert the [start, start + len) range into a u16 bitmap + * + * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0. + */ +static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits; + const int nbits = len >> fs_info->sectorsize_bits; + + btrfs_subpage_assert(fs_info, page, start, len); + + /* + * Here nbits can be 16, thus can go beyond u16 range. We make the + * first left shift to be calculate in unsigned long (at least u32), + * then truncate the result to u16. + */ + return (u16)(((1UL << nbits) - 1) << bit_start); +} + +void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->uptodate_bitmap |= tmp; + if (subpage->uptodate_bitmap == U16_MAX) + SetPageUptodate(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->uptodate_bitmap &= ~tmp; + ClearPageUptodate(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->error_bitmap |= tmp; + SetPageError(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->error_bitmap &= ~tmp; + if (subpage->error_bitmap == 0) + ClearPageError(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +/* + * Unlike set/clear which is dependent on each page status, for test all bits + * are tested in the same way. + */ +#define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \ +bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \ + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \ + unsigned long flags; \ + bool ret; \ + \ + spin_lock_irqsave(&subpage->lock, flags); \ + ret = ((subpage->name##_bitmap & tmp) == tmp); \ + spin_unlock_irqrestore(&subpage->lock, flags); \ + return ret; \ +} +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); + +/* + * Note that, in selftests (extent-io-tests), we can have empty fs_info passed + * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall + * back to regular sectorsize branch. + */ +#define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \ + test_page_func) \ +void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + set_page_func(page); \ + return; \ + } \ + btrfs_subpage_set_##name(fs_info, page, start, len); \ +} \ +void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + clear_page_func(page); \ + return; \ + } \ + btrfs_subpage_clear_##name(fs_info, page, start, len); \ +} \ +bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ + return test_page_func(page); \ + return btrfs_subpage_test_##name(fs_info, page, start, len); \ +} +IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, + PageUptodate); +IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError); diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h new file mode 100644 index 000000000000..b86a4881475d --- /dev/null +++ b/fs/btrfs/subpage.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_SUBPAGE_H +#define BTRFS_SUBPAGE_H + +#include <linux/spinlock.h> + +/* + * Maximum page size we support is 64K, minimum sector size is 4K, u16 bitmap + * is sufficient. Regular bitmap_* is not used due to size reasons. + */ +#define BTRFS_SUBPAGE_BITMAP_SIZE 16 + +/* + * Structure to trace status of each sector inside a page, attached to + * page::private for both data and metadata inodes. + */ +struct btrfs_subpage { + /* Common members for both data and metadata pages */ + spinlock_t lock; + u16 uptodate_bitmap; + u16 error_bitmap; + union { + /* + * Structures only used by metadata + * + * @eb_refs should only be operated under private_lock, as it + * manages whether the subpage can be detached. + */ + atomic_t eb_refs; + /* Structures only used by data */ + struct { + atomic_t readers; + }; + }; +}; + +enum btrfs_subpage_type { + BTRFS_SUBPAGE_METADATA, + BTRFS_SUBPAGE_DATA, +}; + +int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, + struct page *page, enum btrfs_subpage_type type); +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, + struct page *page); + +/* Allocate additional data where page represents more than one sector */ +int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, + struct btrfs_subpage **ret, + enum btrfs_subpage_type type); +void btrfs_free_subpage(struct btrfs_subpage *subpage); + +void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, + struct page *page); +void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, + struct page *page); + +void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); +void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); + +/* + * Template for subpage related operations. + * + * btrfs_subpage_*() are for call sites where the page has subpage attached and + * the range is ensured to be inside the page. + * + * btrfs_page_*() are for call sites where the page can either be subpage + * specific or regular page. The function will handle both cases. + * But the range still needs to be inside the page. + */ +#define DECLARE_BTRFS_SUBPAGE_OPS(name) \ +void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +void btrfs_subpage_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); + +DECLARE_BTRFS_SUBPAGE_OPS(uptodate); +DECLARE_BTRFS_SUBPAGE_OPS(error); + +#endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 12d7d3be7cd4..f8435641b912 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -48,7 +48,6 @@ #include "tests/btrfs-tests.h" #include "block-group.h" #include "discard.h" - #include "qgroup.h" #define CREATE_TRACE_POINTS #include <trace/events/btrfs.h> @@ -2028,6 +2027,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) ret = -EINVAL; goto restore; } + if (fs_info->sectorsize < PAGE_SIZE) { + btrfs_warn(fs_info, + "read-write mount is not yet allowed for sectorsize %u page size %lu", + fs_info->sectorsize, PAGE_SIZE); + ret = -EINVAL; + goto restore; + } /* * NOTE: when remounting with a change that does writes, don't diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 19b9fffa2c9c..6eb1c50fa98c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -666,6 +666,7 @@ SPACE_INFO_ATTR(bytes_pinned); SPACE_INFO_ATTR(bytes_reserved); SPACE_INFO_ATTR(bytes_may_use); SPACE_INFO_ATTR(bytes_readonly); +SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); BTRFS_ATTR(space_info, total_bytes_pinned, @@ -679,6 +680,7 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, bytes_reserved), BTRFS_ATTR_PTR(space_info, bytes_may_use), BTRFS_ATTR_PTR(space_info, bytes_readonly), + BTRFS_ATTR_PTR(space_info, bytes_zone_unusable), BTRFS_ATTR_PTR(space_info, disk_used), BTRFS_ATTR_PTR(space_info, disk_total), BTRFS_ATTR_PTR(space_info, total_bytes_pinned), diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 57379e96ccc9..c0aefe6dee0b 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -507,7 +507,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, goto out_free; } - ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1), + ret = btrfs_rmap_block(fs_info, em->start, NULL, btrfs_sb_offset(1), &logical, &out_ndaddrs, &out_stripe_len); if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) { test_err("didn't rmap anything but expected %d", diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8e0f7a1029c6..acff6bb49a97 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -21,6 +21,7 @@ #include "qgroup.h" #include "block-group.h" #include "space-info.h" +#include "zoned.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -107,6 +108,11 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { __TRANS_JOIN | __TRANS_JOIN_NOLOCK | __TRANS_JOIN_NOSTART), + [TRANS_STATE_SUPER_COMMITTED] = (__TRANS_START | + __TRANS_ATTACH | + __TRANS_JOIN | + __TRANS_JOIN_NOLOCK | + __TRANS_JOIN_NOSTART), [TRANS_STATE_COMPLETED] = (__TRANS_START | __TRANS_ATTACH | __TRANS_JOIN | @@ -375,6 +381,8 @@ loop: spin_lock_init(&cur_trans->dirty_bgs_lock); INIT_LIST_HEAD(&cur_trans->deleted_bgs); spin_lock_init(&cur_trans->dropped_roots_lock); + INIT_LIST_HEAD(&cur_trans->releasing_ebs); + spin_lock_init(&cur_trans->releasing_ebs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(fs_info, &cur_trans->dirty_pages, IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode); @@ -826,10 +834,11 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) return trans; } -/* wait for a transaction commit to be fully complete */ -static noinline void wait_for_commit(struct btrfs_transaction *commit) +/* Wait for a transaction commit to reach at least the given state. */ +static noinline void wait_for_commit(struct btrfs_transaction *commit, + const enum btrfs_trans_state min_state) { - wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED); + wait_event(commit->commit_wait, commit->state >= min_state); } int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) @@ -884,7 +893,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) goto out; /* nothing committing|committed */ } - wait_for_commit(cur_trans); + wait_for_commit(cur_trans, TRANS_STATE_COMPLETED); btrfs_put_transaction(cur_trans); out: return ret; @@ -909,9 +918,8 @@ bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_transaction *cur_trans = trans->transaction; - smp_mb(); if (cur_trans->state >= TRANS_STATE_COMMIT_START || - cur_trans->delayed_refs.flushing) + test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags)) return true; return should_end_transaction(trans); @@ -1230,10 +1238,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) if (ret) return ret; - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) - return ret; - ret = btrfs_run_dev_stats(trans); if (ret) return ret; @@ -1248,10 +1252,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) if (ret) return ret; - /* run_qgroups might have added some more refs */ - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) - return ret; again: while (!list_empty(&fs_info->dirty_cowonly_roots)) { struct btrfs_root *root; @@ -1266,15 +1266,24 @@ again: ret = update_cowonly_root(trans, root); if (ret) return ret; - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) - return ret; } + /* Now flush any delayed refs generated by updating all of the roots */ + ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + if (ret) + return ret; + while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) { ret = btrfs_write_dirty_block_groups(trans); if (ret) return ret; + + /* + * We're writing the dirty block groups, which could generate + * delayed refs, which could generate more dirty block groups, + * so we want to keep this flushing in this loop to make sure + * everything gets run. + */ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); if (ret) return ret; @@ -1319,7 +1328,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) struct btrfs_root *gang[8]; int i; int ret; - int err = 0; spin_lock(&fs_info->fs_roots_radix_lock); while (1) { @@ -1331,6 +1339,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) break; for (i = 0; i < ret; i++) { struct btrfs_root *root = gang[i]; + int ret2; + radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); @@ -1350,17 +1360,17 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) root->node); } - err = btrfs_update_root(trans, fs_info->tree_root, + ret2 = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); + if (ret2) + return ret2; spin_lock(&fs_info->fs_roots_radix_lock); - if (err) - break; btrfs_qgroup_free_meta_all_pertrans(root); } } spin_unlock(&fs_info->fs_roots_radix_lock); - return err; + return 0; } /* @@ -1433,6 +1443,23 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, record_root_in_trans(trans, src, 1); /* + * btrfs_qgroup_inherit relies on a consistent view of the usage for the + * src root, so we must run the delayed refs here. + * + * However this isn't particularly fool proof, because there's no + * synchronization keeping us from changing the tree after this point + * before we do the qgroup_inherit, or even from making changes while + * we're doing the qgroup_inherit. But that's a problem for the future, + * for now flush the delayed refs to narrow the race window where the + * qgroup counters could end up wrong. + */ + ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + /* * We are going to commit transaction, see btrfs_commit_transaction() * comment for reason locking tree_log_mutex */ @@ -1525,7 +1552,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ASSERT(pending->root_item); new_root_item = pending->root_item; - pending->error = btrfs_find_free_objectid(tree_root, &objectid); + pending->error = btrfs_get_free_objectid(tree_root, &objectid); if (pending->error) goto no_free_objectid; @@ -1685,12 +1712,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto fail; - } - /* * Do special qgroup accounting for snapshot, as we do some qgroup * snapshot hack to do fast snapshot. @@ -1738,12 +1759,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } } - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto fail; - } - fail: pending->error = ret; dir_item_existed: @@ -2042,32 +2057,25 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_trans_release_metadata(trans); trans->block_rsv = NULL; - /* make a pass through all the delayed refs we have so far - * any runnings procs may add more while we are here - */ - ret = btrfs_run_delayed_refs(trans, 0); - if (ret) { - btrfs_end_transaction(trans); - return ret; - } - - cur_trans = trans->transaction; - /* - * set the flushing flag so procs in this transaction have to - * start sending their work down. + * We only want one transaction commit doing the flushing so we do not + * waste a bunch of time on lock contention on the extent root node. */ - cur_trans->delayed_refs.flushing = 1; - smp_wmb(); + if (!test_and_set_bit(BTRFS_DELAYED_REFS_FLUSHING, + &cur_trans->delayed_refs.flags)) { + /* + * Make a pass through all the delayed refs we have so far. + * Any running threads may add more while we are here. + */ + ret = btrfs_run_delayed_refs(trans, 0); + if (ret) { + btrfs_end_transaction(trans); + return ret; + } + } btrfs_create_pending_block_groups(trans); - ret = btrfs_run_delayed_refs(trans, 0); - if (ret) { - btrfs_end_transaction(trans); - return ret; - } - if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) { int run_it = 0; @@ -2101,11 +2109,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) spin_lock(&fs_info->trans_lock); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { + enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; + spin_unlock(&fs_info->trans_lock); refcount_inc(&cur_trans->use_count); - ret = btrfs_end_transaction(trans); - wait_for_commit(cur_trans); + if (trans->in_fsync) + want_state = TRANS_STATE_SUPER_COMMITTED; + ret = btrfs_end_transaction(trans); + wait_for_commit(cur_trans, want_state); if (TRANS_ABORTED(cur_trans)) ret = cur_trans->aborted; @@ -2119,13 +2131,19 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wake_up(&fs_info->transaction_blocked_wait); if (cur_trans->list.prev != &fs_info->trans_list) { + enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; + + if (trans->in_fsync) + want_state = TRANS_STATE_SUPER_COMMITTED; + prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); - if (prev_trans->state != TRANS_STATE_COMPLETED) { + if (prev_trans->state < want_state) { refcount_inc(&prev_trans->use_count); spin_unlock(&fs_info->trans_lock); - wait_for_commit(prev_trans); + wait_for_commit(prev_trans, want_state); + ret = READ_ONCE(prev_trans->aborted); btrfs_put_transaction(prev_trans); @@ -2265,14 +2283,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_free_log_root_tree(trans, fs_info); /* - * commit_fs_roots() can call btrfs_save_ino_cache(), which generates - * new delayed refs. Must handle them or qgroup can be wrong. - */ - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) - goto unlock_tree_log; - - /* * Since fs roots are all committed, we can get a quite accurate * new_roots. So let's do quota accounting. */ @@ -2343,6 +2353,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) goto scrub_continue; } + /* + * At this point, we should have written all the tree blocks allocated + * in this transaction. So it's now safe to free the redirtyied extent + * buffers. + */ + btrfs_free_redirty_list(cur_trans); + ret = write_all_supers(fs_info, 0); /* * the super is written, we can safely allow the tree-loggers @@ -2352,6 +2369,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) goto scrub_continue; + /* + * We needn't acquire the lock here because there is no other task + * which can change it. + */ + cur_trans->state = TRANS_STATE_SUPER_COMMITTED; + wake_up(&cur_trans->commit_wait); + btrfs_finish_extent_commit(trans); if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 31ca81bad822..6335716e513f 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -16,6 +16,7 @@ enum btrfs_trans_state { TRANS_STATE_COMMIT_START, TRANS_STATE_COMMIT_DOING, TRANS_STATE_UNBLOCKED, + TRANS_STATE_SUPER_COMMITTED, TRANS_STATE_COMPLETED, TRANS_STATE_MAX, }; @@ -92,6 +93,9 @@ struct btrfs_transaction { */ atomic_t pending_ordered; wait_queue_head_t pending_wait; + + spinlock_t releasing_ebs_lock; + struct list_head releasing_ebs; }; #define __TRANS_FREEZABLE (1U << 0) @@ -133,6 +137,7 @@ struct btrfs_trans_handle { bool can_flush_pending_bgs; bool reloc_reserved; bool dirty; + bool in_fsync; struct btrfs_root *root; struct btrfs_fs_info *fs_info; struct list_head new_bgs; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 254c2ee43aae..d90695c1ab6c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -19,6 +19,7 @@ #include "qgroup.h" #include "block-group.h" #include "space-info.h" +#include "zoned.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -104,6 +105,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, u64 dirid, int del_all); +static void wait_log_commit(struct btrfs_root *root, int transid); /* * tree logging is a special write ahead log used to make sure that @@ -139,7 +141,9 @@ static int start_log_trans(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *tree_root = fs_info->tree_root; + const bool zoned = btrfs_is_zoned(fs_info); int ret = 0; + bool created = false; /* * First check if the log root tree was already created. If not, create @@ -149,8 +153,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&tree_root->log_mutex); if (!fs_info->log_root_tree) { ret = btrfs_init_log_root_tree(trans, fs_info); - if (!ret) + if (!ret) { set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state); + created = true; + } } mutex_unlock(&tree_root->log_mutex); if (ret) @@ -159,12 +165,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); +again: if (root->log_root) { + int index = (root->log_transid + 1) % 2; + if (btrfs_need_log_full_commit(trans)) { ret = -EAGAIN; goto out; } + if (zoned && atomic_read(&root->log_commit[index])) { + wait_log_commit(root, root->log_transid - 1); + goto again; + } + if (!root->log_start_pid) { clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); root->log_start_pid = current->pid; @@ -172,6 +186,17 @@ static int start_log_trans(struct btrfs_trans_handle *trans, set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); } } else { + /* + * This means fs_info->log_root_tree was already created + * for some other FS trees. Do the full commit not to mix + * nodes from multiple log transactions to do sequential + * writing. + */ + if (zoned && !created) { + ret = -EAGAIN; + goto out; + } + ret = btrfs_add_log_tree(trans, root); if (ret) goto out; @@ -200,14 +225,22 @@ out: */ static int join_running_log_trans(struct btrfs_root *root) { + const bool zoned = btrfs_is_zoned(root->fs_info); int ret = -ENOENT; if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) return ret; mutex_lock(&root->log_mutex); +again: if (root->log_root) { + int index = (root->log_transid + 1) % 2; + ret = 0; + if (zoned && atomic_read(&root->log_commit[index])) { + wait_log_commit(root, root->log_transid - 1); + goto again; + } atomic_inc(&root->log_writers); } mutex_unlock(&root->log_mutex); @@ -2752,6 +2785,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, free_extent_buffer(next); return ret; } + btrfs_redirty_list_add( + trans->transaction, next); } else { if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) clear_extent_buffer_dirty(next); @@ -3085,6 +3120,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ blk_start_plug(&plug); ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); + /* + * -EAGAIN happens when someone, e.g., a concurrent transaction + * commit, writes a dirty extent in this tree-log commit. This + * concurrent write will create a hole writing out the extents, + * and we cannot proceed on a zoned filesystem, requiring + * sequential writing. While we can bail out to a full commit + * here, but we can continue hoping the concurrent writing fills + * the hole. + */ + if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) + ret = 0; if (ret) { blk_finish_plug(&plug); btrfs_abort_transaction(trans, ret); @@ -3127,6 +3173,19 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); root_log_ctx.log_transid = log_root_tree->log_transid; + if (btrfs_is_zoned(fs_info)) { + mutex_lock(&fs_info->tree_root->log_mutex); + if (!log_root_tree->node) { + ret = btrfs_alloc_log_tree_node(trans, log_root_tree); + if (ret) { + mutex_unlock(&fs_info->tree_log_mutex); + mutex_unlock(&log_root_tree->log_mutex); + goto out; + } + } + mutex_unlock(&fs_info->tree_root->log_mutex); + } + /* * Now we are safe to update the log_root_tree because we're under the * log_mutex, and we're a current writer so we're holding the commit @@ -3194,7 +3253,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, &log_root_tree->dirty_log_pages, EXTENT_DIRTY | EXTENT_NEW); blk_finish_plug(&plug); - if (ret) { + /* + * As described above, -EAGAIN indicates a hole in the extents. We + * cannot wait for these write outs since the waiting cause a + * deadlock. Bail out to the full commit instead. + */ + if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) { + btrfs_set_log_full_commit(trans); + btrfs_wait_tree_log_extents(log, mark); + mutex_unlock(&log_root_tree->log_mutex); + goto out_wake_log_root; + } else if (ret) { btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret); mutex_unlock(&log_root_tree->log_mutex); @@ -3285,17 +3354,22 @@ static void free_log_tree(struct btrfs_trans_handle *trans, .process_func = process_one_buffer }; - ret = walk_log_tree(trans, log, &wc); - if (ret) { - if (trans) - btrfs_abort_transaction(trans, ret); - else - btrfs_handle_fs_error(log->fs_info, ret, NULL); + if (log->node) { + ret = walk_log_tree(trans, log, &wc); + if (ret) { + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(log->fs_info, ret, NULL); + } } clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); extent_io_tree_release(&log->log_csum_range); + + if (trans && log->node) + btrfs_redirty_list_add(trans->transaction, log->node); btrfs_put_root(log); } @@ -3379,7 +3453,6 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_path *path; int ret; int err = 0; - int bytes_del = 0; u64 dir_ino = btrfs_ino(dir); if (!inode_logged(trans, dir)) @@ -3406,7 +3479,6 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, } if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di); - bytes_del += name_len; if (ret) { err = ret; goto fail; @@ -3421,46 +3493,17 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, } if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di); - bytes_del += name_len; if (ret) { err = ret; goto fail; } } - /* update the directory size in the log to reflect the names - * we have removed + /* + * We do not need to update the size field of the directory's inode item + * because on log replay we update the field to reflect all existing + * entries in the directory (see overwrite_item()). */ - if (bytes_del) { - struct btrfs_key key; - - key.objectid = dir_ino; - key.offset = 0; - key.type = BTRFS_INODE_ITEM_KEY; - btrfs_release_path(path); - - ret = btrfs_search_slot(trans, log, &key, path, 0, 1); - if (ret < 0) { - err = ret; - goto fail; - } - if (ret == 0) { - struct btrfs_inode_item *item; - u64 i_size; - - item = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_inode_item); - i_size = btrfs_inode_size(path->nodes[0], item); - if (i_size > bytes_del) - i_size -= bytes_del; - else - i_size = 0; - btrfs_set_inode_size(path->nodes[0], item, i_size); - btrfs_mark_buffer_dirty(path->nodes[0]); - } else - ret = 0; - btrfs_release_path(path); - } fail: btrfs_free_path(path); out_unlock: @@ -3889,7 +3932,14 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_timespec_nsec(&token, &item->ctime, inode->i_ctime.tv_nsec); - btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); + /* + * We do not need to set the nbytes field, in fact during a fast fsync + * its value may not even be correct, since a fast fsync does not wait + * for ordered extent completion, which is where we update nbytes, it + * only waits for writeback to complete. During log replay as we find + * file extent items and replay them, we adjust the nbytes field of the + * inode item in subvolume tree as needed (see overwrite_item()). + */ btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); btrfs_set_token_inode_transid(&token, item, trans->transid); @@ -5290,12 +5340,28 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } /* + * This is for cases where logging a directory could result in losing a + * a file after replaying the log. For example, if we move a file from a + * directory A to a directory B, then fsync directory A, we have no way + * to known the file was moved from A to B, so logging just A would + * result in losing the file after a log replay. + */ + if (S_ISDIR(inode->vfs_inode.i_mode) && + inode_only == LOG_INODE_ALL && + inode->last_unlink_trans >= trans->transid) { + btrfs_set_log_full_commit(trans); + err = 1; + goto out_unlock; + } + + /* * a brute force approach to making sure we get the most uptodate * copies of everything. */ if (S_ISDIR(inode->vfs_inode.i_mode)) { int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; + clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); if (inode_only == LOG_INODE_EXISTS) max_key_type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key_type); @@ -5452,96 +5518,31 @@ out_unlock: } /* - * Check if we must fallback to a transaction commit when logging an inode. - * This must be called after logging the inode and is used only in the context - * when fsyncing an inode requires the need to log some other inode - in which - * case we can't lock the i_mutex of each other inode we need to log as that - * can lead to deadlocks with concurrent fsync against other inodes (as we can - * log inodes up or down in the hierarchy) or rename operations for example. So - * we take the log_mutex of the inode after we have logged it and then check for - * its last_unlink_trans value - this is safe because any task setting - * last_unlink_trans must take the log_mutex and it must do this before it does - * the actual unlink operation, so if we do this check before a concurrent task - * sets last_unlink_trans it means we've logged a consistent version/state of - * all the inode items, otherwise we are not sure and must do a transaction - * commit (the concurrent task might have only updated last_unlink_trans before - * we logged the inode or it might have also done the unlink). + * Check if we need to log an inode. This is used in contexts where while + * logging an inode we need to log another inode (either that it exists or in + * full mode). This is used instead of btrfs_inode_in_log() because the later + * requires the inode to be in the log and have the log transaction committed, + * while here we do not care if the log transaction was already committed - our + * caller will commit the log later - and we want to avoid logging an inode + * multiple times when multiple tasks have joined the same log transaction. */ -static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode) +static bool need_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode) { - bool ret = false; - - mutex_lock(&inode->log_mutex); - if (inode->last_unlink_trans >= trans->transid) { - /* - * Make sure any commits to the log are forced to be full - * commits. - */ - btrfs_set_log_full_commit(trans); - ret = true; - } - mutex_unlock(&inode->log_mutex); - - return ret; -} - -/* - * follow the dentry parent pointers up the chain and see if any - * of the directories in it require a full commit before they can - * be logged. Returns zero if nothing special needs to be done or 1 if - * a full commit is required. - */ -static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, - struct dentry *parent, - struct super_block *sb) -{ - int ret = 0; - struct dentry *old_parent = NULL; - /* - * for regular files, if its inode is already on disk, we don't - * have to worry about the parents at all. This is because - * we can use the last_unlink_trans field to record renames - * and other fun in this file. + * If this inode does not have new/updated/deleted xattrs since the last + * time it was logged and is flagged as logged in the current transaction, + * we can skip logging it. As for new/deleted names, those are updated in + * the log by link/unlink/rename operations. + * In case the inode was logged and then evicted and reloaded, its + * logged_trans will be 0, in which case we have to fully log it since + * logged_trans is a transient field, not persisted. */ - if (S_ISREG(inode->vfs_inode.i_mode) && - inode->generation < trans->transid && - inode->last_unlink_trans < trans->transid) - goto out; - - if (!S_ISDIR(inode->vfs_inode.i_mode)) { - if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) - goto out; - inode = BTRFS_I(d_inode(parent)); - } - - while (1) { - if (btrfs_must_commit_transaction(trans, inode)) { - ret = 1; - break; - } - - if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) - break; - - if (IS_ROOT(parent)) { - inode = BTRFS_I(d_inode(parent)); - if (btrfs_must_commit_transaction(trans, inode)) - ret = 1; - break; - } - - parent = dget_parent(parent); - dput(old_parent); - old_parent = parent; - inode = BTRFS_I(d_inode(parent)); + if (inode->logged_trans == trans->transid && + !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) + return false; - } - dput(old_parent); -out: - return ret; + return true; } struct btrfs_dir_list { @@ -5671,7 +5672,7 @@ process_leaf: goto next_dir_inode; } - if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { + if (!need_log_inode(trans, BTRFS_I(di_inode))) { btrfs_add_delayed_iput(di_inode); break; } @@ -5681,9 +5682,6 @@ process_leaf: log_mode = LOG_INODE_ALL; ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), log_mode, ctx); - if (!ret && - btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) - ret = 1; btrfs_add_delayed_iput(di_inode); if (ret) goto next_dir_inode; @@ -5821,13 +5819,15 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, goto out; } + if (!need_log_inode(trans, BTRFS_I(dir_inode))) { + btrfs_add_delayed_iput(dir_inode); + continue; + } + if (ctx) ctx->log_new_dentries = false; ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), LOG_INODE_ALL, ctx); - if (!ret && - btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) - ret = 1; if (!ret && ctx && ctx->log_new_dentries) ret = log_new_dir_dentries(trans, root, BTRFS_I(dir_inode), ctx); @@ -5872,7 +5872,8 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) return PTR_ERR(inode); - if (BTRFS_I(inode)->generation >= trans->transid) + if (BTRFS_I(inode)->generation >= trans->transid && + need_log_inode(trans, BTRFS_I(inode))) ret = btrfs_log_inode(trans, root, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); btrfs_add_delayed_iput(inode); @@ -5926,7 +5927,8 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, if (root != inode->root) break; - if (inode->generation >= trans->transid) { + if (inode->generation >= trans->transid && + need_log_inode(trans, inode)) { ret = btrfs_log_inode(trans, root, inode, LOG_INODE_EXISTS, ctx); if (ret) @@ -6041,12 +6043,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; - struct super_block *sb; int ret = 0; bool log_dentries = false; - sb = inode->vfs_inode.i_sb; - if (btrfs_test_opt(fs_info, NOTREELOG)) { ret = 1; goto end_no_trans; @@ -6057,10 +6056,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } - ret = check_parent_dirs_for_sync(trans, inode, parent, sb); - if (ret) - goto end_no_trans; - /* * Skip already logged inodes or inodes corresponding to tmpfiles * (since logging them is pointless, a link count of 0 means they @@ -6307,8 +6302,7 @@ again: * root->objectid_mutex is not acquired as log replay * could only happen during mount. */ - ret = btrfs_find_highest_objectid(root, - &root->highest_objectid); + ret = btrfs_init_root_free_objectid(root); } wc.replay_dest->log_root = NULL; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b62be84833e9..b8fab44394f5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -433,7 +433,7 @@ static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info) atomic_set(&dev->reada_in_flight, 0); atomic_set(&dev->dev_stats_ccnt, 0); - btrfs_device_data_ordered_init(dev, fs_info); + btrfs_device_data_ordered_init(dev); INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); extent_io_tree_init(fs_info, &dev->alloc_state, @@ -669,10 +669,6 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); device->mode = flags; - ret = btrfs_get_dev_zone_info(device); - if (ret != 0) - goto error_free_page; - fs_devices->open_devices++; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { @@ -1418,11 +1414,62 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) * make sure to start at an offset of at least 1MB. */ return max_t(u64, start, SZ_1M); + case BTRFS_CHUNK_ALLOC_ZONED: + /* + * We don't care about the starting region like regular + * allocator, because we anyway use/reserve the first two zones + * for superblock logging. + */ + return ALIGN(start, device->zone_info->zone_size); default: BUG(); } } +static bool dev_extent_hole_check_zoned(struct btrfs_device *device, + u64 *hole_start, u64 *hole_size, + u64 num_bytes) +{ + u64 zone_size = device->zone_info->zone_size; + u64 pos; + int ret; + bool changed = false; + + ASSERT(IS_ALIGNED(*hole_start, zone_size)); + + while (*hole_size > 0) { + pos = btrfs_find_allocatable_zones(device, *hole_start, + *hole_start + *hole_size, + num_bytes); + if (pos != *hole_start) { + *hole_size = *hole_start + *hole_size - pos; + *hole_start = pos; + changed = true; + if (*hole_size < num_bytes) + break; + } + + ret = btrfs_ensure_empty_zones(device, pos, num_bytes); + + /* Range is ensured to be empty */ + if (!ret) + return changed; + + /* Given hole range was invalid (outside of device) */ + if (ret == -ERANGE) { + *hole_start += *hole_size; + *hole_size = 0; + return 1; + } + + *hole_start += zone_size; + *hole_size -= zone_size; + changed = true; + } + + return changed; +} + /** * dev_extent_hole_check - check if specified hole is suitable for allocation * @device: the device which we have the hole @@ -1430,7 +1477,7 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) * @hole_size: the size of the hole * @num_bytes: the size of the free space that we need * - * This function may modify @hole_start and @hole_end to reflect the suitable + * This function may modify @hole_start and @hole_size to reflect the suitable * position for allocation. Returns 1 if hole position is updated, 0 otherwise. */ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, @@ -1439,24 +1486,39 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, bool changed = false; u64 hole_end = *hole_start + *hole_size; - /* - * Check before we set max_hole_start, otherwise we could end up - * sending back this offset anyway. - */ - if (contains_pending_extent(device, hole_start, *hole_size)) { - if (hole_end >= *hole_start) - *hole_size = hole_end - *hole_start; - else - *hole_size = 0; - changed = true; - } + for (;;) { + /* + * Check before we set max_hole_start, otherwise we could end up + * sending back this offset anyway. + */ + if (contains_pending_extent(device, hole_start, *hole_size)) { + if (hole_end >= *hole_start) + *hole_size = hole_end - *hole_start; + else + *hole_size = 0; + changed = true; + } + + switch (device->fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + /* No extra check */ + break; + case BTRFS_CHUNK_ALLOC_ZONED: + if (dev_extent_hole_check_zoned(device, hole_start, + hole_size, num_bytes)) { + changed = true; + /* + * The changed hole can contain pending extent. + * Loop again to check that. + */ + continue; + } + break; + default: + BUG(); + } - switch (device->fs_devices->chunk_alloc_policy) { - case BTRFS_CHUNK_ALLOC_REGULAR: - /* No extra check */ break; - default: - BUG(); } return changed; @@ -1509,6 +1571,9 @@ static int find_free_dev_extent_start(struct btrfs_device *device, search_start = dev_extent_search_start(device, search_start); + WARN_ON(device->zone_info && + !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -4317,6 +4382,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) btrfs_warn(fs_info, "balance: cannot set exclusive op status, resume manually"); + btrfs_release_path(path); + mutex_lock(&fs_info->balance_mutex); BUG_ON(fs_info->balance_ctl); spin_lock(&fs_info->balance_lock); @@ -4666,11 +4733,10 @@ again: } ret = btrfs_previous_item(root, path, 0, key.type); - if (ret) - mutex_unlock(&fs_info->delete_unused_bgs_mutex); - if (ret < 0) - goto done; if (ret) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); + if (ret < 0) + goto done; ret = 0; btrfs_release_path(path); break; @@ -4902,6 +4968,37 @@ static void init_alloc_chunk_ctl_policy_regular( ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; } +static void init_alloc_chunk_ctl_policy_zoned( + struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl) +{ + u64 zone_size = fs_devices->fs_info->zone_size; + u64 limit; + int min_num_stripes = ctl->devs_min * ctl->dev_stripes; + int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies; + u64 min_chunk_size = min_data_stripes * zone_size; + u64 type = ctl->type; + + ctl->max_stripe_size = zone_size; + if (type & BTRFS_BLOCK_GROUP_DATA) { + ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE, + zone_size); + } else if (type & BTRFS_BLOCK_GROUP_METADATA) { + ctl->max_chunk_size = ctl->max_stripe_size; + } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { + ctl->max_chunk_size = 2 * ctl->max_stripe_size; + ctl->devs_max = min_t(int, ctl->devs_max, + BTRFS_MAX_DEVS_SYS_CHUNK); + } + + /* We don't want a chunk larger than 10% of writable space */ + limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1), + zone_size), + min_chunk_size); + ctl->max_chunk_size = min(limit, ctl->max_chunk_size); + ctl->dev_extent_min = zone_size * ctl->dev_stripes; +} + static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl) { @@ -4922,6 +5019,9 @@ static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, case BTRFS_CHUNK_ALLOC_REGULAR: init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); break; + case BTRFS_CHUNK_ALLOC_ZONED: + init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); + break; default: BUG(); } @@ -5048,6 +5148,38 @@ static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, return 0; } +static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + u64 zone_size = devices_info[0].dev->zone_info->zone_size; + /* Number of stripes that count for block group size */ + int data_stripes; + + /* + * It should hold because: + * dev_extent_min == dev_extent_want == zone_size * dev_stripes + */ + ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); + + ctl->stripe_size = zone_size; + ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; + data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; + + /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ + if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { + ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, + ctl->stripe_size) + ctl->nparity, + ctl->dev_stripes); + ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; + data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; + ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); + } + + ctl->chunk_size = ctl->stripe_size * data_stripes; + + return 0; +} + static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl, struct btrfs_device_info *devices_info) @@ -5075,6 +5207,8 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, switch (fs_devices->chunk_alloc_policy) { case BTRFS_CHUNK_ALLOC_REGULAR: return decide_stripe_size_regular(ctl, devices_info); + case BTRFS_CHUNK_ALLOC_ZONED: + return decide_stripe_size_zoned(ctl, devices_info); default: BUG(); } @@ -5839,9 +5973,29 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, return ret; } +static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) +{ + struct btrfs_block_group *cache; + bool ret; + + /* Non zoned filesystem does not use "to_copy" flag */ + if (!btrfs_is_zoned(fs_info)) + return false; + + cache = btrfs_lookup_block_group(fs_info, logical); + + spin_lock(&cache->lock); + ret = cache->to_copy; + spin_unlock(&cache->lock); + + btrfs_put_block_group(cache); + return ret; +} + static void handle_ops_on_dev_replace(enum btrfs_map_op op, struct btrfs_bio **bbio_ret, struct btrfs_dev_replace *dev_replace, + u64 logical, int *num_stripes_ret, int *max_errors_ret) { struct btrfs_bio *bbio = *bbio_ret; @@ -5855,6 +6009,13 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, int index_where_to_add; /* + * A block group which have "to_copy" set will eventually + * copied by dev-replace process. We can avoid cloning IO here. + */ + if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) + return; + + /* * duplicate the write operations while the dev replace * procedure is running. Since the copying of the old disk to * the new disk takes place at run time while the filesystem is @@ -5939,23 +6100,24 @@ static bool need_full_stripe(enum btrfs_map_op op) } /* - * btrfs_get_io_geometry - calculates the geomery of a particular (address, len) - * tuple. This information is used to calculate how big a - * particular bio can get before it straddles a stripe. + * Calculate the geometry of a particular (address, len) tuple. This + * information is used to calculate how big a particular bio can get before it + * straddles a stripe. * - * @fs_info - the filesystem - * @logical - address that we want to figure out the geometry of - * @len - the length of IO we are going to perform, starting at @logical - * @op - type of operation - write or read - * @io_geom - pointer used to return values + * @fs_info: the filesystem + * @em: mapping containing the logical extent + * @op: type of operation - write or read + * @logical: address that we want to figure out the geometry of + * @len: the length of IO we are going to perform, starting at @logical + * @io_geom: pointer used to return values * * Returns < 0 in case a chunk for the given logical address cannot be found, * usually shouldn't happen unless @logical is corrupted, 0 otherwise. */ -int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 len, struct btrfs_io_geometry *io_geom) +int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, + enum btrfs_map_op op, u64 logical, u64 len, + struct btrfs_io_geometry *io_geom) { - struct extent_map *em; struct map_lookup *map; u64 offset; u64 stripe_offset; @@ -5963,14 +6125,9 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 stripe_len; u64 raid56_full_stripe_start = (u64)-1; int data_stripes; - int ret = 0; ASSERT(op != BTRFS_MAP_DISCARD); - em = btrfs_get_chunk_map(fs_info, logical, len); - if (IS_ERR(em)) - return PTR_ERR(em); - map = em->map_lookup; /* Offset of this logical address in the chunk */ offset = logical - em->start; @@ -5984,8 +6141,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, btrfs_crit(fs_info, "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", stripe_offset, offset, em->start, logical, stripe_len); - ret = -EINVAL; - goto out; + return -EINVAL; } /* stripe_offset is the offset of this block in its stripe */ @@ -6032,10 +6188,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, io_geom->stripe_offset = stripe_offset; io_geom->raid56_stripe_offset = raid56_full_stripe_start; -out: - /* once for us */ - free_extent_map(em); - return ret; + return 0; } static int __btrfs_map_block(struct btrfs_fs_info *fs_info, @@ -6068,12 +6221,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, ASSERT(bbio_ret); ASSERT(op != BTRFS_MAP_DISCARD); - ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom); + em = btrfs_get_chunk_map(fs_info, logical, *length); + ASSERT(!IS_ERR(em)); + + ret = btrfs_get_io_geometry(fs_info, em, op, logical, *length, &geom); if (ret < 0) return ret; - em = btrfs_get_chunk_map(fs_info, logical, *length); - ASSERT(!IS_ERR(em)); map = em->map_lookup; *length = geom.len; @@ -6249,8 +6403,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && need_full_stripe(op)) { - handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes, - &max_errors); + handle_ops_on_dev_replace(op, &bbio, dev_replace, logical, + &num_stripes, &max_errors); } *bbio_ret = bbio; @@ -6321,7 +6475,7 @@ static void btrfs_end_bio(struct bio *bio) struct btrfs_device *dev = btrfs_io_bio(bio)->device; ASSERT(dev->bdev); - if (bio_op(bio) == REQ_OP_WRITE) + if (btrfs_op(bio) == BTRFS_MAP_WRITE) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); else if (!(bio->bi_opf & REQ_RAHEAD)) @@ -6373,6 +6527,20 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, btrfs_io_bio(bio)->device = dev; bio->bi_end_io = btrfs_end_bio; bio->bi_iter.bi_sector = physical >> 9; + /* + * For zone append writing, bi_sector must point the beginning of the + * zone + */ + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + if (btrfs_dev_is_sequential(dev, physical)) { + u64 zone_start = round_down(physical, fs_info->zone_size); + + bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; + } else { + bio->bi_opf &= ~REQ_OP_ZONE_APPEND; + bio->bi_opf |= REQ_OP_WRITE; + } + } btrfs_debug_in_rcu(fs_info, "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, @@ -6434,10 +6602,10 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, atomic_set(&bbio->stripes_pending, bbio->num_stripes); if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && - ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) { + ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { /* In this case, map_length has been set to the length of a single stripe; not the whole write */ - if (bio_op(bio) == REQ_OP_WRITE) { + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { ret = raid56_parity_write(fs_info, bio, bbio, map_length); } else { @@ -6460,7 +6628,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, dev = bbio->stripes[dev_nr].dev; if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || - (bio_op(first_bio) == REQ_OP_WRITE && + (btrfs_op(first_bio) == BTRFS_MAP_WRITE && !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { bbio_error(bbio, first_bio, logical); continue; @@ -7642,6 +7810,20 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, ret = -EUCLEAN; goto out; } + + if (dev->zone_info) { + u64 zone_size = dev->zone_info->zone_size; + + if (!IS_ALIGNED(physical_offset, zone_size) || + !IS_ALIGNED(physical_len, zone_size)) { + btrfs_err(fs_info, +"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", + devid, physical_offset, physical_len); + ret = -EUCLEAN; + goto out; + } + } + out: free_extent_map(em); return ret; @@ -7798,3 +7980,75 @@ bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) spin_unlock(&fs_info->swapfile_pins_lock); return node != NULL; } + +static int relocating_repair_kthread(void *data) +{ + struct btrfs_block_group *cache = (struct btrfs_block_group *)data; + struct btrfs_fs_info *fs_info = cache->fs_info; + u64 target; + int ret = 0; + + target = cache->start; + btrfs_put_block_group(cache); + + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { + btrfs_info(fs_info, + "zoned: skip relocating block group %llu to repair: EBUSY", + target); + return -EBUSY; + } + + mutex_lock(&fs_info->delete_unused_bgs_mutex); + + /* Ensure block group still exists */ + cache = btrfs_lookup_block_group(fs_info, target); + if (!cache) + goto out; + + if (!cache->relocating_repair) + goto out; + + ret = btrfs_may_alloc_data_chunk(fs_info, target); + if (ret < 0) + goto out; + + btrfs_info(fs_info, + "zoned: relocating block group %llu to repair IO failure", + target); + ret = btrfs_relocate_chunk(fs_info, target); + +out: + if (cache) + btrfs_put_block_group(cache); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); + btrfs_exclop_finish(fs_info); + + return ret; +} + +int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) +{ + struct btrfs_block_group *cache; + + /* Do not attempt to repair in degraded state */ + if (btrfs_test_opt(fs_info, DEGRADED)) + return 0; + + cache = btrfs_lookup_block_group(fs_info, logical); + if (!cache) + return 0; + + spin_lock(&cache->lock); + if (cache->relocating_repair) { + spin_unlock(&cache->lock); + btrfs_put_block_group(cache); + return 0; + } + cache->relocating_repair = 1; + spin_unlock(&cache->lock); + + kthread_run(relocating_repair_kthread, cache, + "btrfs-relocating-repair"); + + return 0; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 1997a4649a66..d4c3e0dd32b8 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -39,10 +39,10 @@ struct btrfs_io_geometry { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) #include <linux/seqlock.h> #define __BTRFS_NEED_DEVICE_DATA_ORDERED -#define btrfs_device_data_ordered_init(device, info) \ - seqcount_mutex_init(&device->data_seqcount, &info->chunk_mutex) +#define btrfs_device_data_ordered_init(device) \ + seqcount_init(&device->data_seqcount) #else -#define btrfs_device_data_ordered_init(device, info) do { } while (0) +#define btrfs_device_data_ordered_init(device) do { } while (0) #endif #define BTRFS_DEV_STATE_WRITEABLE (0) @@ -76,8 +76,7 @@ struct btrfs_device { blk_status_t last_flush_error; #ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED - /* A seqcount_t with associated chunk_mutex (for lockdep) */ - seqcount_mutex_t data_seqcount; + seqcount_t data_seqcount; #endif /* the internal btrfs device id */ @@ -168,9 +167,11 @@ btrfs_device_get_##name(const struct btrfs_device *dev) \ static inline void \ btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \ { \ + preempt_disable(); \ write_seqcount_begin(&dev->data_seqcount); \ dev->name = size; \ write_seqcount_end(&dev->data_seqcount); \ + preempt_enable(); \ } #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) #define BTRFS_DEVICE_GETSET_FUNCS(name) \ @@ -213,6 +214,7 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used); enum btrfs_chunk_allocation_policy { BTRFS_CHUNK_ALLOC_REGULAR, + BTRFS_CHUNK_ALLOC_ZONED, }; /* @@ -422,6 +424,7 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio) case REQ_OP_DISCARD: return BTRFS_MAP_DISCARD; case REQ_OP_WRITE: + case REQ_OP_ZONE_APPEND: return BTRFS_MAP_WRITE; default: WARN_ON_ONCE(1); @@ -439,8 +442,9 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret); -int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 len, struct btrfs_io_geometry *io_geom); +int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map, + enum btrfs_map_op op, u64 logical, u64 len, + struct btrfs_io_geometry *io_geom); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type); @@ -595,5 +599,6 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, int btrfs_bg_type_to_factor(u64 flags); const char *btrfs_bg_type_to_raid_name(u64 flags); int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); +int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); #endif diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index c38846659019..9a5cf153da89 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1,14 +1,25 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/bitops.h> #include <linux/slab.h> #include <linux/blkdev.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "volumes.h" #include "zoned.h" #include "rcu-string.h" +#include "disk-io.h" +#include "block-group.h" +#include "transaction.h" +#include "dev-replace.h" +#include "space-info.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 +/* Invalid allocation pointer value for missing devices */ +#define WP_MISSING_DEV ((u64)-1) +/* Pseudo write pointer value for conventional zone */ +#define WP_CONVENTIONAL ((u64)-2) /* Number of superblock log zones */ #define BTRFS_NR_SB_LOG_ZONES 2 @@ -119,6 +130,36 @@ static inline u32 sb_zone_number(int shift, int mirror) return 0; } +/* + * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block + * device into static sized chunks and fake a conventional zone on each of + * them. + */ +static int emulate_report_zones(struct btrfs_device *device, u64 pos, + struct blk_zone *zones, unsigned int nr_zones) +{ + const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT; + sector_t bdev_size = bdev_nr_sectors(device->bdev); + unsigned int i; + + pos >>= SECTOR_SHIFT; + for (i = 0; i < nr_zones; i++) { + zones[i].start = i * zone_sectors + pos; + zones[i].len = zone_sectors; + zones[i].capacity = zone_sectors; + zones[i].wp = zones[i].start + zone_sectors; + zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL; + zones[i].cond = BLK_ZONE_COND_NOT_WP; + + if (zones[i].wp >= bdev_size) { + i++; + break; + } + } + + return i; +} + static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, struct blk_zone *zones, unsigned int *nr_zones) { @@ -127,6 +168,12 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, if (!*nr_zones) return 0; + if (!bdev_is_zoned(device->bdev)) { + ret = emulate_report_zones(device, pos, zones, *nr_zones); + *nr_zones = ret; + return 0; + } + ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, copy_zone_info_cb, zones); if (ret < 0) { @@ -143,8 +190,78 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, return 0; } +/* The emulated zone size is determined from the size of device extent */ +static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) +{ + struct btrfs_path *path; + struct btrfs_root *root = fs_info->dev_root; + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_dev_extent *dext; + int ret = 0; + + key.objectid = 1; + key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_item(root, path); + if (ret < 0) + goto out; + /* No dev extents at all? Not good */ + if (ret > 0) { + ret = -EUCLEAN; + goto out; + } + } + + leaf = path->nodes[0]; + dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); + fs_info->zone_size = btrfs_dev_extent_length(leaf, dext); + ret = 0; + +out: + btrfs_free_path(path); + + return ret; +} + +int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + int ret = 0; + + /* fs_info->zone_size might not set yet. Use the incomapt flag here. */ + if (!btrfs_fs_incompat(fs_info, ZONED)) + return 0; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + /* We can skip reading of zone info for missing devices */ + if (!device->bdev) + continue; + + ret = btrfs_get_dev_zone_info(device); + if (ret) + break; + } + mutex_unlock(&fs_devices->device_list_mutex); + + return ret; +} + int btrfs_get_dev_zone_info(struct btrfs_device *device) { + struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_zoned_device_info *zone_info = NULL; struct block_device *bdev = device->bdev; struct request_queue *queue = bdev_get_queue(bdev); @@ -153,9 +270,14 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) struct blk_zone *zones = NULL; unsigned int i, nreported = 0, nr_zones; unsigned int zone_sectors; + char *model, *emulated; int ret; - if (!bdev_is_zoned(bdev)) + /* + * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not + * yet be set. + */ + if (!btrfs_fs_incompat(fs_info, ZONED)) return 0; if (device->zone_info) @@ -165,8 +287,20 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) if (!zone_info) return -ENOMEM; + if (!bdev_is_zoned(bdev)) { + if (!fs_info->zone_size) { + ret = calculate_emulated_zone_size(fs_info); + if (ret) + goto out; + } + + ASSERT(fs_info->zone_size); + zone_sectors = fs_info->zone_size >> SECTOR_SHIFT; + } else { + zone_sectors = bdev_zone_sectors(bdev); + } + nr_sectors = bdev_nr_sectors(bdev); - zone_sectors = bdev_zone_sectors(bdev); /* Check if it's power of 2 (see is_power_of_2) */ ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0); zone_info->zone_size = zone_sectors << SECTOR_SHIFT; @@ -272,20 +406,42 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) device->zone_info = zone_info; - /* device->fs_info is not safe to use for printing messages */ - btrfs_info_in_rcu(NULL, - "host-%s zoned block device %s, %u zones of %llu bytes", - bdev_zoned_model(bdev) == BLK_ZONED_HM ? "managed" : "aware", - rcu_str_deref(device->name), zone_info->nr_zones, - zone_info->zone_size); + switch (bdev_zoned_model(bdev)) { + case BLK_ZONED_HM: + model = "host-managed zoned"; + emulated = ""; + break; + case BLK_ZONED_HA: + model = "host-aware zoned"; + emulated = ""; + break; + case BLK_ZONED_NONE: + model = "regular"; + emulated = "emulated "; + break; + default: + /* Just in case */ + btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s", + bdev_zoned_model(bdev), + rcu_str_deref(device->name)); + ret = -EOPNOTSUPP; + goto out_free_zone_info; + } + + btrfs_info_in_rcu(fs_info, + "%s block device %s, %u %szones of %llu bytes", + model, rcu_str_deref(device->name), zone_info->nr_zones, + emulated, zone_info->zone_size); return 0; out: kfree(zones); +out_free_zone_info: bitmap_free(zone_info->empty_zones); bitmap_free(zone_info->seq_zones); kfree(zone_info); + device->zone_info = NULL; return ret; } @@ -324,7 +480,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) u64 nr_devices = 0; u64 zone_size = 0; u64 max_zone_append_size = 0; - const bool incompat_zoned = btrfs_is_zoned(fs_info); + const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED); int ret = 0; /* Count zoned devices */ @@ -335,9 +491,17 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) continue; model = bdev_zoned_model(device->bdev); + /* + * A Host-Managed zoned device must be used as a zoned device. + * A Host-Aware zoned device and a non-zoned devices can be + * treated as a zoned device, if ZONED flag is enabled in the + * superblock. + */ if (model == BLK_ZONED_HM || - (model == BLK_ZONED_HA && incompat_zoned)) { - struct btrfs_zoned_device_info *zone_info; + (model == BLK_ZONED_HA && incompat_zoned) || + (model == BLK_ZONED_NONE && incompat_zoned)) { + struct btrfs_zoned_device_info *zone_info = + device->zone_info; zone_info = device->zone_info; zoned_devices++; @@ -406,6 +570,15 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) fs_info->zone_size = zone_size; fs_info->max_zone_append_size = max_zone_append_size; + fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; + + /* + * Check mount options here, because we might change fs_info->zoned + * from fs_info->zone_size. + */ + ret = btrfs_check_mountopts_zoned(fs_info); + if (ret) + goto out; btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size); out: @@ -488,7 +661,6 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, unsigned int zone_sectors; u32 sb_zone; int ret; - u64 zone_size; u8 zone_sectors_shift; sector_t nr_sectors; u32 nr_zones; @@ -503,7 +675,6 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, zone_sectors = bdev_zone_sectors(bdev); if (!is_power_of_2(zone_sectors)) return -EINVAL; - zone_size = zone_sectors << SECTOR_SHIFT; zone_sectors_shift = ilog2(zone_sectors); nr_sectors = bdev_nr_sectors(bdev); nr_zones = nr_sectors >> zone_sectors_shift; @@ -529,7 +700,13 @@ int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, struct btrfs_zoned_device_info *zinfo = device->zone_info; u32 zone_num; - if (!zinfo) { + /* + * For a zoned filesystem on a non-zoned block device, use the same + * super block locations as regular filesystem. Doing so, the super + * block can always be retrieved and the zoned flag of the volume + * detected from the super block information. + */ + if (!bdev_is_zoned(device->bdev)) { *bytenr_ret = btrfs_sb_offset(mirror); return 0; } @@ -614,3 +791,671 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) sb_zone << zone_sectors_shift, zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); } + +/** + * btrfs_find_allocatable_zones - find allocatable zones within a given region + * + * @device: the device to allocate a region on + * @hole_start: the position of the hole to allocate the region + * @num_bytes: size of wanted region + * @hole_end: the end of the hole + * @return: position of allocatable zones + * + * Allocatable region should not contain any superblock locations. + */ +u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, + u64 hole_end, u64 num_bytes) +{ + struct btrfs_zoned_device_info *zinfo = device->zone_info; + const u8 shift = zinfo->zone_size_shift; + u64 nzones = num_bytes >> shift; + u64 pos = hole_start; + u64 begin, end; + bool have_sb; + int i; + + ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size)); + ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size)); + + while (pos < hole_end) { + begin = pos >> shift; + end = begin + nzones; + + if (end > zinfo->nr_zones) + return hole_end; + + /* Check if zones in the region are all empty */ + if (btrfs_dev_is_sequential(device, pos) && + find_next_zero_bit(zinfo->empty_zones, end, begin) != end) { + pos += zinfo->zone_size; + continue; + } + + have_sb = false; + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + u32 sb_zone; + u64 sb_pos; + + sb_zone = sb_zone_number(shift, i); + if (!(end <= sb_zone || + sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) { + have_sb = true; + pos = ((u64)sb_zone + BTRFS_NR_SB_LOG_ZONES) << shift; + break; + } + + /* We also need to exclude regular superblock positions */ + sb_pos = btrfs_sb_offset(i); + if (!(pos + num_bytes <= sb_pos || + sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) { + have_sb = true; + pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE, + zinfo->zone_size); + break; + } + } + if (!have_sb) + break; + } + + return pos; +} + +int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, + u64 length, u64 *bytes) +{ + int ret; + + *bytes = 0; + ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, + physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, + GFP_NOFS); + if (ret) + return ret; + + *bytes = length; + while (length) { + btrfs_dev_set_zone_empty(device, physical); + physical += device->zone_info->zone_size; + length -= device->zone_info->zone_size; + } + + return 0; +} + +int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) +{ + struct btrfs_zoned_device_info *zinfo = device->zone_info; + const u8 shift = zinfo->zone_size_shift; + unsigned long begin = start >> shift; + unsigned long end = (start + size) >> shift; + u64 pos; + int ret; + + ASSERT(IS_ALIGNED(start, zinfo->zone_size)); + ASSERT(IS_ALIGNED(size, zinfo->zone_size)); + + if (end > zinfo->nr_zones) + return -ERANGE; + + /* All the zones are conventional */ + if (find_next_bit(zinfo->seq_zones, begin, end) == end) + return 0; + + /* All the zones are sequential and empty */ + if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end && + find_next_zero_bit(zinfo->empty_zones, begin, end) == end) + return 0; + + for (pos = start; pos < start + size; pos += zinfo->zone_size) { + u64 reset_bytes; + + if (!btrfs_dev_is_sequential(device, pos) || + btrfs_dev_is_empty_zone(device, pos)) + continue; + + /* Free regions should be empty */ + btrfs_warn_in_rcu( + device->fs_info, + "zoned: resetting device %s (devid %llu) zone %llu for allocation", + rcu_str_deref(device->name), device->devid, pos >> shift); + WARN_ON_ONCE(1); + + ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size, + &reset_bytes); + if (ret) + return ret; + } + + return 0; +} + +/* + * Calculate an allocation pointer from the extent allocation information + * for a block group consist of conventional zones. It is pointed to the + * end of the highest addressed extent in the block group as an allocation + * offset. + */ +static int calculate_alloc_pointer(struct btrfs_block_group *cache, + u64 *offset_ret) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + int ret; + u64 length; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = cache->start + cache->length; + key.type = 0; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + /* We should not find the exact match */ + if (!ret) + ret = -EUCLEAN; + if (ret < 0) + goto out; + + ret = btrfs_previous_extent_item(root, path, cache->start); + if (ret) { + if (ret == 1) { + ret = 0; + *offset_ret = 0; + } + goto out; + } + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); + + if (found_key.type == BTRFS_EXTENT_ITEM_KEY) + length = found_key.offset; + else + length = fs_info->nodesize; + + if (!(found_key.objectid >= cache->start && + found_key.objectid + length <= cache->start + cache->length)) { + ret = -EUCLEAN; + goto out; + } + *offset_ret = found_key.objectid + length - cache->start; + ret = 0; + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct extent_map_tree *em_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct map_lookup *map; + struct btrfs_device *device; + u64 logical = cache->start; + u64 length = cache->length; + u64 physical = 0; + int ret; + int i; + unsigned int nofs_flag; + u64 *alloc_offsets = NULL; + u64 last_alloc = 0; + u32 num_sequential = 0, num_conventional = 0; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + /* Sanity check */ + if (!IS_ALIGNED(length, fs_info->zone_size)) { + btrfs_err(fs_info, + "zoned: block group %llu len %llu unaligned to zone size %llu", + logical, length, fs_info->zone_size); + return -EIO; + } + + /* Get the chunk mapping */ + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, length); + read_unlock(&em_tree->lock); + + if (!em) + return -EINVAL; + + map = em->map_lookup; + + alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS); + if (!alloc_offsets) { + free_extent_map(em); + return -ENOMEM; + } + + for (i = 0; i < map->num_stripes; i++) { + bool is_sequential; + struct blk_zone zone; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int dev_replace_is_ongoing = 0; + + device = map->stripes[i].dev; + physical = map->stripes[i].physical; + + if (device->bdev == NULL) { + alloc_offsets[i] = WP_MISSING_DEV; + continue; + } + + is_sequential = btrfs_dev_is_sequential(device, physical); + if (is_sequential) + num_sequential++; + else + num_conventional++; + + if (!is_sequential) { + alloc_offsets[i] = WP_CONVENTIONAL; + continue; + } + + /* + * This zone will be used for allocation, so mark this zone + * non-empty. + */ + btrfs_dev_clear_zone_empty(device, physical); + + down_read(&dev_replace->rwsem); + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) + btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical); + up_read(&dev_replace->rwsem); + + /* + * The group is mapped to a sequential zone. Get the zone write + * pointer to determine the allocation offset within the zone. + */ + WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size)); + nofs_flag = memalloc_nofs_save(); + ret = btrfs_get_dev_zone(device, physical, &zone); + memalloc_nofs_restore(nofs_flag); + if (ret == -EIO || ret == -EOPNOTSUPP) { + ret = 0; + alloc_offsets[i] = WP_MISSING_DEV; + continue; + } else if (ret) { + goto out; + } + + switch (zone.cond) { + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + btrfs_err(fs_info, + "zoned: offline/readonly zone %llu on device %s (devid %llu)", + physical >> device->zone_info->zone_size_shift, + rcu_str_deref(device->name), device->devid); + alloc_offsets[i] = WP_MISSING_DEV; + break; + case BLK_ZONE_COND_EMPTY: + alloc_offsets[i] = 0; + break; + case BLK_ZONE_COND_FULL: + alloc_offsets[i] = fs_info->zone_size; + break; + default: + /* Partially used zone */ + alloc_offsets[i] = + ((zone.wp - zone.start) << SECTOR_SHIFT); + break; + } + } + + if (num_sequential > 0) + cache->seq_zone = true; + + if (num_conventional > 0) { + /* + * Avoid calling calculate_alloc_pointer() for new BG. It + * is no use for new BG. It must be always 0. + * + * Also, we have a lock chain of extent buffer lock -> + * chunk mutex. For new BG, this function is called from + * btrfs_make_block_group() which is already taking the + * chunk mutex. Thus, we cannot call + * calculate_alloc_pointer() which takes extent buffer + * locks to avoid deadlock. + */ + if (new) { + cache->alloc_offset = 0; + goto out; + } + ret = calculate_alloc_pointer(cache, &last_alloc); + if (ret || map->num_stripes == num_conventional) { + if (!ret) + cache->alloc_offset = last_alloc; + else + btrfs_err(fs_info, + "zoned: failed to determine allocation offset of bg %llu", + cache->start); + goto out; + } + } + + switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + case 0: /* single */ + cache->alloc_offset = alloc_offsets[0]; + break; + case BTRFS_BLOCK_GROUP_DUP: + case BTRFS_BLOCK_GROUP_RAID1: + case BTRFS_BLOCK_GROUP_RAID0: + case BTRFS_BLOCK_GROUP_RAID10: + case BTRFS_BLOCK_GROUP_RAID5: + case BTRFS_BLOCK_GROUP_RAID6: + /* non-single profiles are not supported yet */ + default: + btrfs_err(fs_info, "zoned: profile %s not yet supported", + btrfs_bg_type_to_raid_name(map->type)); + ret = -EINVAL; + goto out; + } + +out: + /* An extent is allocated after the write pointer */ + if (!ret && num_conventional && last_alloc > cache->alloc_offset) { + btrfs_err(fs_info, + "zoned: got wrong write pointer in BG %llu: %llu > %llu", + logical, last_alloc, cache->alloc_offset); + ret = -EIO; + } + + if (!ret) + cache->meta_write_pointer = cache->alloc_offset + cache->start; + + kfree(alloc_offsets); + free_extent_map(em); + + return ret; +} + +void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) +{ + u64 unusable, free; + + if (!btrfs_is_zoned(cache->fs_info)) + return; + + WARN_ON(cache->bytes_super != 0); + unusable = cache->alloc_offset - cache->used; + free = cache->length - cache->alloc_offset; + + /* We only need ->free_space in ALLOC_SEQ block groups */ + cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + cache->free_space_ctl->free_space = free; + cache->zone_unusable = unusable; + + /* Should not have any excluded extents. Just in case, though */ + btrfs_free_excluded_extents(cache); +} + +void btrfs_redirty_list_add(struct btrfs_transaction *trans, + struct extent_buffer *eb) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + + if (!btrfs_is_zoned(fs_info) || + btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) || + !list_empty(&eb->release_list)) + return; + + set_extent_buffer_dirty(eb); + set_extent_bits_nowait(&trans->dirty_pages, eb->start, + eb->start + eb->len - 1, EXTENT_DIRTY); + memzero_extent_buffer(eb, 0, eb->len); + set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); + + spin_lock(&trans->releasing_ebs_lock); + list_add_tail(&eb->release_list, &trans->releasing_ebs); + spin_unlock(&trans->releasing_ebs_lock); + atomic_inc(&eb->refs); +} + +void btrfs_free_redirty_list(struct btrfs_transaction *trans) +{ + spin_lock(&trans->releasing_ebs_lock); + while (!list_empty(&trans->releasing_ebs)) { + struct extent_buffer *eb; + + eb = list_first_entry(&trans->releasing_ebs, + struct extent_buffer, release_list); + list_del_init(&eb->release_list); + free_extent_buffer(eb); + } + spin_unlock(&trans->releasing_ebs_lock); +} + +bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_block_group *cache; + bool ret = false; + + if (!btrfs_is_zoned(fs_info)) + return false; + + if (!fs_info->max_zone_append_size) + return false; + + if (!is_data_inode(&inode->vfs_inode)) + return false; + + cache = btrfs_lookup_block_group(fs_info, em->block_start); + ASSERT(cache); + if (!cache) + return false; + + ret = cache->seq_zone; + btrfs_put_block_group(cache); + + return ret; +} + +void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, + struct bio *bio) +{ + struct btrfs_ordered_extent *ordered; + const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + + if (bio_op(bio) != REQ_OP_ZONE_APPEND) + return; + + ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset); + if (WARN_ON(!ordered)) + return; + + ordered->physical = physical; + ordered->disk = bio->bi_disk; + ordered->partno = bio->bi_partno; + + btrfs_put_ordered_extent(ordered); +} + +void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) +{ + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct btrfs_ordered_sum *sum; + struct block_device *bdev; + u64 orig_logical = ordered->disk_bytenr; + u64 *logical = NULL; + int nr, stripe_len; + + /* Zoned devices should not have partitions. So, we can assume it is 0 */ + ASSERT(ordered->partno == 0); + bdev = bdgrab(ordered->disk->part0); + if (WARN_ON(!bdev)) + return; + + if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, bdev, + ordered->physical, &logical, &nr, + &stripe_len))) + goto out; + + WARN_ON(nr != 1); + + if (orig_logical == *logical) + goto out; + + ordered->disk_bytenr = *logical; + + em_tree = &inode->extent_tree; + write_lock(&em_tree->lock); + em = search_extent_mapping(em_tree, ordered->file_offset, + ordered->num_bytes); + em->block_start = *logical; + free_extent_map(em); + write_unlock(&em_tree->lock); + + list_for_each_entry(sum, &ordered->list, list) { + if (*logical < orig_logical) + sum->bytenr -= orig_logical - *logical; + else + sum->bytenr += *logical - orig_logical; + } + +out: + kfree(logical); + bdput(bdev); +} + +bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, + struct btrfs_block_group **cache_ret) +{ + struct btrfs_block_group *cache; + bool ret = true; + + if (!btrfs_is_zoned(fs_info)) + return true; + + cache = *cache_ret; + + if (cache && (eb->start < cache->start || + cache->start + cache->length <= eb->start)) { + btrfs_put_block_group(cache); + cache = NULL; + *cache_ret = NULL; + } + + if (!cache) + cache = btrfs_lookup_block_group(fs_info, eb->start); + + if (cache) { + if (cache->meta_write_pointer != eb->start) { + btrfs_put_block_group(cache); + cache = NULL; + ret = false; + } else { + cache->meta_write_pointer = eb->start + eb->len; + } + + *cache_ret = cache; + } + + return ret; +} + +void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, + struct extent_buffer *eb) +{ + if (!btrfs_is_zoned(eb->fs_info) || !cache) + return; + + ASSERT(cache->meta_write_pointer == eb->start + eb->len); + cache->meta_write_pointer = eb->start; +} + +int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length) +{ + if (!btrfs_dev_is_sequential(device, physical)) + return -EOPNOTSUPP; + + return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT, + length >> SECTOR_SHIFT, GFP_NOFS, 0); +} + +static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, + struct blk_zone *zone) +{ + struct btrfs_bio *bbio = NULL; + u64 mapped_length = PAGE_SIZE; + unsigned int nofs_flag; + int nmirrors; + int i, ret; + + ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, + &mapped_length, &bbio); + if (ret || !bbio || mapped_length < PAGE_SIZE) { + btrfs_put_bbio(bbio); + return -EIO; + } + + if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) + return -EINVAL; + + nofs_flag = memalloc_nofs_save(); + nmirrors = (int)bbio->num_stripes; + for (i = 0; i < nmirrors; i++) { + u64 physical = bbio->stripes[i].physical; + struct btrfs_device *dev = bbio->stripes[i].dev; + + /* Missing device */ + if (!dev->bdev) + continue; + + ret = btrfs_get_dev_zone(dev, physical, zone); + /* Failing device */ + if (ret == -EIO || ret == -EOPNOTSUPP) + continue; + break; + } + memalloc_nofs_restore(nofs_flag); + + return ret; +} + +/* + * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by + * filling zeros between @physical_pos to a write pointer of dev-replace + * source device. + */ +int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, + u64 physical_start, u64 physical_pos) +{ + struct btrfs_fs_info *fs_info = tgt_dev->fs_info; + struct blk_zone zone; + u64 length; + u64 wp; + int ret; + + if (!btrfs_dev_is_sequential(tgt_dev, physical_pos)) + return 0; + + ret = read_zone_info(fs_info, logical, &zone); + if (ret) + return ret; + + wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT); + + if (physical_pos == wp) + return 0; + + if (physical_pos > wp) + return -EUCLEAN; + + length = wp - physical_pos; + return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 8abe2f83272b..61e969652fe1 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -7,6 +7,7 @@ #include <linux/blkdev.h> #include "volumes.h" #include "disk-io.h" +#include "block-group.h" struct btrfs_zoned_device_info { /* @@ -25,6 +26,7 @@ struct btrfs_zoned_device_info { #ifdef CONFIG_BLK_DEV_ZONED int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone); +int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info); int btrfs_get_dev_zone_info(struct btrfs_device *device); void btrfs_destroy_dev_zone_info(struct btrfs_device *device); int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info); @@ -35,6 +37,28 @@ int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, u64 *bytenr_ret); void btrfs_advance_sb_log(struct btrfs_device *device, int mirror); int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror); +u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, + u64 hole_end, u64 num_bytes); +int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, + u64 length, u64 *bytes); +int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size); +int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new); +void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); +void btrfs_redirty_list_add(struct btrfs_transaction *trans, + struct extent_buffer *eb); +void btrfs_free_redirty_list(struct btrfs_transaction *trans); +bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em); +void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, + struct bio *bio); +void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered); +bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, + struct btrfs_block_group **cache_ret); +void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, + struct extent_buffer *eb); +int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); +int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, + u64 physical_start, u64 physical_pos); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -42,6 +66,11 @@ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, return 0; } +static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) +{ + return 0; +} + static inline int btrfs_get_dev_zone_info(struct btrfs_device *device) { return 0; @@ -85,6 +114,78 @@ static inline int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror return 0; } +static inline u64 btrfs_find_allocatable_zones(struct btrfs_device *device, + u64 hole_start, u64 hole_end, + u64 num_bytes) +{ + return hole_start; +} + +static inline int btrfs_reset_device_zone(struct btrfs_device *device, + u64 physical, u64 length, u64 *bytes) +{ + *bytes = 0; + return 0; +} + +static inline int btrfs_ensure_empty_zones(struct btrfs_device *device, + u64 start, u64 size) +{ + return 0; +} + +static inline int btrfs_load_block_group_zone_info( + struct btrfs_block_group *cache, bool new) +{ + return 0; +} + +static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { } + +static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, + struct extent_buffer *eb) { } +static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { } + +static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, + struct extent_map *em) +{ + return false; +} + +static inline void btrfs_record_physical_zoned(struct inode *inode, + u64 file_offset, struct bio *bio) +{ +} + +static inline void btrfs_rewrite_logical_zoned( + struct btrfs_ordered_extent *ordered) { } + +static inline bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, + struct btrfs_block_group **cache_ret) +{ + return true; +} + +static inline void btrfs_revert_meta_write_pointer( + struct btrfs_block_group *cache, + struct extent_buffer *eb) +{ +} + +static inline int btrfs_zoned_issue_zeroout(struct btrfs_device *device, + u64 physical, u64 length) +{ + return -EOPNOTSUPP; +} + +static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, + u64 logical, u64 physical_start, + u64 physical_pos) +{ + return -EOPNOTSUPP; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) @@ -136,12 +237,16 @@ static inline void btrfs_dev_clear_zone_empty(struct btrfs_device *device, u64 p static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_info, struct block_device *bdev) { - u64 zone_size; - if (btrfs_is_zoned(fs_info)) { - zone_size = bdev_zone_sectors(bdev) << SECTOR_SHIFT; - /* Do not allow non-zoned device */ - return bdev_is_zoned(bdev) && fs_info->zone_size == zone_size; + /* + * We can allow a regular device on a zoned filesystem, because + * we will emulate the zoned capabilities. + */ + if (!bdev_is_zoned(bdev)) + return true; + + return fs_info->zone_size == + (bdev_zone_sectors(bdev) << SECTOR_SHIFT); } /* Do not allow Host Manged zoned device */ @@ -157,4 +262,46 @@ static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 p return device->zone_info == NULL || !btrfs_dev_is_sequential(device, pos); } +static inline bool btrfs_can_zone_reset(struct btrfs_device *device, + u64 physical, u64 length) +{ + u64 zone_size; + + if (!btrfs_dev_is_sequential(device, physical)) + return false; + + zone_size = device->zone_info->zone_size; + if (!IS_ALIGNED(physical, zone_size) || !IS_ALIGNED(length, zone_size)) + return false; + + return true; +} + +static inline void btrfs_zoned_meta_io_lock(struct btrfs_fs_info *fs_info) +{ + if (!btrfs_is_zoned(fs_info)) + return; + mutex_lock(&fs_info->zoned_meta_io_lock); +} + +static inline void btrfs_zoned_meta_io_unlock(struct btrfs_fs_info *fs_info) +{ + if (!btrfs_is_zoned(fs_info)) + return; + mutex_unlock(&fs_info->zoned_meta_io_lock); +} + +static inline void btrfs_clear_treelog_bg(struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + if (!btrfs_is_zoned(fs_info)) + return; + + spin_lock(&fs_info->treelog_bg_lock); + if (fs_info->treelog_bg == bg->start) + fs_info->treelog_bg = 0; + spin_unlock(&fs_info->treelog_bg_lock); +} + #endif diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 8bda092e60c5..e027c718ca01 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -413,7 +413,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, inode = d_backing_inode(object->backer); ASSERT(S_ISREG(inode->i_mode)); - ASSERT(inode->i_mapping->a_ops->readpages); /* calculate the shift required to use bmap */ shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; @@ -713,7 +712,6 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, inode = d_backing_inode(object->backer); ASSERT(S_ISREG(inode->i_mode)); - ASSERT(inode->i_mapping->a_ops->readpages); /* calculate the shift required to use bmap */ shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 840587037b59..d87bd852ed96 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -5038,7 +5038,7 @@ bad: return; } -static struct ceph_connection *con_get(struct ceph_connection *con) +static struct ceph_connection *mds_get_con(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; @@ -5047,7 +5047,7 @@ static struct ceph_connection *con_get(struct ceph_connection *con) return NULL; } -static void con_put(struct ceph_connection *con) +static void mds_put_con(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; @@ -5058,7 +5058,7 @@ static void con_put(struct ceph_connection *con) * if the client is unresponsive for long enough, the mds will kill * the session entirely. */ -static void peer_reset(struct ceph_connection *con) +static void mds_peer_reset(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; @@ -5067,7 +5067,7 @@ static void peer_reset(struct ceph_connection *con) send_mds_reconnect(mdsc, s); } -static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) +static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; @@ -5125,8 +5125,8 @@ out: * Note: returned pointer is the address of a structure that's * managed separately. Caller must *not* attempt to free it. */ -static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, - int *proto, int force_new) +static struct ceph_auth_handshake * +mds_get_authorizer(struct ceph_connection *con, int *proto, int force_new) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; @@ -5142,7 +5142,7 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, return auth; } -static int add_authorizer_challenge(struct ceph_connection *con, +static int mds_add_authorizer_challenge(struct ceph_connection *con, void *challenge_buf, int challenge_buf_len) { struct ceph_mds_session *s = con->private; @@ -5153,7 +5153,7 @@ static int add_authorizer_challenge(struct ceph_connection *con, challenge_buf, challenge_buf_len); } -static int verify_authorizer_reply(struct ceph_connection *con) +static int mds_verify_authorizer_reply(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; @@ -5165,7 +5165,7 @@ static int verify_authorizer_reply(struct ceph_connection *con) NULL, NULL, NULL, NULL); } -static int invalidate_authorizer(struct ceph_connection *con) +static int mds_invalidate_authorizer(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; @@ -5288,15 +5288,15 @@ static int mds_check_message_signature(struct ceph_msg *msg) } static const struct ceph_connection_operations mds_con_ops = { - .get = con_get, - .put = con_put, - .dispatch = dispatch, - .get_authorizer = get_authorizer, - .add_authorizer_challenge = add_authorizer_challenge, - .verify_authorizer_reply = verify_authorizer_reply, - .invalidate_authorizer = invalidate_authorizer, - .peer_reset = peer_reset, + .get = mds_get_con, + .put = mds_put_con, .alloc_msg = mds_alloc_msg, + .dispatch = mds_dispatch, + .peer_reset = mds_peer_reset, + .get_authorizer = mds_get_authorizer, + .add_authorizer_challenge = mds_add_authorizer_challenge, + .verify_authorizer_reply = mds_verify_authorizer_reply, + .invalidate_authorizer = mds_invalidate_authorizer, .sign_message = mds_sign_message, .check_message_signature = mds_check_message_signature, .get_auth_request = mds_get_auth_request, diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index e4c6ae47a796..6b1ce4efb591 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -133,8 +133,9 @@ cifs_build_devname(char *nodename, const char *prepath) * Caller is responsible for freeing returned value if it is not error. */ char *cifs_compose_mount_options(const char *sb_mountdata, - const char *fullpath, - const struct dfs_info3_param *ref) + const char *fullpath, + const struct dfs_info3_param *ref, + char **devname) { int rc; char *name; @@ -231,7 +232,10 @@ char *cifs_compose_mount_options(const char *sb_mountdata, strcat(mountdata, "ip="); strcat(mountdata, srvIP); - kfree(name); + if (devname) + *devname = name; + else + kfree(name); /*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/ /*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/ @@ -278,7 +282,7 @@ static struct vfsmount *cifs_dfs_do_mount(struct dentry *mntpt, /* strip first '\' from fullpath */ mountdata = cifs_compose_mount_options(cifs_sb->ctx->mount_options, - fullpath + 1, NULL); + fullpath + 1, NULL, NULL); if (IS_ERR(mountdata)) { kfree(devname); return (struct vfsmount *)mountdata; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index ce0d0037fd0a..ab883e84e116 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -469,7 +469,7 @@ cifs_show_cache_flavor(struct seq_file *s, struct cifs_sb_info *cifs_sb) static int cifs_show_devname(struct seq_file *m, struct dentry *root) { struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb); - char *devname = kstrdup(cifs_sb->ctx->UNC, GFP_KERNEL); + char *devname = kstrdup(cifs_sb->ctx->source, GFP_KERNEL); if (devname == NULL) seq_puts(m, "none"); @@ -822,7 +822,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type, goto out; } - rc = cifs_setup_volume_info(cifs_sb->ctx); + rc = cifs_setup_volume_info(cifs_sb->ctx, NULL, old_ctx->UNC); if (rc) { root = ERR_PTR(rc); goto out; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 340ff81ee87b..32f7a013402e 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -78,7 +78,8 @@ extern char *cifs_build_path_to_root(struct smb3_fs_context *ctx, int add_treename); extern char *build_wildcard_path_from_dentry(struct dentry *direntry); extern char *cifs_compose_mount_options(const char *sb_mountdata, - const char *fullpath, const struct dfs_info3_param *ref); + const char *fullpath, const struct dfs_info3_param *ref, + char **devname); /* extern void renew_parental_timestamps(struct dentry *direntry);*/ extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server); @@ -89,6 +90,7 @@ extern void cifs_wake_up_task(struct mid_q_entry *mid); extern int cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid); extern int smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx); +extern int smb3_parse_opt(const char *options, const char *key, char **val); extern bool cifs_match_ipaddr(struct sockaddr *srcaddr, struct sockaddr *rhs); extern int cifs_discard_remaining_data(struct TCP_Server_Info *server); extern int cifs_call_async(struct TCP_Server_Info *server, @@ -549,7 +551,7 @@ extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24); extern int -cifs_setup_volume_info(struct smb3_fs_context *ctx); +cifs_setup_volume_info(struct smb3_fs_context *ctx, const char *mntopts, const char *devname); extern struct TCP_Server_Info * cifs_find_tcp_session(struct smb3_fs_context *ctx); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 5d39129406ea..4bb9decbbf27 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2195,7 +2195,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) if (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING) tcon->nohandlecache = ctx->nohandlecache; else - tcon->nohandlecache = 1; + tcon->nohandlecache = true; tcon->nodelete = ctx->nodelete; tcon->local_lease = ctx->local_lease; INIT_LIST_HEAD(&tcon->pending_opens); @@ -2628,7 +2628,7 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon, } else if (ctx) tcon->unix_ext = 1; /* Unix Extensions supported */ - if (tcon->unix_ext == 0) { + if (!tcon->unix_ext) { cifs_dbg(FYI, "Unix extensions disabled so not set on reconnect\n"); return; } @@ -2756,6 +2756,7 @@ int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb) cifs_sb->prepath = kstrdup(ctx->prepath, GFP_KERNEL); if (cifs_sb->prepath == NULL) return -ENOMEM; + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; } return 0; @@ -2972,17 +2973,28 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses, rc = dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb), ref_path, &referral, NULL); if (!rc) { + char *fake_devname = NULL; + mdata = cifs_compose_mount_options(cifs_sb->ctx->mount_options, - full_path + 1, &referral); + full_path + 1, &referral, + &fake_devname); free_dfs_info_param(&referral); if (IS_ERR(mdata)) { rc = PTR_ERR(mdata); mdata = NULL; } else { - smb3_cleanup_fs_context_contents(ctx); - rc = cifs_setup_volume_info(ctx); + /* + * We can not clear out the whole structure since we + * no longer have an explicit function to parse + * a mount-string. Instead we need to clear out the + * individual fields that are no longer valid. + */ + kfree(ctx->prepath); + ctx->prepath = NULL; + rc = cifs_setup_volume_info(ctx, mdata, fake_devname); } + kfree(fake_devname); kfree(cifs_sb->ctx->mount_options); cifs_sb->ctx->mount_options = mdata; } @@ -3036,6 +3048,7 @@ static int setup_dfs_tgt_conn(const char *path, const char *full_path, struct dfs_info3_param ref = {0}; char *mdata = NULL; struct smb3_fs_context fake_ctx = {NULL}; + char *fake_devname = NULL; cifs_dbg(FYI, "%s: dfs path: %s\n", __func__, path); @@ -3044,16 +3057,18 @@ static int setup_dfs_tgt_conn(const char *path, const char *full_path, return rc; mdata = cifs_compose_mount_options(cifs_sb->ctx->mount_options, - full_path + 1, &ref); + full_path + 1, &ref, + &fake_devname); free_dfs_info_param(&ref); if (IS_ERR(mdata)) { rc = PTR_ERR(mdata); mdata = NULL; } else - rc = cifs_setup_volume_info(&fake_ctx); + rc = cifs_setup_volume_info(&fake_ctx, mdata, fake_devname); kfree(mdata); + kfree(fake_devname); if (!rc) { /* @@ -3122,10 +3137,24 @@ static int do_dfs_failover(const char *path, const char *full_path, struct cifs_ * we should pass a clone of the original context? */ int -cifs_setup_volume_info(struct smb3_fs_context *ctx) +cifs_setup_volume_info(struct smb3_fs_context *ctx, const char *mntopts, const char *devname) { int rc = 0; + smb3_parse_devname(devname, ctx); + + if (mntopts) { + char *ip; + + cifs_dbg(FYI, "%s: mntopts=%s\n", __func__, mntopts); + rc = smb3_parse_opt(mntopts, "ip", &ip); + if (!rc && !cifs_convert_address((struct sockaddr *)&ctx->dstaddr, ip, + strlen(ip))) { + cifs_dbg(VFS, "%s: failed to convert ip address\n", __func__); + return -EINVAL; + } + } + if (ctx->nullauth) { cifs_dbg(FYI, "Anonymous login\n"); kfree(ctx->username); diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 0fdb0de7ff86..4950ab0486ae 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -1417,7 +1417,7 @@ static struct cifs_ses *find_root_ses(struct vol_info *vi, int rc; struct cache_entry *ce; struct dfs_info3_param ref = {0}; - char *mdata = NULL; + char *mdata = NULL, *devname = NULL; struct TCP_Server_Info *server; struct cifs_ses *ses; struct smb3_fs_context ctx = {NULL}; @@ -1444,7 +1444,8 @@ static struct cifs_ses *find_root_ses(struct vol_info *vi, up_read(&htable_rw_lock); - mdata = cifs_compose_mount_options(vi->mntdata, rpath, &ref); + mdata = cifs_compose_mount_options(vi->mntdata, rpath, &ref, + &devname); free_dfs_info_param(&ref); if (IS_ERR(mdata)) { @@ -1453,7 +1454,7 @@ static struct cifs_ses *find_root_ses(struct vol_info *vi, goto out; } - rc = cifs_setup_volume_info(&ctx); + rc = cifs_setup_volume_info(&ctx, NULL, devname); if (rc) { ses = ERR_PTR(rc); @@ -1472,6 +1473,7 @@ out: smb3_cleanup_fs_context_contents(&ctx); kfree(mdata); kfree(rpath); + kfree(devname); return ses; } diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 68900f1629bf..97ac363b5df1 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -737,6 +737,7 @@ static int cifs_d_revalidate(struct dentry *direntry, unsigned int flags) { struct inode *inode; + int rc; if (flags & LOOKUP_RCU) return -ECHILD; @@ -746,8 +747,25 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags) if ((flags & LOOKUP_REVAL) && !CIFS_CACHE_READ(CIFS_I(inode))) CIFS_I(inode)->time = 0; /* force reval */ - if (cifs_revalidate_dentry(direntry)) - return 0; + rc = cifs_revalidate_dentry(direntry); + if (rc) { + cifs_dbg(FYI, "cifs_revalidate_dentry failed with rc=%d", rc); + switch (rc) { + case -ENOENT: + case -ESTALE: + /* + * Those errors mean the dentry is invalid + * (file was deleted or recreated) + */ + return 0; + default: + /* + * Otherwise some unexpected error happened + * report it as-is to VFS layer + */ + return rc; + } + } else { /* * If the inode wasn't known to be a dfs entry when diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 076bcadc756a..12a5da0230b5 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -148,7 +148,6 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { /* Mount options which take string value */ fsparam_string("source", Opt_source), - fsparam_string("unc", Opt_source), fsparam_string("user", Opt_user), fsparam_string("username", Opt_user), fsparam_string("pass", Opt_pass), @@ -175,8 +174,15 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_flag_no("exec", Opt_ignore), fsparam_flag_no("dev", Opt_ignore), fsparam_flag_no("mand", Opt_ignore), + fsparam_flag_no("auto", Opt_ignore), fsparam_string("cred", Opt_ignore), fsparam_string("credentials", Opt_ignore), + /* + * UNC and prefixpath is now extracted from Opt_source + * in the new mount API so we can just ignore them going forward. + */ + fsparam_string("unc", Opt_ignore), + fsparam_string("prefixpath", Opt_ignore), {} }; @@ -311,6 +317,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx new_ctx->password = NULL; new_ctx->domainname = NULL; new_ctx->UNC = NULL; + new_ctx->source = NULL; new_ctx->iocharset = NULL; /* @@ -321,6 +328,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx DUP_CTX_STR(username); DUP_CTX_STR(password); DUP_CTX_STR(UNC); + DUP_CTX_STR(source); DUP_CTX_STR(domainname); DUP_CTX_STR(nodename); DUP_CTX_STR(iocharset); @@ -399,6 +407,37 @@ cifs_parse_smb_version(char *value, struct smb3_fs_context *ctx, bool is_smb3) return 0; } +int smb3_parse_opt(const char *options, const char *key, char **val) +{ + int rc = -ENOENT; + char *opts, *orig, *p; + + orig = opts = kstrdup(options, GFP_KERNEL); + if (!opts) + return -ENOMEM; + + while ((p = strsep(&opts, ","))) { + char *nval; + + if (!*p) + continue; + if (strncasecmp(p, key, strlen(key))) + continue; + nval = strchr(p, '='); + if (nval) { + if (nval == p) + continue; + *nval++ = 0; + *val = kstrndup(nval, strlen(nval), GFP_KERNEL); + rc = !*val ? -ENOMEM : 0; + goto out; + } + } +out: + kfree(orig); + return rc; +} + /* * Parse a devname into substrings and populate the ctx->UNC and ctx->prepath * fields with the result. Returns 0 on success and an error otherwise @@ -531,7 +570,7 @@ static int smb3_fs_context_validate(struct fs_context *fc) if (ctx->rdma && ctx->vals->protocol_id < SMB30_PROT_ID) { cifs_dbg(VFS, "SMB Direct requires Version >=3.0\n"); - return -1; + return -EOPNOTSUPP; } #ifndef CONFIG_KEYS @@ -554,7 +593,7 @@ static int smb3_fs_context_validate(struct fs_context *fc) /* make sure UNC has a share name */ if (strlen(ctx->UNC) < 3 || !strchr(ctx->UNC + 3, '\\')) { cifs_dbg(VFS, "Malformed UNC. Unable to find share name.\n"); - return -1; + return -ENOENT; } if (!ctx->got_ip) { @@ -568,7 +607,7 @@ static int smb3_fs_context_validate(struct fs_context *fc) if (!cifs_convert_address((struct sockaddr *)&ctx->dstaddr, &ctx->UNC[2], len)) { pr_err("Unable to determine destination address\n"); - return -1; + return -EHOSTUNREACH; } } @@ -699,6 +738,7 @@ static int smb3_reconfigure(struct fs_context *fc) * just use what we already have in cifs_sb->ctx. */ STEAL_STRING(cifs_sb, ctx, UNC); + STEAL_STRING(cifs_sb, ctx, source); STEAL_STRING(cifs_sb, ctx, username); STEAL_STRING(cifs_sb, ctx, password); STEAL_STRING(cifs_sb, ctx, domainname); @@ -941,6 +981,11 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, cifs_dbg(VFS, "Unknown error parsing devname\n"); goto cifs_parse_mount_err; } + ctx->source = kstrdup(param->string, GFP_KERNEL); + if (ctx->source == NULL) { + cifs_dbg(VFS, "OOM when copying UNC string\n"); + goto cifs_parse_mount_err; + } fc->source = kstrdup(param->string, GFP_KERNEL); if (fc->source == NULL) { cifs_dbg(VFS, "OOM when copying UNC string\n"); @@ -1263,7 +1308,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, return 0; cifs_parse_mount_err: - return 1; + return -EINVAL; } int smb3_init_fs_context(struct fs_context *fc) @@ -1363,6 +1408,8 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx) ctx->password = NULL; kfree(ctx->UNC); ctx->UNC = NULL; + kfree(ctx->source); + ctx->source = NULL; kfree(ctx->domainname); ctx->domainname = NULL; kfree(ctx->nodename); @@ -1500,8 +1547,8 @@ void smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb) cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_NO_PERM); else - cifs_sb->mnt_cifs_flags &= ~(CIFS_MOUNT_MULTIUSER | - CIFS_MOUNT_NO_PERM); + cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_MULTIUSER; + if (ctx->strict_io) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_STRICT_IO; diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h index 3358b33abcd0..1c44a460e2c0 100644 --- a/fs/cifs/fs_context.h +++ b/fs/cifs/fs_context.h @@ -159,6 +159,7 @@ struct smb3_fs_context { char *username; char *password; char *domainname; + char *source; char *UNC; char *nodename; char *iocharset; /* local code page for mapping to and from Unicode */ diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index d85edf5d1429..a5a9e33c0d73 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -286,7 +286,7 @@ struct smb2_negotiate_req { __le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */ __le16 NegotiateContextCount; /* SMB3.1.1 only. MBZ earlier */ __le16 Reserved2; - __le16 Dialects[1]; /* One dialect (vers=) at a time for now */ + __le16 Dialects[4]; /* BB expand this if autonegotiate > 4 dialects */ } __packed; /* Dialects */ diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index e9abb41aa89b..4a2b836eb017 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -338,7 +338,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, if (ssocket == NULL) return -EAGAIN; - if (signal_pending(current)) { + if (fatal_signal_pending(current)) { cifs_dbg(FYI, "signal pending before send request\n"); return -ERESTARTSYS; } @@ -429,7 +429,7 @@ unmask: if (signal_pending(current) && (total_len != send_length)) { cifs_dbg(FYI, "signal is pending after attempt to send\n"); - rc = -EINTR; + rc = -ERESTARTSYS; } /* uncork it */ @@ -666,10 +666,22 @@ wait_for_compound_request(struct TCP_Server_Info *server, int num, if (*credits < num) { /* - * Return immediately if not too many requests in flight since - * we will likely be stuck on waiting for credits. + * If the server is tight on resources or just gives us less + * credits for other reasons (e.g. requests are coming out of + * order and the server delays granting more credits until it + * processes a missing mid) and we exhausted most available + * credits there may be situations when we try to send + * a compound request but we don't have enough credits. At this + * point the client needs to decide if it should wait for + * additional credits or fail the request. If at least one + * request is in flight there is a high probability that the + * server will return enough credits to satisfy this compound + * request. + * + * Return immediately if no requests in flight since we will be + * stuck on waiting for credits. */ - if (server->in_flight < num - *credits) { + if (server->in_flight == 0) { spin_unlock(&server->req_lock); trace_smb3_insufficient_credits(server->CurrentMid, server->hostname, scredits, sin_flight); diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index 2c557229696a..95e72d271b95 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -50,6 +50,7 @@ * which requires asm/elf.h to define compat_elf_gregset_t et al. */ #define elf_prstatus compat_elf_prstatus +#define elf_prstatus_common compat_elf_prstatus_common #define elf_prpsinfo compat_elf_prpsinfo #undef ns_to_kernel_old_timeval @@ -61,7 +62,6 @@ * differ from the native ones, or omitted when they match. */ -#undef ELF_ARCH #undef elf_check_arch #define elf_check_arch compat_elf_check_arch @@ -90,11 +90,6 @@ #define ELF_ET_DYN_BASE COMPAT_ELF_ET_DYN_BASE #endif -#ifdef COMPAT_ELF_EXEC_PAGESIZE -#undef ELF_EXEC_PAGESIZE -#define ELF_EXEC_PAGESIZE COMPAT_ELF_EXEC_PAGESIZE -#endif - #ifdef COMPAT_ELF_PLAT_INIT #undef ELF_PLAT_INIT #define ELF_PLAT_INIT COMPAT_ELF_PLAT_INIT diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index e23752d9a79f..58d0f7187997 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -1016,15 +1016,19 @@ ecryptfs_setxattr(struct dentry *dentry, struct inode *inode, { int rc; struct dentry *lower_dentry; + struct inode *lower_inode; lower_dentry = ecryptfs_dentry_to_lower(dentry); - if (!(d_inode(lower_dentry)->i_opflags & IOP_XATTR)) { + lower_inode = d_inode(lower_dentry); + if (!(lower_inode->i_opflags & IOP_XATTR)) { rc = -EOPNOTSUPP; goto out; } - rc = vfs_setxattr(lower_dentry, name, value, size, flags); + inode_lock(lower_inode); + rc = __vfs_setxattr_locked(lower_dentry, name, value, size, flags, NULL); + inode_unlock(lower_inode); if (!rc && inode) - fsstack_copy_attr_all(inode, d_inode(lower_dentry)); + fsstack_copy_attr_all(inode, lower_inode); out: return rc; } diff --git a/fs/erofs/super.c b/fs/erofs/super.c index be10b16ea66e..d5a6b9b888a5 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -158,8 +158,8 @@ static int erofs_read_superblock(struct super_block *sb) blkszbits = dsb->blkszbits; /* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */ if (blkszbits != LOG_BLOCK_SIZE) { - erofs_err(sb, "blksize %u isn't supported on this platform", - 1 << blkszbits); + erofs_err(sb, "blkszbits %u isn't supported on this platform", + blkszbits); goto out; } diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 5bde77d70852..47314a26767a 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -48,8 +48,14 @@ static int init_inode_xattrs(struct inode *inode) int ret = 0; /* the most case is that xattrs of this inode are initialized. */ - if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags)) + if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags)) { + /* + * paired with smp_mb() at the end of the function to ensure + * fields will only be observed after the bit is set. + */ + smp_mb(); return 0; + } if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_XATTR_BIT, TASK_KILLABLE)) return -ERESTARTSYS; @@ -137,6 +143,8 @@ static int init_inode_xattrs(struct inode *inode) } xattr_iter_end(&it, atomic_map); + /* paired with smp_mb() at the beginning of the function. */ + smp_mb(); set_bit(EROFS_I_EA_INITED_BIT, &vi->flags); out_unlock: diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index ae325541884e..14d2de35110c 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -36,8 +36,14 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) void *kaddr; struct z_erofs_map_header *h; - if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) + if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) { + /* + * paired with smp_mb() at the end of the function to ensure + * fields will only be observed after the bit is set. + */ + smp_mb(); return 0; + } if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE)) return -ERESTARTSYS; @@ -83,6 +89,8 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits + ((h->h_clusterbits >> 5) & 7); + /* paired with smp_mb() at the beginning of the function */ + smp_mb(); set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); unmap_done: kunmap_atomic(kaddr); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 349b27f0dda0..194f5d00fa32 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -74,8 +74,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) return generic_file_read_iter(iocb, to); } - ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, - is_sync_kiocb(iocb)); + ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); @@ -550,7 +549,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ilock_shared) iomap_ops = &ext4_iomap_overwrite_ops; ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, - is_sync_kiocb(iocb) || unaligned_io || extend); + (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0); if (ret == -ENOTBLK) ret = 0; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index d9665d2f82db..713b1ae44c1a 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1309,6 +1309,12 @@ out: return -EOPNOTSUPP; return fsverity_ioctl_measure(filp, (void __user *)arg); + case FS_IOC_READ_VERITY_METADATA: + if (!ext4_has_feature_verity(sb)) + return -EOPNOTSUPP; + return fsverity_ioctl_read_metadata(filp, + (const void __user *)arg); + default: return -ENOTTY; } @@ -1391,6 +1397,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC_GETFSMAP: case FS_IOC_ENABLE_VERITY: case FS_IOC_MEASURE_VERITY: + case FS_IOC_READ_VERITY_METADATA: case EXT4_IOC_CLEAR_ES_CACHE: case EXT4_IOC_GETSTATE: case EXT4_IOC_GET_ES_CACHE: diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index d13c5c6a9787..62e638a49bbf 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -76,16 +76,6 @@ config F2FS_CHECK_FS If you want to improve the performance, say N. -config F2FS_IO_TRACE - bool "F2FS IO tracer" - depends on F2FS_FS - depends on FUNCTION_TRACER - help - F2FS IO trace is based on a function trace, which gathers process - information and block IO patterns in the filesystem level. - - If unsure, say N. - config F2FS_FAULT_INJECTION bool "F2FS fault injection facility" depends on F2FS_FS @@ -119,6 +109,16 @@ config F2FS_FS_LZ4 help Support LZ4 compress algorithm, if unsure, say Y. +config F2FS_FS_LZ4HC + bool "LZ4HC compression support" + depends on F2FS_FS_COMPRESSION + depends on F2FS_FS_LZ4 + select LZ4HC_COMPRESS + default y + help + Support LZ4HC compress algorithm, LZ4HC has compatible on-disk + layout with LZ4, if unsure, say Y. + config F2FS_FS_ZSTD bool "ZSTD compression support" depends on F2FS_FS_COMPRESSION diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index ee7316b42f69..e5295746208b 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -7,6 +7,5 @@ f2fs-y += shrinker.o extent_cache.o sysfs.o f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o -f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o f2fs-$(CONFIG_FS_VERITY) += verity.o f2fs-$(CONFIG_F2FS_FS_COMPRESSION) += compress.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 1e5e9b1136ee..732ec10e7890 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -200,6 +200,27 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) return __f2fs_get_acl(inode, type, NULL); } +static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p, + struct posix_acl **acl) +{ + umode_t mode = inode->i_mode; + int error; + + if (is_inode_flag_set(inode, FI_ACL_MODE)) + mode = F2FS_I(inode)->i_acl_mode; + + error = posix_acl_equiv_mode(*acl, &mode); + if (error < 0) + return error; + if (error == 0) + *acl = NULL; + if (!in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + mode &= ~S_ISGID; + *mode_p = mode; + return 0; +} + static int __f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl, struct page *ipage) { @@ -213,7 +234,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl && !ipage) { - error = posix_acl_update_mode(inode, &mode, &acl); + error = f2fs_acl_update_mode(inode, &mode, &acl); if (error) return error; set_acl_inode(inode, mode); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 897edb7c951a..174a0819ad96 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -13,13 +13,15 @@ #include <linux/f2fs_fs.h> #include <linux/pagevec.h> #include <linux/swap.h> +#include <linux/kthread.h> #include "f2fs.h" #include "node.h" #include "segment.h" -#include "trace.h" #include <trace/events/f2fs.h> +#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) + static struct kmem_cache *ino_entry_slab; struct kmem_cache *f2fs_inode_entry_slab; @@ -443,7 +445,6 @@ static int f2fs_set_meta_page_dirty(struct page *page) __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); f2fs_set_page_private(page, 0); - f2fs_trace_pid(page); return 1; } return 0; @@ -1017,7 +1018,6 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page) spin_unlock(&sbi->inode_lock[type]); f2fs_set_page_private(page, 0); - f2fs_trace_pid(page); } void f2fs_remove_dirty_inode(struct inode *inode) @@ -1707,3 +1707,174 @@ void f2fs_destroy_checkpoint_caches(void) kmem_cache_destroy(ino_entry_slab); kmem_cache_destroy(f2fs_inode_entry_slab); } + +static int __write_checkpoint_sync(struct f2fs_sb_info *sbi) +{ + struct cp_control cpc = { .reason = CP_SYNC, }; + int err; + + down_write(&sbi->gc_lock); + err = f2fs_write_checkpoint(sbi, &cpc); + up_write(&sbi->gc_lock); + + return err; +} + +static void __checkpoint_and_complete_reqs(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + struct ckpt_req *req, *next; + struct llist_node *dispatch_list; + u64 sum_diff = 0, diff, count = 0; + int ret; + + dispatch_list = llist_del_all(&cprc->issue_list); + if (!dispatch_list) + return; + dispatch_list = llist_reverse_order(dispatch_list); + + ret = __write_checkpoint_sync(sbi); + atomic_inc(&cprc->issued_ckpt); + + llist_for_each_entry_safe(req, next, dispatch_list, llnode) { + diff = (u64)ktime_ms_delta(ktime_get(), req->queue_time); + req->ret = ret; + complete(&req->wait); + + sum_diff += diff; + count++; + } + atomic_sub(count, &cprc->queued_ckpt); + atomic_add(count, &cprc->total_ckpt); + + spin_lock(&cprc->stat_lock); + cprc->cur_time = (unsigned int)div64_u64(sum_diff, count); + if (cprc->peak_time < cprc->cur_time) + cprc->peak_time = cprc->cur_time; + spin_unlock(&cprc->stat_lock); +} + +static int issue_checkpoint_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct ckpt_req_control *cprc = &sbi->cprc_info; + wait_queue_head_t *q = &cprc->ckpt_wait_queue; +repeat: + if (kthread_should_stop()) + return 0; + + if (!llist_empty(&cprc->issue_list)) + __checkpoint_and_complete_reqs(sbi); + + wait_event_interruptible(*q, + kthread_should_stop() || !llist_empty(&cprc->issue_list)); + goto repeat; +} + +static void flush_remained_ckpt_reqs(struct f2fs_sb_info *sbi, + struct ckpt_req *wait_req) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + if (!llist_empty(&cprc->issue_list)) { + __checkpoint_and_complete_reqs(sbi); + } else { + /* already dispatched by issue_checkpoint_thread */ + if (wait_req) + wait_for_completion(&wait_req->wait); + } +} + +static void init_ckpt_req(struct ckpt_req *req) +{ + memset(req, 0, sizeof(struct ckpt_req)); + + init_completion(&req->wait); + req->queue_time = ktime_get(); +} + +int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + struct ckpt_req req; + struct cp_control cpc; + + cpc.reason = __get_cp_reason(sbi); + if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) { + int ret; + + down_write(&sbi->gc_lock); + ret = f2fs_write_checkpoint(sbi, &cpc); + up_write(&sbi->gc_lock); + + return ret; + } + + if (!cprc->f2fs_issue_ckpt) + return __write_checkpoint_sync(sbi); + + init_ckpt_req(&req); + + llist_add(&req.llnode, &cprc->issue_list); + atomic_inc(&cprc->queued_ckpt); + + /* update issue_list before we wake up issue_checkpoint thread */ + smp_mb(); + + if (waitqueue_active(&cprc->ckpt_wait_queue)) + wake_up(&cprc->ckpt_wait_queue); + + if (cprc->f2fs_issue_ckpt) + wait_for_completion(&req.wait); + else + flush_remained_ckpt_reqs(sbi, &req); + + return req.ret; +} + +int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + struct ckpt_req_control *cprc = &sbi->cprc_info; + + if (cprc->f2fs_issue_ckpt) + return 0; + + cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi, + "f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(cprc->f2fs_issue_ckpt)) { + cprc->f2fs_issue_ckpt = NULL; + return -ENOMEM; + } + + set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio); + + return 0; +} + +void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + if (cprc->f2fs_issue_ckpt) { + struct task_struct *ckpt_task = cprc->f2fs_issue_ckpt; + + cprc->f2fs_issue_ckpt = NULL; + kthread_stop(ckpt_task); + + flush_remained_ckpt_reqs(sbi, NULL); + } +} + +void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + atomic_set(&cprc->issued_ckpt, 0); + atomic_set(&cprc->total_ckpt, 0); + atomic_set(&cprc->queued_ckpt, 0); + cprc->ckpt_thread_ioprio = DEFAULT_CHECKPOINT_IOPRIO; + init_waitqueue_head(&cprc->ckpt_wait_queue); + init_llist_head(&cprc->issue_list); + spin_lock_init(&cprc->stat_lock); +} diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 4bcbacfe3325..77fa342de38f 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -252,8 +252,14 @@ static const struct f2fs_compress_ops f2fs_lzo_ops = { #ifdef CONFIG_F2FS_FS_LZ4 static int lz4_init_compress_ctx(struct compress_ctx *cc) { - cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), - LZ4_MEM_COMPRESS, GFP_NOFS); + unsigned int size = LZ4_MEM_COMPRESS; + +#ifdef CONFIG_F2FS_FS_LZ4HC + if (F2FS_I(cc->inode)->i_compress_flag >> COMPRESS_LEVEL_OFFSET) + size = LZ4HC_MEM_COMPRESS; +#endif + + cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), size, GFP_NOFS); if (!cc->private) return -ENOMEM; @@ -272,10 +278,34 @@ static void lz4_destroy_compress_ctx(struct compress_ctx *cc) cc->private = NULL; } +#ifdef CONFIG_F2FS_FS_LZ4HC +static int lz4hc_compress_pages(struct compress_ctx *cc) +{ + unsigned char level = F2FS_I(cc->inode)->i_compress_flag >> + COMPRESS_LEVEL_OFFSET; + int len; + + if (level) + len = LZ4_compress_HC(cc->rbuf, cc->cbuf->cdata, cc->rlen, + cc->clen, level, cc->private); + else + len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, + cc->clen, cc->private); + if (!len) + return -EAGAIN; + + cc->clen = len; + return 0; +} +#endif + static int lz4_compress_pages(struct compress_ctx *cc) { int len; +#ifdef CONFIG_F2FS_FS_LZ4HC + return lz4hc_compress_pages(cc); +#endif len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, cc->clen, cc->private); if (!len) @@ -325,8 +355,13 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc) ZSTD_CStream *stream; void *workspace; unsigned int workspace_size; + unsigned char level = F2FS_I(cc->inode)->i_compress_flag >> + COMPRESS_LEVEL_OFFSET; - params = ZSTD_getParams(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen, 0); + if (!level) + level = F2FS_ZSTD_DEFAULT_CLEVEL; + + params = ZSTD_getParams(level, cc->rlen, 0); workspace_size = ZSTD_CStreamWorkspaceBound(params.cParams); workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode), @@ -721,38 +756,27 @@ out: return ret; } -void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) +static void f2fs_decompress_cluster(struct decompress_io_ctx *dic) { - struct decompress_io_ctx *dic = - (struct decompress_io_ctx *)page_private(page); struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); - struct f2fs_inode_info *fi= F2FS_I(dic->inode); + struct f2fs_inode_info *fi = F2FS_I(dic->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; int ret; int i; - dec_page_count(sbi, F2FS_RD_DATA); - - if (bio->bi_status || PageError(page)) - dic->failed = true; - - if (atomic_dec_return(&dic->pending_pages)) - return; - trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx, dic->cluster_size, fi->i_compress_algorithm); - /* submit partial compressed pages */ if (dic->failed) { ret = -EIO; - goto out_free_dic; + goto out_end_io; } dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); if (!dic->tpages) { ret = -ENOMEM; - goto out_free_dic; + goto out_end_io; } for (i = 0; i < dic->cluster_size; i++) { @@ -764,20 +788,20 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) dic->tpages[i] = f2fs_compress_alloc_page(); if (!dic->tpages[i]) { ret = -ENOMEM; - goto out_free_dic; + goto out_end_io; } } if (cops->init_decompress_ctx) { ret = cops->init_decompress_ctx(dic); if (ret) - goto out_free_dic; + goto out_end_io; } dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); if (!dic->rbuf) { ret = -ENOMEM; - goto destroy_decompress_ctx; + goto out_destroy_decompress_ctx; } dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages); @@ -816,18 +840,34 @@ out_vunmap_cbuf: vm_unmap_ram(dic->cbuf, dic->nr_cpages); out_vunmap_rbuf: vm_unmap_ram(dic->rbuf, dic->cluster_size); -destroy_decompress_ctx: +out_destroy_decompress_ctx: if (cops->destroy_decompress_ctx) cops->destroy_decompress_ctx(dic); -out_free_dic: - if (!verity) - f2fs_decompress_end_io(dic->rpages, dic->cluster_size, - ret, false); - +out_end_io: trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx, dic->clen, ret); - if (!verity) - f2fs_free_dic(dic); + f2fs_decompress_end_io(dic, ret); +} + +/* + * This is called when a page of a compressed cluster has been read from disk + * (or failed to be read from disk). It checks whether this page was the last + * page being waited on in the cluster, and if so, it decompresses the cluster + * (or in the case of a failure, cleans up without actually decompressing). + */ +void f2fs_end_read_compressed_page(struct page *page, bool failed) +{ + struct decompress_io_ctx *dic = + (struct decompress_io_ctx *)page_private(page); + struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); + + dec_page_count(sbi, F2FS_RD_DATA); + + if (failed) + WRITE_ONCE(dic->failed, true); + + if (atomic_dec_and_test(&dic->remaining_pages)) + f2fs_decompress_cluster(dic); } static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index) @@ -1415,7 +1455,7 @@ retry_write: ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted, NULL, NULL, wbc, io_type, - compr_blocks); + compr_blocks, false); if (ret) { if (ret == AOP_WRITEPAGE_ACTIVATE) { unlock_page(cc->rpages[i]); @@ -1450,6 +1490,9 @@ retry_write: *submitted += _submitted; } + + f2fs_balance_fs(F2FS_M_SB(mapping), true); + return 0; out_err: for (++i; i < cc->cluster_size; i++) { @@ -1494,6 +1537,8 @@ destroy_out: return err; } +static void f2fs_free_dic(struct decompress_io_ctx *dic); + struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) { struct decompress_io_ctx *dic; @@ -1512,12 +1557,14 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) dic->magic = F2FS_COMPRESSED_PAGE_MAGIC; dic->inode = cc->inode; - atomic_set(&dic->pending_pages, cc->nr_cpages); + atomic_set(&dic->remaining_pages, cc->nr_cpages); dic->cluster_idx = cc->cluster_idx; dic->cluster_size = cc->cluster_size; dic->log_cluster_size = cc->log_cluster_size; dic->nr_cpages = cc->nr_cpages; + refcount_set(&dic->refcnt, 1); dic->failed = false; + dic->need_verity = f2fs_need_verity(cc->inode, start_idx); for (i = 0; i < dic->cluster_size; i++) dic->rpages[i] = cc->rpages[i]; @@ -1546,7 +1593,7 @@ out_free: return ERR_PTR(-ENOMEM); } -void f2fs_free_dic(struct decompress_io_ctx *dic) +static void f2fs_free_dic(struct decompress_io_ctx *dic) { int i; @@ -1574,30 +1621,88 @@ void f2fs_free_dic(struct decompress_io_ctx *dic) kmem_cache_free(dic_entry_slab, dic); } -void f2fs_decompress_end_io(struct page **rpages, - unsigned int cluster_size, bool err, bool verity) +static void f2fs_put_dic(struct decompress_io_ctx *dic) +{ + if (refcount_dec_and_test(&dic->refcnt)) + f2fs_free_dic(dic); +} + +/* + * Update and unlock the cluster's pagecache pages, and release the reference to + * the decompress_io_ctx that was being held for I/O completion. + */ +static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) { int i; - for (i = 0; i < cluster_size; i++) { - struct page *rpage = rpages[i]; + for (i = 0; i < dic->cluster_size; i++) { + struct page *rpage = dic->rpages[i]; if (!rpage) continue; - if (err || PageError(rpage)) - goto clear_uptodate; - - if (!verity || fsverity_verify_page(rpage)) { + /* PG_error was set if verity failed. */ + if (failed || PageError(rpage)) { + ClearPageUptodate(rpage); + /* will re-read again later */ + ClearPageError(rpage); + } else { SetPageUptodate(rpage); - goto unlock; } -clear_uptodate: - ClearPageUptodate(rpage); - ClearPageError(rpage); -unlock: unlock_page(rpage); } + + f2fs_put_dic(dic); +} + +static void f2fs_verify_cluster(struct work_struct *work) +{ + struct decompress_io_ctx *dic = + container_of(work, struct decompress_io_ctx, verity_work); + int i; + + /* Verify the cluster's decompressed pages with fs-verity. */ + for (i = 0; i < dic->cluster_size; i++) { + struct page *rpage = dic->rpages[i]; + + if (rpage && !fsverity_verify_page(rpage)) + SetPageError(rpage); + } + + __f2fs_decompress_end_io(dic, false); +} + +/* + * This is called when a compressed cluster has been decompressed + * (or failed to be read and/or decompressed). + */ +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) +{ + if (!failed && dic->need_verity) { + /* + * Note that to avoid deadlocks, the verity work can't be done + * on the decompression workqueue. This is because verifying + * the data pages can involve reading metadata pages from the + * file, and these metadata pages may be compressed. + */ + INIT_WORK(&dic->verity_work, f2fs_verify_cluster); + fsverity_enqueue_verify_work(&dic->verity_work); + } else { + __f2fs_decompress_end_io(dic, failed); + } +} + +/* + * Put a reference to a compressed page's decompress_io_ctx. + * + * This is called when the page is no longer needed and can be freed. + */ +void f2fs_put_page_dic(struct page *page) +{ + struct decompress_io_ctx *dic = + (struct decompress_io_ctx *)page_private(page); + + f2fs_put_dic(dic); } int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index aa34d620bec9..1ee966a63df9 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -25,7 +25,6 @@ #include "f2fs.h" #include "node.h" #include "segment.h" -#include "trace.h" #include <trace/events/f2fs.h> #define NUM_PREALLOC_POST_READ_CTXS 128 @@ -115,10 +114,21 @@ static enum count_type __read_io_type(struct page *page) /* postprocessing steps for read bios */ enum bio_post_read_step { - STEP_DECRYPT, - STEP_DECOMPRESS_NOWQ, /* handle normal cluster data inplace */ - STEP_DECOMPRESS, /* handle compressed cluster data in workqueue */ - STEP_VERITY, +#ifdef CONFIG_FS_ENCRYPTION + STEP_DECRYPT = 1 << 0, +#else + STEP_DECRYPT = 0, /* compile out the decryption-related code */ +#endif +#ifdef CONFIG_F2FS_FS_COMPRESSION + STEP_DECOMPRESS = 1 << 1, +#else + STEP_DECOMPRESS = 0, /* compile out the decompression-related code */ +#endif +#ifdef CONFIG_FS_VERITY + STEP_VERITY = 1 << 2, +#else + STEP_VERITY = 0, /* compile out the verity-related code */ +#endif }; struct bio_post_read_ctx { @@ -128,25 +138,26 @@ struct bio_post_read_ctx { unsigned int enabled_steps; }; -static void __read_end_io(struct bio *bio, bool compr, bool verity) +static void f2fs_finish_read_bio(struct bio *bio) { - struct page *page; struct bio_vec *bv; struct bvec_iter_all iter_all; + /* + * Update and unlock the bio's pagecache pages, and put the + * decompression context for any compressed pages. + */ bio_for_each_segment_all(bv, bio, iter_all) { - page = bv->bv_page; + struct page *page = bv->bv_page; -#ifdef CONFIG_F2FS_FS_COMPRESSION - if (compr && f2fs_is_compressed_page(page)) { - f2fs_decompress_pages(bio, page, verity); + if (f2fs_is_compressed_page(page)) { + if (bio->bi_status) + f2fs_end_read_compressed_page(page, true); + f2fs_put_page_dic(page); continue; } - if (verity) - continue; -#endif - /* PG_error was set if any post_read step failed */ + /* PG_error was set if decryption or verity failed. */ if (bio->bi_status || PageError(page)) { ClearPageUptodate(page); /* will re-read again later */ @@ -157,181 +168,141 @@ static void __read_end_io(struct bio *bio, bool compr, bool verity) dec_page_count(F2FS_P_SB(page), __read_io_type(page)); unlock_page(page); } -} - -static void f2fs_release_read_bio(struct bio *bio); -static void __f2fs_read_end_io(struct bio *bio, bool compr, bool verity) -{ - if (!compr) - __read_end_io(bio, false, verity); - f2fs_release_read_bio(bio); -} - -static void f2fs_decompress_bio(struct bio *bio, bool verity) -{ - __read_end_io(bio, true, verity); -} - -static void bio_post_read_processing(struct bio_post_read_ctx *ctx); - -static void f2fs_decrypt_work(struct bio_post_read_ctx *ctx) -{ - fscrypt_decrypt_bio(ctx->bio); -} - -static void f2fs_decompress_work(struct bio_post_read_ctx *ctx) -{ - f2fs_decompress_bio(ctx->bio, ctx->enabled_steps & (1 << STEP_VERITY)); -} - -#ifdef CONFIG_F2FS_FS_COMPRESSION -static void f2fs_verify_pages(struct page **rpages, unsigned int cluster_size) -{ - f2fs_decompress_end_io(rpages, cluster_size, false, true); -} - -static void f2fs_verify_bio(struct bio *bio) -{ - struct bio_vec *bv; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bv, bio, iter_all) { - struct page *page = bv->bv_page; - struct decompress_io_ctx *dic; - - dic = (struct decompress_io_ctx *)page_private(page); - - if (dic) { - if (atomic_dec_return(&dic->verity_pages)) - continue; - f2fs_verify_pages(dic->rpages, - dic->cluster_size); - f2fs_free_dic(dic); - continue; - } - - if (bio->bi_status || PageError(page)) - goto clear_uptodate; - if (fsverity_verify_page(page)) { - SetPageUptodate(page); - goto unlock; - } -clear_uptodate: - ClearPageUptodate(page); - ClearPageError(page); -unlock: - dec_page_count(F2FS_P_SB(page), __read_io_type(page)); - unlock_page(page); - } + if (bio->bi_private) + mempool_free(bio->bi_private, bio_post_read_ctx_pool); + bio_put(bio); } -#endif -static void f2fs_verity_work(struct work_struct *work) +static void f2fs_verify_bio(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; -#ifdef CONFIG_F2FS_FS_COMPRESSION - unsigned int enabled_steps = ctx->enabled_steps; -#endif + bool may_have_compressed_pages = (ctx->enabled_steps & STEP_DECOMPRESS); /* * fsverity_verify_bio() may call readpages() again, and while verity - * will be disabled for this, decryption may still be needed, resulting - * in another bio_post_read_ctx being allocated. So to prevent - * deadlocks we need to release the current ctx to the mempool first. - * This assumes that verity is the last post-read step. + * will be disabled for this, decryption and/or decompression may still + * be needed, resulting in another bio_post_read_ctx being allocated. + * So to prevent deadlocks we need to release the current ctx to the + * mempool first. This assumes that verity is the last post-read step. */ mempool_free(ctx, bio_post_read_ctx_pool); bio->bi_private = NULL; -#ifdef CONFIG_F2FS_FS_COMPRESSION - /* previous step is decompression */ - if (enabled_steps & (1 << STEP_DECOMPRESS)) { - f2fs_verify_bio(bio); - f2fs_release_read_bio(bio); - return; + /* + * Verify the bio's pages with fs-verity. Exclude compressed pages, + * as those were handled separately by f2fs_end_read_compressed_page(). + */ + if (may_have_compressed_pages) { + struct bio_vec *bv; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bv, bio, iter_all) { + struct page *page = bv->bv_page; + + if (!f2fs_is_compressed_page(page) && + !PageError(page) && !fsverity_verify_page(page)) + SetPageError(page); + } + } else { + fsverity_verify_bio(bio); } -#endif - fsverity_verify_bio(bio); - __f2fs_read_end_io(bio, false, false); + f2fs_finish_read_bio(bio); } -static void f2fs_post_read_work(struct work_struct *work) +/* + * If the bio's data needs to be verified with fs-verity, then enqueue the + * verity work for the bio. Otherwise finish the bio now. + * + * Note that to avoid deadlocks, the verity work can't be done on the + * decryption/decompression workqueue. This is because verifying the data pages + * can involve reading verity metadata pages from the file, and these verity + * metadata pages may be encrypted and/or compressed. + */ +static void f2fs_verify_and_finish_bio(struct bio *bio) { - struct bio_post_read_ctx *ctx = - container_of(work, struct bio_post_read_ctx, work); - - if (ctx->enabled_steps & (1 << STEP_DECRYPT)) - f2fs_decrypt_work(ctx); - - if (ctx->enabled_steps & (1 << STEP_DECOMPRESS)) - f2fs_decompress_work(ctx); + struct bio_post_read_ctx *ctx = bio->bi_private; - if (ctx->enabled_steps & (1 << STEP_VERITY)) { - INIT_WORK(&ctx->work, f2fs_verity_work); + if (ctx && (ctx->enabled_steps & STEP_VERITY)) { + INIT_WORK(&ctx->work, f2fs_verify_bio); fsverity_enqueue_verify_work(&ctx->work); - return; + } else { + f2fs_finish_read_bio(bio); } - - __f2fs_read_end_io(ctx->bio, - ctx->enabled_steps & (1 << STEP_DECOMPRESS), false); -} - -static void f2fs_enqueue_post_read_work(struct f2fs_sb_info *sbi, - struct work_struct *work) -{ - queue_work(sbi->post_read_wq, work); } -static void bio_post_read_processing(struct bio_post_read_ctx *ctx) +/* + * Handle STEP_DECOMPRESS by decompressing any compressed clusters whose last + * remaining page was read by @ctx->bio. + * + * Note that a bio may span clusters (even a mix of compressed and uncompressed + * clusters) or be for just part of a cluster. STEP_DECOMPRESS just indicates + * that the bio includes at least one compressed page. The actual decompression + * is done on a per-cluster basis, not a per-bio basis. + */ +static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx) { - /* - * We use different work queues for decryption and for verity because - * verity may require reading metadata pages that need decryption, and - * we shouldn't recurse to the same workqueue. - */ + struct bio_vec *bv; + struct bvec_iter_all iter_all; + bool all_compressed = true; - if (ctx->enabled_steps & (1 << STEP_DECRYPT) || - ctx->enabled_steps & (1 << STEP_DECOMPRESS)) { - INIT_WORK(&ctx->work, f2fs_post_read_work); - f2fs_enqueue_post_read_work(ctx->sbi, &ctx->work); - return; - } + bio_for_each_segment_all(bv, ctx->bio, iter_all) { + struct page *page = bv->bv_page; - if (ctx->enabled_steps & (1 << STEP_VERITY)) { - INIT_WORK(&ctx->work, f2fs_verity_work); - fsverity_enqueue_verify_work(&ctx->work); - return; + /* PG_error was set if decryption failed. */ + if (f2fs_is_compressed_page(page)) + f2fs_end_read_compressed_page(page, PageError(page)); + else + all_compressed = false; } - __f2fs_read_end_io(ctx->bio, false, false); + /* + * Optimization: if all the bio's pages are compressed, then scheduling + * the per-bio verity work is unnecessary, as verity will be fully + * handled at the compression cluster level. + */ + if (all_compressed) + ctx->enabled_steps &= ~STEP_VERITY; } -static bool f2fs_bio_post_read_required(struct bio *bio) +static void f2fs_post_read_work(struct work_struct *work) { - return bio->bi_private; + struct bio_post_read_ctx *ctx = + container_of(work, struct bio_post_read_ctx, work); + + if (ctx->enabled_steps & STEP_DECRYPT) + fscrypt_decrypt_bio(ctx->bio); + + if (ctx->enabled_steps & STEP_DECOMPRESS) + f2fs_handle_step_decompress(ctx); + + f2fs_verify_and_finish_bio(ctx->bio); } static void f2fs_read_end_io(struct bio *bio) { struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); + struct bio_post_read_ctx *ctx = bio->bi_private; if (time_to_inject(sbi, FAULT_READ_IO)) { f2fs_show_injection_info(sbi, FAULT_READ_IO); bio->bi_status = BLK_STS_IOERR; } - if (f2fs_bio_post_read_required(bio)) { - struct bio_post_read_ctx *ctx = bio->bi_private; - - bio_post_read_processing(ctx); + if (bio->bi_status) { + f2fs_finish_read_bio(bio); return; } - __f2fs_read_end_io(bio, false, false); + if (ctx && (ctx->enabled_steps & (STEP_DECRYPT | STEP_DECOMPRESS))) { + INIT_WORK(&ctx->work, f2fs_post_read_work); + queue_work(ctx->sbi->post_read_wq, &ctx->work); + } else { + f2fs_verify_and_finish_bio(bio); + } } static void f2fs_write_end_io(struct bio *bio) @@ -499,7 +470,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, if (f2fs_lfs_mode(sbi) && current->plug) blk_finish_plug(current->plug); - if (F2FS_IO_ALIGNED(sbi)) + if (!F2FS_IO_ALIGNED(sbi)) goto submit_io; start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; @@ -707,7 +678,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) return -EFSCORRUPTED; trace_f2fs_submit_page_bio(page, fio); - f2fs_trace_ios(fio, 0); /* Allocate a new bio */ bio = __bio_alloc(fio, 1); @@ -912,7 +882,6 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) return -EFSCORRUPTED; trace_f2fs_submit_page_bio(page, fio); - f2fs_trace_ios(fio, 0); if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block, fio->new_blkaddr)) @@ -1009,7 +978,6 @@ alloc_new: wbc_account_cgroup_owner(fio->io_wbc, bio_page, PAGE_SIZE); io->last_block_in_bio = fio->new_blkaddr; - f2fs_trace_ios(fio, 0); trace_f2fs_submit_page_write(fio->page, fio); skip: @@ -1022,16 +990,9 @@ out: up_write(&io->io_rwsem); } -static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) -{ - return fsverity_active(inode) && - idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); -} - static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, unsigned nr_pages, unsigned op_flag, - pgoff_t first_idx, bool for_write, - bool for_verity) + pgoff_t first_idx, bool for_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; @@ -1050,13 +1011,19 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, bio_set_op_attrs(bio, REQ_OP_READ, op_flag); if (fscrypt_inode_uses_fs_layer_crypto(inode)) - post_read_steps |= 1 << STEP_DECRYPT; - if (f2fs_compressed_file(inode)) - post_read_steps |= 1 << STEP_DECOMPRESS_NOWQ; - if (for_verity && f2fs_need_verity(inode, first_idx)) - post_read_steps |= 1 << STEP_VERITY; + post_read_steps |= STEP_DECRYPT; + + if (f2fs_need_verity(inode, first_idx)) + post_read_steps |= STEP_VERITY; + + /* + * STEP_DECOMPRESS is handled specially, since a compressed file might + * contain both compressed and uncompressed clusters. We'll allocate a + * bio_post_read_ctx if the file is compressed, but the caller is + * responsible for enabling STEP_DECOMPRESS if it's actually needed. + */ - if (post_read_steps) { + if (post_read_steps || f2fs_compressed_file(inode)) { /* Due to the mempool, this never fails. */ ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); ctx->bio = bio; @@ -1068,13 +1035,6 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, return bio; } -static void f2fs_release_read_bio(struct bio *bio) -{ - if (bio->bi_private) - mempool_free(bio->bi_private, bio_post_read_ctx_pool); - bio_put(bio); -} - /* This can handle encryption stuffs */ static int f2fs_submit_page_read(struct inode *inode, struct page *page, block_t blkaddr, int op_flags, bool for_write) @@ -1083,7 +1043,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, struct bio *bio; bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags, - page->index, for_write, true); + page->index, for_write); if (IS_ERR(bio)) return PTR_ERR(bio); @@ -1964,6 +1924,7 @@ next: } if (size) { + flags |= FIEMAP_EXTENT_MERGED; if (IS_ENCRYPTED(inode)) flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; @@ -2121,7 +2082,7 @@ submit_and_realloc: if (bio == NULL) { bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, is_readahead ? REQ_RAHEAD : 0, page->index, - false, true); + false); if (IS_ERR(bio)) { ret = PTR_ERR(bio); bio = NULL; @@ -2167,8 +2128,6 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, sector_t last_block_in_file; const unsigned blocksize = blks_to_bytes(inode, 1); struct decompress_io_ctx *dic = NULL; - struct bio_post_read_ctx *ctx; - bool for_verity = false; int i; int ret = 0; @@ -2234,29 +2193,10 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, goto out_put_dnode; } - /* - * It's possible to enable fsverity on the fly when handling a cluster, - * which requires complicated error handling. Instead of adding more - * complexity, let's give a rule where end_io post-processes fsverity - * per cluster. In order to do that, we need to submit bio, if previous - * bio sets a different post-process policy. - */ - if (fsverity_active(cc->inode)) { - atomic_set(&dic->verity_pages, cc->nr_cpages); - for_verity = true; - - if (bio) { - ctx = bio->bi_private; - if (!(ctx->enabled_steps & (1 << STEP_VERITY))) { - __submit_bio(sbi, bio, DATA); - bio = NULL; - } - } - } - for (i = 0; i < dic->nr_cpages; i++) { struct page *page = dic->cpages[i]; block_t blkaddr; + struct bio_post_read_ctx *ctx; blkaddr = data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i + 1); @@ -2272,31 +2212,10 @@ submit_and_realloc: if (!bio) { bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages, is_readahead ? REQ_RAHEAD : 0, - page->index, for_write, for_verity); + page->index, for_write); if (IS_ERR(bio)) { - unsigned int remained = dic->nr_cpages - i; - bool release = false; - ret = PTR_ERR(bio); - dic->failed = true; - - if (for_verity) { - if (!atomic_sub_return(remained, - &dic->verity_pages)) - release = true; - } else { - if (!atomic_sub_return(remained, - &dic->pending_pages)) - release = true; - } - - if (release) { - f2fs_decompress_end_io(dic->rpages, - cc->cluster_size, true, - false); - f2fs_free_dic(dic); - } - + f2fs_decompress_end_io(dic, ret); f2fs_put_dnode(&dn); *bio_ret = NULL; return ret; @@ -2308,10 +2227,9 @@ submit_and_realloc: if (bio_add_page(bio, page, blocksize, 0) < blocksize) goto submit_and_realloc; - /* tag STEP_DECOMPRESS to handle IO in wq */ ctx = bio->bi_private; - if (!(ctx->enabled_steps & (1 << STEP_DECOMPRESS))) - ctx->enabled_steps |= 1 << STEP_DECOMPRESS; + ctx->enabled_steps |= STEP_DECOMPRESS; + refcount_inc(&dic->refcnt); inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); @@ -2328,7 +2246,13 @@ submit_and_realloc: out_put_dnode: f2fs_put_dnode(&dn); out: - f2fs_decompress_end_io(cc->rpages, cc->cluster_size, true, false); + for (i = 0; i < cc->cluster_size; i++) { + if (cc->rpages[i]) { + ClearPageUptodate(cc->rpages[i]); + ClearPageError(cc->rpages[i]); + unlock_page(cc->rpages[i]); + } + } *bio_ret = bio; return ret; } @@ -2337,11 +2261,6 @@ out: /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. - * - * Note that the aops->readpages() function is ONLY used for read-ahead. If - * this function ever deviates from doing just read-ahead, it should either - * use ->readpage() or do the necessary surgery to decouple ->readpages() - * from read-ahead. */ static int f2fs_mpage_readpages(struct inode *inode, struct readahead_control *rac, struct page *page) @@ -2364,7 +2283,6 @@ static int f2fs_mpage_readpages(struct inode *inode, unsigned nr_pages = rac ? readahead_count(rac) : 1; unsigned max_nr_pages = nr_pages; int ret = 0; - bool drop_ra = false; map.m_pblk = 0; map.m_lblk = 0; @@ -2375,26 +2293,10 @@ static int f2fs_mpage_readpages(struct inode *inode, map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; - /* - * Two readahead threads for same address range can cause race condition - * which fragments sequential read IOs. So let's avoid each other. - */ - if (rac && readahead_count(rac)) { - if (READ_ONCE(F2FS_I(inode)->ra_offset) == readahead_index(rac)) - drop_ra = true; - else - WRITE_ONCE(F2FS_I(inode)->ra_offset, - readahead_index(rac)); - } - for (; nr_pages; nr_pages--) { if (rac) { page = readahead_page(rac); prefetchw(&page->flags); - if (drop_ra) { - f2fs_put_page(page, 1); - continue; - } } #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -2457,9 +2359,6 @@ next_page: } if (bio) __submit_bio(F2FS_I_SB(inode), bio, DATA); - - if (rac && readahead_count(rac) && !drop_ra) - WRITE_ONCE(F2FS_I(inode)->ra_offset, -1); return ret; } @@ -2743,7 +2642,8 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, sector_t *last_block, struct writeback_control *wbc, enum iostat_type io_type, - int compr_blocks) + int compr_blocks, + bool allow_balance) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -2881,7 +2781,7 @@ out: } unlock_page(page); if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && - !F2FS_I(inode)->cp_task) + !F2FS_I(inode)->cp_task && allow_balance) f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { @@ -2928,7 +2828,7 @@ out: #endif return f2fs_write_single_data_page(page, NULL, NULL, NULL, - wbc, FS_DATA_IO, 0); + wbc, FS_DATA_IO, 0, true); } /* @@ -3096,7 +2996,8 @@ continue_unlock: } #endif ret = f2fs_write_single_data_page(page, &submitted, - &bio, &last_block, wbc, io_type, 0); + &bio, &last_block, wbc, io_type, + 0, true); if (ret == AOP_WRITEPAGE_ACTIVATE) unlock_page(page); #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -3831,7 +3732,7 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) filemap_write_and_wait(mapping); /* Block number less than F2FS MAX BLOCKS */ - if (unlikely(block >= F2FS_I_SB(inode)->max_file_blocks)) + if (unlikely(block >= max_file_blocks(inode))) goto out; if (f2fs_compressed_file(inode)) { @@ -4108,12 +4009,13 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, if (!f2fs_disable_compressed_file(inode)) return -EINVAL; + f2fs_precache_extents(inode); + ret = check_swap_activate(sis, file, span); if (ret < 0) return ret; set_inode_flag(inode, FI_PIN_FILE); - f2fs_precache_extents(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 197c914119da..91855d5721cd 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -120,6 +120,13 @@ static void update_general_status(struct f2fs_sb_info *sbi) atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks; } + si->nr_issued_ckpt = atomic_read(&sbi->cprc_info.issued_ckpt); + si->nr_total_ckpt = atomic_read(&sbi->cprc_info.total_ckpt); + si->nr_queued_ckpt = atomic_read(&sbi->cprc_info.queued_ckpt); + spin_lock(&sbi->cprc_info.stat_lock); + si->cur_ckpt_time = sbi->cprc_info.cur_time; + si->peak_ckpt_time = sbi->cprc_info.peak_time; + spin_unlock(&sbi->cprc_info.stat_lock); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -417,6 +424,11 @@ static int stat_show(struct seq_file *s, void *v) si->meta_count[META_NAT]); seq_printf(s, " - ssa blocks : %u\n", si->meta_count[META_SSA]); + seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, " + "Cur time: %4d(ms), Peak time: %4d(ms))\n", + si->nr_queued_ckpt, si->nr_issued_ckpt, + si->nr_total_ckpt, si->cur_ckpt_time, + si->peak_ckpt_time); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d (%d)\n", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index bb11759191dc..2860003a09ed 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -97,6 +97,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000 #define F2FS_MOUNT_NORECOVERY 0x04000000 #define F2FS_MOUNT_ATGC 0x08000000 +#define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -146,6 +147,7 @@ struct f2fs_mount_info { /* For compression */ unsigned char compress_algorithm; /* algorithm type */ unsigned char compress_log_size; /* cluster log size */ + unsigned char compress_level; /* compress level */ bool compress_chksum; /* compressed data chksum */ unsigned char compress_ext_cnt; /* extension count */ int compress_mode; /* compression mode */ @@ -266,6 +268,26 @@ struct fsync_node_entry { unsigned int seq_id; /* sequence id */ }; +struct ckpt_req { + struct completion wait; /* completion for checkpoint done */ + struct llist_node llnode; /* llist_node to be linked in wait queue */ + int ret; /* return code of checkpoint */ + ktime_t queue_time; /* request queued time */ +}; + +struct ckpt_req_control { + struct task_struct *f2fs_issue_ckpt; /* checkpoint task */ + int ckpt_thread_ioprio; /* checkpoint merge thread ioprio */ + wait_queue_head_t ckpt_wait_queue; /* waiting queue for wake-up */ + atomic_t issued_ckpt; /* # of actually issued ckpts */ + atomic_t total_ckpt; /* # of total ckpts */ + atomic_t queued_ckpt; /* # of queued ckpts */ + struct llist_head issue_list; /* list for command issue */ + spinlock_t stat_lock; /* lock for below checkpoint time stats */ + unsigned int cur_time; /* cur wait time in msec for currently issued checkpoint */ + unsigned int peak_time; /* peak wait time in msec until now */ +}; + /* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ @@ -717,7 +739,6 @@ struct f2fs_inode_info { struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct task_struct *inmem_task; /* store inmemory task */ struct mutex inmem_lock; /* lock for inmemory pages */ - pgoff_t ra_offset; /* ongoing readahead offset */ struct extent_tree *extent_tree; /* cached extent_tree entry */ /* avoid racing between foreground op and gc */ @@ -735,6 +756,7 @@ struct f2fs_inode_info { atomic_t i_compr_blocks; /* # of compressed blocks */ unsigned char i_compress_algorithm; /* algorithm type */ unsigned char i_log_cluster_size; /* log of cluster size */ + unsigned char i_compress_level; /* compress level (lz4hc,zstd) */ unsigned short i_compress_flag; /* compress flag */ unsigned int i_cluster_size; /* cluster size */ }; @@ -1310,6 +1332,8 @@ struct compress_data { #define F2FS_COMPRESSED_PAGE_MAGIC 0xF5F2C000 +#define COMPRESS_LEVEL_OFFSET 8 + /* compress context */ struct compress_ctx { struct inode *inode; /* inode the context belong to */ @@ -1337,7 +1361,7 @@ struct compress_io_ctx { atomic_t pending_pages; /* in-flight compressed page count */ }; -/* decompress io context for read IO path */ +/* Context for decompressing one cluster on the read IO path */ struct decompress_io_ctx { u32 magic; /* magic number to indicate page is compressed */ struct inode *inode; /* inode the context belong to */ @@ -1353,11 +1377,37 @@ struct decompress_io_ctx { struct compress_data *cbuf; /* virtual mapped address on cpages */ size_t rlen; /* valid data length in rbuf */ size_t clen; /* valid data length in cbuf */ - atomic_t pending_pages; /* in-flight compressed page count */ - atomic_t verity_pages; /* in-flight page count for verity */ - bool failed; /* indicate IO error during decompression */ + + /* + * The number of compressed pages remaining to be read in this cluster. + * This is initially nr_cpages. It is decremented by 1 each time a page + * has been read (or failed to be read). When it reaches 0, the cluster + * is decompressed (or an error is reported). + * + * If an error occurs before all the pages have been submitted for I/O, + * then this will never reach 0. In this case the I/O submitter is + * responsible for calling f2fs_decompress_end_io() instead. + */ + atomic_t remaining_pages; + + /* + * Number of references to this decompress_io_ctx. + * + * One reference is held for I/O completion. This reference is dropped + * after the pagecache pages are updated and unlocked -- either after + * decompression (and verity if enabled), or after an error. + * + * In addition, each compressed page holds a reference while it is in a + * bio. These references are necessary prevent compressed pages from + * being freed while they are still in a bio. + */ + refcount_t refcnt; + + bool failed; /* IO error occurred before decompression? */ + bool need_verity; /* need fs-verity verification after decompression? */ void *private; /* payload buffer for specified decompression algorithm */ void *private2; /* extra payload buffer */ + struct work_struct verity_work; /* work to verify the decompressed pages */ }; #define NULL_CLUSTER ((unsigned int)(~0)) @@ -1404,6 +1454,7 @@ struct f2fs_sb_info { wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ + struct ckpt_req_control cprc_info; /* for checkpoint request control */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ @@ -1444,7 +1495,6 @@ struct f2fs_sb_info { unsigned int total_sections; /* total section count */ unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ - loff_t max_file_blocks; /* max block index of file */ int dir_level; /* directory level */ int readdir_ra; /* readahead inode in readdir */ u64 max_io_bytes; /* max io bytes to merge IOs */ @@ -1541,9 +1591,12 @@ struct f2fs_sb_info { unsigned int node_io_flag; /* For sysfs suppport */ - struct kobject s_kobj; + struct kobject s_kobj; /* /sys/fs/f2fs/<devname> */ struct completion s_kobj_unregister; + struct kobject s_stat_kobj; /* /sys/fs/f2fs/<devname>/stat */ + struct completion s_stat_kobj_unregister; + /* For shrinker support */ struct list_head s_list; int s_ndevs; /* number of devices */ @@ -3232,6 +3285,7 @@ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); int f2fs_quota_sync(struct super_block *sb, int type); +loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); @@ -3418,6 +3472,10 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi); int __init f2fs_create_checkpoint_caches(void); void f2fs_destroy_checkpoint_caches(void); +int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi); +int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi); +void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi); +void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi); /* * data.c @@ -3469,7 +3527,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, struct bio **bio, sector_t *last_block, struct writeback_control *wbc, enum iostat_type io_type, - int compr_blocks); + int compr_blocks, bool allow_balance); void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length); int f2fs_release_page(struct page *page, gfp_t wait); @@ -3530,6 +3588,8 @@ struct f2fs_stat_info { int nr_discarding, nr_discarded; int nr_discard_cmd; unsigned int undiscard_blks; + int nr_issued_ckpt, nr_total_ckpt, nr_queued_ckpt; + unsigned int cur_ckpt_time, peak_ckpt_time; int inline_xattr, inline_inode, inline_dir, append, update, orphans; int compr_inode; unsigned long long compr_blocks; @@ -3715,8 +3775,6 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_dec_compr_inode(inode) do { } while (0) #define stat_add_compr_blocks(inode, blocks) do { } while (0) #define stat_sub_compr_blocks(inode, blocks) do { } while (0) -#define stat_inc_atomic_write(inode) do { } while (0) -#define stat_dec_atomic_write(inode) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) #define stat_inc_volatile_write(inode) do { } while (0) #define stat_dec_volatile_write(inode) do { } while (0) @@ -3876,7 +3934,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page); bool f2fs_is_compress_backend_ready(struct inode *inode); int f2fs_init_compress_mempool(void); void f2fs_destroy_compress_mempool(void); -void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity); +void f2fs_end_read_compressed_page(struct page *page, bool failed); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page); @@ -3889,9 +3947,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, bool is_readahead, bool for_write); struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); -void f2fs_free_dic(struct decompress_io_ctx *dic); -void f2fs_decompress_end_io(struct page **rpages, - unsigned int cluster_size, bool err, bool verity); +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed); +void f2fs_put_page_dic(struct page *page); int f2fs_init_compress_ctx(struct compress_ctx *cc); void f2fs_destroy_compress_ctx(struct compress_ctx *cc); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); @@ -3915,6 +3972,14 @@ static inline struct page *f2fs_compress_control_page(struct page *page) } static inline int f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } +static inline void f2fs_end_read_compressed_page(struct page *page, bool failed) +{ + WARN_ON_ONCE(1); +} +static inline void f2fs_put_page_dic(struct page *page) +{ + WARN_ON_ONCE(1); +} static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { } static inline int __init f2fs_init_compress_cache(void) { return 0; } @@ -3934,6 +3999,11 @@ static inline void set_compress_context(struct inode *inode) 1 << COMPRESS_CHKSUM : 0; F2FS_I(inode)->i_cluster_size = 1 << F2FS_I(inode)->i_log_cluster_size; + if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 && + F2FS_OPTION(sbi).compress_level) + F2FS_I(inode)->i_compress_flag |= + F2FS_OPTION(sbi).compress_level << + COMPRESS_LEVEL_OFFSET; F2FS_I(inode)->i_flags |= F2FS_COMPR_FL; set_inode_flag(inode, FI_COMPRESSED_FILE); stat_inc_compr_inode(inode); @@ -4114,6 +4184,12 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, return false; } +static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) +{ + return fsverity_active(inode) && + idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); +} + #ifdef CONFIG_F2FS_FAULT_INJECTION extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, unsigned int type); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f585545277d7..471a6ff0c937 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -29,7 +29,6 @@ #include "xattr.h" #include "acl.h" #include "gc.h" -#include "trace.h" #include <trace/events/f2fs.h> #include <uapi/linux/f2fs.h> @@ -60,6 +59,9 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) bool need_alloc = true; int err = 0; + if (unlikely(IS_IMMUTABLE(inode))) + return VM_FAULT_SIGBUS; + if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; goto err; @@ -70,6 +72,10 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) goto err; } + err = f2fs_convert_inline_inode(inode); + if (err) + goto err; + #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { int ret = f2fs_is_compressed_cluster(inode, page->index); @@ -366,7 +372,6 @@ flush_out: f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); - f2fs_trace_ios(NULL, 1); return ret; } @@ -483,6 +488,9 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file->f_mapping->host; loff_t maxbytes = inode->i_sb->s_maxbytes; + if (f2fs_compressed_file(inode)) + maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS; + switch (whence) { case SEEK_SET: case SEEK_CUR: @@ -502,7 +510,6 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); - int err; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; @@ -510,11 +517,6 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - /* we don't need to use inline_data strictly */ - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - file_accessed(file); vma->vm_ops = &f2fs_file_vm_ops; set_inode_flag(inode, FI_MMAP_FILE); @@ -667,7 +669,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) free_from = (pgoff_t)F2FS_BLK_ALIGN(from); - if (free_from >= sbi->max_file_blocks) + if (free_from >= max_file_blocks(inode)) goto free_partial; if (lock) @@ -767,6 +769,10 @@ int f2fs_truncate(struct inode *inode) return -EIO; } + err = dquot_initialize(inode); + if (err) + return err; + /* we should check inline_data size */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); @@ -848,7 +854,8 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + if (!in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) mode &= ~S_ISGID; set_acl_inode(inode, mode); } @@ -865,6 +872,14 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + if (unlikely(IS_APPEND(inode) && + (attr->ia_valid & (ATTR_MODE | ATTR_UID | + ATTR_GID | ATTR_TIMES_SET)))) + return -EPERM; + if ((attr->ia_valid & ATTR_SIZE) && !f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; @@ -949,8 +964,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_MODE) { err = posix_acl_chmod(inode, f2fs_get_inode_mode(inode)); - if (err || is_inode_flag_set(inode, FI_ACL_MODE)) { - inode->i_mode = F2FS_I(inode)->i_acl_mode; + + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + if (!err) + inode->i_mode = F2FS_I(inode)->i_acl_mode; clear_inode_flag(inode, FI_ACL_MODE); } } @@ -2730,7 +2747,7 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) return -EINVAL; if (unlikely((range.start + range.len) >> PAGE_SHIFT > - sbi->max_file_blocks)) + max_file_blocks(inode))) return -EINVAL; err = mnt_want_write_file(filp); @@ -3293,7 +3310,7 @@ int f2fs_precache_extents(struct inode *inode) map.m_next_extent = &m_next_extent; map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; - end = F2FS_I_SB(inode)->max_file_blocks; + end = max_file_blocks(inode); while (map.m_lblk < end) { map.m_len = end - map.m_lblk; @@ -3357,6 +3374,14 @@ static int f2fs_ioc_measure_verity(struct file *filp, unsigned long arg) return fsverity_ioctl_measure(filp, (void __user *)arg); } +static int f2fs_ioc_read_verity_metadata(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_verity(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fsverity_ioctl_read_metadata(filp, (const void __user *)arg); +} + static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -4043,8 +4068,10 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len) for (i = 0; i < page_len; i++, redirty_idx++) { page = find_lock_page(mapping, redirty_idx); - if (!page) - ret = -ENOENT; + if (!page) { + ret = -ENOMEM; + break; + } set_page_dirty(page); f2fs_put_page(page, 1); f2fs_put_page(page, 0); @@ -4272,6 +4299,8 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_enable_verity(filp, arg); case FS_IOC_MEASURE_VERITY: return f2fs_ioc_measure_verity(filp, arg); + case FS_IOC_READ_VERITY_METADATA: + return f2fs_ioc_read_verity_metadata(filp, arg); case FS_IOC_GETFSLABEL: return f2fs_ioc_getfslabel(filp, arg); case FS_IOC_SETFSLABEL: @@ -4349,6 +4378,11 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(inode); } + if (unlikely(IS_IMMUTABLE(inode))) { + ret = -EPERM; + goto unlock; + } + ret = generic_write_checks(iocb, from); if (ret > 0) { bool preallocated = false; @@ -4413,6 +4447,7 @@ write: if (ret > 0) f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); } +unlock: inode_unlock(inode); out: trace_f2fs_file_write_iter(inode, iocb->ki_pos, @@ -4523,6 +4558,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_RESIZE_FS: case FS_IOC_ENABLE_VERITY: case FS_IOC_MEASURE_VERITY: + case FS_IOC_READ_VERITY_METADATA: case FS_IOC_GETFSLABEL: case FS_IOC_SETFSLABEL: case F2FS_IOC_GET_COMPRESS_BLOCKS: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3ef84e6ded41..39330ad3c44e 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1169,8 +1169,6 @@ static int move_data_block(struct inode *inode, block_t bidx, if (err) goto put_out; - set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); - /* read page */ fio.page = page; fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; @@ -1207,6 +1205,9 @@ static int move_data_block(struct inode *inode, block_t bidx, } } + set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); + + /* allocate block address */ f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, &sum, type, NULL); @@ -1233,9 +1234,6 @@ static int move_data_block(struct inode *inode, block_t bidx, set_page_writeback(fio.encrypted_page); ClearPageError(page); - /* allocate block address */ - f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true); - fio.op = REQ_OP_WRITE; fio.op_flags = REQ_SYNC; fio.new_blkaddr = newaddr; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 806ebabf5870..993caefcd2bb 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -192,6 +192,10 @@ int f2fs_convert_inline_inode(struct inode *inode) f2fs_hw_is_readonly(sbi) || f2fs_readonly(sbi->sb)) return 0; + err = dquot_initialize(inode); + if (err) + return err; + page = f2fs_grab_cache_page(inode->i_mapping, 0, false); if (!page) return -ENOMEM; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 6edb1ab579a1..887804968576 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -855,7 +855,11 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, if (whiteout) { f2fs_i_links_write(inode, false); + + spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; + spin_unlock(&inode->i_lock); + *whiteout = inode; } else { d_tmpfile(dentry, inode); @@ -1041,7 +1045,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, err = f2fs_add_link(old_dentry, whiteout); if (err) goto put_out_dir; + + spin_lock(&whiteout->i_lock); whiteout->i_state &= ~I_LINKABLE; + spin_unlock(&whiteout->i_lock); + iput(whiteout); } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3a24423ac65f..a8a0fb890e8d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -17,7 +17,6 @@ #include "node.h" #include "segment.h" #include "xattr.h" -#include "trace.h" #include <trace/events/f2fs.h> #define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) @@ -2089,7 +2088,6 @@ static int f2fs_set_node_page_dirty(struct page *page) __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); f2fs_set_page_private(page, 0); - f2fs_trace_pid(page); return 1; } return 0; @@ -2696,7 +2694,7 @@ retry: src = F2FS_INODE(page); dst = F2FS_INODE(ipage); - memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src); + memcpy(dst, src, offsetof(struct f2fs_inode, i_ext)); dst->i_size = 0; dst->i_blocks = cpu_to_le64(1); dst->i_links = cpu_to_le32(1); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index deca74cb17df..440634dfaa56 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -20,7 +20,6 @@ #include "segment.h" #include "node.h" #include "gc.h" -#include "trace.h" #include <trace/events/f2fs.h> #define __reverse_ffz(x) __reverse_ffs(~(x)) @@ -187,8 +186,6 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) { struct inmem_pages *new; - f2fs_trace_pid(page); - f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE); new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); @@ -610,8 +607,6 @@ repeat: if (kthread_should_stop()) return 0; - sb_start_intwrite(sbi->sb); - if (!llist_empty(&fcc->issue_list)) { struct flush_cmd *cmd, *next; int ret; @@ -632,8 +627,6 @@ repeat: fcc->dispatch_list = NULL; } - sb_end_intwrite(sbi->sb); - wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&fcc->issue_list)); goto repeat; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index e81eb0748e2a..229814b4f4a6 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -101,11 +101,11 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, #define BLKS_PER_SEC(sbi) \ ((sbi)->segs_per_sec * (sbi)->blocks_per_seg) #define GET_SEC_FROM_SEG(sbi, segno) \ - ((segno) / (sbi)->segs_per_sec) + (((segno) == -1) ? -1: (segno) / (sbi)->segs_per_sec) #define GET_SEG_FROM_SEC(sbi, secno) \ ((secno) * (sbi)->segs_per_sec) #define GET_ZONE_FROM_SEC(sbi, secno) \ - ((secno) / (sbi)->secs_per_zone) + (((secno) == -1) ? -1: (secno) / (sbi)->secs_per_zone) #define GET_ZONE_FROM_SEG(sbi, segno) \ GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b4a07fe62d1a..30d5abef4361 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -25,13 +25,14 @@ #include <linux/quota.h> #include <linux/unicode.h> #include <linux/part_stat.h> +#include <linux/zstd.h> +#include <linux/lz4.h> #include "f2fs.h" #include "node.h" #include "segment.h" #include "xattr.h" #include "gc.h" -#include "trace.h" #define CREATE_TRACE_POINTS #include <trace/events/f2fs.h> @@ -143,6 +144,8 @@ enum { Opt_checkpoint_disable_cap, Opt_checkpoint_disable_cap_perc, Opt_checkpoint_enable, + Opt_checkpoint_merge, + Opt_nocheckpoint_merge, Opt_compress_algorithm, Opt_compress_log_size, Opt_compress_extension, @@ -213,6 +216,8 @@ static match_table_t f2fs_tokens = { {Opt_checkpoint_disable_cap, "checkpoint=disable:%u"}, {Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"}, {Opt_checkpoint_enable, "checkpoint=enable"}, + {Opt_checkpoint_merge, "checkpoint_merge"}, + {Opt_nocheckpoint_merge, "nocheckpoint_merge"}, {Opt_compress_algorithm, "compress_algorithm=%s"}, {Opt_compress_log_size, "compress_log_size=%u"}, {Opt_compress_extension, "compress_extension=%s"}, @@ -464,6 +469,74 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, return 0; } +#ifdef CONFIG_F2FS_FS_COMPRESSION +#ifdef CONFIG_F2FS_FS_LZ4 +static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) +{ +#ifdef CONFIG_F2FS_FS_LZ4HC + unsigned int level; +#endif + + if (strlen(str) == 3) { + F2FS_OPTION(sbi).compress_level = 0; + return 0; + } + +#ifdef CONFIG_F2FS_FS_LZ4HC + str += 3; + + if (str[0] != ':') { + f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>"); + return -EINVAL; + } + if (kstrtouint(str + 1, 10, &level)) + return -EINVAL; + + if (level < LZ4HC_MIN_CLEVEL || level > LZ4HC_MAX_CLEVEL) { + f2fs_info(sbi, "invalid lz4hc compress level: %d", level); + return -EINVAL; + } + + F2FS_OPTION(sbi).compress_level = level; + return 0; +#else + f2fs_info(sbi, "kernel doesn't support lz4hc compression"); + return -EINVAL; +#endif +} +#endif + +#ifdef CONFIG_F2FS_FS_ZSTD +static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) +{ + unsigned int level; + int len = 4; + + if (strlen(str) == len) { + F2FS_OPTION(sbi).compress_level = 0; + return 0; + } + + str += len; + + if (str[0] != ':') { + f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>"); + return -EINVAL; + } + if (kstrtouint(str + 1, 10, &level)) + return -EINVAL; + + if (!level || level > ZSTD_maxCLevel()) { + f2fs_info(sbi, "invalid zstd compress level: %d", level); + return -EINVAL; + } + + F2FS_OPTION(sbi).compress_level = level; + return 0; +} +#endif +#endif + static int parse_options(struct super_block *sb, char *options, bool is_remount) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -872,6 +945,12 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_checkpoint_enable: clear_opt(sbi, DISABLE_CHECKPOINT); break; + case Opt_checkpoint_merge: + set_opt(sbi, MERGE_CHECKPOINT); + break; + case Opt_nocheckpoint_merge: + clear_opt(sbi, MERGE_CHECKPOINT); + break; #ifdef CONFIG_F2FS_FS_COMPRESSION case Opt_compress_algorithm: if (!f2fs_sb_has_compression(sbi)) { @@ -882,17 +961,45 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) if (!name) return -ENOMEM; if (!strcmp(name, "lzo")) { +#ifdef CONFIG_F2FS_FS_LZO + F2FS_OPTION(sbi).compress_level = 0; F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZO; - } else if (!strcmp(name, "lz4")) { +#else + f2fs_info(sbi, "kernel doesn't support lzo compression"); +#endif + } else if (!strncmp(name, "lz4", 3)) { +#ifdef CONFIG_F2FS_FS_LZ4 + ret = f2fs_set_lz4hc_level(sbi, name); + if (ret) { + kfree(name); + return -EINVAL; + } F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; - } else if (!strcmp(name, "zstd")) { +#else + f2fs_info(sbi, "kernel doesn't support lz4 compression"); +#endif + } else if (!strncmp(name, "zstd", 4)) { +#ifdef CONFIG_F2FS_FS_ZSTD + ret = f2fs_set_zstd_level(sbi, name); + if (ret) { + kfree(name); + return -EINVAL; + } F2FS_OPTION(sbi).compress_algorithm = COMPRESS_ZSTD; +#else + f2fs_info(sbi, "kernel doesn't support zstd compression"); +#endif } else if (!strcmp(name, "lzo-rle")) { +#ifdef CONFIG_F2FS_FS_LZORLE + F2FS_OPTION(sbi).compress_level = 0; F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZORLE; +#else + f2fs_info(sbi, "kernel doesn't support lzorle compression"); +#endif } else { kfree(name); return -EINVAL; @@ -1076,8 +1183,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; - fi->ra_offset = -1; - return &fi->vfs_inode; } @@ -1246,6 +1351,12 @@ static void f2fs_put_super(struct super_block *sb) mutex_lock(&sbi->umount_mutex); /* + * flush all issued checkpoints and stop checkpoint issue thread. + * after then, all checkpoints should be done by each process context. + */ + f2fs_stop_ckpt_thread(sbi); + + /* * We don't need to do checkpoint when superblock is clean. * But, the previous checkpoint was not done by umount, it needs to do * clean checkpoint again. @@ -1343,16 +1454,8 @@ int f2fs_sync_fs(struct super_block *sb, int sync) if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return -EAGAIN; - if (sync) { - struct cp_control cpc; - - cpc.reason = __get_cp_reason(sbi); - - down_write(&sbi->gc_lock); - err = f2fs_write_checkpoint(sbi, &cpc); - up_write(&sbi->gc_lock); - } - f2fs_trace_ios(NULL, 1); + if (sync) + err = f2fs_issue_checkpoint(sbi); return err; } @@ -1369,6 +1472,10 @@ static int f2fs_freeze(struct super_block *sb) /* must be clean, since sync_filesystem() was already called */ if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) return -EINVAL; + + /* ensure no checkpoint required */ + if (!llist_empty(&F2FS_SB(sb)->cprc_info.issue_list)) + return -EINVAL; return 0; } @@ -1539,6 +1646,9 @@ static inline void f2fs_show_compress_options(struct seq_file *seq, } seq_printf(seq, ",compress_algorithm=%s", algtype); + if (F2FS_OPTION(sbi).compress_level) + seq_printf(seq, ":%d", F2FS_OPTION(sbi).compress_level); + seq_printf(seq, ",compress_log_size=%u", F2FS_OPTION(sbi).compress_log_size); @@ -1674,6 +1784,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, DISABLE_CHECKPOINT)) seq_printf(seq, ",checkpoint=disable:%u", F2FS_OPTION(sbi).unusable_cap); + if (test_opt(sbi, MERGE_CHECKPOINT)) + seq_puts(seq, ",checkpoint_merge"); + else + seq_puts(seq, ",nocheckpoint_merge"); if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_POSIX) seq_printf(seq, ",fsync_mode=%s", "posix"); else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) @@ -1796,6 +1910,9 @@ restore_flag: static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) { + /* we should flush all the data to keep data consistency */ + sync_inodes_sb(sbi->sb); + down_write(&sbi->gc_lock); f2fs_dirty_to_prefree(sbi); @@ -1954,6 +2071,19 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } } + if (!test_opt(sbi, DISABLE_CHECKPOINT) && + test_opt(sbi, MERGE_CHECKPOINT)) { + err = f2fs_start_ckpt_thread(sbi); + if (err) { + f2fs_err(sbi, + "Failed to start F2FS issue_checkpoint_thread (%d)", + err); + goto restore_gc; + } + } else { + f2fs_stop_ckpt_thread(sbi); + } + /* * We stop issue flush thread if FS is mounted as RO * or if flush_merge is not passed in mount option. @@ -2638,10 +2768,10 @@ static const struct export_operations f2fs_export_ops = { .get_parent = f2fs_get_parent, }; -static loff_t max_file_blocks(void) +loff_t max_file_blocks(struct inode *inode) { loff_t result = 0; - loff_t leaf_count = DEF_ADDRS_PER_BLOCK; + loff_t leaf_count; /* * note: previously, result is equal to (DEF_ADDRS_PER_INODE - @@ -2650,6 +2780,11 @@ static loff_t max_file_blocks(void) * result as zero. */ + if (inode && f2fs_compressed_file(inode)) + leaf_count = ADDRS_PER_BLOCK(inode); + else + leaf_count = DEF_ADDRS_PER_BLOCK; + /* two direct node blocks */ result += (leaf_count * 2); @@ -3533,8 +3668,7 @@ try_onemore: if (err) goto free_options; - sbi->max_file_blocks = max_file_blocks(); - sb->s_maxbytes = sbi->max_file_blocks << + sb->s_maxbytes = max_file_blocks(NULL) << le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; @@ -3701,6 +3835,19 @@ try_onemore: f2fs_init_fsync_node_info(sbi); + /* setup checkpoint request control and start checkpoint issue thread */ + f2fs_init_ckpt_req_control(sbi); + if (!test_opt(sbi, DISABLE_CHECKPOINT) && + test_opt(sbi, MERGE_CHECKPOINT)) { + err = f2fs_start_ckpt_thread(sbi); + if (err) { + f2fs_err(sbi, + "Failed to start F2FS issue_checkpoint_thread (%d)", + err); + goto stop_ckpt_thread; + } + } + /* setup f2fs internal modules */ err = f2fs_build_segment_manager(sbi); if (err) { @@ -3786,12 +3933,10 @@ try_onemore: * previous checkpoint was not done by clean system shutdown. */ if (f2fs_hw_is_readonly(sbi)) { - if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { - err = -EROFS; + if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) f2fs_err(sbi, "Need to recover fsync data, but write access unavailable"); - goto free_meta; - } - f2fs_info(sbi, "write access unavailable, skipping recovery"); + else + f2fs_info(sbi, "write access unavailable, skipping recovery"); goto reset_checkpoint; } @@ -3910,6 +4055,8 @@ free_nm: free_sm: f2fs_destroy_segment_manager(sbi); f2fs_destroy_post_read_wq(sbi); +stop_ckpt_thread: + f2fs_stop_ckpt_thread(sbi); free_devices: destroy_device_list(sbi); kvfree(sbi->ckpt); @@ -4024,8 +4171,6 @@ static int __init init_f2fs_fs(void) return -EINVAL; } - f2fs_build_trace_ios(); - err = init_inodecache(); if (err) goto fail; @@ -4118,7 +4263,6 @@ static void __exit exit_f2fs_fs(void) f2fs_destroy_segment_manager_caches(); f2fs_destroy_node_manager_caches(); destroy_inodecache(); - f2fs_destroy_trace_ios(); } module_init(init_f2fs_fs) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 30bae57428d1..e38a7f6921dd 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -11,6 +11,7 @@ #include <linux/f2fs_fs.h> #include <linux/seq_file.h> #include <linux/unicode.h> +#include <linux/ioprio.h> #include "f2fs.h" #include "segment.h" @@ -34,6 +35,7 @@ enum { FAULT_INFO_TYPE, /* struct f2fs_fault_info */ #endif RESERVED_BLOCKS, /* struct f2fs_sb_info */ + CPRC_INFO, /* struct ckpt_req_control */ }; struct f2fs_attr { @@ -70,6 +72,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) else if (struct_type == STAT_INFO) return (unsigned char *)F2FS_STAT(sbi); #endif + else if (struct_type == CPRC_INFO) + return (unsigned char *)&sbi->cprc_info; return NULL; } @@ -96,6 +100,12 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, sbi->sectors_written_start) >> 1))); } +static ssize_t sb_status_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%lx\n", sbi->s_flag); +} + static ssize_t features_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -255,6 +265,23 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, return len; } + if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) { + struct ckpt_req_control *cprc = &sbi->cprc_info; + int len = 0; + int class = IOPRIO_PRIO_CLASS(cprc->ckpt_thread_ioprio); + int data = IOPRIO_PRIO_DATA(cprc->ckpt_thread_ioprio); + + if (class == IOPRIO_CLASS_RT) + len += scnprintf(buf + len, PAGE_SIZE - len, "rt,"); + else if (class == IOPRIO_CLASS_BE) + len += scnprintf(buf + len, PAGE_SIZE - len, "be,"); + else + return -EINVAL; + + len += scnprintf(buf + len, PAGE_SIZE - len, "%d\n", data); + return len; + } + ui = (unsigned int *)(ptr + a->offset); return sprintf(buf, "%u\n", *ui); @@ -308,6 +335,38 @@ out: return ret ? ret : count; } + if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) { + const char *name = strim((char *)buf); + struct ckpt_req_control *cprc = &sbi->cprc_info; + int class; + long data; + int ret; + + if (!strncmp(name, "rt,", 3)) + class = IOPRIO_CLASS_RT; + else if (!strncmp(name, "be,", 3)) + class = IOPRIO_CLASS_BE; + else + return -EINVAL; + + name += 3; + ret = kstrtol(name, 10, &data); + if (ret) + return ret; + if (data >= IOPRIO_BE_NR || data < 0) + return -EINVAL; + + cprc->ckpt_thread_ioprio = IOPRIO_PRIO_VALUE(class, data); + if (test_opt(sbi, MERGE_CHECKPOINT)) { + ret = set_task_ioprio(cprc->f2fs_issue_ckpt, + cprc->ckpt_thread_ioprio); + if (ret) + return ret; + } + + return count; + } + ui = (unsigned int *)(ptr + a->offset); ret = kstrtoul(skip_spaces(buf), 0, &t); @@ -567,6 +626,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag); +F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio); F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(free_segments); F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); @@ -652,6 +712,7 @@ static struct attribute *f2fs_attrs[] = { #endif ATTR_LIST(data_io_flag), ATTR_LIST(node_io_flag), + ATTR_LIST(ckpt_thread_ioprio), ATTR_LIST(dirty_segments), ATTR_LIST(free_segments), ATTR_LIST(unusable), @@ -702,6 +763,13 @@ static struct attribute *f2fs_feat_attrs[] = { }; ATTRIBUTE_GROUPS(f2fs_feat); +F2FS_GENERAL_RO_ATTR(sb_status); +static struct attribute *f2fs_stat_attrs[] = { + ATTR_LIST(sb_status), + NULL, +}; +ATTRIBUTE_GROUPS(f2fs_stat); + static const struct sysfs_ops f2fs_attr_ops = { .show = f2fs_attr_show, .store = f2fs_attr_store, @@ -730,6 +798,44 @@ static struct kobject f2fs_feat = { .kset = &f2fs_kset, }; +static ssize_t f2fs_stat_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t f2fs_stat_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void f2fs_stat_kobj_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + complete(&sbi->s_stat_kobj_unregister); +} + +static const struct sysfs_ops f2fs_stat_attr_ops = { + .show = f2fs_stat_attr_show, + .store = f2fs_stat_attr_store, +}; + +static struct kobj_type f2fs_stat_ktype = { + .default_groups = f2fs_stat_groups, + .sysfs_ops = &f2fs_stat_attr_ops, + .release = f2fs_stat_kobj_release, +}; + static int __maybe_unused segment_info_seq_show(struct seq_file *seq, void *offset) { @@ -936,11 +1042,15 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) init_completion(&sbi->s_kobj_unregister); err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL, "%s", sb->s_id); - if (err) { - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); - return err; - } + if (err) + goto put_sb_kobj; + + sbi->s_stat_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_stat_kobj_unregister); + err = kobject_init_and_add(&sbi->s_stat_kobj, &f2fs_stat_ktype, + &sbi->s_kobj, "stat"); + if (err) + goto put_stat_kobj; if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -956,6 +1066,13 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) victim_bits_seq_show, sb); } return 0; +put_stat_kobj: + kobject_put(&sbi->s_stat_kobj); + wait_for_completion(&sbi->s_stat_kobj_unregister); +put_sb_kobj: + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + return err; } void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) @@ -967,6 +1084,11 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) remove_proc_entry("victim_bits", sbi->s_proc); remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } + + kobject_del(&sbi->s_stat_kobj); + kobject_put(&sbi->s_stat_kobj); + wait_for_completion(&sbi->s_stat_kobj_unregister); + kobject_del(&sbi->s_kobj); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c deleted file mode 100644 index d0ab533a9ce8..000000000000 --- a/fs/f2fs/trace.c +++ /dev/null @@ -1,165 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * f2fs IO tracer - * - * Copyright (c) 2014 Motorola Mobility - * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org> - */ -#include <linux/fs.h> -#include <linux/f2fs_fs.h> -#include <linux/sched.h> -#include <linux/radix-tree.h> - -#include "f2fs.h" -#include "trace.h" - -static RADIX_TREE(pids, GFP_ATOMIC); -static spinlock_t pids_lock; -static struct last_io_info last_io; - -static inline void __print_last_io(void) -{ - if (!last_io.len) - return; - - trace_printk("%3x:%3x %4x %-16s %2x %5x %5x %12x %4x\n", - last_io.major, last_io.minor, - last_io.pid, "----------------", - last_io.type, - last_io.fio.op, last_io.fio.op_flags, - last_io.fio.new_blkaddr, - last_io.len); - memset(&last_io, 0, sizeof(last_io)); -} - -static int __file_type(struct inode *inode, pid_t pid) -{ - if (f2fs_is_atomic_file(inode)) - return __ATOMIC_FILE; - else if (f2fs_is_volatile_file(inode)) - return __VOLATILE_FILE; - else if (S_ISDIR(inode->i_mode)) - return __DIR_FILE; - else if (inode->i_ino == F2FS_NODE_INO(F2FS_I_SB(inode))) - return __NODE_FILE; - else if (inode->i_ino == F2FS_META_INO(F2FS_I_SB(inode))) - return __META_FILE; - else if (pid) - return __NORMAL_FILE; - else - return __MISC_FILE; -} - -void f2fs_trace_pid(struct page *page) -{ - struct inode *inode = page->mapping->host; - pid_t pid = task_pid_nr(current); - void *p; - - set_page_private(page, (unsigned long)pid); - -retry: - if (radix_tree_preload(GFP_NOFS)) - return; - - spin_lock(&pids_lock); - p = radix_tree_lookup(&pids, pid); - if (p == current) - goto out; - if (p) - radix_tree_delete(&pids, pid); - - if (radix_tree_insert(&pids, pid, current)) { - spin_unlock(&pids_lock); - radix_tree_preload_end(); - cond_resched(); - goto retry; - } - - trace_printk("%3x:%3x %4x %-16s\n", - MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), - pid, current->comm); -out: - spin_unlock(&pids_lock); - radix_tree_preload_end(); -} - -void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) -{ - struct inode *inode; - pid_t pid; - int major, minor; - - if (flush) { - __print_last_io(); - return; - } - - inode = fio->page->mapping->host; - pid = page_private(fio->page); - - major = MAJOR(inode->i_sb->s_dev); - minor = MINOR(inode->i_sb->s_dev); - - if (last_io.major == major && last_io.minor == minor && - last_io.pid == pid && - last_io.type == __file_type(inode, pid) && - last_io.fio.op == fio->op && - last_io.fio.op_flags == fio->op_flags && - last_io.fio.new_blkaddr + last_io.len == - fio->new_blkaddr) { - last_io.len++; - return; - } - - __print_last_io(); - - last_io.major = major; - last_io.minor = minor; - last_io.pid = pid; - last_io.type = __file_type(inode, pid); - last_io.fio = *fio; - last_io.len = 1; - return; -} - -void f2fs_build_trace_ios(void) -{ - spin_lock_init(&pids_lock); -} - -#define PIDVEC_SIZE 128 -static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index, - unsigned int max_items) -{ - struct radix_tree_iter iter; - void **slot; - unsigned int ret = 0; - - if (unlikely(!max_items)) - return 0; - - radix_tree_for_each_slot(slot, &pids, &iter, first_index) { - results[ret] = iter.index; - if (++ret == max_items) - break; - } - return ret; -} - -void f2fs_destroy_trace_ios(void) -{ - pid_t pid[PIDVEC_SIZE]; - pid_t next_pid = 0; - unsigned int found; - - spin_lock(&pids_lock); - while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) { - unsigned idx; - - next_pid = pid[found - 1] + 1; - for (idx = 0; idx < found; idx++) - radix_tree_delete(&pids, pid[idx]); - } - spin_unlock(&pids_lock); -} diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h deleted file mode 100644 index 789f6aa727fc..000000000000 --- a/fs/f2fs/trace.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * f2fs IO tracer - * - * Copyright (c) 2014 Motorola Mobility - * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org> - */ -#ifndef __F2FS_TRACE_H__ -#define __F2FS_TRACE_H__ - -#ifdef CONFIG_F2FS_IO_TRACE -#include <trace/events/f2fs.h> - -enum file_type { - __NORMAL_FILE, - __DIR_FILE, - __NODE_FILE, - __META_FILE, - __ATOMIC_FILE, - __VOLATILE_FILE, - __MISC_FILE, -}; - -struct last_io_info { - int major, minor; - pid_t pid; - enum file_type type; - struct f2fs_io_info fio; - block_t len; -}; - -extern void f2fs_trace_pid(struct page *); -extern void f2fs_trace_ios(struct f2fs_io_info *, int); -extern void f2fs_build_trace_ios(void); -extern void f2fs_destroy_trace_ios(void); -#else -#define f2fs_trace_pid(p) -#define f2fs_trace_ios(i, n) -#define f2fs_build_trace_ios() -#define f2fs_destroy_trace_ios() - -#endif -#endif /* __F2FS_TRACE_H__ */ diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 65afcc3cc68a..8159fae74b9a 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -327,7 +327,7 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, void *last_addr = NULL; nid_t xnid = F2FS_I(inode)->i_xattr_nid; unsigned int inline_size = inline_xattr_size(inode); - int err = 0; + int err; if (!xnid && !inline_size) return -ENODATA; @@ -515,7 +515,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size, struct page *ipage) { struct f2fs_xattr_entry *entry = NULL; - int error = 0; + int error; unsigned int size, len; void *base_addr = NULL; int base_size; @@ -562,7 +562,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) struct inode *inode = d_inode(dentry); struct f2fs_xattr_entry *entry; void *base_addr, *last_base_addr; - int error = 0; + int error; size_t rest = buffer_size; down_read(&F2FS_I(inode)->i_xattr_sem); @@ -632,7 +632,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, int found, newsize; size_t len; __u32 new_hsize; - int error = 0; + int error; if (name == NULL) return -EINVAL; @@ -673,7 +673,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, } if (value && f2fs_xattr_value_same(here, value, size)) - goto exit; + goto same; } else if ((flags & XATTR_REPLACE)) { error = -ENODATA; goto exit; @@ -738,17 +738,20 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (error) goto exit; - if (is_inode_flag_set(inode, FI_ACL_MODE)) { - inode->i_mode = F2FS_I(inode)->i_acl_mode; - inode->i_ctime = current_time(inode); - clear_inode_flag(inode, FI_ACL_MODE); - } if (index == F2FS_XATTR_INDEX_ENCRYPTION && !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); f2fs_mark_inode_dirty_sync(inode, true); if (!error && S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); + +same: + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + inode->i_mode = F2FS_I(inode)->i_acl_mode; + inode->i_ctime = current_time(inode); + clear_inode_flag(inode, FI_ACL_MODE); + } + exit: kfree(base_addr); return error; diff --git a/fs/fcntl.c b/fs/fcntl.c index 05b36b28f2e8..483ef8861376 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -148,11 +148,15 @@ void f_delown(struct file *filp) pid_t f_getown(struct file *filp) { - pid_t pid; + pid_t pid = 0; read_lock(&filp->f_owner.lock); - pid = pid_vnr(filp->f_owner.pid); - if (filp->f_owner.pid_type == PIDTYPE_PGID) - pid = -pid; + rcu_read_lock(); + if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) { + pid = pid_vnr(filp->f_owner.pid); + if (filp->f_owner.pid_type == PIDTYPE_PGID) + pid = -pid; + } + rcu_read_unlock(); read_unlock(&filp->f_owner.lock); return pid; } @@ -200,11 +204,14 @@ static int f_setown_ex(struct file *filp, unsigned long arg) static int f_getown_ex(struct file *filp, unsigned long arg) { struct f_owner_ex __user *owner_p = (void __user *)arg; - struct f_owner_ex owner; + struct f_owner_ex owner = {}; int ret = 0; read_lock(&filp->f_owner.lock); - owner.pid = pid_vnr(filp->f_owner.pid); + rcu_read_lock(); + if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) + owner.pid = pid_vnr(filp->f_owner.pid); + rcu_read_unlock(); switch (filp->f_owner.pid_type) { case PIDTYPE_PID: owner.type = F_OWNER_TID; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index acfb55834af2..c41cb887eb7d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1474,21 +1474,25 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } /* - * Some filesystems may redirty the inode during the writeback - * due to delalloc, clear dirty metadata flags right before - * write_inode() + * If the inode has dirty timestamps and we need to write them, call + * mark_inode_dirty_sync() to notify the filesystem about it and to + * change I_DIRTY_TIME into I_DIRTY_SYNC. */ - spin_lock(&inode->i_lock); - - dirty = inode->i_state & I_DIRTY; if ((inode->i_state & I_DIRTY_TIME) && - ((dirty & I_DIRTY_INODE) || - wbc->sync_mode == WB_SYNC_ALL || wbc->for_sync || + (wbc->sync_mode == WB_SYNC_ALL || wbc->for_sync || time_after(jiffies, inode->dirtied_time_when + dirtytime_expire_interval * HZ))) { - dirty |= I_DIRTY_TIME; trace_writeback_lazytime(inode); + mark_inode_dirty_sync(inode); } + + /* + * Some filesystems may redirty the inode during the writeback + * due to delalloc, clear dirty metadata flags right before + * write_inode() + */ + spin_lock(&inode->i_lock); + dirty = inode->i_state & I_DIRTY; inode->i_state &= ~dirty; /* @@ -1509,8 +1513,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) spin_unlock(&inode->i_lock); - if (dirty & I_DIRTY_TIME) - mark_inode_dirty_sync(inode); /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & ~I_DIRTY_PAGES) { int err = write_inode(inode, wbc); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index b39b339feddc..89609c299717 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -797,9 +797,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to, if (ret) goto out_uninit; - ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, - is_sync_kiocb(iocb)); - + ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0); gfs2_glock_dq(gh); out_uninit: gfs2_holder_uninit(gh); @@ -833,8 +831,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, if (offset + len > i_size_read(&ip->i_inode)) goto out; - ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, - is_sync_kiocb(iocb)); + ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0); if (ret == -ENOTBLK) ret = 0; out: diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index b5c109703daa..21c20fd5f9ee 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -735,9 +735,10 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, mutex_unlock(&hugetlb_fault_mutex_table[hash]); + set_page_huge_active(page); /* * unlock_page because locked by add_to_page_cache() - * page_put due to reference from alloc_huge_page() + * put_page() due to reference from alloc_huge_page() */ unlock_page(page); put_page(page); diff --git a/fs/internal.h b/fs/internal.h index 77c50befbfbe..cff1f30cfefb 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -15,6 +15,7 @@ struct mount; struct shrink_control; struct fs_context; struct user_namespace; +struct pipe_inode_info; /* * block_dev.c @@ -193,3 +194,11 @@ int sb_init_dio_done_wq(struct super_block *sb); */ int do_statx(int dfd, const char __user *filename, unsigned flags, unsigned int mask, struct statx __user *buffer); + +/* + * fs/splice.c: + */ +long splice_file_to_pipe(struct file *in, + struct pipe_inode_info *opipe, + loff_t *offset, + size_t len, unsigned int flags); diff --git a/fs/io_uring.c b/fs/io_uring.c index 985a9e3f976d..931671082e61 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -857,7 +857,8 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .needs_async_data = 1, .async_size = sizeof(struct io_async_msghdr), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | + IO_WQ_WORK_FS, }, [IORING_OP_RECVMSG] = { .needs_file = 1, @@ -866,7 +867,8 @@ static const struct io_op_def io_op_defs[] = { .buffer_select = 1, .needs_async_data = 1, .async_size = sizeof(struct io_async_msghdr), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | + IO_WQ_WORK_FS, }, [IORING_OP_TIMEOUT] = { .needs_async_data = 1, @@ -1025,6 +1027,8 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, const struct iovec *fast_iov, struct iov_iter *iter, bool force); +static void io_req_drop_files(struct io_kiocb *req); +static void io_req_task_queue(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -1048,8 +1052,7 @@ EXPORT_SYMBOL(io_uring_get_socket); static inline void io_clean_op(struct io_kiocb *req) { - if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED | - REQ_F_INFLIGHT)) + if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED)) __io_clean_op(req); } @@ -1069,14 +1072,21 @@ static bool io_match_task(struct io_kiocb *head, { struct io_kiocb *req; - if (task && head->task != task) + if (task && head->task != task) { + /* in terms of cancelation, always match if req task is dead */ + if (head->task->flags & PF_EXITING) + return true; return false; + } if (!files) return true; io_for_each_link(req, head) { - if ((req->flags & REQ_F_WORK_INITIALIZED) && - (req->work.flags & IO_WQ_WORK_FILES) && + if (!(req->flags & REQ_F_WORK_INITIALIZED)) + continue; + if (req->file && req->file->f_op == &io_uring_fops) + return true; + if ((req->work.flags & IO_WQ_WORK_FILES) && req->work.identity->files == files) return true; } @@ -1394,6 +1404,8 @@ static void io_req_clean_work(struct io_kiocb *req) free_fs_struct(fs); req->work.flags &= ~IO_WQ_WORK_FS; } + if (req->flags & REQ_F_INFLIGHT) + io_req_drop_files(req); io_put_identity(req->task->io_uring, req); } @@ -1503,11 +1515,14 @@ static bool io_grab_identity(struct io_kiocb *req) return false; atomic_inc(&id->files->count); get_nsproxy(id->nsproxy); - req->flags |= REQ_F_INFLIGHT; - spin_lock_irq(&ctx->inflight_lock); - list_add(&req->inflight_entry, &ctx->inflight_list); - spin_unlock_irq(&ctx->inflight_lock); + if (!(req->flags & REQ_F_INFLIGHT)) { + req->flags |= REQ_F_INFLIGHT; + + spin_lock_irq(&ctx->inflight_lock); + list_add(&req->inflight_entry, &ctx->inflight_list); + spin_unlock_irq(&ctx->inflight_lock); + } req->work.flags |= IO_WQ_WORK_FILES; } if (!(req->work.flags & IO_WQ_WORK_MM) && @@ -1622,18 +1637,11 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx) do { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); - struct io_kiocb *link; if (req_need_defer(de->req, de->seq)) break; list_del_init(&de->list); - /* punt-init is done before queueing for defer */ - link = __io_queue_async_work(de->req); - if (link) { - __io_queue_linked_timeout(link); - /* drop submission reference */ - io_put_req_deferred(link, 1); - } + io_req_task_queue(de->req); kfree(de); } while (!list_empty(&ctx->defer_list)); } @@ -1767,12 +1775,13 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, struct io_kiocb *req, *tmp; struct io_uring_cqe *cqe; unsigned long flags; - bool all_flushed; + bool all_flushed, posted; LIST_HEAD(list); if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries) return false; + posted = false; spin_lock_irqsave(&ctx->completion_lock, flags); list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) { if (!io_match_task(req, tsk, files)) @@ -1792,6 +1801,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, WRITE_ONCE(ctx->rings->cq_overflow, ctx->cached_cq_overflow); } + posted = true; } all_flushed = list_empty(&ctx->cq_overflow_list); @@ -1801,9 +1811,11 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; } - io_commit_cqring(ctx); + if (posted) + io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); - io_cqring_ev_posted(ctx); + if (posted) + io_cqring_ev_posted(ctx); while (!list_empty(&list)) { req = list_first_entry(&list, struct io_kiocb, compl.list); @@ -2195,6 +2207,9 @@ static void __io_req_task_submit(struct io_kiocb *req) else __io_req_task_cancel(req, -EFAULT); mutex_unlock(&ctx->uring_lock); + + if (ctx->flags & IORING_SETUP_SQPOLL) + io_sq_thread_drop_mm_files(); } static void io_req_task_submit(struct callback_head *cb) @@ -2270,6 +2285,8 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx, struct io_uring_task *tctx = rb->task->io_uring; percpu_counter_sub(&tctx->inflight, rb->task_refs); + if (atomic_read(&tctx->in_idle)) + wake_up(&tctx->wait); put_task_struct_many(rb->task, rb->task_refs); rb->task = NULL; } @@ -2288,6 +2305,8 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) struct io_uring_task *tctx = rb->task->io_uring; percpu_counter_sub(&tctx->inflight, rb->task_refs); + if (atomic_read(&tctx->in_idle)) + wake_up(&tctx->wait); put_task_struct_many(rb->task, rb->task_refs); } rb->task = req->task; @@ -3548,7 +3567,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, /* read it all, or we did blocking attempt. no retry. */ if (!iov_iter_count(iter) || !force_nonblock || - (req->file->f_flags & O_NONBLOCK)) + (req->file->f_flags & O_NONBLOCK) || !(req->flags & REQ_F_ISREG)) goto done; io_size -= ret; @@ -4468,7 +4487,6 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) * io_wq_work.flags, so initialize io_wq_work firstly. */ io_req_init_async(req); - req->work.flags |= IO_WQ_WORK_NO_CANCEL; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4501,6 +4519,8 @@ static int io_close(struct io_kiocb *req, bool force_nonblock, /* if the file has a flush method, be safe and punt to async */ if (close->put_file->f_op->flush && force_nonblock) { + /* not safe to cancel at this point */ + req->work.flags |= IO_WQ_WORK_NO_CANCEL; /* was never set, but play safe */ req->flags &= ~REQ_F_NOWAIT; /* avoid grabbing files - we don't need the files */ @@ -6157,8 +6177,10 @@ static void io_req_drop_files(struct io_kiocb *req) struct io_uring_task *tctx = req->task->io_uring; unsigned long flags; - put_files_struct(req->work.identity->files); - put_nsproxy(req->work.identity->nsproxy); + if (req->work.flags & IO_WQ_WORK_FILES) { + put_files_struct(req->work.identity->files); + put_nsproxy(req->work.identity->nsproxy); + } spin_lock_irqsave(&ctx->inflight_lock, flags); list_del(&req->inflight_entry); spin_unlock_irqrestore(&ctx->inflight_lock, flags); @@ -6225,9 +6247,6 @@ static void __io_clean_op(struct io_kiocb *req) } req->flags &= ~REQ_F_NEED_CLEANUP; } - - if (req->flags & REQ_F_INFLIGHT) - io_req_drop_files(req); } static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock, @@ -6446,6 +6465,16 @@ static struct file *io_file_get(struct io_submit_state *state, file = __io_file_get(state, fd); } + if (file && file->f_op == &io_uring_fops && + !(req->flags & REQ_F_INFLIGHT)) { + io_req_init_async(req); + req->flags |= REQ_F_INFLIGHT; + + spin_lock_irq(&ctx->inflight_lock); + list_add(&req->inflight_entry, &ctx->inflight_list); + spin_unlock_irq(&ctx->inflight_lock); + } + return file; } @@ -7245,14 +7274,18 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, TASK_INTERRUPTIBLE); /* make sure we run task_work before checking for signals */ ret = io_run_task_work_sig(); - if (ret > 0) + if (ret > 0) { + finish_wait(&ctx->wait, &iowq.wq); continue; + } else if (ret < 0) break; if (io_should_wake(&iowq)) break; - if (test_bit(0, &ctx->cq_check_overflow)) + if (test_bit(0, &ctx->cq_check_overflow)) { + finish_wait(&ctx->wait, &iowq.wq); continue; + } if (uts) { timeout = schedule_timeout(timeout); if (timeout == 0) { @@ -8844,39 +8877,44 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx, } } +static int io_uring_count_inflight(struct io_ring_ctx *ctx, + struct task_struct *task, + struct files_struct *files) +{ + struct io_kiocb *req; + int cnt = 0; + + spin_lock_irq(&ctx->inflight_lock); + list_for_each_entry(req, &ctx->inflight_list, inflight_entry) + cnt += io_match_task(req, task, files); + spin_unlock_irq(&ctx->inflight_lock); + return cnt; +} + static void io_uring_cancel_files(struct io_ring_ctx *ctx, struct task_struct *task, struct files_struct *files) { while (!list_empty_careful(&ctx->inflight_list)) { struct io_task_cancel cancel = { .task = task, .files = files }; - struct io_kiocb *req; DEFINE_WAIT(wait); - bool found = false; - - spin_lock_irq(&ctx->inflight_lock); - list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { - if (req->task != task || - req->work.identity->files != files) - continue; - found = true; - break; - } - if (found) - prepare_to_wait(&task->io_uring->wait, &wait, - TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&ctx->inflight_lock); + int inflight; - /* We need to keep going until we don't find a matching req */ - if (!found) + inflight = io_uring_count_inflight(ctx, task, files); + if (!inflight) break; io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true); io_poll_remove_all(ctx, task, files); io_kill_timeouts(ctx, task, files); + io_cqring_overflow_flush(ctx, true, task, files); /* cancellations _may_ trigger task work */ io_run_task_work(); - schedule(); + + prepare_to_wait(&task->io_uring->wait, &wait, + TASK_UNINTERRUPTIBLE); + if (inflight == io_uring_count_inflight(ctx, task, files)) + schedule(); finish_wait(&task->io_uring->wait, &wait); } } @@ -8914,8 +8952,6 @@ static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, static void io_disable_sqo_submit(struct io_ring_ctx *ctx) { - WARN_ON_ONCE(ctx->sqo_task != current); - mutex_lock(&ctx->uring_lock); ctx->sqo_dead = 1; mutex_unlock(&ctx->uring_lock); @@ -8936,7 +8972,6 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, struct task_struct *task = current; if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { - /* for SQPOLL only sqo_task has task notes */ io_disable_sqo_submit(ctx); task = ctx->sq_data->thread; atomic_inc(&task->io_uring->in_idle); @@ -8946,19 +8981,12 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, io_cancel_defer_files(ctx, task, files); io_cqring_overflow_flush(ctx, true, task, files); + io_uring_cancel_files(ctx, task, files); if (!files) __io_uring_cancel_task_requests(ctx, task); - else - io_uring_cancel_files(ctx, task, files); if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { atomic_dec(&task->io_uring->in_idle); - /* - * If the files that are going away are the ones in the thread - * identity, clear them out. - */ - if (task->io_uring->identity->files == files) - task->io_uring->identity->files = NULL; io_sq_thread_unpark(ctx->sq_data); } } @@ -9082,6 +9110,10 @@ void __io_uring_task_cancel(void) /* make sure overflow events are dropped */ atomic_inc(&tctx->in_idle); + /* trigger io_disable_sqo_submit() */ + if (tctx->sqpoll) + __io_uring_files_cancel(NULL); + do { /* read completions before cancelations */ inflight = tctx_inflight(tctx); @@ -9092,16 +9124,15 @@ void __io_uring_task_cancel(void) prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); /* - * If we've seen completions, retry. This avoids a race where - * a completion comes in before we did prepare_to_wait(). + * If we've seen completions, retry without waiting. This + * avoids a race where a completion comes in before we did + * prepare_to_wait(). */ - if (inflight != tctx_inflight(tctx)) - continue; - schedule(); + if (inflight == tctx_inflight(tctx)) + schedule(); finish_wait(&tctx->wait, &wait); } while (1); - finish_wait(&tctx->wait, &wait); atomic_dec(&tctx->in_idle); io_uring_remove_task_files(tctx); @@ -9112,6 +9143,9 @@ static int io_uring_flush(struct file *file, void *data) struct io_uring_task *tctx = current->io_uring; struct io_ring_ctx *ctx = file->private_data; + if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) + io_uring_cancel_task_requests(ctx, NULL); + if (!tctx) return 0; @@ -9128,7 +9162,10 @@ static int io_uring_flush(struct file *file, void *data) if (ctx->flags & IORING_SETUP_SQPOLL) { /* there is only one file note, which is owned by sqo_task */ - WARN_ON_ONCE((ctx->sqo_task == current) == + WARN_ON_ONCE(ctx->sqo_task != current && + xa_load(&tctx->xa, (unsigned long)file)); + /* sqo_dead check is for when this happens after cancellation */ + WARN_ON_ONCE(ctx->sqo_task == current && !ctx->sqo_dead && !xa_load(&tctx->xa, (unsigned long)file)); io_disable_sqo_submit(ctx); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 933f234d5bec..e4c258a2cefc 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -201,6 +201,34 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, iomap_dio_submit_bio(dio, iomap, bio, pos); } +/* + * Figure out the bio's operation flags from the dio request, the + * mapping, and whether or not we want FUA. Note that we can end up + * clearing the WRITE_FUA flag in the dio request. + */ +static inline unsigned int +iomap_dio_bio_opflags(struct iomap_dio *dio, struct iomap *iomap, bool use_fua) +{ + unsigned int opflags = REQ_SYNC | REQ_IDLE; + + if (!(dio->flags & IOMAP_DIO_WRITE)) { + WARN_ON_ONCE(iomap->flags & IOMAP_F_ZONE_APPEND); + return REQ_OP_READ; + } + + if (iomap->flags & IOMAP_F_ZONE_APPEND) + opflags |= REQ_OP_ZONE_APPEND; + else + opflags |= REQ_OP_WRITE; + + if (use_fua) + opflags |= REQ_FUA; + else + dio->flags &= ~IOMAP_DIO_WRITE_FUA; + + return opflags; +} + static loff_t iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, struct iomap_dio *dio, struct iomap *iomap) @@ -208,6 +236,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); unsigned int fs_block_size = i_blocksize(inode), pad; unsigned int align = iov_iter_alignment(dio->submit.iter); + unsigned int bio_opf; struct bio *bio; bool need_zeroout = false; bool use_fua = false; @@ -263,6 +292,13 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, iomap_dio_zero(dio, iomap, pos - pad, pad); } + /* + * Set the operation flags early so that bio_iov_iter_get_pages + * can set up the page vector appropriately for a ZONE_APPEND + * operation. + */ + bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua); + do { size_t n; if (dio->error) { @@ -278,6 +314,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, bio->bi_ioprio = dio->iocb->ki_ioprio; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; + bio->bi_opf = bio_opf; ret = bio_iov_iter_get_pages(bio, dio->submit.iter); if (unlikely(ret)) { @@ -293,14 +330,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, n = bio->bi_iter.bi_size; if (dio->flags & IOMAP_DIO_WRITE) { - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; - if (use_fua) - bio->bi_opf |= REQ_FUA; - else - dio->flags &= ~IOMAP_DIO_WRITE_FUA; task_io_account_write(n); } else { - bio->bi_opf = REQ_OP_READ; if (dio->flags & IOMAP_DIO_DIRTY) bio_set_pages_dirty(bio); } @@ -420,23 +451,22 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, struct iomap_dio * __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, - bool wait_for_completion) + unsigned int dio_flags) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = file_inode(iocb->ki_filp); size_t count = iov_iter_count(iter); loff_t pos = iocb->ki_pos; loff_t end = iocb->ki_pos + count - 1, ret = 0; - unsigned int flags = IOMAP_DIRECT; + bool wait_for_completion = + is_sync_kiocb(iocb) || (dio_flags & IOMAP_DIO_FORCE_WAIT); + unsigned int iomap_flags = IOMAP_DIRECT; struct blk_plug plug; struct iomap_dio *dio; if (!count) return NULL; - if (WARN_ON(is_sync_kiocb(iocb) && !wait_for_completion)) - return ERR_PTR(-EIO); - dio = kmalloc(sizeof(*dio), GFP_KERNEL); if (!dio) return ERR_PTR(-ENOMEM); @@ -461,7 +491,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iter_is_iovec(iter)) dio->flags |= IOMAP_DIO_DIRTY; } else { - flags |= IOMAP_WRITE; + iomap_flags |= IOMAP_WRITE; dio->flags |= IOMAP_DIO_WRITE; /* for data sync or sync, we need sync completion processing */ @@ -483,7 +513,14 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, ret = -EAGAIN; goto out_free_dio; } - flags |= IOMAP_NOWAIT; + iomap_flags |= IOMAP_NOWAIT; + } + + if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { + ret = -EAGAIN; + if (pos >= dio->i_size || pos + count > dio->i_size) + goto out_free_dio; + iomap_flags |= IOMAP_OVERWRITE_ONLY; } ret = filemap_write_and_wait_range(mapping, pos, end); @@ -514,7 +551,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, blk_start_plug(&plug); do { - ret = iomap_apply(inode, pos, count, flags, ops, dio, + ret = iomap_apply(inode, pos, count, iomap_flags, ops, dio, iomap_dio_actor); if (ret <= 0) { /* magic error code to fall back to buffered I/O */ @@ -598,11 +635,11 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw); ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, - bool wait_for_completion) + unsigned int dio_flags) { struct iomap_dio *dio; - dio = __iomap_dio_rw(iocb, iter, ops, dops, wait_for_completion); + dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags); if (IS_ERR_OR_NULL(dio)) return PTR_ERR_OR_ZERO(dio); return iomap_dio_complete(dio); diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 94b7c1cb5ceb..7aee15608619 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -1656,7 +1656,7 @@ s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen) } else if (rc == -ENOSPC) { /* search for next smaller log2 block */ l2nb = BLKSTOL2(nblocks) - 1; - nblocks = 1 << l2nb; + nblocks = 1LL << l2nb; } else { /* Trim any already allocated blocks */ jfs_error(bmp->db_ipbmap->i_sb, "-EIO\n"); diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h index 1e899298f7f0..b5d702df7111 100644 --- a/fs/jfs/jfs_filsys.h +++ b/fs/jfs/jfs_filsys.h @@ -268,5 +268,6 @@ * fsck() must be run to repair */ #define FM_EXTENDFS 0x00000008 /* file system extendfs() in progress */ +#define FM_STATE_MAX 0x0000000f /* max value of s_state */ #endif /* _H_JFS_FILSYS */ diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c index 2935d4c776ec..5d7d7170c03c 100644 --- a/fs/jfs/jfs_mount.c +++ b/fs/jfs/jfs_mount.c @@ -37,6 +37,7 @@ #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> +#include <linux/log2.h> #include "jfs_incore.h" #include "jfs_filsys.h" @@ -366,6 +367,15 @@ static int chkSuper(struct super_block *sb) sbi->bsize = bsize; sbi->l2bsize = le16_to_cpu(j_sb->s_l2bsize); + /* check some fields for possible corruption */ + if (sbi->l2bsize != ilog2((u32)bsize) || + j_sb->pad != 0 || + le32_to_cpu(j_sb->s_state) > FM_STATE_MAX) { + rc = -EINVAL; + jfs_err("jfs_mount: Mount Failure: superblock is corrupt!"); + goto out; + } + /* * For now, ignore s_pbsize, l2bfactor. All I/O going through buffer * cache. diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index dca8edd2378c..053295cd7bc6 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -148,10 +148,10 @@ static struct { /* * forward references */ -static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, - struct tlock * tlck, struct commit * cd); -static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, - struct tlock * tlck); +static void diLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd, + struct tlock *tlck, struct commit *cd); +static void dataLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd, + struct tlock *tlck); static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck); static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, @@ -159,8 +159,8 @@ static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, static void txAllocPMap(struct inode *ip, struct maplock * maplock, struct tblock * tblk); static void txForce(struct tblock * tblk); -static int txLog(struct jfs_log * log, struct tblock * tblk, - struct commit * cd); +static void txLog(struct jfs_log *log, struct tblock *tblk, + struct commit *cd); static void txUpdateMap(struct tblock * tblk); static void txRelease(struct tblock * tblk); static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, @@ -1256,8 +1256,7 @@ int txCommit(tid_t tid, /* transaction identifier */ * * txUpdateMap() resets XAD_NEW in XAD. */ - if ((rc = txLog(log, tblk, &cd))) - goto TheEnd; + txLog(log, tblk, &cd); /* * Ensure that inode isn't reused before @@ -1365,9 +1364,8 @@ int txCommit(tid_t tid, /* transaction identifier */ * * RETURN : */ -static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd) +static void txLog(struct jfs_log *log, struct tblock *tblk, struct commit *cd) { - int rc = 0; struct inode *ip; lid_t lid; struct tlock *tlck; @@ -1414,7 +1412,7 @@ static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd) } } - return rc; + return; } /* @@ -1422,10 +1420,9 @@ static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd) * * function: log inode tlock and format maplock to update bmap; */ -static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, - struct tlock * tlck, struct commit * cd) +static void diLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd, + struct tlock *tlck, struct commit *cd) { - int rc = 0; struct metapage *mp; pxd_t *pxd; struct pxd_lock *pxdlock; @@ -1527,7 +1524,7 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } #endif /* _JFS_WIP */ - return rc; + return; } /* @@ -1535,8 +1532,8 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, * * function: log data tlock */ -static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, - struct tlock * tlck) +static void dataLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd, + struct tlock *tlck) { struct metapage *mp; pxd_t *pxd; @@ -1562,7 +1559,7 @@ static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, metapage_homeok(mp); discard_metapage(mp); tlck->mp = NULL; - return 0; + return; } PXDaddress(pxd, mp->index); @@ -1573,7 +1570,7 @@ static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, /* mark page as homeward bound */ tlck->flag |= tlckWRITEPAGE; - return 0; + return; } /* diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index f277d023ebcd..c75719312147 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -14,6 +14,7 @@ #include <linux/pagemap.h> #include <linux/sched/mm.h> #include <linux/fsnotify.h> +#include <linux/uio.h> #include "kernfs-internal.h" @@ -180,11 +181,10 @@ static const struct seq_operations kernfs_seq_ops = { * it difficult to use seq_file. Implement simplistic custom buffering for * bin files. */ -static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, - char __user *user_buf, size_t count, - loff_t *ppos) +static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { - ssize_t len = min_t(size_t, count, PAGE_SIZE); + struct kernfs_open_file *of = kernfs_of(iocb->ki_filp); + ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE); const struct kernfs_ops *ops; char *buf; @@ -210,7 +210,7 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, of->event = atomic_read(&of->kn->attr.open->event); ops = kernfs_ops(of->kn); if (ops->read) - len = ops->read(of, buf, len, *ppos); + len = ops->read(of, buf, len, iocb->ki_pos); else len = -EINVAL; @@ -220,12 +220,12 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, if (len < 0) goto out_free; - if (copy_to_user(user_buf, buf, len)) { + if (copy_to_iter(buf, len, iter) != len) { len = -EFAULT; goto out_free; } - *ppos += len; + iocb->ki_pos += len; out_free: if (buf == of->prealloc_buf) @@ -235,31 +235,14 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, return len; } -/** - * kernfs_fop_read - kernfs vfs read callback - * @file: file pointer - * @user_buf: data to write - * @count: number of bytes - * @ppos: starting offset - */ -static ssize_t kernfs_fop_read(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) +static ssize_t kernfs_fop_read_iter(struct kiocb *iocb, struct iov_iter *iter) { - struct kernfs_open_file *of = kernfs_of(file); - - if (of->kn->flags & KERNFS_HAS_SEQ_SHOW) - return seq_read(file, user_buf, count, ppos); - else - return kernfs_file_direct_read(of, user_buf, count, ppos); + if (kernfs_of(iocb->ki_filp)->kn->flags & KERNFS_HAS_SEQ_SHOW) + return seq_read_iter(iocb, iter); + return kernfs_file_read_iter(iocb, iter); } -/** - * kernfs_fop_write - kernfs vfs write callback - * @file: file pointer - * @user_buf: data to write - * @count: number of bytes - * @ppos: starting offset - * +/* * Copy data in from userland and pass it to the matching kernfs write * operation. * @@ -269,20 +252,18 @@ static ssize_t kernfs_fop_read(struct file *file, char __user *user_buf, * modify only the the value you're changing, then write entire buffer * back. */ -static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, - size_t count, loff_t *ppos) +static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter) { - struct kernfs_open_file *of = kernfs_of(file); + struct kernfs_open_file *of = kernfs_of(iocb->ki_filp); + ssize_t len = iov_iter_count(iter); const struct kernfs_ops *ops; - ssize_t len; char *buf; if (of->atomic_write_len) { - len = count; if (len > of->atomic_write_len) return -E2BIG; } else { - len = min_t(size_t, count, PAGE_SIZE); + len = min_t(size_t, len, PAGE_SIZE); } buf = of->prealloc_buf; @@ -293,7 +274,7 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, if (!buf) return -ENOMEM; - if (copy_from_user(buf, user_buf, len)) { + if (copy_from_iter(buf, len, iter) != len) { len = -EFAULT; goto out_free; } @@ -312,7 +293,7 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, ops = kernfs_ops(of->kn); if (ops->write) - len = ops->write(of, buf, len, *ppos); + len = ops->write(of, buf, len, iocb->ki_pos); else len = -EINVAL; @@ -320,7 +301,7 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, mutex_unlock(&of->mutex); if (len > 0) - *ppos += len; + iocb->ki_pos += len; out_free: if (buf == of->prealloc_buf) @@ -673,7 +654,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) /* * Write path needs to atomic_write_len outside active reference. - * Cache it in open_file. See kernfs_fop_write() for details. + * Cache it in open_file. See kernfs_fop_write_iter() for details. */ of->atomic_write_len = ops->atomic_write_len; @@ -960,14 +941,16 @@ void kernfs_notify(struct kernfs_node *kn) EXPORT_SYMBOL_GPL(kernfs_notify); const struct file_operations kernfs_file_fops = { - .read = kernfs_fop_read, - .write = kernfs_fop_write, + .read_iter = kernfs_fop_read_iter, + .write_iter = kernfs_fop_write_iter, .llseek = generic_file_llseek, .mmap = kernfs_fop_mmap, .open = kernfs_fop_open, .release = kernfs_fop_release, .poll = kernfs_fop_poll, .fsync = noop_fsync, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, }; /** diff --git a/fs/libfs.c b/fs/libfs.c index d1c3bade9f30..79721571e014 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1388,8 +1388,8 @@ static bool needs_casefold(const struct inode *dir) * * Return: 0 if names match, 1 if mismatch, or -ERRNO */ -int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, - const char *str, const struct qstr *name) +static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name) { const struct dentry *parent = READ_ONCE(dentry->d_parent); const struct inode *dir = READ_ONCE(parent->d_inode); @@ -1426,7 +1426,6 @@ fallback: return 1; return !!memcmp(str, name->name, len); } -EXPORT_SYMBOL(generic_ci_d_compare); /** * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems @@ -1435,7 +1434,7 @@ EXPORT_SYMBOL(generic_ci_d_compare); * * Return: 0 if hash was successful or unchanged, and -EINVAL on error */ -int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) +static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) { const struct inode *dir = READ_ONCE(dentry->d_inode); struct super_block *sb = dentry->d_sb; @@ -1450,7 +1449,6 @@ int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) return -EINVAL; return 0; } -EXPORT_SYMBOL(generic_ci_d_hash); static const struct dentry_operations generic_ci_dentry_ops = { .d_hash = generic_ci_d_hash, diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index fa41dda39925..4c10fb5138f1 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -512,6 +512,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "NULL", }, [NLMPROC_TEST] = { .pc_func = nlm4svc_proc_test, @@ -520,6 +521,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+2+No+Rg, + .pc_name = "TEST", }, [NLMPROC_LOCK] = { .pc_func = nlm4svc_proc_lock, @@ -528,6 +530,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "LOCK", }, [NLMPROC_CANCEL] = { .pc_func = nlm4svc_proc_cancel, @@ -536,6 +539,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "CANCEL", }, [NLMPROC_UNLOCK] = { .pc_func = nlm4svc_proc_unlock, @@ -544,6 +548,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "UNLOCK", }, [NLMPROC_GRANTED] = { .pc_func = nlm4svc_proc_granted, @@ -552,6 +557,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "GRANTED", }, [NLMPROC_TEST_MSG] = { .pc_func = nlm4svc_proc_test_msg, @@ -560,6 +566,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "TEST_MSG", }, [NLMPROC_LOCK_MSG] = { .pc_func = nlm4svc_proc_lock_msg, @@ -568,6 +575,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "LOCK_MSG", }, [NLMPROC_CANCEL_MSG] = { .pc_func = nlm4svc_proc_cancel_msg, @@ -576,6 +584,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "CANCEL_MSG", }, [NLMPROC_UNLOCK_MSG] = { .pc_func = nlm4svc_proc_unlock_msg, @@ -584,6 +593,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "UNLOCK_MSG", }, [NLMPROC_GRANTED_MSG] = { .pc_func = nlm4svc_proc_granted_msg, @@ -592,6 +602,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "GRANTED_MSG", }, [NLMPROC_TEST_RES] = { .pc_func = nlm4svc_proc_null, @@ -600,6 +611,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "TEST_RES", }, [NLMPROC_LOCK_RES] = { .pc_func = nlm4svc_proc_null, @@ -608,6 +620,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "LOCK_RES", }, [NLMPROC_CANCEL_RES] = { .pc_func = nlm4svc_proc_null, @@ -616,6 +629,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "CANCEL_RES", }, [NLMPROC_UNLOCK_RES] = { .pc_func = nlm4svc_proc_null, @@ -624,6 +638,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "UNLOCK_RES", }, [NLMPROC_GRANTED_RES] = { .pc_func = nlm4svc_proc_granted_res, @@ -632,6 +647,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "GRANTED_RES", }, [NLMPROC_NSM_NOTIFY] = { .pc_func = nlm4svc_proc_sm_notify, @@ -640,6 +656,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_reboot), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "SM_NOTIFY", }, [17] = { .pc_func = nlm4svc_proc_unused, @@ -648,6 +665,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = 0, + .pc_name = "UNUSED", }, [18] = { .pc_func = nlm4svc_proc_unused, @@ -656,6 +674,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = 0, + .pc_name = "UNUSED", }, [19] = { .pc_func = nlm4svc_proc_unused, @@ -664,6 +683,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = 0, + .pc_name = "UNUSED", }, [NLMPROC_SHARE] = { .pc_func = nlm4svc_proc_share, @@ -672,6 +692,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+1, + .pc_name = "SHARE", }, [NLMPROC_UNSHARE] = { .pc_func = nlm4svc_proc_unshare, @@ -680,6 +701,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+1, + .pc_name = "UNSHARE", }, [NLMPROC_NM_LOCK] = { .pc_func = nlm4svc_proc_nm_lock, @@ -688,6 +710,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "NM_LOCK", }, [NLMPROC_FREE_ALL] = { .pc_func = nlm4svc_proc_free_all, @@ -696,5 +719,6 @@ const struct svc_procedure nlmsvc_procedures4[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "FREE_ALL", }, }; diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 50855f2c1f4b..4ae4b63b5392 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -554,6 +554,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "NULL", }, [NLMPROC_TEST] = { .pc_func = nlmsvc_proc_test, @@ -562,6 +563,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+2+No+Rg, + .pc_name = "TEST", }, [NLMPROC_LOCK] = { .pc_func = nlmsvc_proc_lock, @@ -570,6 +572,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "LOCK", }, [NLMPROC_CANCEL] = { .pc_func = nlmsvc_proc_cancel, @@ -578,6 +581,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "CANCEL", }, [NLMPROC_UNLOCK] = { .pc_func = nlmsvc_proc_unlock, @@ -586,6 +590,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "UNLOCK", }, [NLMPROC_GRANTED] = { .pc_func = nlmsvc_proc_granted, @@ -594,6 +599,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "GRANTED", }, [NLMPROC_TEST_MSG] = { .pc_func = nlmsvc_proc_test_msg, @@ -602,6 +608,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "TEST_MSG", }, [NLMPROC_LOCK_MSG] = { .pc_func = nlmsvc_proc_lock_msg, @@ -610,6 +617,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "LOCK_MSG", }, [NLMPROC_CANCEL_MSG] = { .pc_func = nlmsvc_proc_cancel_msg, @@ -618,6 +626,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "CANCEL_MSG", }, [NLMPROC_UNLOCK_MSG] = { .pc_func = nlmsvc_proc_unlock_msg, @@ -626,6 +635,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "UNLOCK_MSG", }, [NLMPROC_GRANTED_MSG] = { .pc_func = nlmsvc_proc_granted_msg, @@ -634,6 +644,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "GRANTED_MSG", }, [NLMPROC_TEST_RES] = { .pc_func = nlmsvc_proc_null, @@ -642,6 +653,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "TEST_RES", }, [NLMPROC_LOCK_RES] = { .pc_func = nlmsvc_proc_null, @@ -650,6 +662,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "LOCK_RES", }, [NLMPROC_CANCEL_RES] = { .pc_func = nlmsvc_proc_null, @@ -658,6 +671,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "CANCEL_RES", }, [NLMPROC_UNLOCK_RES] = { .pc_func = nlmsvc_proc_null, @@ -666,6 +680,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "UNLOCK_RES", }, [NLMPROC_GRANTED_RES] = { .pc_func = nlmsvc_proc_granted_res, @@ -674,6 +689,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_res), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "GRANTED_RES", }, [NLMPROC_NSM_NOTIFY] = { .pc_func = nlmsvc_proc_sm_notify, @@ -682,6 +698,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_reboot), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "SM_NOTIFY", }, [17] = { .pc_func = nlmsvc_proc_unused, @@ -690,6 +707,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "UNUSED", }, [18] = { .pc_func = nlmsvc_proc_unused, @@ -698,6 +716,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "UNUSED", }, [19] = { .pc_func = nlmsvc_proc_unused, @@ -706,6 +725,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_void), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = St, + .pc_name = "UNUSED", }, [NLMPROC_SHARE] = { .pc_func = nlmsvc_proc_share, @@ -714,6 +734,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+1, + .pc_name = "SHARE", }, [NLMPROC_UNSHARE] = { .pc_func = nlmsvc_proc_unshare, @@ -722,6 +743,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St+1, + .pc_name = "UNSHARE", }, [NLMPROC_NM_LOCK] = { .pc_func = nlmsvc_proc_nm_lock, @@ -730,6 +752,7 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_res), .pc_xdrressize = Ck+St, + .pc_name = "NM_LOCK", }, [NLMPROC_FREE_ALL] = { .pc_func = nlmsvc_proc_free_all, @@ -738,5 +761,6 @@ const struct svc_procedure nlmsvc_procedures[24] = { .pc_argsize = sizeof(struct nlm_args), .pc_ressize = sizeof(struct nlm_void), .pc_xdrressize = 0, + .pc_name = "FREE_ALL", }, }; diff --git a/fs/namei.c b/fs/namei.c index 78443a85480a..de74ad2bc6e2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -630,6 +630,11 @@ static inline bool legitimize_path(struct nameidata *nd, static bool legitimize_links(struct nameidata *nd) { int i; + if (unlikely(nd->flags & LOOKUP_CACHED)) { + drop_links(nd); + nd->depth = 0; + return false; + } for (i = 0; i < nd->depth; i++) { struct saved *last = nd->stack + i; if (unlikely(!legitimize_path(nd, &last->link, last->seq))) { @@ -669,17 +674,17 @@ static bool legitimize_root(struct nameidata *nd) */ /** - * unlazy_walk - try to switch to ref-walk mode. + * try_to_unlazy - try to switch to ref-walk mode. * @nd: nameidata pathwalk data - * Returns: 0 on success, -ECHILD on failure + * Returns: true on success, false on failure * - * unlazy_walk attempts to legitimize the current nd->path and nd->root + * try_to_unlazy attempts to legitimize the current nd->path and nd->root * for ref-walk mode. * Must be called from rcu-walk context. - * Nothing should touch nameidata between unlazy_walk() failure and + * Nothing should touch nameidata between try_to_unlazy() failure and * terminate_walk(). */ -static int unlazy_walk(struct nameidata *nd) +static bool try_to_unlazy(struct nameidata *nd) { struct dentry *parent = nd->path.dentry; @@ -694,30 +699,30 @@ static int unlazy_walk(struct nameidata *nd) goto out; rcu_read_unlock(); BUG_ON(nd->inode != parent->d_inode); - return 0; + return true; out1: nd->path.mnt = NULL; nd->path.dentry = NULL; out: rcu_read_unlock(); - return -ECHILD; + return false; } /** - * unlazy_child - try to switch to ref-walk mode. + * try_to_unlazy_next - try to switch to ref-walk mode. * @nd: nameidata pathwalk data - * @dentry: child of nd->path.dentry - * @seq: seq number to check dentry against - * Returns: 0 on success, -ECHILD on failure + * @dentry: next dentry to step into + * @seq: seq number to check @dentry against + * Returns: true on success, false on failure * - * unlazy_child attempts to legitimize the current nd->path, nd->root and dentry - * for ref-walk mode. @dentry must be a path found by a do_lookup call on - * @nd. Must be called from rcu-walk context. - * Nothing should touch nameidata between unlazy_child() failure and + * Similar to to try_to_unlazy(), but here we have the next dentry already + * picked by rcu-walk and want to legitimize that in addition to the current + * nd->path and nd->root for ref-walk mode. Must be called from rcu-walk context. + * Nothing should touch nameidata between try_to_unlazy_next() failure and * terminate_walk(). */ -static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned seq) +static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsigned seq) { BUG_ON(!(nd->flags & LOOKUP_RCU)); @@ -747,7 +752,7 @@ static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned se if (unlikely(!legitimize_root(nd))) goto out_dput; rcu_read_unlock(); - return 0; + return true; out2: nd->path.mnt = NULL; @@ -755,11 +760,11 @@ out1: nd->path.dentry = NULL; out: rcu_read_unlock(); - return -ECHILD; + return false; out_dput: rcu_read_unlock(); dput(dentry); - return -ECHILD; + return false; } static inline int d_revalidate(struct dentry *dentry, unsigned int flags) @@ -792,7 +797,8 @@ static int complete_walk(struct nameidata *nd) */ if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED))) nd->root.mnt = NULL; - if (unlikely(unlazy_walk(nd))) + nd->flags &= ~LOOKUP_CACHED; + if (!try_to_unlazy(nd)) return -ECHILD; } @@ -1372,7 +1378,7 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, return -ENOENT; if (likely(__follow_mount_rcu(nd, path, inode, seqp))) return 0; - if (unlazy_child(nd, dentry, seq)) + if (!try_to_unlazy_next(nd, dentry, seq)) return -ECHILD; // *path might've been clobbered by __follow_mount_rcu() path->mnt = nd->path.mnt; @@ -1466,7 +1472,7 @@ static struct dentry *lookup_fast(struct nameidata *nd, unsigned seq; dentry = __d_lookup_rcu(parent, &nd->last, &seq); if (unlikely(!dentry)) { - if (unlazy_walk(nd)) + if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); return NULL; } @@ -1493,9 +1499,9 @@ static struct dentry *lookup_fast(struct nameidata *nd, status = d_revalidate(dentry, nd->flags); if (likely(status > 0)) return dentry; - if (unlazy_child(nd, dentry, seq)) + if (!try_to_unlazy_next(nd, dentry, seq)) return ERR_PTR(-ECHILD); - if (unlikely(status == -ECHILD)) + if (status == -ECHILD) /* we'd been told to redo it in non-rcu mode */ status = d_revalidate(dentry, nd->flags); } else { @@ -1567,10 +1573,8 @@ static inline int may_lookup(struct nameidata *nd) { if (nd->flags & LOOKUP_RCU) { int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK); - if (err != -ECHILD) + if (err != -ECHILD || !try_to_unlazy(nd)) return err; - if (unlazy_walk(nd)) - return -ECHILD; } return inode_permission(nd->inode, MAY_EXEC); } @@ -1592,7 +1596,7 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) // unlazy even if we fail to grab the link - cleanup needs it bool grabbed_link = legitimize_path(nd, link, seq); - if (unlazy_walk(nd) != 0 || !grabbed_link) + if (!try_to_unlazy(nd) != 0 || !grabbed_link) return -ECHILD; if (nd_alloc_stack(nd)) @@ -1634,7 +1638,7 @@ static const char *pick_link(struct nameidata *nd, struct path *link, touch_atime(&last->link); cond_resched(); } else if (atime_needs_update(&last->link, inode)) { - if (unlikely(unlazy_walk(nd))) + if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); touch_atime(&last->link); } @@ -1651,11 +1655,8 @@ static const char *pick_link(struct nameidata *nd, struct path *link, get = inode->i_op->get_link; if (nd->flags & LOOKUP_RCU) { res = get(NULL, inode, &last->done); - if (res == ERR_PTR(-ECHILD)) { - if (unlikely(unlazy_walk(nd))) - return ERR_PTR(-ECHILD); + if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd)) res = get(link->dentry, inode, &last->done); - } } else { res = get(link->dentry, inode, &last->done); } @@ -2195,7 +2196,7 @@ OK: } if (unlikely(!d_can_lookup(nd->path.dentry))) { if (nd->flags & LOOKUP_RCU) { - if (unlazy_walk(nd)) + if (!try_to_unlazy(nd)) return -ECHILD; } return -ENOTDIR; @@ -2209,6 +2210,10 @@ static const char *path_init(struct nameidata *nd, unsigned flags) int error; const char *s = nd->name->name; + /* LOOKUP_CACHED requires RCU, ask caller to retry */ + if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED) + return ERR_PTR(-EAGAIN); + if (!*s) flags &= ~LOOKUP_RCU; if (flags & LOOKUP_RCU) @@ -3129,7 +3134,6 @@ static const char *open_last_lookups(struct nameidata *nd, struct inode *inode; struct dentry *dentry; const char *res; - int error; nd->flags |= op->intent; @@ -3153,9 +3157,8 @@ static const char *open_last_lookups(struct nameidata *nd, } else { /* create side of things */ if (nd->flags & LOOKUP_RCU) { - error = unlazy_walk(nd); - if (unlikely(error)) - return ERR_PTR(error); + if (!try_to_unlazy(nd)) + return ERR_PTR(-ECHILD); } audit_inode(nd->name, dir, AUDIT_INODE_PARENT); /* trailing slashes? */ @@ -3164,9 +3167,7 @@ static const char *open_last_lookups(struct nameidata *nd, } if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { - error = mnt_want_write(nd->path.mnt); - if (!error) - got_write = true; + got_write = !mnt_want_write(nd->path.mnt); /* * do _not_ fail yet - we might not need that or fail with * a different error; let lookup_open() decide; we'll be @@ -3325,10 +3326,8 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, audit_inode(nd->name, child, 0); /* Don't check for other permissions, the inode was just created */ error = may_open(&path, 0, op->open_flag); - if (error) - goto out2; - file->f_path.mnt = path.mnt; - error = finish_open(file, child, NULL); + if (!error) + error = vfs_open(&path, file); out2: mnt_drop_write(path.mnt); out: diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 79ff172eb1c8..c5348ba81129 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -1060,6 +1060,7 @@ static const struct svc_procedure nfs4_callback_procedures1[] = { .pc_decode = nfs4_decode_void, .pc_encode = nfs4_encode_void, .pc_xdrressize = 1, + .pc_name = "NULL", }, [CB_COMPOUND] = { .pc_func = nfs4_callback_compound, @@ -1067,6 +1068,7 @@ static const struct svc_procedure nfs4_callback_procedures1[] = { .pc_argsize = 256, .pc_ressize = 256, .pc_xdrressize = NFS4_CALLBACK_BUFSIZE, + .pc_name = "COMPOUND", } }; diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 7412bb164fa7..f2b34cfe286c 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -167,10 +167,28 @@ out: return parent; } +static u64 nfs_fetch_iversion(struct inode *inode) +{ + struct nfs_server *server = NFS_SERVER(inode); + + /* Is this the right call?: */ + nfs_revalidate_inode(server, inode); + /* + * Also, note we're ignoring any returned error. That seems to be + * the practice for cache consistency information elsewhere in + * the server, but I'm not sure why. + */ + if (server->nfs_client->rpc_ops->version >= 4) + return inode_peek_iversion_raw(inode); + else + return time_to_chattr(&inode->i_ctime); +} + const struct export_operations nfs_export_ops = { .encode_fh = nfs_encode_fh, .fh_to_dentry = nfs_fh_to_dentry, .get_parent = nfs_get_parent, + .fetch_iversion = nfs_fetch_iversion, .flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK| EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS| EXPORT_OP_NOATOMIC_ATTR, diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 57b3821d975a..441a2fa073c8 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -420,7 +420,9 @@ static const struct nfs4_ssc_client_ops nfs4_ssc_clnt_ops_tbl = { */ void nfs42_ssc_register_ops(void) { +#ifdef CONFIG_NFSD_V4 nfs42_ssc_register(&nfs4_ssc_clnt_ops_tbl); +#endif } /** @@ -431,7 +433,9 @@ void nfs42_ssc_register_ops(void) */ void nfs42_ssc_unregister_ops(void) { +#ifdef CONFIG_NFSD_V4 nfs42_ssc_unregister(&nfs4_ssc_clnt_ops_tbl); +#endif } #endif /* CONFIG_NFS_V4_2 */ diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 4f274f21c4ab..af64b4e6fd1f 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -324,6 +324,21 @@ pnfs_grab_inode_layout_hdr(struct pnfs_layout_hdr *lo) return NULL; } +/* + * Compare 2 layout stateid sequence ids, to see which is newer, + * taking into account wraparound issues. + */ +static bool pnfs_seqid_is_newer(u32 s1, u32 s2) +{ + return (s32)(s1 - s2) > 0; +} + +static void pnfs_barrier_update(struct pnfs_layout_hdr *lo, u32 newseq) +{ + if (pnfs_seqid_is_newer(newseq, lo->plh_barrier)) + lo->plh_barrier = newseq; +} + static void pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, u32 seq) @@ -335,6 +350,7 @@ pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, if (seq != 0) { WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq); lo->plh_return_seq = seq; + pnfs_barrier_update(lo, seq); } } @@ -639,15 +655,6 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, return rv; } -/* - * Compare 2 layout stateid sequence ids, to see which is newer, - * taking into account wraparound issues. - */ -static bool pnfs_seqid_is_newer(u32 s1, u32 s2) -{ - return (s32)(s1 - s2) > 0; -} - static bool pnfs_should_free_range(const struct pnfs_layout_range *lseg_range, const struct pnfs_layout_range *recall_range) @@ -984,8 +991,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, new_barrier = be32_to_cpu(new->seqid); else if (new_barrier == 0) return; - if (pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) - lo->plh_barrier = new_barrier; + pnfs_barrier_update(lo, new_barrier); } static bool @@ -994,7 +1000,7 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo, { u32 seqid = be32_to_cpu(stateid->seqid); - return !pnfs_seqid_is_newer(seqid, lo->plh_barrier); + return !pnfs_seqid_is_newer(seqid, lo->plh_barrier) && lo->plh_barrier; } /* lget is set to 1 if called from inside send_layoutget call chain */ @@ -1183,20 +1189,17 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, return false; set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); pnfs_get_layout_hdr(lo); + nfs4_stateid_copy(stateid, &lo->plh_stateid); + *cred = get_cred(lo->plh_lc_cred); if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) { - nfs4_stateid_copy(stateid, &lo->plh_stateid); - *cred = get_cred(lo->plh_lc_cred); if (lo->plh_return_seq != 0) stateid->seqid = cpu_to_be32(lo->plh_return_seq); if (iomode != NULL) *iomode = lo->plh_return_iomode; pnfs_clear_layoutreturn_info(lo); - return true; - } - nfs4_stateid_copy(stateid, &lo->plh_stateid); - *cred = get_cred(lo->plh_lc_cred); - if (iomode != NULL) + } else if (iomode != NULL) *iomode = IOMODE_ANY; + pnfs_barrier_update(lo, be32_to_cpu(stateid->seqid)); return true; } @@ -1909,6 +1912,11 @@ static void nfs_layoutget_end(struct pnfs_layout_hdr *lo) wake_up_var(&lo->plh_outstanding); } +static bool pnfs_is_first_layoutget(struct pnfs_layout_hdr *lo) +{ + return test_bit(NFS_LAYOUT_FIRST_LAYOUTGET, &lo->plh_flags); +} + static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo) { unsigned long *bitlock = &lo->plh_flags; @@ -2383,23 +2391,34 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out_forget; } - if (!pnfs_layout_is_valid(lo)) { - /* We have a completely new layout */ - pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, true); - } else if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { + if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { /* existing state ID, make sure the sequence number matches. */ if (pnfs_layout_stateid_blocked(lo, &res->stateid)) { + if (!pnfs_layout_is_valid(lo) && + pnfs_is_first_layoutget(lo)) + lo->plh_barrier = 0; dprintk("%s forget reply due to sequence\n", __func__); goto out_forget; } pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, false); - } else { + } else if (pnfs_layout_is_valid(lo)) { /* * We got an entirely new state ID. Mark all segments for the * inode invalid, and retry the layoutget */ - pnfs_mark_layout_stateid_invalid(lo, &free_me); + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .length = NFS4_MAX_UINT64, + }; + pnfs_set_plh_return_info(lo, IOMODE_ANY, 0); + pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, + &range, 0); goto out_forget; + } else { + /* We have a completely new layout */ + if (!pnfs_is_first_layoutget(lo)) + goto out_forget; + pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, true); } pnfs_get_lseg(lseg); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 4034102010f0..c7a924580eec 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -86,9 +86,11 @@ const struct super_operations nfs_sops = { }; EXPORT_SYMBOL_GPL(nfs_sops); +#ifdef CONFIG_NFS_V4_2 static const struct nfs_ssc_client_ops nfs_ssc_clnt_ops_tbl = { .sco_sb_deactive = nfs_sb_deactive, }; +#endif #if IS_ENABLED(CONFIG_NFS_V4) static int __init register_nfs4_fs(void) @@ -111,15 +113,21 @@ static void unregister_nfs4_fs(void) } #endif +#ifdef CONFIG_NFS_V4_2 static void nfs_ssc_register_ops(void) { +#ifdef CONFIG_NFSD_V4 nfs_ssc_register(&nfs_ssc_clnt_ops_tbl); +#endif } static void nfs_ssc_unregister_ops(void) { +#ifdef CONFIG_NFSD_V4 nfs_ssc_unregister(&nfs_ssc_clnt_ops_tbl); +#endif } +#endif /* CONFIG_NFS_V4_2 */ static struct shrinker acl_shrinker = { .count_objects = nfs_access_cache_count, @@ -148,7 +156,9 @@ int __init register_nfs_fs(void) ret = register_shrinker(&acl_shrinker); if (ret < 0) goto error_3; +#ifdef CONFIG_NFS_V4_2 nfs_ssc_register_ops(); +#endif return 0; error_3: nfs_unregister_sysctl(); @@ -168,7 +178,9 @@ void __exit unregister_nfs_fs(void) unregister_shrinker(&acl_shrinker); nfs_unregister_sysctl(); unregister_nfs4_fs(); +#ifdef CONFIG_NFS_V4_2 nfs_ssc_unregister_ops(); +#endif unregister_filesystem(&nfs_fs_type); } diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile index fa82f5aaa6d9..119c75ab9fd0 100644 --- a/fs/nfs_common/Makefile +++ b/fs/nfs_common/Makefile @@ -7,4 +7,4 @@ obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o nfs_acl-objs := nfsacl.o obj-$(CONFIG_GRACE_PERIOD) += grace.o -obj-$(CONFIG_GRACE_PERIOD) += nfs_ssc.o +obj-$(CONFIG_NFS_V4_2_SSC_HELPER) += nfs_ssc.o diff --git a/fs/nfs_common/nfs_ssc.c b/fs/nfs_common/nfs_ssc.c index f43bbb373913..7c1509e968c8 100644 --- a/fs/nfs_common/nfs_ssc.c +++ b/fs/nfs_common/nfs_ssc.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * fs/nfs_common/nfs_ssc_comm.c - * * Helper for knfsd's SSC to access ops in NFS client modules * * Author: Dai Ngo <dai.ngo@oracle.com> diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c index d056ad2fdefd..79c563c1a5e8 100644 --- a/fs/nfs_common/nfsacl.c +++ b/fs/nfs_common/nfsacl.c @@ -295,3 +295,55 @@ int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt, nfsacl_desc.desc.array_len; } EXPORT_SYMBOL_GPL(nfsacl_decode); + +/** + * nfs_stream_decode_acl - Decode an NFSv3 ACL + * + * @xdr: an xdr_stream positioned at an encoded ACL + * @aclcnt: OUT: count of ACEs in decoded posix_acl + * @pacl: OUT: a dynamically-allocated buffer containing the decoded posix_acl + * + * Return values: + * %false: The encoded ACL is not valid + * %true: @pacl contains a decoded ACL, and @xdr is advanced + * + * On a successful return, caller must release *pacl using posix_acl_release(). + */ +bool nfs_stream_decode_acl(struct xdr_stream *xdr, unsigned int *aclcnt, + struct posix_acl **pacl) +{ + const size_t elem_size = XDR_UNIT * 3; + struct nfsacl_decode_desc nfsacl_desc = { + .desc = { + .elem_size = elem_size, + .xcode = pacl ? xdr_nfsace_decode : NULL, + }, + }; + unsigned int base; + u32 entries; + + if (xdr_stream_decode_u32(xdr, &entries) < 0) + return false; + if (entries > NFS_ACL_MAX_ENTRIES) + return false; + + base = xdr_stream_pos(xdr); + if (!xdr_inline_decode(xdr, XDR_UNIT + elem_size * entries)) + return false; + nfsacl_desc.desc.array_maxlen = entries; + if (xdr_decode_array2(xdr->buf, base, &nfsacl_desc.desc)) + return false; + + if (pacl) { + if (entries != nfsacl_desc.desc.array_len || + posix_acl_from_nfsacl(nfsacl_desc.acl) != 0) { + posix_acl_release(nfsacl_desc.acl); + return false; + } + *pacl = nfsacl_desc.acl; + } + if (aclcnt) + *aclcnt = entries; + return true; +} +EXPORT_SYMBOL_GPL(nfs_stream_decode_acl); diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index dbbc583d6273..821e5913faee 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -76,6 +76,7 @@ config NFSD_V4 select CRYPTO_MD5 select CRYPTO_SHA256 select GRACE_PERIOD + select NFS_V4_2_SSC_HELPER if NFS_V4_2 help This option enables support in your system's NFS server for version 4 of the NFS protocol (RFC 3530). diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 81e7bb12aca6..7c863f2c21e0 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -331,12 +331,29 @@ static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc) fsloc->locations = NULL; } +static int export_stats_init(struct export_stats *stats) +{ + stats->start_time = ktime_get_seconds(); + return nfsd_percpu_counters_init(stats->counter, EXP_STATS_COUNTERS_NUM); +} + +static void export_stats_reset(struct export_stats *stats) +{ + nfsd_percpu_counters_reset(stats->counter, EXP_STATS_COUNTERS_NUM); +} + +static void export_stats_destroy(struct export_stats *stats) +{ + nfsd_percpu_counters_destroy(stats->counter, EXP_STATS_COUNTERS_NUM); +} + static void svc_export_put(struct kref *ref) { struct svc_export *exp = container_of(ref, struct svc_export, h.ref); path_put(&exp->ex_path); auth_domain_put(exp->ex_client); nfsd4_fslocs_free(&exp->ex_fslocs); + export_stats_destroy(&exp->ex_stats); kfree(exp->ex_uuid); kfree_rcu(exp, ex_rcu); } @@ -692,22 +709,47 @@ static void exp_flags(struct seq_file *m, int flag, int fsid, kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fslocs); static void show_secinfo(struct seq_file *m, struct svc_export *exp); +static int is_export_stats_file(struct seq_file *m) +{ + /* + * The export_stats file uses the same ops as the exports file. + * We use the file's name to determine the reported info per export. + * There is no rename in nsfdfs, so d_name.name is stable. + */ + return !strcmp(m->file->f_path.dentry->d_name.name, "export_stats"); +} + static int svc_export_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) { - struct svc_export *exp ; + struct svc_export *exp; + bool export_stats = is_export_stats_file(m); - if (h ==NULL) { - seq_puts(m, "#path domain(flags)\n"); + if (h == NULL) { + if (export_stats) + seq_puts(m, "#path domain start-time\n#\tstats\n"); + else + seq_puts(m, "#path domain(flags)\n"); return 0; } exp = container_of(h, struct svc_export, h); seq_path(m, &exp->ex_path, " \t\n\\"); seq_putc(m, '\t'); seq_escape(m, exp->ex_client->name, " \t\n\\"); + if (export_stats) { + seq_printf(m, "\t%lld\n", exp->ex_stats.start_time); + seq_printf(m, "\tfh_stale: %lld\n", + percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_FH_STALE])); + seq_printf(m, "\tio_read: %lld\n", + percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_READ])); + seq_printf(m, "\tio_write: %lld\n", + percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_WRITE])); + seq_putc(m, '\n'); + return 0; + } seq_putc(m, '('); - if (test_bit(CACHE_VALID, &h->flags) && + if (test_bit(CACHE_VALID, &h->flags) && !test_bit(CACHE_NEGATIVE, &h->flags)) { exp_flags(m, exp->ex_flags, exp->ex_fsid, exp->ex_anon_uid, exp->ex_anon_gid, &exp->ex_fslocs); @@ -748,6 +790,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) new->ex_layout_types = 0; new->ex_uuid = NULL; new->cd = item->cd; + export_stats_reset(&new->ex_stats); } static void export_update(struct cache_head *cnew, struct cache_head *citem) @@ -780,10 +823,15 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem) static struct cache_head *svc_export_alloc(void) { struct svc_export *i = kmalloc(sizeof(*i), GFP_KERNEL); - if (i) - return &i->h; - else + if (!i) + return NULL; + + if (export_stats_init(&i->ex_stats)) { + kfree(i); return NULL; + } + + return &i->h; } static const struct cache_detail svc_export_cache_template = { @@ -1245,10 +1293,14 @@ static int e_show(struct seq_file *m, void *p) struct cache_head *cp = p; struct svc_export *exp = container_of(cp, struct svc_export, h); struct cache_detail *cd = m->private; + bool export_stats = is_export_stats_file(m); if (p == SEQ_START_TOKEN) { seq_puts(m, "# Version 1.1\n"); - seq_puts(m, "# Path Client(Flags) # IPs\n"); + if (export_stats) + seq_puts(m, "# Path Client Start-time\n#\tStats\n"); + else + seq_puts(m, "# Path Client(Flags) # IPs\n"); return 0; } diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index e7daa1f246f0..ee0e3aba4a6e 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -6,6 +6,7 @@ #define NFSD_EXPORT_H #include <linux/sunrpc/cache.h> +#include <linux/percpu_counter.h> #include <uapi/linux/nfsd/export.h> #include <linux/nfs4.h> @@ -46,6 +47,19 @@ struct exp_flavor_info { u32 flags; }; +/* Per-export stats */ +enum { + EXP_STATS_FH_STALE, + EXP_STATS_IO_READ, + EXP_STATS_IO_WRITE, + EXP_STATS_COUNTERS_NUM +}; + +struct export_stats { + time64_t start_time; + struct percpu_counter counter[EXP_STATS_COUNTERS_NUM]; +}; + struct svc_export { struct cache_head h; struct auth_domain * ex_client; @@ -62,6 +76,7 @@ struct svc_export { struct nfsd4_deviceid_map *ex_devid_map; struct cache_detail *cd; struct rcu_head ex_rcu; + struct export_stats ex_stats; }; /* an "export key" (expkey) maps a filehandlefragement to an diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 7346acda9d76..c330f5bd0cf3 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -10,6 +10,7 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <linux/percpu_counter.h> /* Hash tables for nfs4_clientid state */ #define CLIENT_HASH_BITS 4 @@ -21,6 +22,14 @@ struct cld_net; struct nfsd4_client_tracking_ops; +enum { + /* cache misses due only to checksum comparison failures */ + NFSD_NET_PAYLOAD_MISSES, + /* amount of memory (in bytes) currently consumed by the DRC */ + NFSD_NET_DRC_MEM_USAGE, + NFSD_NET_COUNTERS_NUM +}; + /* * Represents a nfsd "container". With respect to nfsv4 state tracking, the * fields of interest are the *_id_hashtbls and the *_name_tree. These track @@ -149,20 +158,16 @@ struct nfsd_net { /* * Stats and other tracking of on the duplicate reply cache. - * These fields and the "rc" fields in nfsdstats are modified - * with only the per-bucket cache lock, which isn't really safe - * and should be fixed if we want the statistics to be - * completely accurate. + * The longest_chain* fields are modified with only the per-bucket + * cache lock, which isn't really safe and should be fixed if we want + * these statistics to be completely accurate. */ /* total number of entries */ atomic_t num_drc_entries; - /* cache misses due only to checksum comparison failures */ - unsigned int payload_misses; - - /* amount of memory (in bytes) currently consumed by the DRC */ - unsigned int drc_mem_usage; + /* Per-netns stats counters */ + struct percpu_counter counter[NFSD_NET_COUNTERS_NUM]; /* longest hash chain seen */ unsigned int longest_chain; diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index b0f66604532a..7eeac5b81c20 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -188,63 +188,49 @@ out: static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_getaclargs *argp = rqstp->rq_argp; - p = nfs2svc_decode_fh(p, &argp->fh); - if (!p) + if (!svcxdr_decode_fhandle(xdr, &argp->fh)) + return 0; + if (xdr_stream_decode_u32(xdr, &argp->mask) < 0) return 0; - argp->mask = ntohl(*p); p++; - return xdr_argsize_check(rqstp, p); + return 1; } - static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_setaclargs *argp = rqstp->rq_argp; - struct kvec *head = rqstp->rq_arg.head; - unsigned int base; - int n; - p = nfs2svc_decode_fh(p, &argp->fh); - if (!p) + if (!svcxdr_decode_fhandle(xdr, &argp->fh)) return 0; - argp->mask = ntohl(*p++); - if (argp->mask & ~NFS_ACL_MASK || - !xdr_argsize_check(rqstp, p)) + if (xdr_stream_decode_u32(xdr, &argp->mask) < 0) return 0; - - base = (char *)p - (char *)head->iov_base; - n = nfsacl_decode(&rqstp->rq_arg, base, NULL, - (argp->mask & NFS_ACL) ? - &argp->acl_access : NULL); - if (n > 0) - n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL, - (argp->mask & NFS_DFACL) ? - &argp->acl_default : NULL); - return (n > 0); -} - -static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p) -{ - struct nfsd_fhandle *argp = rqstp->rq_argp; - - p = nfs2svc_decode_fh(p, &argp->fh); - if (!p) + if (argp->mask & ~NFS_ACL_MASK) + return 0; + if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_ACL) ? + &argp->acl_access : NULL)) return 0; - return xdr_argsize_check(rqstp, p); + if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_DFACL) ? + &argp->acl_default : NULL)) + return 0; + + return 1; } static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p) { - struct nfsd3_accessargs *argp = rqstp->rq_argp; + struct xdr_stream *xdr = &rqstp->rq_arg_stream; + struct nfsd3_accessargs *args = rqstp->rq_argp; - p = nfs2svc_decode_fh(p, &argp->fh); - if (!p) + if (!svcxdr_decode_fhandle(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u32(xdr, &args->access) < 0) return 0; - argp->access = ntohl(*p++); - return xdr_argsize_check(rqstp, p); + return 1; } /* @@ -371,6 +357,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST, + .pc_name = "NULL", }, [ACLPROC2_GETACL] = { .pc_func = nfsacld_proc_getacl, @@ -381,6 +368,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { .pc_ressize = sizeof(struct nfsd3_getaclres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+1+2*(1+ACL), + .pc_name = "GETACL", }, [ACLPROC2_SETACL] = { .pc_func = nfsacld_proc_setacl, @@ -391,16 +379,18 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT, + .pc_name = "SETACL", }, [ACLPROC2_GETATTR] = { .pc_func = nfsacld_proc_getattr, - .pc_decode = nfsaclsvc_decode_fhandleargs, + .pc_decode = nfssvc_decode_fhandleargs, .pc_encode = nfsaclsvc_encode_attrstatres, .pc_release = nfsaclsvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT, + .pc_name = "GETATTR", }, [ACLPROC2_ACCESS] = { .pc_func = nfsacld_proc_access, @@ -411,6 +401,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { .pc_ressize = sizeof(struct nfsd3_accessres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT+1, + .pc_name = "SETATTR", }, }; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 7c30876a31a1..a568b842e9eb 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -124,43 +124,39 @@ out: /* * XDR decode functions */ + static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_getaclargs *args = rqstp->rq_argp; - p = nfs3svc_decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u32(xdr, &args->mask) < 0) return 0; - args->mask = ntohl(*p); p++; - return xdr_argsize_check(rqstp, p); + return 1; } - static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p) { - struct nfsd3_setaclargs *args = rqstp->rq_argp; - struct kvec *head = rqstp->rq_arg.head; - unsigned int base; - int n; + struct xdr_stream *xdr = &rqstp->rq_arg_stream; + struct nfsd3_setaclargs *argp = rqstp->rq_argp; - p = nfs3svc_decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_nfs_fh3(xdr, &argp->fh)) return 0; - args->mask = ntohl(*p++); - if (args->mask & ~NFS_ACL_MASK || - !xdr_argsize_check(rqstp, p)) + if (xdr_stream_decode_u32(xdr, &argp->mask) < 0) + return 0; + if (argp->mask & ~NFS_ACL_MASK) + return 0; + if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_ACL) ? + &argp->acl_access : NULL)) + return 0; + if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_DFACL) ? + &argp->acl_default : NULL)) return 0; - base = (char *)p - (char *)head->iov_base; - n = nfsacl_decode(&rqstp->rq_arg, base, NULL, - (args->mask & NFS_ACL) ? - &args->acl_access : NULL); - if (n > 0) - n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL, - (args->mask & NFS_DFACL) ? - &args->acl_default : NULL); - return (n > 0); + return 1; } /* @@ -251,6 +247,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = { .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST, + .pc_name = "NULL", }, [ACLPROC3_GETACL] = { .pc_func = nfsd3_proc_getacl, @@ -261,6 +258,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = { .pc_ressize = sizeof(struct nfsd3_getaclres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+1+2*(1+ACL), + .pc_name = "GETACL", }, [ACLPROC3_SETACL] = { .pc_func = nfsd3_proc_setacl, @@ -271,6 +269,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = { .pc_ressize = sizeof(struct nfsd3_attrstat), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT, + .pc_name = "SETACL", }, }; diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 76931f4f57c3..8675851199f8 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -124,15 +124,16 @@ nfsd3_proc_access(struct svc_rqst *rqstp) static __be32 nfsd3_proc_readlink(struct svc_rqst *rqstp) { - struct nfsd3_readlinkargs *argp = rqstp->rq_argp; + struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd3_readlinkres *resp = rqstp->rq_resp; + char *buffer = page_address(*(rqstp->rq_next_page++)); dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh)); /* Read the symlink. */ fh_copy(&resp->fh, &argp->fh); resp->len = NFS3_MAXPATHLEN; - resp->status = nfsd_readlink(rqstp, &resp->fh, argp->buffer, &resp->len); + resp->status = nfsd_readlink(rqstp, &resp->fh, buffer, &resp->len); return rpc_success; } @@ -144,25 +145,38 @@ nfsd3_proc_read(struct svc_rqst *rqstp) { struct nfsd3_readargs *argp = rqstp->rq_argp; struct nfsd3_readres *resp = rqstp->rq_resp; - u32 max_blocksize = svc_max_payload(rqstp); - unsigned long cnt = min(argp->count, max_blocksize); + u32 max_blocksize = svc_max_payload(rqstp); + unsigned int len; + int v; + + argp->count = min_t(u32, argp->count, max_blocksize); dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n", SVCFH_fmt(&argp->fh), (unsigned long) argp->count, (unsigned long long) argp->offset); + v = 0; + len = argp->count; + while (len > 0) { + struct page *page = *(rqstp->rq_next_page++); + + rqstp->rq_vec[v].iov_base = page_address(page); + rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); + len -= rqstp->rq_vec[v].iov_len; + v++; + } + /* Obtain buffer pointer for payload. * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof) * + 1 (xdr opaque byte count) = 26 */ - resp->count = cnt; + resp->count = argp->count; svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4); fh_copy(&resp->fh, &argp->fh); resp->status = nfsd_read(rqstp, &resp->fh, argp->offset, - rqstp->rq_vec, argp->vlen, &resp->count, - &resp->eof); + rqstp->rq_vec, v, &resp->count, &resp->eof); return rpc_success; } @@ -421,6 +435,23 @@ nfsd3_proc_link(struct svc_rqst *rqstp) return rpc_success; } +static void nfsd3_init_dirlist_pages(struct svc_rqst *rqstp, + struct nfsd3_readdirres *resp, + int count) +{ + count = min_t(u32, count, svc_max_payload(rqstp)); + + /* Convert byte count to number of words (i.e. >> 2), + * and reserve room for the NULL ptr & eof flag (-2 words) */ + resp->buflen = (count >> 2) - 2; + + resp->buffer = page_address(*rqstp->rq_next_page); + while (count > 0) { + rqstp->rq_next_page++; + count -= PAGE_SIZE; + } +} + /* * Read a portion of a directory. */ @@ -430,6 +461,7 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) struct nfsd3_readdirargs *argp = rqstp->rq_argp; struct nfsd3_readdirres *resp = rqstp->rq_resp; int count = 0; + loff_t offset; struct page **p; caddr_t page_addr = NULL; @@ -437,18 +469,16 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) SVCFH_fmt(&argp->fh), argp->count, (u32) argp->cookie); - /* Make sure we've room for the NULL ptr & eof flag, and shrink to - * client read size */ - count = (argp->count >> 2) - 2; + nfsd3_init_dirlist_pages(rqstp, resp, argp->count); /* Read directory and encode entries on the fly */ fh_copy(&resp->fh, &argp->fh); - resp->buflen = count; resp->common.err = nfs_ok; - resp->buffer = argp->buffer; resp->rqstp = rqstp; - resp->status = nfsd_readdir(rqstp, &resp->fh, (loff_t *)&argp->cookie, + offset = argp->cookie; + + resp->status = nfsd_readdir(rqstp, &resp->fh, &offset, &resp->common, nfs3svc_encode_entry); memcpy(resp->verf, argp->verf, 8); count = 0; @@ -464,8 +494,6 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) } resp->count = count >> 2; if (resp->offset) { - loff_t offset = argp->cookie; - if (unlikely(resp->offset1)) { /* we ended up with offset on a page boundary */ *resp->offset = htonl(offset >> 32); @@ -498,16 +526,12 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp) SVCFH_fmt(&argp->fh), argp->count, (u32) argp->cookie); - /* Convert byte count to number of words (i.e. >> 2), - * and reserve room for the NULL ptr & eof flag (-2 words) */ - resp->count = (argp->count >> 2) - 2; + nfsd3_init_dirlist_pages(rqstp, resp, argp->count); /* Read directory and encode entries on the fly */ fh_copy(&resp->fh, &argp->fh); resp->common.err = nfs_ok; - resp->buffer = argp->buffer; - resp->buflen = resp->count; resp->rqstp = rqstp; offset = argp->cookie; @@ -683,7 +707,6 @@ out: * NFSv3 Server procedures. * Only the results of non-idempotent operations are cached. */ -#define nfs3svc_decode_fhandleargs nfs3svc_decode_fhandle #define nfs3svc_encode_attrstatres nfs3svc_encode_attrstat #define nfs3svc_encode_wccstatres nfs3svc_encode_wccstat #define nfsd3_mkdirargs nfsd3_createargs @@ -708,16 +731,18 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST, + .pc_name = "NULL", }, [NFS3PROC_GETATTR] = { .pc_func = nfsd3_proc_getattr, .pc_decode = nfs3svc_decode_fhandleargs, .pc_encode = nfs3svc_encode_attrstatres, .pc_release = nfs3svc_release_fhandle, - .pc_argsize = sizeof(struct nfsd3_fhandleargs), + .pc_argsize = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd3_attrstatres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT, + .pc_name = "GETATTR", }, [NFS3PROC_SETATTR] = { .pc_func = nfsd3_proc_setattr, @@ -728,6 +753,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_wccstatres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC, + .pc_name = "SETATTR", }, [NFS3PROC_LOOKUP] = { .pc_func = nfsd3_proc_lookup, @@ -738,6 +764,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_diropres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+FH+pAT+pAT, + .pc_name = "LOOKUP", }, [NFS3PROC_ACCESS] = { .pc_func = nfsd3_proc_access, @@ -748,16 +775,18 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_accessres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+1, + .pc_name = "ACCESS", }, [NFS3PROC_READLINK] = { .pc_func = nfsd3_proc_readlink, - .pc_decode = nfs3svc_decode_readlinkargs, + .pc_decode = nfs3svc_decode_fhandleargs, .pc_encode = nfs3svc_encode_readlinkres, .pc_release = nfs3svc_release_fhandle, - .pc_argsize = sizeof(struct nfsd3_readlinkargs), + .pc_argsize = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd3_readlinkres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+1+NFS3_MAXPATHLEN/4, + .pc_name = "READLINK", }, [NFS3PROC_READ] = { .pc_func = nfsd3_proc_read, @@ -768,6 +797,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_readres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+4+NFSSVC_MAXBLKSIZE/4, + .pc_name = "READ", }, [NFS3PROC_WRITE] = { .pc_func = nfsd3_proc_write, @@ -778,6 +808,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_writeres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC+4, + .pc_name = "WRITE", }, [NFS3PROC_CREATE] = { .pc_func = nfsd3_proc_create, @@ -788,6 +819,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_createres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+(1+FH+pAT)+WC, + .pc_name = "CREATE", }, [NFS3PROC_MKDIR] = { .pc_func = nfsd3_proc_mkdir, @@ -798,6 +830,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_createres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+(1+FH+pAT)+WC, + .pc_name = "MKDIR", }, [NFS3PROC_SYMLINK] = { .pc_func = nfsd3_proc_symlink, @@ -808,6 +841,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_createres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+(1+FH+pAT)+WC, + .pc_name = "SYMLINK", }, [NFS3PROC_MKNOD] = { .pc_func = nfsd3_proc_mknod, @@ -818,6 +852,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_createres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+(1+FH+pAT)+WC, + .pc_name = "MKNOD", }, [NFS3PROC_REMOVE] = { .pc_func = nfsd3_proc_remove, @@ -828,6 +863,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_wccstatres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC, + .pc_name = "REMOVE", }, [NFS3PROC_RMDIR] = { .pc_func = nfsd3_proc_rmdir, @@ -838,6 +874,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_wccstatres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC, + .pc_name = "RMDIR", }, [NFS3PROC_RENAME] = { .pc_func = nfsd3_proc_rename, @@ -848,6 +885,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_renameres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+WC+WC, + .pc_name = "RENAME", }, [NFS3PROC_LINK] = { .pc_func = nfsd3_proc_link, @@ -858,6 +896,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_linkres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+pAT+WC, + .pc_name = "LINK", }, [NFS3PROC_READDIR] = { .pc_func = nfsd3_proc_readdir, @@ -867,6 +906,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_argsize = sizeof(struct nfsd3_readdirargs), .pc_ressize = sizeof(struct nfsd3_readdirres), .pc_cachetype = RC_NOCACHE, + .pc_name = "READDIR", }, [NFS3PROC_READDIRPLUS] = { .pc_func = nfsd3_proc_readdirplus, @@ -876,6 +916,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_argsize = sizeof(struct nfsd3_readdirplusargs), .pc_ressize = sizeof(struct nfsd3_readdirres), .pc_cachetype = RC_NOCACHE, + .pc_name = "READDIRPLUS", }, [NFS3PROC_FSSTAT] = { .pc_func = nfsd3_proc_fsstat, @@ -885,6 +926,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_fsstatres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+2*6+1, + .pc_name = "FSSTAT", }, [NFS3PROC_FSINFO] = { .pc_func = nfsd3_proc_fsinfo, @@ -894,6 +936,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_fsinfores), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+12, + .pc_name = "FSINFO", }, [NFS3PROC_PATHCONF] = { .pc_func = nfsd3_proc_pathconf, @@ -903,6 +946,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_pathconfres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+pAT+6, + .pc_name = "PATHCONF", }, [NFS3PROC_COMMIT] = { .pc_func = nfsd3_proc_commit, @@ -913,6 +957,7 @@ static const struct svc_procedure nfsd_procedures3[22] = { .pc_ressize = sizeof(struct nfsd3_commitres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+WC+2, + .pc_name = "COMMIT", }, }; diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 821db21ba072..9d9a01ce0b27 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -29,8 +29,9 @@ static u32 nfs3_ftypes[] = { /* - * XDR functions for basic NFS types + * Basic NFSv3 data types (RFC 1813 Sections 2.5 and 2.6) */ + static __be32 * encode_time3(__be32 *p, struct timespec64 *time) { @@ -38,32 +39,47 @@ encode_time3(__be32 *p, struct timespec64 *time) return p; } -static __be32 * -decode_time3(__be32 *p, struct timespec64 *time) +static bool +svcxdr_decode_nfstime3(struct xdr_stream *xdr, struct timespec64 *timep) { - time->tv_sec = ntohl(*p++); - time->tv_nsec = ntohl(*p++); - return p; + __be32 *p; + + p = xdr_inline_decode(xdr, XDR_UNIT * 2); + if (!p) + return false; + timep->tv_sec = be32_to_cpup(p++); + timep->tv_nsec = be32_to_cpup(p); + + return true; } -static __be32 * -decode_fh(__be32 *p, struct svc_fh *fhp) +/** + * svcxdr_decode_nfs_fh3 - Decode an NFSv3 file handle + * @xdr: XDR stream positioned at an undecoded NFSv3 FH + * @fhp: OUT: filled-in server file handle + * + * Return values: + * %false: The encoded file handle was not valid + * %true: @fhp has been initialized + */ +bool +svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp) { - unsigned int size; + __be32 *p; + u32 size; + + if (xdr_stream_decode_u32(xdr, &size) < 0) + return false; + if (size == 0 || size > NFS3_FHSIZE) + return false; + p = xdr_inline_decode(xdr, size); + if (!p) + return false; fh_init(fhp, NFS3_FHSIZE); - size = ntohl(*p++); - if (size > NFS3_FHSIZE) - return NULL; - - memcpy(&fhp->fh_handle.fh_base, p, size); fhp->fh_handle.fh_size = size; - return p + XDR_QUADLEN(size); -} + memcpy(&fhp->fh_handle.fh_base, p, size); -/* Helper function for NFSv3 ACL code */ -__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp) -{ - return decode_fh(p, fhp); + return true; } static __be32 * @@ -76,69 +92,165 @@ encode_fh(__be32 *p, struct svc_fh *fhp) return p + XDR_QUADLEN(size); } -/* - * Decode a file name and make sure that the path contains - * no slashes or null bytes. - */ -static __be32 * -decode_filename(__be32 *p, char **namp, unsigned int *lenp) +static bool +svcxdr_decode_filename3(struct xdr_stream *xdr, char **name, unsigned int *len) { - char *name; - unsigned int i; + u32 size, i; + __be32 *p; + char *c; + + if (xdr_stream_decode_u32(xdr, &size) < 0) + return false; + if (size == 0 || size > NFS3_MAXNAMLEN) + return false; + p = xdr_inline_decode(xdr, size); + if (!p) + return false; - if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS3_MAXNAMLEN)) != NULL) { - for (i = 0, name = *namp; i < *lenp; i++, name++) { - if (*name == '\0' || *name == '/') - return NULL; - } + *len = size; + *name = (char *)p; + for (i = 0, c = *name; i < size; i++, c++) { + if (*c == '\0' || *c == '/') + return false; } - return p; + return true; } -static __be32 * -decode_sattr3(__be32 *p, struct iattr *iap, struct user_namespace *userns) +static bool +svcxdr_decode_diropargs3(struct xdr_stream *xdr, struct svc_fh *fhp, + char **name, unsigned int *len) +{ + return svcxdr_decode_nfs_fh3(xdr, fhp) && + svcxdr_decode_filename3(xdr, name, len); +} + +static bool +svcxdr_decode_sattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr, + struct iattr *iap) { - u32 tmp; + u32 set_it; iap->ia_valid = 0; - if (*p++) { + if (xdr_stream_decode_bool(xdr, &set_it) < 0) + return false; + if (set_it) { + u32 mode; + + if (xdr_stream_decode_u32(xdr, &mode) < 0) + return false; iap->ia_valid |= ATTR_MODE; - iap->ia_mode = ntohl(*p++); + iap->ia_mode = mode; } - if (*p++) { - iap->ia_uid = make_kuid(userns, ntohl(*p++)); + if (xdr_stream_decode_bool(xdr, &set_it) < 0) + return false; + if (set_it) { + u32 uid; + + if (xdr_stream_decode_u32(xdr, &uid) < 0) + return false; + iap->ia_uid = make_kuid(nfsd_user_namespace(rqstp), uid); if (uid_valid(iap->ia_uid)) iap->ia_valid |= ATTR_UID; } - if (*p++) { - iap->ia_gid = make_kgid(userns, ntohl(*p++)); + if (xdr_stream_decode_bool(xdr, &set_it) < 0) + return false; + if (set_it) { + u32 gid; + + if (xdr_stream_decode_u32(xdr, &gid) < 0) + return false; + iap->ia_gid = make_kgid(nfsd_user_namespace(rqstp), gid); if (gid_valid(iap->ia_gid)) iap->ia_valid |= ATTR_GID; } - if (*p++) { - u64 newsize; + if (xdr_stream_decode_bool(xdr, &set_it) < 0) + return false; + if (set_it) { + u64 newsize; + if (xdr_stream_decode_u64(xdr, &newsize) < 0) + return false; iap->ia_valid |= ATTR_SIZE; - p = xdr_decode_hyper(p, &newsize); iap->ia_size = min_t(u64, newsize, NFS_OFFSET_MAX); } - if ((tmp = ntohl(*p++)) == 1) { /* set to server time */ + if (xdr_stream_decode_u32(xdr, &set_it) < 0) + return false; + switch (set_it) { + case DONT_CHANGE: + break; + case SET_TO_SERVER_TIME: iap->ia_valid |= ATTR_ATIME; - } else if (tmp == 2) { /* set to client time */ + break; + case SET_TO_CLIENT_TIME: + if (!svcxdr_decode_nfstime3(xdr, &iap->ia_atime)) + return false; iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET; - iap->ia_atime.tv_sec = ntohl(*p++); - iap->ia_atime.tv_nsec = ntohl(*p++); + break; + default: + return false; } - if ((tmp = ntohl(*p++)) == 1) { /* set to server time */ + if (xdr_stream_decode_u32(xdr, &set_it) < 0) + return false; + switch (set_it) { + case DONT_CHANGE: + break; + case SET_TO_SERVER_TIME: iap->ia_valid |= ATTR_MTIME; - } else if (tmp == 2) { /* set to client time */ + break; + case SET_TO_CLIENT_TIME: + if (!svcxdr_decode_nfstime3(xdr, &iap->ia_mtime)) + return false; iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET; - iap->ia_mtime.tv_sec = ntohl(*p++); - iap->ia_mtime.tv_nsec = ntohl(*p++); + break; + default: + return false; } - return p; + + return true; +} + +static bool +svcxdr_decode_sattrguard3(struct xdr_stream *xdr, struct nfsd3_sattrargs *args) +{ + __be32 *p; + u32 check; + + if (xdr_stream_decode_bool(xdr, &check) < 0) + return false; + if (check) { + p = xdr_inline_decode(xdr, XDR_UNIT * 2); + if (!p) + return false; + args->check_guard = 1; + args->guardtime = be32_to_cpup(p); + } else + args->check_guard = 0; + + return true; +} + +static bool +svcxdr_decode_specdata3(struct xdr_stream *xdr, struct nfsd3_mknodargs *args) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, XDR_UNIT * 2); + if (!p) + return false; + args->major = be32_to_cpup(p++); + args->minor = be32_to_cpup(p); + + return true; +} + +static bool +svcxdr_decode_devicedata3(struct svc_rqst *rqstp, struct xdr_stream *xdr, + struct nfsd3_mknodargs *args) +{ + return svcxdr_decode_sattr3(rqstp, xdr, &args->attrs) && + svcxdr_decode_specdata3(xdr, args); } static __be32 *encode_fsid(__be32 *p, struct svc_fh *fhp) @@ -252,6 +364,11 @@ encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) return encode_post_op_attr(rqstp, p, fhp); } +static bool fs_supports_change_attribute(struct super_block *sb) +{ + return sb->s_flags & SB_I_VERSION || sb->s_export_op->fetch_iversion; +} + /* * Fill in the pre_op attr for the wcc data */ @@ -260,24 +377,26 @@ void fill_pre_wcc(struct svc_fh *fhp) struct inode *inode; struct kstat stat; bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); - __be32 err; if (fhp->fh_no_wcc || fhp->fh_pre_saved) return; inode = d_inode(fhp->fh_dentry); - err = fh_getattr(fhp, &stat); - if (err) { - /* Grab the times from inode anyway */ - stat.mtime = inode->i_mtime; - stat.ctime = inode->i_ctime; - stat.size = inode->i_size; + if (fs_supports_change_attribute(inode->i_sb) || !v4) { + __be32 err = fh_getattr(fhp, &stat); + + if (err) { + /* Grab the times from inode anyway */ + stat.mtime = inode->i_mtime; + stat.ctime = inode->i_ctime; + stat.size = inode->i_size; + } + fhp->fh_pre_mtime = stat.mtime; + fhp->fh_pre_ctime = stat.ctime; + fhp->fh_pre_size = stat.size; } if (v4) fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); - fhp->fh_pre_mtime = stat.mtime; - fhp->fh_pre_ctime = stat.ctime; - fhp->fh_pre_size = stat.size; fhp->fh_pre_saved = true; } @@ -288,7 +407,6 @@ void fill_post_wcc(struct svc_fh *fhp) { bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); struct inode *inode = d_inode(fhp->fh_dentry); - __be32 err; if (fhp->fh_no_wcc) return; @@ -296,12 +414,16 @@ void fill_post_wcc(struct svc_fh *fhp) if (fhp->fh_post_saved) printk("nfsd: inode locked twice during operation.\n"); - err = fh_getattr(fhp, &fhp->fh_post_attr); - if (err) { - fhp->fh_post_saved = false; - fhp->fh_post_attr.ctime = inode->i_ctime; - } else - fhp->fh_post_saved = true; + fhp->fh_post_saved = true; + + if (fs_supports_change_attribute(inode->i_sb) || !v4) { + __be32 err = fh_getattr(fhp, &fhp->fh_post_attr); + + if (err) { + fhp->fh_post_saved = false; + fhp->fh_post_attr.ctime = inode->i_ctime; + } + } if (v4) fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr, inode); @@ -312,331 +434,277 @@ void fill_post_wcc(struct svc_fh *fhp) */ int -nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p) +nfs3svc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_fhandle *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_nfs_fh3(xdr, &args->fh); } int nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_sattrargs *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp)); - - if ((args->check_guard = ntohl(*p++)) != 0) { - struct timespec64 time; - p = decode_time3(p, &time); - args->guardtime = time.tv_sec; - } - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_nfs_fh3(xdr, &args->fh) && + svcxdr_decode_sattr3(rqstp, xdr, &args->attrs) && + svcxdr_decode_sattrguard3(xdr, args); } int nfs3svc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_diropargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->fh)) - || !(p = decode_filename(p, &args->name, &args->len))) - return 0; - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_diropargs3(xdr, &args->fh, &args->name, &args->len); } int nfs3svc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_accessargs *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u32(xdr, &args->access) < 0) return 0; - args->access = ntohl(*p++); - return xdr_argsize_check(rqstp, p); + return 1; } int nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_readargs *args = rqstp->rq_argp; - unsigned int len; - int v; - u32 max_blocksize = svc_max_payload(rqstp); - p = decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u64(xdr, &args->offset) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) return 0; - p = xdr_decode_hyper(p, &args->offset); - - args->count = ntohl(*p++); - len = min(args->count, max_blocksize); - - /* set up the kvec */ - v=0; - while (len > 0) { - struct page *p = *(rqstp->rq_next_page++); - rqstp->rq_vec[v].iov_base = page_address(p); - rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); - len -= rqstp->rq_vec[v].iov_len; - v++; - } - args->vlen = v; - return xdr_argsize_check(rqstp, p); + return 1; } int nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_writeargs *args = rqstp->rq_argp; - unsigned int len, hdr, dlen; u32 max_blocksize = svc_max_payload(rqstp); struct kvec *head = rqstp->rq_arg.head; struct kvec *tail = rqstp->rq_arg.tail; + size_t remaining; - p = decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) return 0; - p = xdr_decode_hyper(p, &args->offset); - - args->count = ntohl(*p++); - args->stable = ntohl(*p++); - len = args->len = ntohl(*p++); - if ((void *)p > head->iov_base + head->iov_len) + if (xdr_stream_decode_u64(xdr, &args->offset) < 0) return 0; - /* - * The count must equal the amount of data passed. - */ - if (args->count != args->len) + if (xdr_stream_decode_u32(xdr, &args->count) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &args->stable) < 0) return 0; - /* - * Check to make sure that we got the right number of - * bytes. - */ - hdr = (void*)p - head->iov_base; - dlen = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len - hdr; - /* - * Round the length of the data which was specified up to - * the next multiple of XDR units and then compare that - * against the length which was actually received. - * Note that when RPCSEC/GSS (for example) is used, the - * data buffer can be padded so dlen might be larger - * than required. It must never be smaller. - */ - if (dlen < XDR_QUADLEN(len)*4) + /* opaque data */ + if (xdr_stream_decode_u32(xdr, &args->len) < 0) return 0; + /* request sanity */ + if (args->count != args->len) + return 0; + remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len; + remaining -= xdr_stream_pos(xdr); + if (remaining < xdr_align_size(args->len)) + return 0; if (args->count > max_blocksize) { args->count = max_blocksize; - len = args->len = max_blocksize; + args->len = max_blocksize; } - args->first.iov_base = (void *)p; - args->first.iov_len = head->iov_len - hdr; + args->first.iov_base = xdr->p; + args->first.iov_len = head->iov_len - xdr_stream_pos(xdr); + return 1; } int nfs3svc_decode_createargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_createargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->fh)) - || !(p = decode_filename(p, &args->name, &args->len))) + if (!svcxdr_decode_diropargs3(xdr, &args->fh, &args->name, &args->len)) return 0; - - switch (args->createmode = ntohl(*p++)) { + if (xdr_stream_decode_u32(xdr, &args->createmode) < 0) + return 0; + switch (args->createmode) { case NFS3_CREATE_UNCHECKED: case NFS3_CREATE_GUARDED: - p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp)); - break; + return svcxdr_decode_sattr3(rqstp, xdr, &args->attrs); case NFS3_CREATE_EXCLUSIVE: - args->verf = p; - p += 2; + args->verf = xdr_inline_decode(xdr, NFS3_CREATEVERFSIZE); + if (!args->verf) + return 0; break; default: return 0; } - - return xdr_argsize_check(rqstp, p); + return 1; } int nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_createargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->fh)) || - !(p = decode_filename(p, &args->name, &args->len))) - return 0; - p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp)); - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_diropargs3(xdr, &args->fh, + &args->name, &args->len) && + svcxdr_decode_sattr3(rqstp, xdr, &args->attrs); } int nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_symlinkargs *args = rqstp->rq_argp; - char *base = (char *)p; - size_t dlen; + struct kvec *head = rqstp->rq_arg.head; + struct kvec *tail = rqstp->rq_arg.tail; + size_t remaining; - if (!(p = decode_fh(p, &args->ffh)) || - !(p = decode_filename(p, &args->fname, &args->flen))) + if (!svcxdr_decode_diropargs3(xdr, &args->ffh, &args->fname, &args->flen)) + return 0; + if (!svcxdr_decode_sattr3(rqstp, xdr, &args->attrs)) + return 0; + if (xdr_stream_decode_u32(xdr, &args->tlen) < 0) return 0; - p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp)); - args->tlen = ntohl(*p++); + /* request sanity */ + remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len; + remaining -= xdr_stream_pos(xdr); + if (remaining < xdr_align_size(args->tlen)) + return 0; - args->first.iov_base = p; - args->first.iov_len = rqstp->rq_arg.head[0].iov_len; - args->first.iov_len -= (char *)p - base; + args->first.iov_base = xdr->p; + args->first.iov_len = head->iov_len - xdr_stream_pos(xdr); - dlen = args->first.iov_len + rqstp->rq_arg.page_len + - rqstp->rq_arg.tail[0].iov_len; - if (dlen < XDR_QUADLEN(args->tlen) << 2) - return 0; return 1; } int nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_mknodargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->fh)) - || !(p = decode_filename(p, &args->name, &args->len))) + if (!svcxdr_decode_diropargs3(xdr, &args->fh, &args->name, &args->len)) + return 0; + if (xdr_stream_decode_u32(xdr, &args->ftype) < 0) + return 0; + switch (args->ftype) { + case NF3CHR: + case NF3BLK: + return svcxdr_decode_devicedata3(rqstp, xdr, args); + case NF3SOCK: + case NF3FIFO: + return svcxdr_decode_sattr3(rqstp, xdr, &args->attrs); + case NF3REG: + case NF3DIR: + case NF3LNK: + /* Valid XDR but illegal file types */ + break; + default: return 0; - - args->ftype = ntohl(*p++); - - if (args->ftype == NF3BLK || args->ftype == NF3CHR - || args->ftype == NF3SOCK || args->ftype == NF3FIFO) - p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp)); - - if (args->ftype == NF3BLK || args->ftype == NF3CHR) { - args->major = ntohl(*p++); - args->minor = ntohl(*p++); } - return xdr_argsize_check(rqstp, p); + return 1; } int nfs3svc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_renameargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->ffh)) - || !(p = decode_filename(p, &args->fname, &args->flen)) - || !(p = decode_fh(p, &args->tfh)) - || !(p = decode_filename(p, &args->tname, &args->tlen))) - return 0; - - return xdr_argsize_check(rqstp, p); -} - -int -nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p) -{ - struct nfsd3_readlinkargs *args = rqstp->rq_argp; - - p = decode_fh(p, &args->fh); - if (!p) - return 0; - args->buffer = page_address(*(rqstp->rq_next_page++)); - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_diropargs3(xdr, &args->ffh, + &args->fname, &args->flen) && + svcxdr_decode_diropargs3(xdr, &args->tfh, + &args->tname, &args->tlen); } int nfs3svc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_linkargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->ffh)) - || !(p = decode_fh(p, &args->tfh)) - || !(p = decode_filename(p, &args->tname, &args->tlen))) - return 0; - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_nfs_fh3(xdr, &args->ffh) && + svcxdr_decode_diropargs3(xdr, &args->tfh, + &args->tname, &args->tlen); } int nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_readdirargs *args = rqstp->rq_argp; - int len; - u32 max_blocksize = svc_max_payload(rqstp); - p = decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u64(xdr, &args->cookie) < 0) + return 0; + args->verf = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE); + if (!args->verf) + return 0; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) return 0; - p = xdr_decode_hyper(p, &args->cookie); - args->verf = p; p += 2; - args->dircount = ~0; - args->count = ntohl(*p++); - len = args->count = min_t(u32, args->count, max_blocksize); - - while (len > 0) { - struct page *p = *(rqstp->rq_next_page++); - if (!args->buffer) - args->buffer = page_address(p); - len -= PAGE_SIZE; - } - return xdr_argsize_check(rqstp, p); + return 1; } int nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_readdirargs *args = rqstp->rq_argp; - int len; - u32 max_blocksize = svc_max_payload(rqstp); + u32 dircount; - p = decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u64(xdr, &args->cookie) < 0) + return 0; + args->verf = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE); + if (!args->verf) + return 0; + /* dircount is ignored */ + if (xdr_stream_decode_u32(xdr, &dircount) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) return 0; - p = xdr_decode_hyper(p, &args->cookie); - args->verf = p; p += 2; - args->dircount = ntohl(*p++); - args->count = ntohl(*p++); - - len = args->count = min(args->count, max_blocksize); - while (len > 0) { - struct page *p = *(rqstp->rq_next_page++); - if (!args->buffer) - args->buffer = page_address(p); - len -= PAGE_SIZE; - } - return xdr_argsize_check(rqstp, p); + return 1; } int nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_commitargs *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) + + if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u64(xdr, &args->offset) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) return 0; - p = xdr_decode_hyper(p, &args->offset); - args->count = ntohl(*p++); - return xdr_argsize_check(rqstp, p); + return 1; } /* @@ -865,9 +933,14 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, if (isdotent(name, namlen)) { if (namlen == 2) { dchild = dget_parent(dparent); - /* filesystem root - cannot return filehandle for ".." */ + /* + * Don't return filehandle for ".." if we're at + * the filesystem or export root: + */ if (dchild == dparent) goto out; + if (dparent == exp->ex_path.dentry) + goto out; } else dchild = dget(dparent); } else diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 8d6d2678abad..acdb3cd806a1 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -378,8 +378,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * Before RECLAIM_COMPLETE done, server should deny new lock */ if (nfsd4_has_session(cstate) && - !test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, - &cstate->session->se_client->cl_flags) && + !test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) return nfserr_grace; @@ -428,8 +427,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; break; case NFS4_OPEN_CLAIM_PREVIOUS: - status = nfs4_check_open_reclaim(&open->op_clientid, - cstate, nn); + status = nfs4_check_open_reclaim(cstate->clp); if (status) goto out; open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; @@ -1888,7 +1886,7 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp, nfserr = nfs_ok; if (gdp->gd_maxcount != 0) { nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, - rqstp, cstate->session->se_client, gdp); + rqstp, cstate->clp, gdp); } gdp->gd_notify_types &= ops->notify_types; @@ -2174,7 +2172,7 @@ nfsd4_proc_null(struct svc_rqst *rqstp) static inline void nfsd4_increment_op_stats(u32 opnum) { if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP) - nfsdstats.nfs4_opcount[opnum]++; + percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_NFS4_OP(opnum)]); } static const struct nfsd4_operation nfsd4_ops[]; @@ -3305,6 +3303,7 @@ static const struct svc_procedure nfsd_procedures4[2] = { .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 1, + .pc_name = "NULL", }, [NFSPROC4_COMPOUND] = { .pc_func = nfsd4_proc_compound, @@ -3315,6 +3314,7 @@ static const struct svc_procedure nfsd_procedures4[2] = { .pc_release = nfsd4_release_compoundargs, .pc_cachetype = RC_NOCACHE, .pc_xdrressize = NFSD_BUFSIZE/4, + .pc_name = "COMPOUND", }, }; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1d2cd6a88f61..423fd6683f3a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3891,6 +3891,7 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_reclaim_complete *rc = &u->reclaim_complete; + struct nfs4_client *clp = cstate->clp; __be32 status = 0; if (rc->rca_one_fs) { @@ -3904,12 +3905,11 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, } status = nfserr_complete_already; - if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, - &cstate->session->se_client->cl_flags)) + if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags)) goto out; status = nfserr_stale_clientid; - if (is_client_expired(cstate->session->se_client)) + if (is_client_expired(clp)) /* * The following error isn't really legal. * But we only get here if the client just explicitly @@ -3920,8 +3920,8 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, goto out; status = nfs_ok; - nfsd4_client_record_create(cstate->session->se_client); - inc_reclaim_complete(cstate->session->se_client); + nfsd4_client_record_create(clp); + inc_reclaim_complete(clp); out: return status; } @@ -4633,40 +4633,37 @@ static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4 return nfserr_bad_seqid; } -static __be32 lookup_clientid(clientid_t *clid, - struct nfsd4_compound_state *cstate, - struct nfsd_net *nn, - bool sessions) +static struct nfs4_client *lookup_clientid(clientid_t *clid, bool sessions, + struct nfsd_net *nn) { struct nfs4_client *found; + spin_lock(&nn->client_lock); + found = find_confirmed_client(clid, sessions, nn); + if (found) + atomic_inc(&found->cl_rpc_users); + spin_unlock(&nn->client_lock); + return found; +} + +static __be32 set_client(clientid_t *clid, + struct nfsd4_compound_state *cstate, + struct nfsd_net *nn) +{ if (cstate->clp) { - found = cstate->clp; - if (!same_clid(&found->cl_clientid, clid)) + if (!same_clid(&cstate->clp->cl_clientid, clid)) return nfserr_stale_clientid; return nfs_ok; } - if (STALE_CLIENTID(clid, nn)) return nfserr_stale_clientid; - /* - * For v4.1+ we get the client in the SEQUENCE op. If we don't have one - * cached already then we know this is for is for v4.0 and "sessions" - * will be false. + * We're in the 4.0 case (otherwise the SEQUENCE op would have + * set cstate->clp), so session = false: */ - WARN_ON_ONCE(cstate->session); - spin_lock(&nn->client_lock); - found = find_confirmed_client(clid, sessions, nn); - if (!found) { - spin_unlock(&nn->client_lock); + cstate->clp = lookup_clientid(clid, false, nn); + if (!cstate->clp) return nfserr_expired; - } - atomic_inc(&found->cl_rpc_users); - spin_unlock(&nn->client_lock); - - /* Cache the nfs4_client in cstate! */ - cstate->clp = found; return nfs_ok; } @@ -4680,8 +4677,6 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, struct nfs4_openowner *oo = NULL; __be32 status; - if (STALE_CLIENTID(&open->op_clientid, nn)) - return nfserr_stale_clientid; /* * In case we need it later, after we've already created the * file and don't want to risk a further failure: @@ -4690,7 +4685,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate, if (open->op_file == NULL) return nfserr_jukebox; - status = lookup_clientid(clientid, cstate, nn, false); + status = set_client(clientid, cstate, nn); if (status) return status; clp = cstate->clp; @@ -5300,17 +5295,14 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); trace_nfsd_clid_renew(clid); - status = lookup_clientid(clid, cstate, nn, false); + status = set_client(clid, cstate, nn); if (status) - goto out; + return status; clp = cstate->clp; - status = nfserr_cb_path_down; if (!list_empty(&clp->cl_delegations) && clp->cl_cb_state != NFSD4_CB_UP) - goto out; - status = nfs_ok; -out: - return status; + return nfserr_cb_path_down; + return nfs_ok; } void @@ -5686,8 +5678,7 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || CLOSE_STATEID(stateid)) return nfserr_bad_stateid; - status = lookup_clientid(&stateid->si_opaque.so_clid, cstate, nn, - false); + status = set_client(&stateid->si_opaque.so_clid, cstate, nn); if (status == nfserr_stale_clientid) { if (cstate->session) return nfserr_bad_stateid; @@ -5818,21 +5809,27 @@ static __be32 find_cpntf_state(struct nfsd_net *nn, stateid_t *st, { __be32 status; struct nfs4_cpntf_state *cps = NULL; - struct nfsd4_compound_state cstate; + struct nfs4_client *found; status = manage_cpntf_state(nn, st, NULL, &cps); if (status) return status; cps->cpntf_time = ktime_get_boottime_seconds(); - memset(&cstate, 0, sizeof(cstate)); - status = lookup_clientid(&cps->cp_p_clid, &cstate, nn, true); - if (status) + + status = nfserr_expired; + found = lookup_clientid(&cps->cp_p_clid, true, nn); + if (!found) goto out; - status = nfsd4_lookup_stateid(&cstate, &cps->cp_p_stateid, - NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, - stid, nn); - put_client_renew(cstate.clp); + + *stid = find_stateid_by_type(found, &cps->cp_p_stateid, + NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID); + if (*stid) + status = nfs_ok; + else + status = nfserr_bad_stateid; + + put_client_renew(found); out: nfs4_put_cpntf_state(nn, cps); return status; @@ -5921,7 +5918,7 @@ nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct nfsd4_test_stateid *test_stateid = &u->test_stateid; struct nfsd4_test_stateid_id *stateid; - struct nfs4_client *cl = cstate->session->se_client; + struct nfs4_client *cl = cstate->clp; list_for_each_entry(stateid, &test_stateid->ts_stateid_list, ts_id_list) stateid->ts_id_status = @@ -5967,7 +5964,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stateid_t *stateid = &free_stateid->fr_stateid; struct nfs4_stid *s; struct nfs4_delegation *dp; - struct nfs4_client *cl = cstate->session->se_client; + struct nfs4_client *cl = cstate->clp; __be32 ret = nfserr_bad_stateid; spin_lock(&cl->cl_lock); @@ -6696,13 +6693,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (nfsd4_has_session(cstate)) /* See rfc 5661 18.10.3: given clientid is ignored: */ memcpy(&lock->lk_new_clientid, - &cstate->session->se_client->cl_clientid, + &cstate->clp->cl_clientid, sizeof(clientid_t)); - status = nfserr_stale_clientid; - if (STALE_CLIENTID(&lock->lk_new_clientid, nn)) - goto out; - /* validate and update open stateid and open seqid */ status = nfs4_preprocess_confirmed_seqid_op(cstate, lock->lk_new_open_seqid, @@ -6909,8 +6902,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_inval; if (!nfsd4_has_session(cstate)) { - status = lookup_clientid(&lockt->lt_clientid, cstate, nn, - false); + status = set_client(&lockt->lt_clientid, cstate, nn); if (status) goto out; } @@ -7094,7 +7086,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", clid->cl_boot, clid->cl_id); - status = lookup_clientid(clid, cstate, nn, false); + status = set_client(clid, cstate, nn); if (status) return status; @@ -7230,25 +7222,13 @@ nfsd4_find_reclaim_client(struct xdr_netobj name, struct nfsd_net *nn) return NULL; } -/* -* Called from OPEN. Look for clientid in reclaim list. -*/ __be32 -nfs4_check_open_reclaim(clientid_t *clid, - struct nfsd4_compound_state *cstate, - struct nfsd_net *nn) +nfs4_check_open_reclaim(struct nfs4_client *clp) { - __be32 status; - - /* find clientid in conf_id_hashtbl */ - status = lookup_clientid(clid, cstate, nn, false); - if (status) - return nfserr_reclaim_bad; - - if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags)) + if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags)) return nfserr_no_grace; - if (nfsd4_client_record_check(cstate->clp)) + if (nfsd4_client_record_check(clp)) return nfserr_reclaim_bad; return nfs_ok; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 80c90fc231a5..96cdf77925f3 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -121,14 +121,14 @@ nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp, struct nfsd_net *nn) { if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) { - nn->drc_mem_usage -= rp->c_replvec.iov_len; + nfsd_stats_drc_mem_usage_sub(nn, rp->c_replvec.iov_len); kfree(rp->c_replvec.iov_base); } if (rp->c_state != RC_UNUSED) { rb_erase(&rp->c_node, &b->rb_head); list_del(&rp->c_lru); atomic_dec(&nn->num_drc_entries); - nn->drc_mem_usage -= sizeof(*rp); + nfsd_stats_drc_mem_usage_sub(nn, sizeof(*rp)); } kmem_cache_free(drc_slab, rp); } @@ -154,6 +154,16 @@ void nfsd_drc_slab_free(void) kmem_cache_destroy(drc_slab); } +static int nfsd_reply_cache_stats_init(struct nfsd_net *nn) +{ + return nfsd_percpu_counters_init(nn->counter, NFSD_NET_COUNTERS_NUM); +} + +static void nfsd_reply_cache_stats_destroy(struct nfsd_net *nn) +{ + nfsd_percpu_counters_destroy(nn->counter, NFSD_NET_COUNTERS_NUM); +} + int nfsd_reply_cache_init(struct nfsd_net *nn) { unsigned int hashsize; @@ -165,12 +175,16 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) hashsize = nfsd_hashsize(nn->max_drc_entries); nn->maskbits = ilog2(hashsize); + status = nfsd_reply_cache_stats_init(nn); + if (status) + goto out_nomem; + nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan; nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count; nn->nfsd_reply_cache_shrinker.seeks = 1; status = register_shrinker(&nn->nfsd_reply_cache_shrinker); if (status) - goto out_nomem; + goto out_stats_destroy; nn->drc_hashtbl = kvzalloc(array_size(hashsize, sizeof(*nn->drc_hashtbl)), GFP_KERNEL); @@ -186,6 +200,8 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) return 0; out_shrinker: unregister_shrinker(&nn->nfsd_reply_cache_shrinker); +out_stats_destroy: + nfsd_reply_cache_stats_destroy(nn); out_nomem: printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); return -ENOMEM; @@ -196,6 +212,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn) struct svc_cacherep *rp; unsigned int i; + nfsd_reply_cache_stats_destroy(nn); unregister_shrinker(&nn->nfsd_reply_cache_shrinker); for (i = 0; i < nn->drc_hashsize; i++) { @@ -324,7 +341,7 @@ nfsd_cache_key_cmp(const struct svc_cacherep *key, { if (key->c_key.k_xid == rp->c_key.k_xid && key->c_key.k_csum != rp->c_key.k_csum) { - ++nn->payload_misses; + nfsd_stats_payload_misses_inc(nn); trace_nfsd_drc_mismatch(nn, key, rp); } @@ -407,7 +424,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp) rqstp->rq_cacherep = NULL; if (type == RC_NOCACHE) { - nfsdstats.rcnocache++; + nfsd_stats_rc_nocache_inc(); goto out; } @@ -429,12 +446,12 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp) goto found_entry; } - nfsdstats.rcmisses++; + nfsd_stats_rc_misses_inc(); rqstp->rq_cacherep = rp; rp->c_state = RC_INPROG; atomic_inc(&nn->num_drc_entries); - nn->drc_mem_usage += sizeof(*rp); + nfsd_stats_drc_mem_usage_add(nn, sizeof(*rp)); /* go ahead and prune the cache */ prune_bucket(b, nn); @@ -446,7 +463,7 @@ out: found_entry: /* We found a matching entry which is either in progress or done. */ - nfsdstats.rchits++; + nfsd_stats_rc_hits_inc(); rtn = RC_DROPIT; /* Request being processed */ @@ -548,7 +565,7 @@ void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) return; } spin_lock(&b->cache_lock); - nn->drc_mem_usage += bufsize; + nfsd_stats_drc_mem_usage_add(nn, bufsize); lru_put_end(b, rp); rp->c_secure = test_bit(RQ_SECURE, &rqstp->rq_flags); rp->c_type = cachetype; @@ -588,13 +605,18 @@ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) seq_printf(m, "max entries: %u\n", nn->max_drc_entries); seq_printf(m, "num entries: %u\n", - atomic_read(&nn->num_drc_entries)); + atomic_read(&nn->num_drc_entries)); seq_printf(m, "hash buckets: %u\n", 1 << nn->maskbits); - seq_printf(m, "mem usage: %u\n", nn->drc_mem_usage); - seq_printf(m, "cache hits: %u\n", nfsdstats.rchits); - seq_printf(m, "cache misses: %u\n", nfsdstats.rcmisses); - seq_printf(m, "not cached: %u\n", nfsdstats.rcnocache); - seq_printf(m, "payload misses: %u\n", nn->payload_misses); + seq_printf(m, "mem usage: %lld\n", + percpu_counter_sum_positive(&nn->counter[NFSD_NET_DRC_MEM_USAGE])); + seq_printf(m, "cache hits: %lld\n", + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS])); + seq_printf(m, "cache misses: %lld\n", + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES])); + seq_printf(m, "not cached: %lld\n", + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE])); + seq_printf(m, "payload misses: %lld\n", + percpu_counter_sum_positive(&nn->counter[NFSD_NET_PAYLOAD_MISSES])); seq_printf(m, "longest chain len: %u\n", nn->longest_chain); seq_printf(m, "cachesize at longest: %u\n", nn->longest_chain_cachesize); return 0; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index f6d5d783f4a4..4f6e514192bd 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -32,6 +32,7 @@ enum { NFSD_Root = 1, NFSD_List, + NFSD_Export_Stats, NFSD_Export_features, NFSD_Fh, NFSD_FO_UnlockIP, @@ -1348,6 +1349,8 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) static const struct tree_descr nfsd_files[] = { [NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO}, + /* Per-export io stats use same ops as exports file */ + [NFSD_Export_Stats] = {"export_stats", &exports_nfsd_operations, S_IRUGO}, [NFSD_Export_features] = {"export_features", &export_features_operations, S_IRUGO}, [NFSD_FO_UnlockIP] = {"unlock_ip", @@ -1534,7 +1537,9 @@ static int __init init_nfsd(void) retval = nfsd4_init_pnfs(); if (retval) goto out_free_slabs; - nfsd_stat_init(); /* Statistics */ + retval = nfsd_stat_init(); /* Statistics */ + if (retval) + goto out_free_pnfs; retval = nfsd_drc_slab_create(); if (retval) goto out_free_stat; @@ -1554,6 +1559,7 @@ out_free_lockd: nfsd_drc_slab_free(); out_free_stat: nfsd_stat_shutdown(); +out_free_pnfs: nfsd4_exit_pnfs(); out_free_slabs: nfsd4_free_slabs(); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index d63cf8196fed..8bdc37aa2c2e 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -24,8 +24,8 @@ #include <uapi/linux/nfsd/debug.h> #include "netns.h" -#include "stats.h" #include "export.h" +#include "stats.h" #undef ifdebug #ifdef CONFIG_SUNRPC_DEBUG diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 66f2ef67792a..4744a276058d 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -349,7 +349,7 @@ out: __be32 fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) { - struct svc_export *exp; + struct svc_export *exp = NULL; struct dentry *dentry; __be32 error; @@ -422,7 +422,7 @@ skip_pseudoflavor_check: } out: if (error == nfserr_stale) - nfsdstats.fh_stale++; + nfsd_stats_fh_stale_inc(exp); return error; } diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index cb20c2cd3469..f58933519f38 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -12,6 +12,7 @@ #include <linux/sunrpc/svc.h> #include <uapi/linux/nfsd/nfsfh.h> #include <linux/iversion.h> +#include <linux/exportfs.h> static inline __u32 ino_t_to_u32(ino_t ino) { @@ -264,7 +265,9 @@ fh_clear_wcc(struct svc_fh *fhp) static inline u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode) { - if (IS_I_VERSION(inode)) { + if (inode->i_sb->s_export_op->fetch_iversion) + return inode->i_sb->s_export_op->fetch_iversion(inode); + else if (IS_I_VERSION(inode)) { u64 chattr; chattr = stat->ctime.tv_sec; diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 9473d048efec..b2f8035f166b 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -149,14 +149,15 @@ out: static __be32 nfsd_proc_readlink(struct svc_rqst *rqstp) { - struct nfsd_readlinkargs *argp = rqstp->rq_argp; + struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd_readlinkres *resp = rqstp->rq_resp; + char *buffer = page_address(*(rqstp->rq_next_page++)); dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh)); /* Read the symlink. */ resp->len = NFS_MAXPATHLEN; - resp->status = nfsd_readlink(rqstp, &argp->fh, argp->buffer, &resp->len); + resp->status = nfsd_readlink(rqstp, &argp->fh, buffer, &resp->len); fh_put(&argp->fh); return rpc_success; @@ -171,32 +172,36 @@ nfsd_proc_read(struct svc_rqst *rqstp) { struct nfsd_readargs *argp = rqstp->rq_argp; struct nfsd_readres *resp = rqstp->rq_resp; + unsigned int len; u32 eof; + int v; dprintk("nfsd: READ %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), argp->count, argp->offset); + argp->count = min_t(u32, argp->count, NFSSVC_MAXBLKSIZE_V2); + + v = 0; + len = argp->count; + while (len > 0) { + struct page *page = *(rqstp->rq_next_page++); + + rqstp->rq_vec[v].iov_base = page_address(page); + rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); + len -= rqstp->rq_vec[v].iov_len; + v++; + } + /* Obtain buffer pointer for payload. 19 is 1 word for * status, 17 words for fattr, and 1 word for the byte count. */ - - if (NFSSVC_MAXBLKSIZE_V2 < argp->count) { - char buf[RPC_MAX_ADDRBUFLEN]; - printk(KERN_NOTICE - "oversized read request from %s (%d bytes)\n", - svc_print_addr(rqstp, buf, sizeof(buf)), - argp->count); - argp->count = NFSSVC_MAXBLKSIZE_V2; - } svc_reserve_auth(rqstp, (19<<2) + argp->count + 4); resp->count = argp->count; - resp->status = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), - argp->offset, - rqstp->rq_vec, argp->vlen, - &resp->count, - &eof); + fh_copy(&resp->fh, &argp->fh); + resp->status = nfsd_read(rqstp, &resp->fh, argp->offset, + rqstp->rq_vec, v, &resp->count, &eof); if (resp->status == nfs_ok) resp->status = fh_getattr(&resp->fh, &resp->stat); else if (resp->status == nfserr_jukebox) @@ -548,6 +553,20 @@ nfsd_proc_rmdir(struct svc_rqst *rqstp) return rpc_success; } +static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp, + struct nfsd_readdirres *resp, + int count) +{ + count = min_t(u32, count, PAGE_SIZE); + + /* Convert byte count to number of words (i.e. >> 2), + * and reserve room for the NULL ptr & eof flag (-2 words) */ + resp->buflen = (count >> 2) - 2; + + resp->buffer = page_address(*rqstp->rq_next_page); + rqstp->rq_next_page++; +} + /* * Read a portion of a directory. */ @@ -556,31 +575,24 @@ nfsd_proc_readdir(struct svc_rqst *rqstp) { struct nfsd_readdirargs *argp = rqstp->rq_argp; struct nfsd_readdirres *resp = rqstp->rq_resp; - int count; loff_t offset; + __be32 *buffer; dprintk("nfsd: READDIR %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), argp->count, argp->cookie); - /* Shrink to the client read size */ - count = (argp->count >> 2) - 2; - - /* Make sure we've room for the NULL ptr & eof flag */ - count -= 2; - if (count < 0) - count = 0; + nfsd_init_dirlist_pages(rqstp, resp, argp->count); + buffer = resp->buffer; - resp->buffer = argp->buffer; resp->offset = NULL; - resp->buflen = count; resp->common.err = nfs_ok; /* Read directory and encode entries on the fly */ offset = argp->cookie; resp->status = nfsd_readdir(rqstp, &argp->fh, &offset, &resp->common, nfssvc_encode_entry); - resp->count = resp->buffer - argp->buffer; + resp->count = resp->buffer - buffer; if (resp->offset) *resp->offset = htonl(offset); @@ -623,16 +635,18 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 0, + .pc_name = "NULL", }, [NFSPROC_GETATTR] = { .pc_func = nfsd_proc_getattr, - .pc_decode = nfssvc_decode_fhandle, + .pc_decode = nfssvc_decode_fhandleargs, .pc_encode = nfssvc_encode_attrstat, .pc_release = nfssvc_release_attrstat, .pc_argsize = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT, + .pc_name = "GETATTR", }, [NFSPROC_SETATTR] = { .pc_func = nfsd_proc_setattr, @@ -643,6 +657,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+AT, + .pc_name = "SETATTR", }, [NFSPROC_ROOT] = { .pc_func = nfsd_proc_root, @@ -652,6 +667,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 0, + .pc_name = "ROOT", }, [NFSPROC_LOOKUP] = { .pc_func = nfsd_proc_lookup, @@ -662,15 +678,17 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_diropres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+FH+AT, + .pc_name = "LOOKUP", }, [NFSPROC_READLINK] = { .pc_func = nfsd_proc_readlink, - .pc_decode = nfssvc_decode_readlinkargs, + .pc_decode = nfssvc_decode_fhandleargs, .pc_encode = nfssvc_encode_readlinkres, - .pc_argsize = sizeof(struct nfsd_readlinkargs), + .pc_argsize = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_readlinkres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+1+NFS_MAXPATHLEN/4, + .pc_name = "READLINK", }, [NFSPROC_READ] = { .pc_func = nfsd_proc_read, @@ -681,6 +699,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_readres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4, + .pc_name = "READ", }, [NFSPROC_WRITECACHE] = { .pc_func = nfsd_proc_writecache, @@ -690,6 +709,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_voidres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = 0, + .pc_name = "WRITECACHE", }, [NFSPROC_WRITE] = { .pc_func = nfsd_proc_write, @@ -700,6 +720,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_attrstat), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+AT, + .pc_name = "WRITE", }, [NFSPROC_CREATE] = { .pc_func = nfsd_proc_create, @@ -710,6 +731,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_diropres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+FH+AT, + .pc_name = "CREATE", }, [NFSPROC_REMOVE] = { .pc_func = nfsd_proc_remove, @@ -719,6 +741,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, + .pc_name = "REMOVE", }, [NFSPROC_RENAME] = { .pc_func = nfsd_proc_rename, @@ -728,6 +751,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, + .pc_name = "RENAME", }, [NFSPROC_LINK] = { .pc_func = nfsd_proc_link, @@ -737,6 +761,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, + .pc_name = "LINK", }, [NFSPROC_SYMLINK] = { .pc_func = nfsd_proc_symlink, @@ -746,6 +771,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, + .pc_name = "SYMLINK", }, [NFSPROC_MKDIR] = { .pc_func = nfsd_proc_mkdir, @@ -756,6 +782,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_diropres), .pc_cachetype = RC_REPLBUFF, .pc_xdrressize = ST+FH+AT, + .pc_name = "MKDIR", }, [NFSPROC_RMDIR] = { .pc_func = nfsd_proc_rmdir, @@ -765,6 +792,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_ressize = sizeof(struct nfsd_stat), .pc_cachetype = RC_REPLSTAT, .pc_xdrressize = ST, + .pc_name = "RMDIR", }, [NFSPROC_READDIR] = { .pc_func = nfsd_proc_readdir, @@ -773,15 +801,17 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_argsize = sizeof(struct nfsd_readdirargs), .pc_ressize = sizeof(struct nfsd_readdirres), .pc_cachetype = RC_NOCACHE, + .pc_name = "READDIR", }, [NFSPROC_STATFS] = { .pc_func = nfsd_proc_statfs, - .pc_decode = nfssvc_decode_fhandle, + .pc_decode = nfssvc_decode_fhandleargs, .pc_encode = nfssvc_encode_statfsres, .pc_argsize = sizeof(struct nfsd_fhandle), .pc_ressize = sizeof(struct nfsd_statfsres), .pc_cachetype = RC_NOCACHE, .pc_xdrressize = ST+5, + .pc_name = "STATFS", }, }; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index f9c9f4c63cc7..6de406322106 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -955,37 +955,6 @@ out: return 0; } -/* - * A write procedure can have a large argument, and a read procedure can - * have a large reply, but no NFSv2 or NFSv3 procedure has argument and - * reply that can both be larger than a page. The xdr code has taken - * advantage of this assumption to be a sloppy about bounds checking in - * some cases. Pending a rewrite of the NFSv2/v3 xdr code to fix that - * problem, we enforce these assumptions here: - */ -static bool nfs_request_too_big(struct svc_rqst *rqstp, - const struct svc_procedure *proc) -{ - /* - * The ACL code has more careful bounds-checking and is not - * susceptible to this problem: - */ - if (rqstp->rq_prog != NFS_PROGRAM) - return false; - /* - * Ditto NFSv4 (which can in theory have argument and reply both - * more than a page): - */ - if (rqstp->rq_vers >= 4) - return false; - /* The reply will be small, we're OK: */ - if (proc->pc_xdrressize > 0 && - proc->pc_xdrressize < XDR_QUADLEN(PAGE_SIZE)) - return false; - - return rqstp->rq_arg.len > PAGE_SIZE; -} - /** * nfsd_dispatch - Process an NFS or NFSACL Request * @rqstp: incoming request @@ -1004,9 +973,6 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) struct kvec *resv = &rqstp->rq_res.head[0]; __be32 *p; - if (nfs_request_too_big(rqstp, proc)) - goto out_decode_err; - /* * Give the xdr decoder a chance to change this if it wants * (necessary in the NFSv4.0 compound case) diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 7aa6e8aca2c1..5d79ef6a0c7f 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -23,24 +23,31 @@ static u32 nfs_ftypes[] = { /* - * XDR functions for basic NFS types + * Basic NFSv2 data types (RFC 1094 Section 2.3) */ -static __be32 * -decode_fh(__be32 *p, struct svc_fh *fhp) + +/** + * svcxdr_decode_fhandle - Decode an NFSv2 file handle + * @xdr: XDR stream positioned at an encoded NFSv2 FH + * @fhp: OUT: filled-in server file handle + * + * Return values: + * %false: The encoded file handle was not valid + * %true: @fhp has been initialized + */ +bool +svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp) { + __be32 *p; + + p = xdr_inline_decode(xdr, NFS_FHSIZE); + if (!p) + return false; fh_init(fhp, NFS_FHSIZE); memcpy(&fhp->fh_handle.fh_base, p, NFS_FHSIZE); fhp->fh_handle.fh_size = NFS_FHSIZE; - /* FIXME: Look up export pointer here and verify - * Sun Secure RPC if requested */ - return p + (NFS_FHSIZE >> 2); -} - -/* Helper function for NFSv2 ACL code */ -__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp) -{ - return decode_fh(p, fhp); + return true; } static __be32 * @@ -50,66 +57,95 @@ encode_fh(__be32 *p, struct svc_fh *fhp) return p + (NFS_FHSIZE>> 2); } -/* - * Decode a file name and make sure that the path contains - * no slashes or null bytes. - */ -static __be32 * -decode_filename(__be32 *p, char **namp, unsigned int *lenp) +static bool +svcxdr_decode_filename(struct xdr_stream *xdr, char **name, unsigned int *len) { - char *name; - unsigned int i; - - if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXNAMLEN)) != NULL) { - for (i = 0, name = *namp; i < *lenp; i++, name++) { - if (*name == '\0' || *name == '/') - return NULL; - } - } + u32 size, i; + __be32 *p; + char *c; + + if (xdr_stream_decode_u32(xdr, &size) < 0) + return false; + if (size == 0 || size > NFS_MAXNAMLEN) + return false; + p = xdr_inline_decode(xdr, size); + if (!p) + return false; - return p; + *len = size; + *name = (char *)p; + for (i = 0, c = *name; i < size; i++, c++) + if (*c == '\0' || *c == '/') + return false; + + return true; } -static __be32 * -decode_sattr(__be32 *p, struct iattr *iap, struct user_namespace *userns) +static bool +svcxdr_decode_diropargs(struct xdr_stream *xdr, struct svc_fh *fhp, + char **name, unsigned int *len) { - u32 tmp, tmp1; + return svcxdr_decode_fhandle(xdr, fhp) && + svcxdr_decode_filename(xdr, name, len); +} + +static bool +svcxdr_decode_sattr(struct svc_rqst *rqstp, struct xdr_stream *xdr, + struct iattr *iap) +{ + u32 tmp1, tmp2; + __be32 *p; + + p = xdr_inline_decode(xdr, XDR_UNIT * 8); + if (!p) + return false; iap->ia_valid = 0; - /* Sun client bug compatibility check: some sun clients seem to - * put 0xffff in the mode field when they mean 0xffffffff. - * Quoting the 4.4BSD nfs server code: Nah nah nah nah na nah. + /* + * Some Sun clients put 0xffff in the mode field when they + * mean 0xffffffff. */ - if ((tmp = ntohl(*p++)) != (u32)-1 && tmp != 0xffff) { + tmp1 = be32_to_cpup(p++); + if (tmp1 != (u32)-1 && tmp1 != 0xffff) { iap->ia_valid |= ATTR_MODE; - iap->ia_mode = tmp; + iap->ia_mode = tmp1; } - if ((tmp = ntohl(*p++)) != (u32)-1) { - iap->ia_uid = make_kuid(userns, tmp); + + tmp1 = be32_to_cpup(p++); + if (tmp1 != (u32)-1) { + iap->ia_uid = make_kuid(nfsd_user_namespace(rqstp), tmp1); if (uid_valid(iap->ia_uid)) iap->ia_valid |= ATTR_UID; } - if ((tmp = ntohl(*p++)) != (u32)-1) { - iap->ia_gid = make_kgid(userns, tmp); + + tmp1 = be32_to_cpup(p++); + if (tmp1 != (u32)-1) { + iap->ia_gid = make_kgid(nfsd_user_namespace(rqstp), tmp1); if (gid_valid(iap->ia_gid)) iap->ia_valid |= ATTR_GID; } - if ((tmp = ntohl(*p++)) != (u32)-1) { + + tmp1 = be32_to_cpup(p++); + if (tmp1 != (u32)-1) { iap->ia_valid |= ATTR_SIZE; - iap->ia_size = tmp; + iap->ia_size = tmp1; } - tmp = ntohl(*p++); tmp1 = ntohl(*p++); - if (tmp != (u32)-1 && tmp1 != (u32)-1) { + + tmp1 = be32_to_cpup(p++); + tmp2 = be32_to_cpup(p++); + if (tmp1 != (u32)-1 && tmp2 != (u32)-1) { iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET; - iap->ia_atime.tv_sec = tmp; - iap->ia_atime.tv_nsec = tmp1 * 1000; + iap->ia_atime.tv_sec = tmp1; + iap->ia_atime.tv_nsec = tmp2 * NSEC_PER_USEC; } - tmp = ntohl(*p++); tmp1 = ntohl(*p++); - if (tmp != (u32)-1 && tmp1 != (u32)-1) { + + tmp1 = be32_to_cpup(p++); + tmp2 = be32_to_cpup(p++); + if (tmp1 != (u32)-1 && tmp2 != (u32)-1) { iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET; - iap->ia_mtime.tv_sec = tmp; - iap->ia_mtime.tv_nsec = tmp1 * 1000; + iap->ia_mtime.tv_sec = tmp1; + iap->ia_mtime.tv_nsec = tmp2 * NSEC_PER_USEC; /* * Passing the invalid value useconds=1000000 for mtime * is a Sun convention for "set both mtime and atime to @@ -119,10 +155,11 @@ decode_sattr(__be32 *p, struct iattr *iap, struct user_namespace *userns) * sattr in section 6.1 of "NFS Illustrated" by * Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5 */ - if (tmp1 == 1000000) + if (tmp2 == 1000000) iap->ia_valid &= ~(ATTR_ATIME_SET|ATTR_MTIME_SET); } - return p; + + return true; } static __be32 * @@ -194,225 +231,158 @@ __be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *f */ int -nfssvc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p) +nfssvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_fhandle *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_fhandle(xdr, &args->fh); } int nfssvc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_sattrargs *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - p = decode_sattr(p, &args->attrs, nfsd_user_namespace(rqstp)); - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_fhandle(xdr, &args->fh) && + svcxdr_decode_sattr(rqstp, xdr, &args->attrs); } int nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_diropargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->fh)) - || !(p = decode_filename(p, &args->name, &args->len))) - return 0; - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_diropargs(xdr, &args->fh, &args->name, &args->len); } int nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_readargs *args = rqstp->rq_argp; - unsigned int len; - int v; - p = decode_fh(p, &args->fh); - if (!p) - return 0; - - args->offset = ntohl(*p++); - len = args->count = ntohl(*p++); - p++; /* totalcount - unused */ + u32 totalcount; - len = min_t(unsigned int, len, NFSSVC_MAXBLKSIZE_V2); + if (!svcxdr_decode_fhandle(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u32(xdr, &args->offset) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) + return 0; + /* totalcount is ignored */ + if (xdr_stream_decode_u32(xdr, &totalcount) < 0) + return 0; - /* set up somewhere to store response. - * We take pages, put them on reslist and include in iovec - */ - v=0; - while (len > 0) { - struct page *p = *(rqstp->rq_next_page++); - - rqstp->rq_vec[v].iov_base = page_address(p); - rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); - len -= rqstp->rq_vec[v].iov_len; - v++; - } - args->vlen = v; - return xdr_argsize_check(rqstp, p); + return 1; } int nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_writeargs *args = rqstp->rq_argp; - unsigned int len, hdr, dlen; struct kvec *head = rqstp->rq_arg.head; + struct kvec *tail = rqstp->rq_arg.tail; + u32 beginoffset, totalcount; + size_t remaining; - p = decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_fhandle(xdr, &args->fh)) return 0; - - p++; /* beginoffset */ - args->offset = ntohl(*p++); /* offset */ - p++; /* totalcount */ - len = args->len = ntohl(*p++); - /* - * The protocol specifies a maximum of 8192 bytes. - */ - if (len > NFSSVC_MAXBLKSIZE_V2) + /* beginoffset is ignored */ + if (xdr_stream_decode_u32(xdr, &beginoffset) < 0) return 0; - - /* - * Check to make sure that we got the right number of - * bytes. - */ - hdr = (void*)p - head->iov_base; - if (hdr > head->iov_len) + if (xdr_stream_decode_u32(xdr, &args->offset) < 0) + return 0; + /* totalcount is ignored */ + if (xdr_stream_decode_u32(xdr, &totalcount) < 0) return 0; - dlen = head->iov_len + rqstp->rq_arg.page_len - hdr; - /* - * Round the length of the data which was specified up to - * the next multiple of XDR units and then compare that - * against the length which was actually received. - * Note that when RPCSEC/GSS (for example) is used, the - * data buffer can be padded so dlen might be larger - * than required. It must never be smaller. - */ - if (dlen < XDR_QUADLEN(len)*4) + /* opaque data */ + if (xdr_stream_decode_u32(xdr, &args->len) < 0) return 0; + if (args->len > NFSSVC_MAXBLKSIZE_V2) + return 0; + remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len; + remaining -= xdr_stream_pos(xdr); + if (remaining < xdr_align_size(args->len)) + return 0; + args->first.iov_base = xdr->p; + args->first.iov_len = head->iov_len - xdr_stream_pos(xdr); - args->first.iov_base = (void *)p; - args->first.iov_len = head->iov_len - hdr; return 1; } int nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_createargs *args = rqstp->rq_argp; - if ( !(p = decode_fh(p, &args->fh)) - || !(p = decode_filename(p, &args->name, &args->len))) - return 0; - p = decode_sattr(p, &args->attrs, nfsd_user_namespace(rqstp)); - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_diropargs(xdr, &args->fh, + &args->name, &args->len) && + svcxdr_decode_sattr(rqstp, xdr, &args->attrs); } int nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_renameargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->ffh)) - || !(p = decode_filename(p, &args->fname, &args->flen)) - || !(p = decode_fh(p, &args->tfh)) - || !(p = decode_filename(p, &args->tname, &args->tlen))) - return 0; - - return xdr_argsize_check(rqstp, p); -} - -int -nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p) -{ - struct nfsd_readlinkargs *args = rqstp->rq_argp; - - p = decode_fh(p, &args->fh); - if (!p) - return 0; - args->buffer = page_address(*(rqstp->rq_next_page++)); - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_diropargs(xdr, &args->ffh, + &args->fname, &args->flen) && + svcxdr_decode_diropargs(xdr, &args->tfh, + &args->tname, &args->tlen); } int nfssvc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_linkargs *args = rqstp->rq_argp; - if (!(p = decode_fh(p, &args->ffh)) - || !(p = decode_fh(p, &args->tfh)) - || !(p = decode_filename(p, &args->tname, &args->tlen))) - return 0; - - return xdr_argsize_check(rqstp, p); + return svcxdr_decode_fhandle(xdr, &args->ffh) && + svcxdr_decode_diropargs(xdr, &args->tfh, + &args->tname, &args->tlen); } int nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_symlinkargs *args = rqstp->rq_argp; - char *base = (char *)p; - size_t xdrlen; + struct kvec *head = rqstp->rq_arg.head; - if ( !(p = decode_fh(p, &args->ffh)) - || !(p = decode_filename(p, &args->fname, &args->flen))) + if (!svcxdr_decode_diropargs(xdr, &args->ffh, &args->fname, &args->flen)) + return 0; + if (xdr_stream_decode_u32(xdr, &args->tlen) < 0) return 0; - - args->tlen = ntohl(*p++); if (args->tlen == 0) return 0; - args->first.iov_base = p; - args->first.iov_len = rqstp->rq_arg.head[0].iov_len; - args->first.iov_len -= (char *)p - base; - - /* This request is never larger than a page. Therefore, - * transport will deliver either: - * 1. pathname in the pagelist -> sattr is in the tail. - * 2. everything in the head buffer -> sattr is in the head. - */ - if (rqstp->rq_arg.page_len) { - if (args->tlen != rqstp->rq_arg.page_len) - return 0; - p = rqstp->rq_arg.tail[0].iov_base; - } else { - xdrlen = XDR_QUADLEN(args->tlen); - if (xdrlen > args->first.iov_len - (8 * sizeof(__be32))) - return 0; - p += xdrlen; - } - decode_sattr(p, &args->attrs, nfsd_user_namespace(rqstp)); - - return 1; + args->first.iov_len = head->iov_len - xdr_stream_pos(xdr); + args->first.iov_base = xdr_inline_decode(xdr, args->tlen); + if (!args->first.iov_base) + return 0; + return svcxdr_decode_sattr(rqstp, xdr, &args->attrs); } int nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_readdirargs *args = rqstp->rq_argp; - p = decode_fh(p, &args->fh); - if (!p) + if (!svcxdr_decode_fhandle(xdr, &args->fh)) + return 0; + if (xdr_stream_decode_u32(xdr, &args->cookie) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &args->count) < 0) return 0; - args->cookie = ntohl(*p++); - args->count = ntohl(*p++); - args->count = min_t(u32, args->count, PAGE_SIZE); - args->buffer = page_address(*(rqstp->rq_next_page++)); - return xdr_argsize_check(rqstp, p); + return 1; } /* diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 9eae11a9d21c..73deea353169 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -649,8 +649,7 @@ void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *) extern void nfs4_release_reclaim(struct nfsd_net *); extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(struct xdr_netobj name, struct nfsd_net *nn); -extern __be32 nfs4_check_open_reclaim(clientid_t *clid, - struct nfsd4_compound_state *cstate, struct nfsd_net *nn); +extern __be32 nfs4_check_open_reclaim(struct nfs4_client *); extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index b1bc582b0493..1d3b881e7382 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -7,16 +7,14 @@ * Format: * rc <hits> <misses> <nocache> * Statistsics for the reply cache - * fh <stale> <total-lookups> <anonlookups> <dir-not-in-dcache> <nondir-not-in-dcache> + * fh <stale> <deprecated filehandle cache stats> * statistics for filehandle lookup * io <bytes-read> <bytes-written> * statistics for IO throughput - * th <threads> <fullcnt> <10%-20%> <20%-30%> ... <90%-100%> <100%> - * time (seconds) when nfsd thread usage above thresholds - * and number of times that all threads were in use - * ra cache-size <10% <20% <30% ... <100% not-found - * number of times that read-ahead entry was found that deep in - * the cache. + * th <threads> <deprecated thread usage histogram stats> + * number of threads + * ra <deprecated ra-cache stats> + * * plus generic RPC stats (see net/sunrpc/stats.c) * * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> @@ -38,31 +36,24 @@ static int nfsd_proc_show(struct seq_file *seq, void *v) { int i; - seq_printf(seq, "rc %u %u %u\nfh %u %u %u %u %u\nio %u %u\n", - nfsdstats.rchits, - nfsdstats.rcmisses, - nfsdstats.rcnocache, - nfsdstats.fh_stale, - nfsdstats.fh_lookup, - nfsdstats.fh_anon, - nfsdstats.fh_nocache_dir, - nfsdstats.fh_nocache_nondir, - nfsdstats.io_read, - nfsdstats.io_write); + seq_printf(seq, "rc %lld %lld %lld\nfh %lld 0 0 0 0\nio %lld %lld\n", + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS]), + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES]), + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]), + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_FH_STALE]), + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_READ]), + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_WRITE])); + /* thread usage: */ - seq_printf(seq, "th %u %u", nfsdstats.th_cnt, nfsdstats.th_fullcnt); - for (i=0; i<10; i++) { - unsigned int jifs = nfsdstats.th_usage[i]; - unsigned int sec = jifs / HZ, msec = (jifs % HZ)*1000/HZ; - seq_printf(seq, " %u.%03u", sec, msec); - } + seq_printf(seq, "th %u 0", nfsdstats.th_cnt); + + /* deprecated thread usage histogram stats */ + for (i = 0; i < 10; i++) + seq_puts(seq, " 0.000"); + + /* deprecated ra-cache stats */ + seq_puts(seq, "\nra 0 0 0 0 0 0 0 0 0 0 0 0\n"); - /* newline and ra-cache */ - seq_printf(seq, "\nra %u", nfsdstats.ra_size); - for (i=0; i<11; i++) - seq_printf(seq, " %u", nfsdstats.ra_depth[i]); - seq_putc(seq, '\n'); - /* show my rpc info */ svc_seq_show(seq, &nfsd_svcstats); @@ -70,8 +61,10 @@ static int nfsd_proc_show(struct seq_file *seq, void *v) /* Show count for individual nfsv4 operations */ /* Writing operation numbers 0 1 2 also for maintaining uniformity */ seq_printf(seq,"proc4ops %u", LAST_NFS4_OP + 1); - for (i = 0; i <= LAST_NFS4_OP; i++) - seq_printf(seq, " %u", nfsdstats.nfs4_opcount[i]); + for (i = 0; i <= LAST_NFS4_OP; i++) { + seq_printf(seq, " %lld", + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_NFS4_OP(i)])); + } seq_putc(seq, '\n'); #endif @@ -91,14 +84,63 @@ static const struct proc_ops nfsd_proc_ops = { .proc_release = single_release, }; -void -nfsd_stat_init(void) +int nfsd_percpu_counters_init(struct percpu_counter counters[], int num) +{ + int i, err = 0; + + for (i = 0; !err && i < num; i++) + err = percpu_counter_init(&counters[i], 0, GFP_KERNEL); + + if (!err) + return 0; + + for (; i > 0; i--) + percpu_counter_destroy(&counters[i-1]); + + return err; +} + +void nfsd_percpu_counters_reset(struct percpu_counter counters[], int num) +{ + int i; + + for (i = 0; i < num; i++) + percpu_counter_set(&counters[i], 0); +} + +void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num) { + int i; + + for (i = 0; i < num; i++) + percpu_counter_destroy(&counters[i]); +} + +static int nfsd_stat_counters_init(void) +{ + return nfsd_percpu_counters_init(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM); +} + +static void nfsd_stat_counters_destroy(void) +{ + nfsd_percpu_counters_destroy(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM); +} + +int nfsd_stat_init(void) +{ + int err; + + err = nfsd_stat_counters_init(); + if (err) + return err; + svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_ops); + + return 0; } -void -nfsd_stat_shutdown(void) +void nfsd_stat_shutdown(void) { + nfsd_stat_counters_destroy(); svc_proc_unregister(&init_net, "nfsd"); } diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index b23fdac69820..51ecda852e23 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -8,37 +8,91 @@ #define _NFSD_STATS_H #include <uapi/linux/nfsd/stats.h> +#include <linux/percpu_counter.h> -struct nfsd_stats { - unsigned int rchits; /* repcache hits */ - unsigned int rcmisses; /* repcache hits */ - unsigned int rcnocache; /* uncached reqs */ - unsigned int fh_stale; /* FH stale error */ - unsigned int fh_lookup; /* dentry cached */ - unsigned int fh_anon; /* anon file dentry returned */ - unsigned int fh_nocache_dir; /* filehandle not found in dcache */ - unsigned int fh_nocache_nondir; /* filehandle not found in dcache */ - unsigned int io_read; /* bytes returned to read requests */ - unsigned int io_write; /* bytes passed in write requests */ - unsigned int th_cnt; /* number of available threads */ - unsigned int th_usage[10]; /* number of ticks during which n perdeciles - * of available threads were in use */ - unsigned int th_fullcnt; /* number of times last free thread was used */ - unsigned int ra_size; /* size of ra cache */ - unsigned int ra_depth[11]; /* number of times ra entry was found that deep - * in the cache (10percentiles). [10] = not found */ +enum { + NFSD_STATS_RC_HITS, /* repcache hits */ + NFSD_STATS_RC_MISSES, /* repcache misses */ + NFSD_STATS_RC_NOCACHE, /* uncached reqs */ + NFSD_STATS_FH_STALE, /* FH stale error */ + NFSD_STATS_IO_READ, /* bytes returned to read requests */ + NFSD_STATS_IO_WRITE, /* bytes passed in write requests */ #ifdef CONFIG_NFSD_V4 - unsigned int nfs4_opcount[LAST_NFS4_OP + 1]; /* count of individual nfsv4 operations */ + NFSD_STATS_FIRST_NFS4_OP, /* count of individual nfsv4 operations */ + NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP, +#define NFSD_STATS_NFS4_OP(op) (NFSD_STATS_FIRST_NFS4_OP + (op)) #endif + NFSD_STATS_COUNTERS_NUM +}; + +struct nfsd_stats { + struct percpu_counter counter[NFSD_STATS_COUNTERS_NUM]; + /* Protected by nfsd_mutex */ + unsigned int th_cnt; /* number of available threads */ }; extern struct nfsd_stats nfsdstats; + extern struct svc_stat nfsd_svcstats; -void nfsd_stat_init(void); -void nfsd_stat_shutdown(void); +int nfsd_percpu_counters_init(struct percpu_counter counters[], int num); +void nfsd_percpu_counters_reset(struct percpu_counter counters[], int num); +void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num); +int nfsd_stat_init(void); +void nfsd_stat_shutdown(void); + +static inline void nfsd_stats_rc_hits_inc(void) +{ + percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_HITS]); +} + +static inline void nfsd_stats_rc_misses_inc(void) +{ + percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_MISSES]); +} + +static inline void nfsd_stats_rc_nocache_inc(void) +{ + percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]); +} + +static inline void nfsd_stats_fh_stale_inc(struct svc_export *exp) +{ + percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_FH_STALE]); + if (exp) + percpu_counter_inc(&exp->ex_stats.counter[EXP_STATS_FH_STALE]); +} + +static inline void nfsd_stats_io_read_add(struct svc_export *exp, s64 amount) +{ + percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_READ], amount); + if (exp) + percpu_counter_add(&exp->ex_stats.counter[EXP_STATS_IO_READ], amount); +} + +static inline void nfsd_stats_io_write_add(struct svc_export *exp, s64 amount) +{ + percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_WRITE], amount); + if (exp) + percpu_counter_add(&exp->ex_stats.counter[EXP_STATS_IO_WRITE], amount); +} + +static inline void nfsd_stats_payload_misses_inc(struct nfsd_net *nn) +{ + percpu_counter_inc(&nn->counter[NFSD_NET_PAYLOAD_MISSES]); +} + +static inline void nfsd_stats_drc_mem_usage_add(struct nfsd_net *nn, s64 amount) +{ + percpu_counter_add(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount); +} + +static inline void nfsd_stats_drc_mem_usage_sub(struct nfsd_net *nn, s64 amount) +{ + percpu_counter_sub(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount); +} #endif /* _NFSD_STATS_H */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 04937e51de56..d316e11923c5 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -889,7 +889,7 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned long *count, u32 *eof, ssize_t host_err) { if (host_err >= 0) { - nfsdstats.io_read += host_err; + nfsd_stats_io_read_add(fhp->fh_export, host_err); *eof = nfsd_eof_on_read(file, offset, host_err, *count); *count = host_err; fsnotify_access(file); @@ -1040,7 +1040,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, goto out_nfserr; } *cnt = host_err; - nfsdstats.io_write += *cnt; + nfsd_stats_io_write_add(exp, *cnt); fsnotify_modify(file); if (stable && use_wgather) { diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index ad77387734cc..3018b52b6d5e 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -27,7 +27,6 @@ struct nfsd_readargs { struct svc_fh fh; __u32 offset; __u32 count; - int vlen; }; struct nfsd_writeargs { @@ -53,11 +52,6 @@ struct nfsd_renameargs { unsigned int tlen; }; -struct nfsd_readlinkargs { - struct svc_fh fh; - char * buffer; -}; - struct nfsd_linkargs { struct svc_fh ffh; struct svc_fh tfh; @@ -79,7 +73,6 @@ struct nfsd_readdirargs { struct svc_fh fh; __u32 cookie; __u32 count; - __be32 * buffer; }; struct nfsd_stat { @@ -144,14 +137,13 @@ union nfsd_xdrstore { #define NFS2_SVC_XDRSIZE sizeof(union nfsd_xdrstore) -int nfssvc_decode_fhandle(struct svc_rqst *, __be32 *); +int nfssvc_decode_fhandleargs(struct svc_rqst *, __be32 *); int nfssvc_decode_sattrargs(struct svc_rqst *, __be32 *); int nfssvc_decode_diropargs(struct svc_rqst *, __be32 *); int nfssvc_decode_readargs(struct svc_rqst *, __be32 *); int nfssvc_decode_writeargs(struct svc_rqst *, __be32 *); int nfssvc_decode_createargs(struct svc_rqst *, __be32 *); int nfssvc_decode_renameargs(struct svc_rqst *, __be32 *); -int nfssvc_decode_readlinkargs(struct svc_rqst *, __be32 *); int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *); int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *); int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *); @@ -172,6 +164,6 @@ void nfssvc_release_readres(struct svc_rqst *rqstp); /* Helper functions for NFSv2 ACL code */ __be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat); -__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp); +bool svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp); #endif /* LINUX_NFSD_H */ diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 456fcd7a1038..3e1578953f54 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -25,14 +25,13 @@ struct nfsd3_diropargs { struct nfsd3_accessargs { struct svc_fh fh; - unsigned int access; + __u32 access; }; struct nfsd3_readargs { struct svc_fh fh; __u64 offset; __u32 count; - int vlen; }; struct nfsd3_writeargs { @@ -71,11 +70,6 @@ struct nfsd3_renameargs { unsigned int tlen; }; -struct nfsd3_readlinkargs { - struct svc_fh fh; - char * buffer; -}; - struct nfsd3_linkargs { struct svc_fh ffh; struct svc_fh tfh; @@ -96,10 +90,8 @@ struct nfsd3_symlinkargs { struct nfsd3_readdirargs { struct svc_fh fh; __u64 cookie; - __u32 dircount; __u32 count; __be32 * verf; - __be32 * buffer; }; struct nfsd3_commitargs { @@ -110,13 +102,13 @@ struct nfsd3_commitargs { struct nfsd3_getaclargs { struct svc_fh fh; - int mask; + __u32 mask; }; struct posix_acl; struct nfsd3_setaclargs { struct svc_fh fh; - int mask; + __u32 mask; struct posix_acl *acl_access; struct posix_acl *acl_default; }; @@ -273,7 +265,7 @@ union nfsd3_xdrstore { #define NFS3_SVC_XDRSIZE sizeof(union nfsd3_xdrstore) -int nfs3svc_decode_fhandle(struct svc_rqst *, __be32 *); +int nfs3svc_decode_fhandleargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_accessargs(struct svc_rqst *, __be32 *); @@ -283,7 +275,6 @@ int nfs3svc_decode_createargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_mkdirargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_mknodargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_renameargs(struct svc_rqst *, __be32 *); -int nfs3svc_decode_readlinkargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_linkargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *); int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *); @@ -316,7 +307,6 @@ int nfs3svc_encode_entry_plus(void *, const char *name, /* Helper functions for NFSv3 ACL code */ __be32 *nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp); -__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp); - +bool svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp); #endif /* _LINUX_NFSD_XDR3_H */ diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 64bc81363c6c..e1bd592ce700 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -141,6 +141,7 @@ const struct file_operations nilfs_file_operations = { /* .release = nilfs_release_file, */ .fsync = nilfs_sync_file, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, }; const struct inode_operations nilfs_file_inode_operations = { diff --git a/fs/open.c b/fs/open.c index 1e06e443a565..ca5444733acd 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1091,6 +1091,12 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op) lookup_flags |= LOOKUP_BENEATH; if (how->resolve & RESOLVE_IN_ROOT) lookup_flags |= LOOKUP_IN_ROOT; + if (how->resolve & RESOLVE_CACHED) { + /* Don't bother even trying for create/truncate/tmpfile open */ + if (flags & (O_TRUNC | O_CREAT | O_TMPFILE)) + return -EAGAIN; + lookup_flags |= LOOKUP_CACHED; + } op->lookup_flags = lookup_flags; return 0; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index e5b616c93e11..0fed532efa68 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -84,6 +84,14 @@ int ovl_copy_xattr(struct super_block *sb, struct dentry *old, if (ovl_is_private_xattr(sb, name)) continue; + + error = security_inode_copy_up_xattr(name); + if (error < 0 && error != -EOPNOTSUPP) + break; + if (error == 1) { + error = 0; + continue; /* Discard */ + } retry: size = vfs_getxattr(old, name, value, value_size); if (size == -ERANGE) @@ -107,13 +115,6 @@ retry: goto retry; } - error = security_inode_copy_up_xattr(name); - if (error < 0 && error != -EOPNOTSUPP) - break; - if (error == 1) { - error = 0; - continue; /* Discard */ - } error = vfs_setxattr(new, name, value, size, 0); if (error) { if (error != -EOPNOTSUPP || ovl_must_copy_xattr(name)) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 28a075b5f5b2..d1efa3a5a503 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -992,8 +992,8 @@ static char *ovl_get_redirect(struct dentry *dentry, bool abs_redirect) buflen -= thislen; memcpy(&buf[buflen], name, thislen); - tmp = dget_dlock(d->d_parent); spin_unlock(&d->d_lock); + tmp = dget_parent(d); dput(d); d = tmp; diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index bd9dd38347ae..077d3ad343f6 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -398,8 +398,9 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) const struct cred *old_cred; int ret; - if (!ovl_should_sync(OVL_FS(file_inode(file)->i_sb))) - return 0; + ret = ovl_sync_status(OVL_FS(file_inode(file)->i_sb)); + if (ret <= 0) + return ret; ret = ovl_real_fdget_meta(file, &real, !datasync); if (ret) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index d739e14c6814..cf41bcb664bc 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -352,7 +352,9 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, goto out; if (!value && !upperdentry) { + old_cred = ovl_override_creds(dentry->d_sb); err = vfs_getxattr(realdentry, name, NULL, 0); + revert_creds(old_cred); if (err < 0) goto out_drop_write; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index b487e48c7fd4..cb4e2d60ecf9 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -324,6 +324,7 @@ int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct dentry *dentry); bool ovl_is_metacopy_dentry(struct dentry *dentry); char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry, int padding); +int ovl_sync_status(struct ovl_fs *ofs); static inline bool ovl_is_impuredir(struct super_block *sb, struct dentry *dentry) diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index fbd5e27ce66b..63efee554f69 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -81,6 +81,8 @@ struct ovl_fs { atomic_long_t last_ino; /* Whiteout dentry cache */ struct dentry *whiteout; + /* r/o snapshot of upperdir sb's only taken on volatile mounts */ + errseq_t errseq; }; static inline struct vfsmount *ovl_upper_mnt(struct ovl_fs *ofs) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 01620ebae1bd..f404a78e6b60 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -865,7 +865,7 @@ struct file *ovl_dir_real_file(const struct file *file, bool want_upper) struct ovl_dir_file *od = file->private_data; struct dentry *dentry = file->f_path.dentry; - struct file *realfile = od->realfile; + struct file *old, *realfile = od->realfile; if (!OVL_TYPE_UPPER(ovl_path_type(dentry))) return want_upper ? NULL : realfile; @@ -874,29 +874,20 @@ struct file *ovl_dir_real_file(const struct file *file, bool want_upper) * Need to check if we started out being a lower dir, but got copied up */ if (!od->is_upper) { - struct inode *inode = file_inode(file); - realfile = READ_ONCE(od->upperfile); if (!realfile) { struct path upperpath; ovl_path_upper(dentry, &upperpath); realfile = ovl_dir_open_realfile(file, &upperpath); + if (IS_ERR(realfile)) + return realfile; - inode_lock(inode); - if (!od->upperfile) { - if (IS_ERR(realfile)) { - inode_unlock(inode); - return realfile; - } - smp_store_release(&od->upperfile, realfile); - } else { - /* somebody has beaten us to it */ - if (!IS_ERR(realfile)) - fput(realfile); - realfile = od->upperfile; + old = cmpxchg_release(&od->upperfile, NULL, realfile); + if (old) { + fput(realfile); + realfile = old; } - inode_unlock(inode); } } @@ -909,8 +900,9 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, struct file *realfile; int err; - if (!ovl_should_sync(OVL_FS(file->f_path.dentry->d_sb))) - return 0; + err = ovl_sync_status(OVL_FS(file->f_path.dentry->d_sb)); + if (err <= 0) + return err; realfile = ovl_dir_real_file(file, true); err = PTR_ERR_OR_ZERO(realfile); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 2bd570cbe8a4..d58b8f2bf9d0 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -264,11 +264,20 @@ static int ovl_sync_fs(struct super_block *sb, int wait) struct super_block *upper_sb; int ret; - if (!ovl_upper_mnt(ofs)) - return 0; + ret = ovl_sync_status(ofs); + /* + * We have to always set the err, because the return value isn't + * checked in syncfs, and instead indirectly return an error via + * the sb's writeback errseq, which VFS inspects after this call. + */ + if (ret < 0) { + errseq_set(&sb->s_wb_err, -EIO); + return -EIO; + } + + if (!ret) + return ret; - if (!ovl_should_sync(ofs)) - return 0; /* * Not called for sync(2) call or an emergency sync (SB_I_SKIP_SYNC). * All the super blocks will be iterated, including upper_sb. @@ -1923,6 +1932,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) unsigned int numlower; int err; + err = -EIO; + if (WARN_ON(sb->s_user_ns != current_user_ns())) + goto out; + sb->s_d_op = &ovl_dentry_operations; err = -ENOMEM; @@ -1989,6 +2002,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &ovl_super_operations; if (ofs->config.upperdir) { + struct super_block *upper_sb; + if (!ofs->config.workdir) { pr_err("missing 'workdir'\n"); goto out_err; @@ -1998,6 +2013,16 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (err) goto out_err; + upper_sb = ovl_upper_mnt(ofs)->mnt_sb; + if (!ovl_should_sync(ofs)) { + ofs->errseq = errseq_sample(&upper_sb->s_wb_err); + if (errseq_check(&upper_sb->s_wb_err, ofs->errseq)) { + err = -EIO; + pr_err("Cannot mount volatile when upperdir has an unseen error. Sync upperdir fs to clear state.\n"); + goto out_err; + } + } + err = ovl_get_workdir(sb, ofs, &upperpath); if (err) goto out_err; @@ -2005,9 +2030,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!ofs->workdir) sb->s_flags |= SB_RDONLY; - sb->s_stack_depth = ovl_upper_mnt(ofs)->mnt_sb->s_stack_depth; - sb->s_time_gran = ovl_upper_mnt(ofs)->mnt_sb->s_time_gran; - + sb->s_stack_depth = upper_sb->s_stack_depth; + sb->s_time_gran = upper_sb->s_time_gran; } oe = ovl_get_lowerstack(sb, splitlower, numlower, ofs, layers); err = PTR_ERR(oe); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 6569031af3cd..9826b003f1d2 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -962,3 +962,30 @@ err_free: kfree(buf); return ERR_PTR(res); } + +/* + * ovl_sync_status() - Check fs sync status for volatile mounts + * + * Returns 1 if this is not a volatile mount and a real sync is required. + * + * Returns 0 if syncing can be skipped because mount is volatile, and no errors + * have occurred on the upperdir since the mount. + * + * Returns -errno if it is a volatile mount, and the error that occurred since + * the last mount. If the error code changes, it'll return the latest error + * code. + */ + +int ovl_sync_status(struct ovl_fs *ofs) +{ + struct vfsmount *mnt; + + if (ovl_should_sync(ofs)) + return 1; + + mnt = ovl_upper_mnt(ofs); + if (!mnt) + return 0; + + return errseq_check(&mnt->mnt_sb->s_wb_err, ofs->errseq); +} diff --git a/fs/pipe.c b/fs/pipe.c index c5989cfd564d..39c96845a72f 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1206,6 +1206,7 @@ const struct file_operations pipefifo_fops = { .unlocked_ioctl = pipe_ioctl, .release = pipe_release, .fasync = pipe_fasync, + .splice_write = iter_file_splice_write, }; /* diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 317899222d7f..d2018f70d1fa 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1770,6 +1770,12 @@ static int process_sysctl_arg(char *param, char *val, return 0; } + if (!val) + return -EINVAL; + len = strlen(val); + if (len == 0) + return -EINVAL; + /* * To set sysctl options, we use a temporary mount of proc, look up the * respective sys/ file and write to it. To avoid mounting it when no @@ -1811,7 +1817,6 @@ static int process_sysctl_arg(char *param, char *val, file, param, val); goto out; } - len = strlen(val); wret = kernel_write(file, val, len, &pos); if (wret < 0) { err = wret; diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 32f64abc277c..d963ae7902f9 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -269,7 +269,7 @@ static int pstore_compress(const void *in, void *out, { int ret; - if (!IS_ENABLED(CONFIG_PSTORE_COMPRESSION)) + if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS)) return -EINVAL; ret = crypto_comp_compress(tfm, in, inlen, out, &outlen); @@ -671,7 +671,7 @@ static void decompress_record(struct pstore_record *record) int unzipped_len; char *unzipped, *workspace; - if (!IS_ENABLED(CONFIG_PSTORE_COMPRESSION) || !record->compressed) + if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !record->compressed) return; /* Only PSTORE_TYPE_DMESG support compression. */ diff --git a/fs/read_write.c b/fs/read_write.c index 75f764b43418..9db7adf160d2 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1188,6 +1188,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, { struct fd in, out; struct inode *in_inode, *out_inode; + struct pipe_inode_info *opipe; loff_t pos; loff_t out_pos; ssize_t retval; @@ -1228,9 +1229,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, in_inode = file_inode(in.file); out_inode = file_inode(out.file); out_pos = out.file->f_pos; - retval = rw_verify_area(WRITE, out.file, &out_pos, count); - if (retval < 0) - goto fput_out; if (!max) max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); @@ -1253,9 +1251,18 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, if (in.file->f_flags & O_NONBLOCK) fl = SPLICE_F_NONBLOCK; #endif - file_start_write(out.file); - retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); - file_end_write(out.file); + opipe = get_pipe_info(out.file, true); + if (!opipe) { + retval = rw_verify_area(WRITE, out.file, &out_pos, count); + if (retval < 0) + goto fput_out; + file_start_write(out.file); + retval = do_splice_direct(in.file, &pos, out.file, &out_pos, + count, fl); + file_end_write(out.file); + } else { + retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl); + } if (retval > 0) { add_rchar(current, retval); diff --git a/fs/splice.c b/fs/splice.c index 866d5c2367b2..b06846f1e6ee 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -771,11 +771,16 @@ static long do_splice_to(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { + unsigned int p_space; int ret; if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; + /* Don't try to read more the pipe has space for. */ + p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail); + len = min_t(size_t, len, p_space << PAGE_SHIFT); + ret = rw_verify_area(READ, in, ppos, len); if (unlikely(ret < 0)) return ret; @@ -856,15 +861,10 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail)); while (len) { - unsigned int p_space; size_t read_len; loff_t pos = sd->pos, prev_pos = pos; - /* Don't try to read more the pipe has space for. */ - p_space = pipe->max_usage - - pipe_occupancy(pipe->head, pipe->tail); - read_len = min_t(size_t, len, p_space << PAGE_SHIFT); - ret = do_splice_to(in, &pos, pipe, read_len, flags); + ret = do_splice_to(in, &pos, pipe, len, flags); if (unlikely(ret <= 0)) goto out_release; @@ -1002,6 +1002,23 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags); +long splice_file_to_pipe(struct file *in, + struct pipe_inode_info *opipe, + loff_t *offset, + size_t len, unsigned int flags) +{ + long ret; + + pipe_lock(opipe); + ret = wait_for_space(opipe, flags); + if (!ret) + ret = do_splice_to(in, offset, opipe, len, flags); + pipe_unlock(opipe); + if (ret > 0) + wakeup_pipe_readers(opipe); + return ret; +} + /* * Determine where to splice to/from. */ @@ -1081,20 +1098,7 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, if (out->f_flags & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; - pipe_lock(opipe); - ret = wait_for_space(opipe, flags); - if (!ret) { - unsigned int p_space; - - /* Don't try to read more the pipe has space for. */ - p_space = opipe->max_usage - pipe_occupancy(opipe->head, opipe->tail); - len = min_t(size_t, len, p_space << PAGE_SHIFT); - - ret = do_splice_to(in, &offset, opipe, len, flags); - } - pipe_unlock(opipe); - if (ret > 0) - wakeup_pipe_readers(opipe); + ret = splice_file_to_pipe(in, opipe, &offset, len, flags); if (!off_in) in->f_pos = offset; else diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 8a19773b5a0b..45f44425d856 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -196,9 +196,15 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, length = SQUASHFS_COMPRESSED_SIZE(length); index += 2; - TRACE("Block @ 0x%llx, %scompressed size %d\n", index, + TRACE("Block @ 0x%llx, %scompressed size %d\n", index - 2, compressed ? "" : "un", length); } + if (length < 0 || length > output->length || + (index + length) > msblk->bytes_used) { + res = -EIO; + goto out; + } + if (next_index) *next_index = index + length; diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c index ae2c87bb0fbe..eb02072d28dd 100644 --- a/fs/squashfs/export.c +++ b/fs/squashfs/export.c @@ -41,12 +41,17 @@ static long long squashfs_inode_lookup(struct super_block *sb, int ino_num) struct squashfs_sb_info *msblk = sb->s_fs_info; int blk = SQUASHFS_LOOKUP_BLOCK(ino_num - 1); int offset = SQUASHFS_LOOKUP_BLOCK_OFFSET(ino_num - 1); - u64 start = le64_to_cpu(msblk->inode_lookup_table[blk]); + u64 start; __le64 ino; int err; TRACE("Entered squashfs_inode_lookup, inode_number = %d\n", ino_num); + if (ino_num == 0 || (ino_num - 1) >= msblk->inodes) + return -EINVAL; + + start = le64_to_cpu(msblk->inode_lookup_table[blk]); + err = squashfs_read_metadata(sb, &ino, &start, &offset, sizeof(ino)); if (err < 0) return err; @@ -111,7 +116,10 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb, u64 lookup_table_start, u64 next_table, unsigned int inodes) { unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes); + unsigned int indexes = SQUASHFS_LOOKUP_BLOCKS(inodes); + int n; __le64 *table; + u64 start, end; TRACE("In read_inode_lookup_table, length %d\n", length); @@ -121,20 +129,37 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb, if (inodes == 0) return ERR_PTR(-EINVAL); - /* length bytes should not extend into the next table - this check - * also traps instances where lookup_table_start is incorrectly larger - * than the next table start + /* + * The computed size of the lookup table (length bytes) should exactly + * match the table start and end points */ - if (lookup_table_start + length > next_table) + if (length != (next_table - lookup_table_start)) return ERR_PTR(-EINVAL); table = squashfs_read_table(sb, lookup_table_start, length); + if (IS_ERR(table)) + return table; /* - * table[0] points to the first inode lookup table metadata block, - * this should be less than lookup_table_start + * table0], table[1], ... table[indexes - 1] store the locations + * of the compressed inode lookup blocks. Each entry should be + * less than the next (i.e. table[0] < table[1]), and the difference + * between them should be SQUASHFS_METADATA_SIZE or less. + * table[indexes - 1] should be less than lookup_table_start, and + * again the difference should be SQUASHFS_METADATA_SIZE or less */ - if (!IS_ERR(table) && le64_to_cpu(table[0]) >= lookup_table_start) { + for (n = 0; n < (indexes - 1); n++) { + start = le64_to_cpu(table[n]); + end = le64_to_cpu(table[n + 1]); + + if (start >= end || (end - start) > SQUASHFS_METADATA_SIZE) { + kfree(table); + return ERR_PTR(-EINVAL); + } + } + + start = le64_to_cpu(table[indexes - 1]); + if (start >= lookup_table_start || (lookup_table_start - start) > SQUASHFS_METADATA_SIZE) { kfree(table); return ERR_PTR(-EINVAL); } diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c index 6be5afe7287d..11581bf31af4 100644 --- a/fs/squashfs/id.c +++ b/fs/squashfs/id.c @@ -35,10 +35,15 @@ int squashfs_get_id(struct super_block *sb, unsigned int index, struct squashfs_sb_info *msblk = sb->s_fs_info; int block = SQUASHFS_ID_BLOCK(index); int offset = SQUASHFS_ID_BLOCK_OFFSET(index); - u64 start_block = le64_to_cpu(msblk->id_table[block]); + u64 start_block; __le32 disk_id; int err; + if (index >= msblk->ids) + return -EINVAL; + + start_block = le64_to_cpu(msblk->id_table[block]); + err = squashfs_read_metadata(sb, &disk_id, &start_block, &offset, sizeof(disk_id)); if (err < 0) @@ -56,7 +61,10 @@ __le64 *squashfs_read_id_index_table(struct super_block *sb, u64 id_table_start, u64 next_table, unsigned short no_ids) { unsigned int length = SQUASHFS_ID_BLOCK_BYTES(no_ids); + unsigned int indexes = SQUASHFS_ID_BLOCKS(no_ids); + int n; __le64 *table; + u64 start, end; TRACE("In read_id_index_table, length %d\n", length); @@ -67,20 +75,36 @@ __le64 *squashfs_read_id_index_table(struct super_block *sb, return ERR_PTR(-EINVAL); /* - * length bytes should not extend into the next table - this check - * also traps instances where id_table_start is incorrectly larger - * than the next table start + * The computed size of the index table (length bytes) should exactly + * match the table start and end points */ - if (id_table_start + length > next_table) + if (length != (next_table - id_table_start)) return ERR_PTR(-EINVAL); table = squashfs_read_table(sb, id_table_start, length); + if (IS_ERR(table)) + return table; /* - * table[0] points to the first id lookup table metadata block, this - * should be less than id_table_start + * table[0], table[1], ... table[indexes - 1] store the locations + * of the compressed id blocks. Each entry should be less than + * the next (i.e. table[0] < table[1]), and the difference between them + * should be SQUASHFS_METADATA_SIZE or less. table[indexes - 1] + * should be less than id_table_start, and again the difference + * should be SQUASHFS_METADATA_SIZE or less */ - if (!IS_ERR(table) && le64_to_cpu(table[0]) >= id_table_start) { + for (n = 0; n < (indexes - 1); n++) { + start = le64_to_cpu(table[n]); + end = le64_to_cpu(table[n + 1]); + + if (start >= end || (end - start) > SQUASHFS_METADATA_SIZE) { + kfree(table); + return ERR_PTR(-EINVAL); + } + } + + start = le64_to_cpu(table[indexes - 1]); + if (start >= id_table_start || (id_table_start - start) > SQUASHFS_METADATA_SIZE) { kfree(table); return ERR_PTR(-EINVAL); } diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 34c21ffb6df3..166e98806265 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -64,5 +64,6 @@ struct squashfs_sb_info { unsigned int inodes; unsigned int fragments; int xattr_ids; + unsigned int ids; }; #endif diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index d6c6593ec169..88cc94be1076 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -166,6 +166,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) msblk->directory_table = le64_to_cpu(sblk->directory_table_start); msblk->inodes = le32_to_cpu(sblk->inodes); msblk->fragments = le32_to_cpu(sblk->fragments); + msblk->ids = le16_to_cpu(sblk->no_ids); flags = le16_to_cpu(sblk->flags); TRACE("Found valid superblock on %pg\n", sb->s_bdev); @@ -177,7 +178,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) TRACE("Block size %d\n", msblk->block_size); TRACE("Number of inodes %d\n", msblk->inodes); TRACE("Number of fragments %d\n", msblk->fragments); - TRACE("Number of ids %d\n", le16_to_cpu(sblk->no_ids)); + TRACE("Number of ids %d\n", msblk->ids); TRACE("sblk->inode_table_start %llx\n", msblk->inode_table); TRACE("sblk->directory_table_start %llx\n", msblk->directory_table); TRACE("sblk->fragment_table_start %llx\n", @@ -236,8 +237,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) allocate_id_index_table: /* Allocate and read id index table */ msblk->id_table = squashfs_read_id_index_table(sb, - le64_to_cpu(sblk->id_table_start), next_table, - le16_to_cpu(sblk->no_ids)); + le64_to_cpu(sblk->id_table_start), next_table, msblk->ids); if (IS_ERR(msblk->id_table)) { errorf(fc, "unable to read id index table"); err = PTR_ERR(msblk->id_table); diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h index 184129afd456..d8a270d3ac4c 100644 --- a/fs/squashfs/xattr.h +++ b/fs/squashfs/xattr.h @@ -17,8 +17,16 @@ extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *, static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start, u64 *xattr_table_start, int *xattr_ids) { + struct squashfs_xattr_id_table *id_table; + + id_table = squashfs_read_table(sb, start, sizeof(*id_table)); + if (IS_ERR(id_table)) + return (__le64 *) id_table; + + *xattr_table_start = le64_to_cpu(id_table->xattr_table_start); + kfree(id_table); + ERROR("Xattrs in filesystem, these will be ignored\n"); - *xattr_table_start = start; return ERR_PTR(-ENOTSUPP); } diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c index d99e08464554..ead66670b41a 100644 --- a/fs/squashfs/xattr_id.c +++ b/fs/squashfs/xattr_id.c @@ -31,10 +31,15 @@ int squashfs_xattr_lookup(struct super_block *sb, unsigned int index, struct squashfs_sb_info *msblk = sb->s_fs_info; int block = SQUASHFS_XATTR_BLOCK(index); int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index); - u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]); + u64 start_block; struct squashfs_xattr_id id; int err; + if (index >= msblk->xattr_ids) + return -EINVAL; + + start_block = le64_to_cpu(msblk->xattr_id_table[block]); + err = squashfs_read_metadata(sb, &id, &start_block, &offset, sizeof(id)); if (err < 0) @@ -50,13 +55,17 @@ int squashfs_xattr_lookup(struct super_block *sb, unsigned int index, /* * Read uncompressed xattr id lookup table indexes from disk into memory */ -__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start, +__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start, u64 *xattr_table_start, int *xattr_ids) { - unsigned int len; + struct squashfs_sb_info *msblk = sb->s_fs_info; + unsigned int len, indexes; struct squashfs_xattr_id_table *id_table; + __le64 *table; + u64 start, end; + int n; - id_table = squashfs_read_table(sb, start, sizeof(*id_table)); + id_table = squashfs_read_table(sb, table_start, sizeof(*id_table)); if (IS_ERR(id_table)) return (__le64 *) id_table; @@ -70,13 +79,52 @@ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start, if (*xattr_ids == 0) return ERR_PTR(-EINVAL); - /* xattr_table should be less than start */ - if (*xattr_table_start >= start) + len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids); + indexes = SQUASHFS_XATTR_BLOCKS(*xattr_ids); + + /* + * The computed size of the index table (len bytes) should exactly + * match the table start and end points + */ + start = table_start + sizeof(*id_table); + end = msblk->bytes_used; + + if (len != (end - start)) return ERR_PTR(-EINVAL); - len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids); + table = squashfs_read_table(sb, start, len); + if (IS_ERR(table)) + return table; + + /* table[0], table[1], ... table[indexes - 1] store the locations + * of the compressed xattr id blocks. Each entry should be less than + * the next (i.e. table[0] < table[1]), and the difference between them + * should be SQUASHFS_METADATA_SIZE or less. table[indexes - 1] + * should be less than table_start, and again the difference + * shouls be SQUASHFS_METADATA_SIZE or less. + * + * Finally xattr_table_start should be less than table[0]. + */ + for (n = 0; n < (indexes - 1); n++) { + start = le64_to_cpu(table[n]); + end = le64_to_cpu(table[n + 1]); + + if (start >= end || (end - start) > SQUASHFS_METADATA_SIZE) { + kfree(table); + return ERR_PTR(-EINVAL); + } + } + + start = le64_to_cpu(table[indexes - 1]); + if (start >= table_start || (table_start - start) > SQUASHFS_METADATA_SIZE) { + kfree(table); + return ERR_PTR(-EINVAL); + } - TRACE("In read_xattr_index_table, length %d\n", len); + if (*xattr_table_start >= le64_to_cpu(table[0])) { + kfree(table); + return ERR_PTR(-EINVAL); + } - return squashfs_read_table(sb, start + sizeof(*id_table), len); + return table; } diff --git a/fs/udf/super.c b/fs/udf/super.c index 5bef3a68395d..d0df217f4712 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -705,6 +705,7 @@ static int udf_check_vsd(struct super_block *sb) struct buffer_head *bh = NULL; int nsr = 0; struct udf_sb_info *sbi; + loff_t session_offset; sbi = UDF_SB(sb); if (sb->s_blocksize < sizeof(struct volStructDesc)) @@ -712,7 +713,8 @@ static int udf_check_vsd(struct super_block *sb) else sectorsize = sb->s_blocksize; - sector += (((loff_t)sbi->s_session) << sb->s_blocksize_bits); + session_offset = (loff_t)sbi->s_session << sb->s_blocksize_bits; + sector += session_offset; udf_debug("Starting at sector %u (%lu byte sectors)\n", (unsigned int)(sector >> sb->s_blocksize_bits), @@ -757,8 +759,7 @@ static int udf_check_vsd(struct super_block *sb) if (nsr > 0) return 1; - else if (!bh && sector - (sbi->s_session << sb->s_blocksize_bits) == - VSD_FIRST_SECTOR_OFFSET) + else if (!bh && sector - session_offset == VSD_FIRST_SECTOR_OFFSET) return -1; else return 0; diff --git a/fs/verity/Makefile b/fs/verity/Makefile index 570e9136334d..435559a4fa9e 100644 --- a/fs/verity/Makefile +++ b/fs/verity/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_FS_VERITY) += enable.o \ init.o \ measure.o \ open.o \ + read_metadata.o \ verify.o obj-$(CONFIG_FS_VERITY_BUILTIN_SIGNATURES) += signature.o diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 6413d28664d6..a7920434bae5 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -122,12 +122,17 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, const u8 *salt, size_t salt_size); struct fsverity_info *fsverity_create_info(const struct inode *inode, - void *desc, size_t desc_size); + struct fsverity_descriptor *desc, + size_t desc_size); void fsverity_set_info(struct inode *inode, struct fsverity_info *vi); void fsverity_free_info(struct fsverity_info *vi); +int fsverity_get_descriptor(struct inode *inode, + struct fsverity_descriptor **desc_ret, + size_t *desc_size_ret); + int __init fsverity_init_info_cache(void); void __init fsverity_exit_info_cache(void); @@ -135,15 +140,13 @@ void __init fsverity_exit_info_cache(void); #ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES int fsverity_verify_signature(const struct fsverity_info *vi, - const struct fsverity_descriptor *desc, - size_t desc_size); + const u8 *signature, size_t sig_size); int __init fsverity_init_signature(void); #else /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */ static inline int fsverity_verify_signature(const struct fsverity_info *vi, - const struct fsverity_descriptor *desc, - size_t desc_size) + const u8 *signature, size_t sig_size) { return 0; } diff --git a/fs/verity/open.c b/fs/verity/open.c index 228d0eca3e2e..60ff8af7219f 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -142,45 +142,17 @@ static int compute_file_digest(struct fsverity_hash_alg *hash_alg, } /* - * Validate the given fsverity_descriptor and create a new fsverity_info from - * it. The signature (if present) is also checked. + * Create a new fsverity_info from the given fsverity_descriptor (with optional + * appended signature), and check the signature if present. The + * fsverity_descriptor must have already undergone basic validation. */ struct fsverity_info *fsverity_create_info(const struct inode *inode, - void *_desc, size_t desc_size) + struct fsverity_descriptor *desc, + size_t desc_size) { - struct fsverity_descriptor *desc = _desc; struct fsverity_info *vi; int err; - if (desc_size < sizeof(*desc)) { - fsverity_err(inode, "Unrecognized descriptor size: %zu bytes", - desc_size); - return ERR_PTR(-EINVAL); - } - - if (desc->version != 1) { - fsverity_err(inode, "Unrecognized descriptor version: %u", - desc->version); - return ERR_PTR(-EINVAL); - } - - if (memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) { - fsverity_err(inode, "Reserved bits set in descriptor"); - return ERR_PTR(-EINVAL); - } - - if (desc->salt_size > sizeof(desc->salt)) { - fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size); - return ERR_PTR(-EINVAL); - } - - if (le64_to_cpu(desc->data_size) != inode->i_size) { - fsverity_err(inode, - "Wrong data_size: %llu (desc) != %lld (inode)", - le64_to_cpu(desc->data_size), inode->i_size); - return ERR_PTR(-EINVAL); - } - vi = kmem_cache_zalloc(fsverity_info_cachep, GFP_KERNEL); if (!vi) return ERR_PTR(-ENOMEM); @@ -209,7 +181,8 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode, vi->tree_params.hash_alg->name, vi->tree_params.digest_size, vi->file_digest); - err = fsverity_verify_signature(vi, desc, desc_size); + err = fsverity_verify_signature(vi, desc->signature, + le32_to_cpu(desc->sig_size)); out: if (err) { fsverity_free_info(vi); @@ -245,15 +218,57 @@ void fsverity_free_info(struct fsverity_info *vi) kmem_cache_free(fsverity_info_cachep, vi); } -/* Ensure the inode has an ->i_verity_info */ -static int ensure_verity_info(struct inode *inode) +static bool validate_fsverity_descriptor(struct inode *inode, + const struct fsverity_descriptor *desc, + size_t desc_size) { - struct fsverity_info *vi = fsverity_get_info(inode); - struct fsverity_descriptor *desc; - int res; + if (desc_size < sizeof(*desc)) { + fsverity_err(inode, "Unrecognized descriptor size: %zu bytes", + desc_size); + return false; + } - if (vi) - return 0; + if (desc->version != 1) { + fsverity_err(inode, "Unrecognized descriptor version: %u", + desc->version); + return false; + } + + if (memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) { + fsverity_err(inode, "Reserved bits set in descriptor"); + return false; + } + + if (desc->salt_size > sizeof(desc->salt)) { + fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size); + return false; + } + + if (le64_to_cpu(desc->data_size) != inode->i_size) { + fsverity_err(inode, + "Wrong data_size: %llu (desc) != %lld (inode)", + le64_to_cpu(desc->data_size), inode->i_size); + return false; + } + + if (le32_to_cpu(desc->sig_size) > desc_size - sizeof(*desc)) { + fsverity_err(inode, "Signature overflows verity descriptor"); + return false; + } + + return true; +} + +/* + * Read the inode's fsverity_descriptor (with optional appended signature) from + * the filesystem, and do basic validation of it. + */ +int fsverity_get_descriptor(struct inode *inode, + struct fsverity_descriptor **desc_ret, + size_t *desc_size_ret) +{ + int res; + struct fsverity_descriptor *desc; res = inode->i_sb->s_vop->get_verity_descriptor(inode, NULL, 0); if (res < 0) { @@ -272,20 +287,46 @@ static int ensure_verity_info(struct inode *inode) res = inode->i_sb->s_vop->get_verity_descriptor(inode, desc, res); if (res < 0) { fsverity_err(inode, "Error %d reading verity descriptor", res); - goto out_free_desc; + kfree(desc); + return res; + } + + if (!validate_fsverity_descriptor(inode, desc, res)) { + kfree(desc); + return -EINVAL; } - vi = fsverity_create_info(inode, desc, res); + *desc_ret = desc; + *desc_size_ret = res; + return 0; +} + +/* Ensure the inode has an ->i_verity_info */ +static int ensure_verity_info(struct inode *inode) +{ + struct fsverity_info *vi = fsverity_get_info(inode); + struct fsverity_descriptor *desc; + size_t desc_size; + int err; + + if (vi) + return 0; + + err = fsverity_get_descriptor(inode, &desc, &desc_size); + if (err) + return err; + + vi = fsverity_create_info(inode, desc, desc_size); if (IS_ERR(vi)) { - res = PTR_ERR(vi); + err = PTR_ERR(vi); goto out_free_desc; } fsverity_set_info(inode, vi); - res = 0; + err = 0; out_free_desc: kfree(desc); - return res; + return err; } /** diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c new file mode 100644 index 000000000000..7e2d0c7bdf0d --- /dev/null +++ b/fs/verity/read_metadata.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Ioctl to read verity metadata + * + * Copyright 2021 Google LLC + */ + +#include "fsverity_private.h" + +#include <linux/backing-dev.h> +#include <linux/highmem.h> +#include <linux/sched/signal.h> +#include <linux/uaccess.h> + +static int fsverity_read_merkle_tree(struct inode *inode, + const struct fsverity_info *vi, + void __user *buf, u64 offset, int length) +{ + const struct fsverity_operations *vops = inode->i_sb->s_vop; + u64 end_offset; + unsigned int offs_in_page; + pgoff_t index, last_index; + int retval = 0; + int err = 0; + + end_offset = min(offset + length, vi->tree_params.tree_size); + if (offset >= end_offset) + return 0; + offs_in_page = offset_in_page(offset); + last_index = (end_offset - 1) >> PAGE_SHIFT; + + /* + * Iterate through each Merkle tree page in the requested range and copy + * the requested portion to userspace. Note that the Merkle tree block + * size isn't important here, as we are returning a byte stream; i.e., + * we can just work with pages even if the tree block size != PAGE_SIZE. + */ + for (index = offset >> PAGE_SHIFT; index <= last_index; index++) { + unsigned long num_ra_pages = + min_t(unsigned long, last_index - index + 1, + inode->i_sb->s_bdi->io_pages); + unsigned int bytes_to_copy = min_t(u64, end_offset - offset, + PAGE_SIZE - offs_in_page); + struct page *page; + const void *virt; + + page = vops->read_merkle_tree_page(inode, index, num_ra_pages); + if (IS_ERR(page)) { + err = PTR_ERR(page); + fsverity_err(inode, + "Error %d reading Merkle tree page %lu", + err, index); + break; + } + + virt = kmap(page); + if (copy_to_user(buf, virt + offs_in_page, bytes_to_copy)) { + kunmap(page); + put_page(page); + err = -EFAULT; + break; + } + kunmap(page); + put_page(page); + + retval += bytes_to_copy; + buf += bytes_to_copy; + offset += bytes_to_copy; + + if (fatal_signal_pending(current)) { + err = -EINTR; + break; + } + cond_resched(); + offs_in_page = 0; + } + return retval ? retval : err; +} + +/* Copy the requested portion of the buffer to userspace. */ +static int fsverity_read_buffer(void __user *dst, u64 offset, int length, + const void *src, size_t src_length) +{ + if (offset >= src_length) + return 0; + src += offset; + src_length -= offset; + + length = min_t(size_t, length, src_length); + + if (copy_to_user(dst, src, length)) + return -EFAULT; + + return length; +} + +static int fsverity_read_descriptor(struct inode *inode, + void __user *buf, u64 offset, int length) +{ + struct fsverity_descriptor *desc; + size_t desc_size; + int res; + + res = fsverity_get_descriptor(inode, &desc, &desc_size); + if (res) + return res; + + /* don't include the signature */ + desc_size = offsetof(struct fsverity_descriptor, signature); + desc->sig_size = 0; + + res = fsverity_read_buffer(buf, offset, length, desc, desc_size); + + kfree(desc); + return res; +} + +static int fsverity_read_signature(struct inode *inode, + void __user *buf, u64 offset, int length) +{ + struct fsverity_descriptor *desc; + size_t desc_size; + int res; + + res = fsverity_get_descriptor(inode, &desc, &desc_size); + if (res) + return res; + + if (desc->sig_size == 0) { + res = -ENODATA; + goto out; + } + + /* + * Include only the signature. Note that fsverity_get_descriptor() + * already verified that sig_size is in-bounds. + */ + res = fsverity_read_buffer(buf, offset, length, desc->signature, + le32_to_cpu(desc->sig_size)); +out: + kfree(desc); + return res; +} + +/** + * fsverity_ioctl_read_metadata() - read verity metadata from a file + * @filp: file to read the metadata from + * @uarg: user pointer to fsverity_read_metadata_arg + * + * Return: length read on success, 0 on EOF, -errno on failure + */ +int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg) +{ + struct inode *inode = file_inode(filp); + const struct fsverity_info *vi; + struct fsverity_read_metadata_arg arg; + int length; + void __user *buf; + + vi = fsverity_get_info(inode); + if (!vi) + return -ENODATA; /* not a verity file */ + /* + * Note that we don't have to explicitly check that the file is open for + * reading, since verity files can only be opened for reading. + */ + + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; + + if (arg.__reserved) + return -EINVAL; + + /* offset + length must not overflow. */ + if (arg.offset + arg.length < arg.offset) + return -EINVAL; + + /* Ensure that the return value will fit in INT_MAX. */ + length = min_t(u64, arg.length, INT_MAX); + + buf = u64_to_user_ptr(arg.buf_ptr); + + switch (arg.metadata_type) { + case FS_VERITY_METADATA_TYPE_MERKLE_TREE: + return fsverity_read_merkle_tree(inode, vi, buf, arg.offset, + length); + case FS_VERITY_METADATA_TYPE_DESCRIPTOR: + return fsverity_read_descriptor(inode, buf, arg.offset, length); + case FS_VERITY_METADATA_TYPE_SIGNATURE: + return fsverity_read_signature(inode, buf, arg.offset, length); + default: + return -EINVAL; + } +} +EXPORT_SYMBOL_GPL(fsverity_ioctl_read_metadata); diff --git a/fs/verity/signature.c b/fs/verity/signature.c index 012468eda2a7..143a530a8008 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -29,21 +29,19 @@ static struct key *fsverity_keyring; /** * fsverity_verify_signature() - check a verity file's signature * @vi: the file's fsverity_info - * @desc: the file's fsverity_descriptor - * @desc_size: size of @desc + * @signature: the file's built-in signature + * @sig_size: size of signature in bytes, or 0 if no signature * - * If the file's fs-verity descriptor includes a signature of the file digest, - * verify it against the certificates in the fs-verity keyring. + * If the file includes a signature of its fs-verity file digest, verify it + * against the certificates in the fs-verity keyring. * * Return: 0 on success (signature valid or not required); -errno on failure */ int fsverity_verify_signature(const struct fsverity_info *vi, - const struct fsverity_descriptor *desc, - size_t desc_size) + const u8 *signature, size_t sig_size) { const struct inode *inode = vi->inode; const struct fsverity_hash_alg *hash_alg = vi->tree_params.hash_alg; - const u32 sig_size = le32_to_cpu(desc->sig_size); struct fsverity_formatted_digest *d; int err; @@ -56,11 +54,6 @@ int fsverity_verify_signature(const struct fsverity_info *vi, return 0; } - if (sig_size > desc_size - sizeof(*desc)) { - fsverity_err(inode, "Signature overflows verity descriptor"); - return -EBADMSG; - } - d = kzalloc(sizeof(*d) + hash_alg->digest_size, GFP_KERNEL); if (!d) return -ENOMEM; @@ -70,8 +63,7 @@ int fsverity_verify_signature(const struct fsverity_info *vi, memcpy(d->digest, vi->file_digest, hash_alg->digest_size); err = verify_pkcs7_signature(d, sizeof(*d) + hash_alg->digest_size, - desc->signature, sig_size, - fsverity_keyring, + signature, sig_size, fsverity_keyring, VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL); kfree(d); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 38528e59030e..dc91973c0b4f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -223,30 +223,42 @@ xfs_file_fsync( return error; } +static int +xfs_ilock_iocb( + struct kiocb *iocb, + unsigned int lock_mode) +{ + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!xfs_ilock_nowait(ip, lock_mode)) + return -EAGAIN; + } else { + xfs_ilock(ip, lock_mode); + } + + return 0; +} + STATIC ssize_t -xfs_file_dio_aio_read( +xfs_file_dio_read( struct kiocb *iocb, struct iov_iter *to) { struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); - size_t count = iov_iter_count(to); ssize_t ret; - trace_xfs_file_direct_read(ip, count, iocb->ki_pos); + trace_xfs_file_direct_read(iocb, to); - if (!count) + if (!iov_iter_count(to)) return 0; /* skip atime */ file_accessed(iocb->ki_filp); - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) - return -EAGAIN; - } else { - xfs_ilock(ip, XFS_IOLOCK_SHARED); - } - ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, - is_sync_kiocb(iocb)); + ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); + if (ret) + return ret; + ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; @@ -258,21 +270,16 @@ xfs_file_dax_read( struct iov_iter *to) { struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); - size_t count = iov_iter_count(to); ssize_t ret = 0; - trace_xfs_file_dax_read(ip, count, iocb->ki_pos); + trace_xfs_file_dax_read(iocb, to); - if (!count) + if (!iov_iter_count(to)) return 0; /* skip atime */ - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) - return -EAGAIN; - } else { - xfs_ilock(ip, XFS_IOLOCK_SHARED); - } - + ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); + if (ret) + return ret; ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops); xfs_iunlock(ip, XFS_IOLOCK_SHARED); @@ -281,21 +288,18 @@ xfs_file_dax_read( } STATIC ssize_t -xfs_file_buffered_aio_read( +xfs_file_buffered_read( struct kiocb *iocb, struct iov_iter *to) { struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); ssize_t ret; - trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); + trace_xfs_file_buffered_read(iocb, to); - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) - return -EAGAIN; - } else { - xfs_ilock(ip, XFS_IOLOCK_SHARED); - } + ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); + if (ret) + return ret; ret = generic_file_read_iter(iocb, to); xfs_iunlock(ip, XFS_IOLOCK_SHARED); @@ -319,9 +323,9 @@ xfs_file_read_iter( if (IS_DAX(inode)) ret = xfs_file_dax_read(iocb, to); else if (iocb->ki_flags & IOCB_DIRECT) - ret = xfs_file_dio_aio_read(iocb, to); + ret = xfs_file_dio_read(iocb, to); else - ret = xfs_file_buffered_aio_read(iocb, to); + ret = xfs_file_buffered_read(iocb, to); if (ret > 0) XFS_STATS_ADD(mp, xs_read_bytes, ret); @@ -336,7 +340,7 @@ xfs_file_read_iter( * if called for a direct write beyond i_size. */ STATIC ssize_t -xfs_file_aio_write_checks( +xfs_file_write_checks( struct kiocb *iocb, struct iov_iter *from, int *iolock) @@ -354,7 +358,14 @@ restart: if (error <= 0) return error; - error = xfs_break_layouts(inode, iolock, BREAK_WRITE); + if (iocb->ki_flags & IOCB_NOWAIT) { + error = break_layout(inode, false); + if (error == -EWOULDBLOCK) + error = -EAGAIN; + } else { + error = xfs_break_layouts(inode, iolock, BREAK_WRITE); + } + if (error) return error; @@ -365,7 +376,11 @@ restart: if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { xfs_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; - xfs_ilock(ip, *iolock); + error = xfs_ilock_iocb(iocb, *iolock); + if (error) { + *iolock = 0; + return error; + } goto restart; } /* @@ -387,6 +402,10 @@ restart: isize = i_size_read(inode); if (iocb->ki_pos > isize) { spin_unlock(&ip->i_flags_lock); + + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + if (!drained_dio) { if (*iolock == XFS_IOLOCK_SHARED) { xfs_iunlock(ip, *iolock); @@ -500,122 +519,149 @@ static const struct iomap_dio_ops xfs_dio_write_ops = { }; /* - * xfs_file_dio_aio_write - handle direct IO writes - * - * Lock the inode appropriately to prepare for and issue a direct IO write. - * By separating it from the buffered write path we remove all the tricky to - * follow locking changes and looping. - * - * If there are cached pages or we're extending the file, we need IOLOCK_EXCL - * until we're sure the bytes at the new EOF have been zeroed and/or the cached - * pages are flushed out. - * - * In most cases the direct IO writes will be done holding IOLOCK_SHARED - * allowing them to be done in parallel with reads and other direct IO writes. - * However, if the IO is not aligned to filesystem blocks, the direct IO layer - * needs to do sub-block zeroing and that requires serialisation against other - * direct IOs to the same block. In this case we need to serialise the - * submission of the unaligned IOs so that we don't get racing block zeroing in - * the dio layer. To avoid the problem with aio, we also need to wait for - * outstanding IOs to complete so that unwritten extent conversion is completed - * before we try to map the overlapping block. This is currently implemented by - * hitting it with a big hammer (i.e. inode_dio_wait()). - * - * Returns with locks held indicated by @iolock and errors indicated by - * negative return values. + * Handle block aligned direct I/O writes */ -STATIC ssize_t -xfs_file_dio_aio_write( +static noinline ssize_t +xfs_file_dio_write_aligned( + struct xfs_inode *ip, struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - ssize_t ret = 0; - int unaligned_io = 0; - int iolock; - size_t count = iov_iter_count(from); - struct xfs_buftarg *target = xfs_inode_buftarg(ip); + int iolock = XFS_IOLOCK_SHARED; + ssize_t ret; - /* DIO must be aligned to device logical sector size */ - if ((iocb->ki_pos | count) & target->bt_logical_sectormask) - return -EINVAL; + ret = xfs_ilock_iocb(iocb, iolock); + if (ret) + return ret; + ret = xfs_file_write_checks(iocb, from, &iolock); + if (ret) + goto out_unlock; /* - * Don't take the exclusive iolock here unless the I/O is unaligned to - * the file system block size. We don't need to consider the EOF - * extension case here because xfs_file_aio_write_checks() will relock - * the inode as necessary for EOF zeroing cases and fill out the new - * inode size as appropriate. + * We don't need to hold the IOLOCK exclusively across the IO, so demote + * the iolock back to shared if we had to take the exclusive lock in + * xfs_file_write_checks() for other reasons. */ - if ((iocb->ki_pos & mp->m_blockmask) || - ((iocb->ki_pos + count) & mp->m_blockmask)) { - unaligned_io = 1; - - /* - * We can't properly handle unaligned direct I/O to reflink - * files yet, as we can't unshare a partial block. - */ - if (xfs_is_cow_inode(ip)) { - trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count); - return -ENOTBLK; - } - iolock = XFS_IOLOCK_EXCL; - } else { + if (iolock == XFS_IOLOCK_EXCL) { + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } + trace_xfs_file_direct_write(iocb, from); + ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, + &xfs_dio_write_ops, 0); +out_unlock: + if (iolock) + xfs_iunlock(ip, iolock); + return ret; +} - if (iocb->ki_flags & IOCB_NOWAIT) { - /* unaligned dio always waits, bail */ - if (unaligned_io) - return -EAGAIN; - if (!xfs_ilock_nowait(ip, iolock)) +/* + * Handle block unaligned direct I/O writes + * + * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing + * them to be done in parallel with reads and other direct I/O writes. However, + * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need + * to do sub-block zeroing and that requires serialisation against other direct + * I/O to the same block. In this case we need to serialise the submission of + * the unaligned I/O so that we don't get racing block zeroing in the dio layer. + * In the case where sub-block zeroing is not required, we can do concurrent + * sub-block dios to the same block successfully. + * + * Optimistically submit the I/O using the shared lock first, but use the + * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN + * if block allocation or partial block zeroing would be required. In that case + * we try again with the exclusive lock. + */ +static noinline ssize_t +xfs_file_dio_write_unaligned( + struct xfs_inode *ip, + struct kiocb *iocb, + struct iov_iter *from) +{ + size_t isize = i_size_read(VFS_I(ip)); + size_t count = iov_iter_count(from); + int iolock = XFS_IOLOCK_SHARED; + unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY; + ssize_t ret; + + /* + * Extending writes need exclusivity because of the sub-block zeroing + * that the DIO code always does for partial tail blocks beyond EOF, so + * don't even bother trying the fast path in this case. + */ + if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) { +retry_exclusive: + if (iocb->ki_flags & IOCB_NOWAIT) return -EAGAIN; - } else { - xfs_ilock(ip, iolock); + iolock = XFS_IOLOCK_EXCL; + flags = IOMAP_DIO_FORCE_WAIT; } - ret = xfs_file_aio_write_checks(iocb, from, &iolock); + ret = xfs_ilock_iocb(iocb, iolock); if (ret) - goto out; - count = iov_iter_count(from); + return ret; /* - * If we are doing unaligned IO, we can't allow any other overlapping IO - * in-flight at the same time or we risk data corruption. Wait for all - * other IO to drain before we submit. If the IO is aligned, demote the - * iolock if we had to take the exclusive lock in - * xfs_file_aio_write_checks() for other reasons. + * We can't properly handle unaligned direct I/O to reflink files yet, + * as we can't unshare a partial block. */ - if (unaligned_io) { - inode_dio_wait(inode); - } else if (iolock == XFS_IOLOCK_EXCL) { - xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); - iolock = XFS_IOLOCK_SHARED; + if (xfs_is_cow_inode(ip)) { + trace_xfs_reflink_bounce_dio_write(iocb, from); + ret = -ENOTBLK; + goto out_unlock; } - trace_xfs_file_direct_write(ip, count, iocb->ki_pos); + ret = xfs_file_write_checks(iocb, from, &iolock); + if (ret) + goto out_unlock; + /* - * If unaligned, this is the only IO in-flight. Wait on it before we - * release the iolock to prevent subsequent overlapping IO. + * If we are doing exclusive unaligned I/O, this must be the only I/O + * in-flight. Otherwise we risk data corruption due to unwritten extent + * conversions from the AIO end_io handler. Wait for all other I/O to + * drain first. */ + if (flags & IOMAP_DIO_FORCE_WAIT) + inode_dio_wait(VFS_I(ip)); + + trace_xfs_file_direct_write(iocb, from); ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, - &xfs_dio_write_ops, - is_sync_kiocb(iocb) || unaligned_io); -out: - xfs_iunlock(ip, iolock); + &xfs_dio_write_ops, flags); /* - * No fallback to buffered IO after short writes for XFS, direct I/O - * will either complete fully or return an error. + * Retry unaligned I/O with exclusive blocking semantics if the DIO + * layer rejected it for mapping or locking reasons. If we are doing + * nonblocking user I/O, propagate the error. */ - ASSERT(ret < 0 || ret == count); + if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) { + ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY); + xfs_iunlock(ip, iolock); + goto retry_exclusive; + } + +out_unlock: + if (iolock) + xfs_iunlock(ip, iolock); return ret; } +static ssize_t +xfs_file_dio_write( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + size_t count = iov_iter_count(from); + + /* direct I/O must be aligned to device logical sector size */ + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) + return -EINVAL; + if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask) + return xfs_file_dio_write_unaligned(ip, iocb, from); + return xfs_file_dio_write_aligned(ip, iocb, from); +} + static noinline ssize_t xfs_file_dax_write( struct kiocb *iocb, @@ -625,31 +671,26 @@ xfs_file_dax_write( struct xfs_inode *ip = XFS_I(inode); int iolock = XFS_IOLOCK_EXCL; ssize_t ret, error = 0; - size_t count; loff_t pos; - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!xfs_ilock_nowait(ip, iolock)) - return -EAGAIN; - } else { - xfs_ilock(ip, iolock); - } - - ret = xfs_file_aio_write_checks(iocb, from, &iolock); + ret = xfs_ilock_iocb(iocb, iolock); + if (ret) + return ret; + ret = xfs_file_write_checks(iocb, from, &iolock); if (ret) goto out; pos = iocb->ki_pos; - count = iov_iter_count(from); - trace_xfs_file_dax_write(ip, count, pos); + trace_xfs_file_dax_write(iocb, from); ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops); if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { i_size_write(inode, iocb->ki_pos); error = xfs_setfilesize(ip, pos, ret); } out: - xfs_iunlock(ip, iolock); + if (iolock) + xfs_iunlock(ip, iolock); if (error) return error; @@ -663,7 +704,7 @@ out: } STATIC ssize_t -xfs_file_buffered_aio_write( +xfs_file_buffered_write( struct kiocb *iocb, struct iov_iter *from) { @@ -682,14 +723,14 @@ write_retry: iolock = XFS_IOLOCK_EXCL; xfs_ilock(ip, iolock); - ret = xfs_file_aio_write_checks(iocb, from, &iolock); + ret = xfs_file_write_checks(iocb, from, &iolock); if (ret) goto out; /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); - trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); + trace_xfs_file_buffered_write(iocb, from); ret = iomap_file_buffered_write(iocb, from, &xfs_buffered_write_iomap_ops); if (likely(ret >= 0)) @@ -765,12 +806,12 @@ xfs_file_write_iter( * CoW. In all other directio scenarios we do not * allow an operation to fall back to buffered mode. */ - ret = xfs_file_dio_aio_write(iocb, from); + ret = xfs_file_dio_write(iocb, from); if (ret != -ENOTBLK) return ret; } - return xfs_file_buffered_aio_write(iocb, from); + return xfs_file_buffered_write(iocb, from); } static void diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 6594f572096e..e17ab7f42928 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -773,15 +773,28 @@ xfs_direct_write_iomap_begin( goto allocate_blocks; /* - * NOWAIT IO needs to span the entire requested IO with a single map so - * that we avoid partial IO failures due to the rest of the IO range not - * covered by this map triggering an EAGAIN condition when it is - * subsequently mapped and aborting the IO. + * NOWAIT and OVERWRITE I/O needs to span the entire requested I/O with + * a single map so that we avoid partial IO failures due to the rest of + * the I/O range not covered by this map triggering an EAGAIN condition + * when it is subsequently mapped and aborting the I/O. */ - if ((flags & IOMAP_NOWAIT) && - !imap_spans_range(&imap, offset_fsb, end_fsb)) { + if (flags & (IOMAP_NOWAIT | IOMAP_OVERWRITE_ONLY)) { error = -EAGAIN; - goto out_unlock; + if (!imap_spans_range(&imap, offset_fsb, end_fsb)) + goto out_unlock; + } + + /* + * For overwrite only I/O, we cannot convert unwritten extents without + * requiring sub-block zeroing. This can only be done under an + * exclusive IOLOCK, hence return -EAGAIN if this is not a written + * extent to tell the caller to try again. + */ + if (flags & IOMAP_OVERWRITE_ONLY) { + error = -EAGAIN; + if (imap.br_state != XFS_EXT_NORM && + ((offset | length) & mp->m_blockmask)) + goto out_unlock; } xfs_iunlock(ip, lockmode); @@ -790,7 +803,7 @@ xfs_direct_write_iomap_begin( allocate_blocks: error = -EAGAIN; - if (flags & IOMAP_NOWAIT) + if (flags & (IOMAP_NOWAIT | IOMAP_OVERWRITE_ONLY)) goto out_unlock; /* diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 63ecbc6b1a76..e74bbb648f83 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1286,8 +1286,8 @@ TRACE_EVENT(xfs_log_assign_tail_lsn, ) DECLARE_EVENT_CLASS(xfs_file_class, - TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), - TP_ARGS(ip, count, offset), + TP_PROTO(struct kiocb *iocb, struct iov_iter *iter), + TP_ARGS(iocb, iter), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) @@ -1296,11 +1296,11 @@ DECLARE_EVENT_CLASS(xfs_file_class, __field(size_t, count) ), TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; - __entry->size = ip->i_d.di_size; - __entry->offset = offset; - __entry->count = count; + __entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev; + __entry->ino = XFS_I(file_inode(iocb->ki_filp))->i_ino; + __entry->size = XFS_I(file_inode(iocb->ki_filp))->i_d.di_size; + __entry->offset = iocb->ki_pos; + __entry->count = iov_iter_count(iter); ), TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -1312,14 +1312,16 @@ DECLARE_EVENT_CLASS(xfs_file_class, #define DEFINE_RW_EVENT(name) \ DEFINE_EVENT(xfs_file_class, name, \ - TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), \ - TP_ARGS(ip, count, offset)) + TP_PROTO(struct kiocb *iocb, struct iov_iter *iter), \ + TP_ARGS(iocb, iter)) DEFINE_RW_EVENT(xfs_file_buffered_read); DEFINE_RW_EVENT(xfs_file_direct_read); DEFINE_RW_EVENT(xfs_file_dax_read); DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_direct_write); DEFINE_RW_EVENT(xfs_file_dax_write); +DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write); + DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, @@ -3293,8 +3295,6 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow); -DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write); - DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap); diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index bec47f2d074b..0e7ab0bc00ae 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -780,7 +780,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) ret = zonefs_file_dio_append(iocb, from); else ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops, - &zonefs_write_dio_ops, sync); + &zonefs_write_dio_ops, 0); if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && (ret > 0 || ret == -EIOCBQUEUED)) { if (ret > 0) @@ -917,7 +917,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } file_accessed(iocb->ki_filp); ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops, - &zonefs_read_dio_ops, is_sync_kiocb(iocb)); + &zonefs_read_dio_ops, 0); } else { ret = generic_file_read_iter(iocb, to); if (ret == -EIO) |