diff options
Diffstat (limited to 'fs')
132 files changed, 4158 insertions, 2144 deletions
diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 2fe402483ad5..30b066299d39 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -740,10 +740,22 @@ int afs_getattr(struct user_namespace *mnt_userns, const struct path *path, { struct inode *inode = d_inode(path->dentry); struct afs_vnode *vnode = AFS_FS_I(inode); - int seq = 0; + struct key *key; + int ret, seq = 0; _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); + if (!(query_flags & AT_STATX_DONT_SYNC) && + !test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { + key = afs_request_key(vnode->volume->cell); + if (IS_ERR(key)) + return PTR_ERR(key); + ret = afs_validate(vnode, key); + key_put(key); + if (ret < 0) + return ret; + } + do { read_seqbegin_or_lock(&vnode->cb_lock, &seq); generic_fillattr(&init_user_ns, inode, stat); diff --git a/fs/afs/write.c b/fs/afs/write.c index 6bcf1475511b..4763132ca57e 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -616,8 +616,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, _debug("write discard %x @%llx [%llx]", len, start, i_size); /* The dirty region was entirely beyond the EOF. */ - fscache_clear_page_bits(afs_vnode_cache(vnode), - mapping, start, len, caching); + fscache_clear_page_bits(mapping, start, len, caching); afs_pages_written_back(vnode, start, len); ret = 0; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 6556e13ed95f..63c7ebb0da89 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1117,11 +1117,11 @@ out_free_interp: * independently randomized mmap region (0 load_bias * without MAP_FIXED nor MAP_FIXED_NOREPLACE). */ - alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); - if (interpreter || alignment > ELF_MIN_ALIGN) { + if (interpreter) { load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); + alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); if (alignment) load_bias &= ~(alignment - 1); elf_flags |= MAP_FIXED_NOREPLACE; diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index c22d287e020b..0dd6de994199 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2503,12 +2503,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran return ERR_PTR(ret); } - /* - * New block group is likely to be used soon. Try to activate it now. - * Failure is OK for now. - */ - btrfs_zone_activate(cache); - ret = exclude_super_stripes(cache); if (ret) { /* We may have excluded something, so call this just in case */ @@ -2946,7 +2940,6 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) struct btrfs_path *path = NULL; LIST_HEAD(dirty); struct list_head *io = &cur_trans->io_bgs; - int num_started = 0; int loops = 0; spin_lock(&cur_trans->dirty_bgs_lock); @@ -3012,7 +3005,6 @@ again: cache->io_ctl.inode = NULL; ret = btrfs_write_out_cache(trans, cache, path); if (ret == 0 && cache->io_ctl.inode) { - num_started++; should_put = 0; /* @@ -3113,7 +3105,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) int should_put; struct btrfs_path *path; struct list_head *io = &cur_trans->io_bgs; - int num_started = 0; path = btrfs_alloc_path(); if (!path) @@ -3171,7 +3162,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) cache->io_ctl.inode = NULL; ret = btrfs_write_out_cache(trans, cache, path); if (ret == 0 && cache->io_ctl.inode) { - num_started++; should_put = 0; list_add_tail(&cache->io_list, io); } else { @@ -3455,7 +3445,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); } -static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) +static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) { struct btrfs_block_group *bg; int ret; @@ -3542,7 +3532,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) out: btrfs_trans_release_chunk_metadata(trans); - return ret; + if (ret) + return ERR_PTR(ret); + + btrfs_get_block_group(bg); + return bg; } /* @@ -3657,10 +3651,17 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_space_info *space_info; + struct btrfs_block_group *ret_bg; bool wait_for_alloc = false; bool should_alloc = false; + bool from_extent_allocation = false; int ret = 0; + if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) { + from_extent_allocation = true; + force = CHUNK_ALLOC_FORCE; + } + /* Don't re-enter if we're already allocating a chunk */ if (trans->allocating_chunk) return -ENOSPC; @@ -3750,9 +3751,22 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, force_metadata_allocation(fs_info); } - ret = do_chunk_alloc(trans, flags); + ret_bg = do_chunk_alloc(trans, flags); trans->allocating_chunk = false; + if (IS_ERR(ret_bg)) { + ret = PTR_ERR(ret_bg); + } else if (from_extent_allocation) { + /* + * New block group is likely to be used soon. Try to activate + * it now. Failure is OK for now. + */ + btrfs_zone_activate(ret_bg); + } + + if (!ret) + btrfs_put_block_group(ret_bg); + spin_lock(&space_info->lock); if (ret < 0) { if (ret == -ENOSPC) diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 93aabc68bb6a..e8308f2ad07d 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -35,11 +35,15 @@ enum btrfs_discard_state { * the FS with empty chunks * * CHUNK_ALLOC_FORCE means it must try to allocate one + * + * CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from + * find_free_extent() that also activaes the zone */ enum btrfs_chunk_alloc_enum { CHUNK_ALLOC_NO_FORCE, CHUNK_ALLOC_LIMITED, CHUNK_ALLOC_FORCE, + CHUNK_ALLOC_FORCE_FOR_EXTENT, }; struct btrfs_caching_control { diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 47e72d72f7d0..32131a5d321b 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -384,6 +384,17 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) return ret; } +/* + * Check if the inode has flags compatible with compression + */ +static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode) +{ + if (inode->flags & BTRFS_INODE_NODATACOW || + inode->flags & BTRFS_INODE_NODATASUM) + return false; + return true; +} + struct btrfs_dio_private { struct inode *inode; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index be476f094300..19bf36d8ffea 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -537,6 +537,9 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, cb->orig_bio = NULL; cb->nr_pages = nr_pages; + if (blkcg_css) + kthread_associate_blkcg(blkcg_css); + while (cur_disk_bytenr < disk_start + compressed_len) { u64 offset = cur_disk_bytenr - disk_start; unsigned int index = offset >> PAGE_SHIFT; @@ -555,6 +558,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, bio = NULL; goto finish_cb; } + if (blkcg_css) + bio->bi_opf |= REQ_CGROUP_PUNT; } /* * We should never reach next_stripe_start start as we will @@ -612,6 +617,9 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, return 0; finish_cb: + if (blkcg_css) + kthread_associate_blkcg(NULL); + if (bio) { bio->bi_status = ret; bio_endio(bio); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b7631b88426e..077c95e9baa5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1060,6 +1060,7 @@ struct btrfs_fs_info { */ spinlock_t relocation_bg_lock; u64 data_reloc_bg; + struct mutex zoned_data_reloc_io_lock; u64 nr_global_roots; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 71fd99b48283..f26202621989 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -734,7 +734,12 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); - /* Commit dev_replace state and reserve 1 item for it. */ + /* + * Commit dev_replace state and reserve 1 item for it. + * This is crucial to ensure we won't miss copying extents for new block + * groups that are allocated after we started the device replace, and + * must be done after setting up the device replace state. + */ trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b30309f187cf..84795d831282 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1850,9 +1850,10 @@ again: ret = btrfs_insert_fs_root(fs_info, root); if (ret) { - btrfs_put_root(root); - if (ret == -EEXIST) + if (ret == -EEXIST) { + btrfs_put_root(root); goto again; + } goto fail; } return root; @@ -3156,6 +3157,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); mutex_init(&fs_info->zoned_meta_io_lock); + mutex_init(&fs_info->zoned_data_reloc_io_lock); seqlock_init(&fs_info->profiles_lock); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); @@ -3656,6 +3658,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (sectorsize < PAGE_SIZE) { struct btrfs_subpage_info *subpage_info; + /* + * V1 space cache has some hardcoded PAGE_SIZE usage, and is + * going to be deprecated. + * + * Force to use v2 cache for subpage case. + */ + btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); + btrfs_set_and_info(fs_info, FREE_SPACE_TREE, + "forcing free space tree for sector size %u with page size %lu", + sectorsize, PAGE_SIZE); + btrfs_warn(fs_info, "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); @@ -4225,6 +4238,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) */ static void btrfs_end_empty_barrier(struct bio *bio) { + bio_uninit(bio); complete(bio->bi_private); } @@ -4234,7 +4248,7 @@ static void btrfs_end_empty_barrier(struct bio *bio) */ static void write_dev_flush(struct btrfs_device *device) { - struct bio *bio = device->flush_bio; + struct bio *bio = &device->flush_bio; #ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY /* @@ -4247,12 +4261,12 @@ static void write_dev_flush(struct btrfs_device *device) * of simplicity, since this is a debug tool and not meant for use in * non-debug builds. */ - struct request_queue *q = bdev_get_queue(device->bdev); - if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + if (!bdev_write_cache(device->bdev)) return; #endif - bio_reset(bio, device->bdev, REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH); + bio_init(bio, device->bdev, NULL, 0, + REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH); bio->bi_end_io = btrfs_end_empty_barrier; init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; @@ -4266,7 +4280,7 @@ static void write_dev_flush(struct btrfs_device *device) */ static blk_status_t wait_dev_flush(struct btrfs_device *device) { - struct bio *bio = device->flush_bio; + struct bio *bio = &device->flush_bio; if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)) return BLK_STS_OK; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f477035a2ac2..6260784e74b5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1239,7 +1239,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, if (size) { ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, - GFP_NOFS, 0); + GFP_NOFS); if (!ret) *discarded_bytes += size; else if (ret != -EOPNOTSUPP) @@ -1256,7 +1256,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, if (bytes_left) { ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, - GFP_NOFS, 0); + GFP_NOFS); if (!ret) *discarded_bytes += bytes_left; } @@ -1291,7 +1291,7 @@ static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes) ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len, &discarded); discarded += src_disc; - } else if (blk_queue_discard(bdev_get_queue(stripe->dev->bdev))) { + } else if (bdev_max_discard_sectors(stripe->dev->bdev)) { ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded); } else { ret = 0; @@ -4082,7 +4082,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, } ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, - CHUNK_ALLOC_FORCE); + CHUNK_ALLOC_FORCE_FOR_EXTENT); /* Do not bail out on ENOSPC since we can do more. */ if (ret == -ENOSPC) @@ -5987,7 +5987,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) *trimmed = 0; /* Discard not supported = nothing to do. */ - if (!blk_queue_discard(bdev_get_queue(device->bdev))) + if (!bdev_max_discard_sectors(device->bdev)) return 0; /* Not writable = nothing to do. */ diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 724e8fe06aa0..33c19f51d79b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2658,6 +2658,7 @@ int btrfs_repair_one_sector(struct inode *inode, repair_bio = btrfs_bio_alloc(1); repair_bbio = btrfs_bio(repair_bio); + repair_bbio->file_offset = start; repair_bio->bi_opf = REQ_OP_READ; repair_bio->bi_end_io = failed_bio->bi_end_io; repair_bio->bi_iter.bi_sector = failrec->logical >> 9; @@ -3333,24 +3334,37 @@ static int alloc_new_bio(struct btrfs_inode *inode, ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); if (ret < 0) goto error; - if (wbc) { - struct block_device *bdev; - bdev = fs_info->fs_devices->latest_dev->bdev; - bio_set_dev(bio, bdev); - wbc_init_bio(wbc, bio); - } - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct btrfs_device *device; + if (wbc) { + /* + * For Zone append we need the correct block_device that we are + * going to write to set in the bio to be able to respect the + * hardware limitation. Look it up here: + */ + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct btrfs_device *dev; + + dev = btrfs_zoned_get_device(fs_info, disk_bytenr, + fs_info->sectorsize); + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + goto error; + } - device = btrfs_zoned_get_device(fs_info, disk_bytenr, - fs_info->sectorsize); - if (IS_ERR(device)) { - ret = PTR_ERR(device); - goto error; + bio_set_dev(bio, dev->bdev); + } else { + /* + * Otherwise pick the last added device to support + * cgroup writeback. For multi-device file systems this + * means blk-cgroup policies have to always be set on the + * last added/replaced device. This is a bit odd but has + * been like that for a long time. + */ + bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); } - - btrfs_bio(bio)->device = device; + wbc_init_bio(wbc, bio); + } else { + ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND); } return 0; error: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 17d5557f98ec..95c499b8424e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -481,17 +481,6 @@ static noinline int add_async_extent(struct async_chunk *cow, } /* - * Check if the inode has flags compatible with compression - */ -static inline bool inode_can_compress(struct btrfs_inode *inode) -{ - if (inode->flags & BTRFS_INODE_NODATACOW || - inode->flags & BTRFS_INODE_NODATASUM) - return false; - return true; -} - -/* * Check if the inode needs to be submitted to compression, based on mount * options, defragmentation, properties or heuristics. */ @@ -500,7 +489,7 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, { struct btrfs_fs_info *fs_info = inode->root->fs_info; - if (!inode_can_compress(inode)) { + if (!btrfs_inode_can_compress(inode)) { WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), KERN_ERR "BTRFS: unexpected compression for ino %llu\n", btrfs_ino(inode)); @@ -2016,11 +2005,10 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page * to use run_delalloc_nocow() here, like for regular * preallocated inodes. */ - ASSERT(!zoned || - (zoned && btrfs_is_data_reloc_root(inode->root))); + ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root)); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, nr_written); - } else if (!inode_can_compress(inode) || + } else if (!btrfs_inode_can_compress(inode) || !inode_need_compress(inode, start, end)) { if (zoned) ret = run_delalloc_zoned(inode, locked_page, start, end, @@ -7444,6 +7432,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, u64 block_start, orig_start, orig_block_len, ram_bytes; bool can_nocow = false; bool space_reserved = false; + u64 prev_len; int ret = 0; /* @@ -7471,6 +7460,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, can_nocow = true; } + prev_len = len; if (can_nocow) { struct extent_map *em2; @@ -7500,8 +7490,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, goto out; } } else { - const u64 prev_len = len; - /* Our caller expects us to free the input extent map. */ free_extent_map(em); *map = NULL; @@ -7532,7 +7520,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, * We have created our ordered extent, so we can now release our reservation * for an outstanding extent. */ - btrfs_delalloc_release_extents(BTRFS_I(inode), len); + btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len); /* * Need to update the i_size under the extent lock so buffered @@ -7811,8 +7799,6 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); struct bio_vec bvec; struct bvec_iter iter; - const u64 orig_file_offset = dip->file_offset; - u64 start = orig_file_offset; u32 bio_offset = 0; blk_status_t err = BLK_STS_OK; @@ -7822,6 +7808,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); pgoff = bvec.bv_offset; for (i = 0; i < nr_sectors; i++) { + u64 start = bbio->file_offset + bio_offset; + ASSERT(pgoff < PAGE_SIZE); if (uptodate && (!csum || !check_data_csum(inode, bbio, @@ -7834,17 +7822,13 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, } else { int ret; - ASSERT((start - orig_file_offset) < UINT_MAX); - ret = btrfs_repair_one_sector(inode, - &bbio->bio, - start - orig_file_offset, - bvec.bv_page, pgoff, + ret = btrfs_repair_one_sector(inode, &bbio->bio, + bio_offset, bvec.bv_page, pgoff, start, bbio->mirror_num, submit_dio_repair_bio); if (ret) err = errno_to_blk_status(ret); } - start += sectorsize; ASSERT(bio_offset + sectorsize > bio_offset); bio_offset += sectorsize; pgoff += sectorsize; @@ -7871,6 +7855,7 @@ static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, static void btrfs_end_dio_bio(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; + struct btrfs_bio *bbio = btrfs_bio(bio); blk_status_t err = bio->bi_status; if (err) @@ -7881,12 +7866,12 @@ static void btrfs_end_dio_bio(struct bio *bio) bio->bi_iter.bi_size, err); if (bio_op(bio) == REQ_OP_READ) - err = btrfs_check_read_dio_bio(dip, btrfs_bio(bio), !err); + err = btrfs_check_read_dio_bio(dip, bbio, !err); if (err) dip->dio_bio->bi_status = err; - btrfs_record_physical_zoned(dip->inode, dip->file_offset, bio); + btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio); bio_put(bio); btrfs_dio_private_put(dip); @@ -8047,6 +8032,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; + btrfs_bio(bio)->file_offset = file_offset; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { status = extract_ordered_extent(BTRFS_I(inode), bio, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f46e71061942..b2c692b2fd8d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -468,7 +468,6 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_device *device; - struct request_queue *q; struct fstrim_range range; u64 minlen = ULLONG_MAX; u64 num_devices = 0; @@ -498,14 +497,11 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, rcu_read_lock(); list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, dev_list) { - if (!device->bdev) + if (!device->bdev || !bdev_max_discard_sectors(device->bdev)) continue; - q = bdev_get_queue(device->bdev); - if (blk_queue_discard(q)) { - num_devices++; - minlen = min_t(u64, q->limits.discard_granularity, - minlen); - } + num_devices++; + minlen = min_t(u64, bdev_discard_granularity(device->bdev), + minlen); } rcu_read_unlock(); @@ -2565,7 +2561,12 @@ static noinline int search_ioctl(struct inode *inode, while (1) { ret = -EFAULT; - if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset)) + /* + * Ensure that the whole user buffer is faulted in at sub-page + * granularity, otherwise the loop may live-lock. + */ + if (fault_in_subpage_writeable(ubuf + sk_offset, + *buf_size - sk_offset)) break; ret = btrfs_search_forward(root, &key, path, sk->min_transid); @@ -5456,8 +5457,6 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_fs_info(fs_info, argp); case BTRFS_IOC_DEV_INFO: return btrfs_ioctl_dev_info(fs_info, argp); - case BTRFS_IOC_BALANCE: - return btrfs_ioctl_balance(file, NULL); case BTRFS_IOC_TREE_SEARCH: return btrfs_ioctl_tree_search(inode, argp); case BTRFS_IOC_TREE_SEARCH_V2: diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 1a6d2d5b4b33..1b31481f9e72 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -17,9 +17,11 @@ static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); struct prop_handler { struct hlist_node node; const char *xattr_name; - int (*validate)(const char *value, size_t len); + int (*validate)(const struct btrfs_inode *inode, const char *value, + size_t len); int (*apply)(struct inode *inode, const char *value, size_t len); const char *(*extract)(struct inode *inode); + bool (*ignore)(const struct btrfs_inode *inode); int inheritable; }; @@ -55,7 +57,8 @@ find_prop_handler(const char *name, return NULL; } -int btrfs_validate_prop(const char *name, const char *value, size_t value_len) +int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name, + const char *value, size_t value_len) { const struct prop_handler *handler; @@ -69,7 +72,29 @@ int btrfs_validate_prop(const char *name, const char *value, size_t value_len) if (value_len == 0) return 0; - return handler->validate(value, value_len); + return handler->validate(inode, value, value_len); +} + +/* + * Check if a property should be ignored (not set) for an inode. + * + * @inode: The target inode. + * @name: The property's name. + * + * The caller must be sure the given property name is valid, for example by + * having previously called btrfs_validate_prop(). + * + * Returns: true if the property should be ignored for the given inode + * false if the property must not be ignored for the given inode + */ +bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name) +{ + const struct prop_handler *handler; + + handler = find_prop_handler(name, NULL); + ASSERT(handler != NULL); + + return handler->ignore(inode); } int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, @@ -252,8 +277,12 @@ int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path) return ret; } -static int prop_compression_validate(const char *value, size_t len) +static int prop_compression_validate(const struct btrfs_inode *inode, + const char *value, size_t len) { + if (!btrfs_inode_can_compress(inode)) + return -EINVAL; + if (!value) return 0; @@ -310,6 +339,22 @@ static int prop_compression_apply(struct inode *inode, const char *value, return 0; } +static bool prop_compression_ignore(const struct btrfs_inode *inode) +{ + /* + * Compression only has effect for regular files, and for directories + * we set it just to propagate it to new files created inside them. + * Everything else (symlinks, devices, sockets, fifos) is pointless as + * it will do nothing, so don't waste metadata space on a compression + * xattr for anything that is neither a file nor a directory. + */ + if (!S_ISREG(inode->vfs_inode.i_mode) && + !S_ISDIR(inode->vfs_inode.i_mode)) + return true; + + return false; +} + static const char *prop_compression_extract(struct inode *inode) { switch (BTRFS_I(inode)->prop_compress) { @@ -330,6 +375,7 @@ static struct prop_handler prop_handlers[] = { .validate = prop_compression_validate, .apply = prop_compression_apply, .extract = prop_compression_extract, + .ignore = prop_compression_ignore, .inheritable = 1 }, }; @@ -356,6 +402,9 @@ static int inherit_props(struct btrfs_trans_handle *trans, if (!h->inheritable) continue; + if (h->ignore(BTRFS_I(inode))) + continue; + value = h->extract(parent); if (!value) continue; @@ -364,7 +413,7 @@ static int inherit_props(struct btrfs_trans_handle *trans, * This is not strictly necessary as the property should be * valid, but in case it isn't, don't propagate it further. */ - ret = h->validate(value, strlen(value)); + ret = h->validate(BTRFS_I(inode), value, strlen(value)); if (ret) continue; diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index 40b2c65b518c..59bea741cfcf 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -13,7 +13,9 @@ void __init btrfs_props_init(void); int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const char *value, size_t value_len, int flags); -int btrfs_validate_prop(const char *name, const char *value, size_t value_len); +int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name, + const char *value, size_t value_len); +bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name); int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 11089568b287..8cd713d37ad2 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3699,6 +3699,31 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (!cache) goto skip; + ASSERT(cache->start <= chunk_offset); + /* + * We are using the commit root to search for device extents, so + * that means we could have found a device extent item from a + * block group that was deleted in the current transaction. The + * logical start offset of the deleted block group, stored at + * @chunk_offset, might be part of the logical address range of + * a new block group (which uses different physical extents). + * In this case btrfs_lookup_block_group() has returned the new + * block group, and its start address is less than @chunk_offset. + * + * We skip such new block groups, because it's pointless to + * process them, as we won't find their extents because we search + * for them using the commit root of the extent tree. For a device + * replace it's also fine to skip it, we won't miss copying them + * to the target device because we have the write duplication + * setup through the regular write path (by btrfs_map_block()), + * and we have committed a transaction when we started the device + * replace, right after setting up the device replace state. + */ + if (cache->start < chunk_offset) { + btrfs_put_block_group(cache); + goto skip; + } + if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) { spin_lock(&cache->lock); if (!cache->to_copy) { @@ -3822,7 +3847,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, dev_replace->item_needs_writeback = 1; up_write(&dev_replace->rwsem); - ASSERT(cache->start == chunk_offset); ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset, dev_extent_len); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 17389a42a3ab..ba78ca5aabbb 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -922,6 +922,9 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj, case BTRFS_EXCLOP_BALANCE: str = "balance\n"; break; + case BTRFS_EXCLOP_BALANCE_PAUSED: + str = "balance paused\n"; + break; case BTRFS_EXCLOP_DEV_ADD: str = "device add\n"; break; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 571dae8ad65e..e65633686378 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3188,6 +3188,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_alloc_log_tree_node(trans, log_root_tree); if (ret) { mutex_unlock(&fs_info->tree_root->log_mutex); + blk_finish_plug(&plug); goto out; } } @@ -3720,11 +3721,29 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, key.offset = first_offset; key.type = BTRFS_DIR_LOG_INDEX_KEY; ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); - if (ret) + /* + * -EEXIST is fine and can happen sporadically when we are logging a + * directory and have concurrent insertions in the subvolume's tree for + * items from other inodes and that result in pushing off some dir items + * from one leaf to another in order to accommodate for the new items. + * This results in logging the same dir index range key. + */ + if (ret && ret != -EEXIST) return ret; item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_dir_log_item); + if (ret == -EEXIST) { + const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item); + + /* + * btrfs_del_dir_entries_in_log() might have been called during + * an unlink between the initial insertion of this key and the + * current update, or we might be logging a single entry deletion + * during a rename, so set the new last_offset to the max value. + */ + last_offset = max(last_offset, curr_end); + } btrfs_set_dir_log_end(path->nodes[0], item, last_offset); btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_release_path(path); @@ -3848,13 +3867,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, ret = insert_dir_log_key(trans, log, dst_path, ino, *last_old_dentry_offset + 1, key.offset - 1); - /* - * -EEXIST should never happen because when we - * log a directory in full mode (LOG_INODE_ALL) - * we drop all BTRFS_DIR_LOG_INDEX_KEY keys from - * the log tree. - */ - ASSERT(ret != -EEXIST); if (ret < 0) return ret; } @@ -5804,6 +5816,18 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } /* + * For symlinks, we must always log their content, which is stored in an + * inline extent, otherwise we could end up with an empty symlink after + * log replay, which is invalid on linux (symlink(2) returns -ENOENT if + * one attempts to create an empty symlink). + * We don't need to worry about flushing delalloc, because when we create + * the inline extent when the symlink is created (we never have delalloc + * for symlinks). + */ + if (S_ISLNK(inode->vfs_inode.i_mode)) + inode_only = LOG_INODE_ALL; + + /* * Before logging the inode item, cache the value returned by * inode_logged(), because after that we have the need to figure out if * the inode was previously logged in this transaction. @@ -6181,7 +6205,7 @@ again: } ctx->log_new_dentries = false; - if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) + if (type == BTRFS_FT_DIR) log_mode = LOG_INODE_ALL; ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); @@ -7018,12 +7042,12 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, /* * Other concurrent task might be logging the old directory, * as it can be triggered when logging other inode that had or - * still has a dentry in the old directory. So take the old - * directory's log_mutex to prevent getting an -EEXIST when - * logging a key to record the deletion, or having that other - * task logging the old directory get an -EEXIST if it attempts - * to log the same key after we just did it. In both cases that - * would result in falling back to a transaction commit. + * still has a dentry in the old directory. We lock the old + * directory's log_mutex to ensure the deletion of the old + * name is persisted, because during directory logging we + * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of + * the old name's dir index item is in the delayed items, so + * it could be missed by an in progress directory logging. */ mutex_lock(&old_dir->log_mutex); ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2cfbc74a3b4e..b6b00338037c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -405,7 +405,6 @@ void btrfs_free_device(struct btrfs_device *device) WARN_ON(!list_empty(&device->post_commit_list)); rcu_string_free(device->name); extent_io_tree_release(&device->alloc_state); - bio_put(device->flush_bio); btrfs_destroy_dev_zone_info(device); kfree(device); } @@ -643,7 +642,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); } - if (!blk_queue_nonrot(bdev_get_queue(bdev))) + if (!bdev_nonrot(bdev)) fs_devices->rotating = true; device->bdev = bdev; @@ -2706,7 +2705,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path atomic64_add(device->total_bytes, &fs_info->free_chunk_space); - if (!blk_queue_nonrot(bdev_get_queue(bdev))) + if (!bdev_nonrot(bdev)) fs_devices->rotating = true; orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); @@ -4430,10 +4429,12 @@ static int balance_kthread(void *data) struct btrfs_fs_info *fs_info = data; int ret = 0; + sb_start_write(fs_info->sb); mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl) ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); mutex_unlock(&fs_info->balance_mutex); + sb_end_write(fs_info->sb); return ret; } @@ -6947,16 +6948,6 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, if (!dev) return ERR_PTR(-ENOMEM); - /* - * Preallocate a bio that's always going to be used for flushing device - * barriers and matches the device lifespan - */ - dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0); - if (!dev->flush_bio) { - kfree(dev); - return ERR_PTR(-ENOMEM); - } - INIT_LIST_HEAD(&dev->dev_list); INIT_LIST_HEAD(&dev->dev_alloc_list); INIT_LIST_HEAD(&dev->post_commit_list); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index bd297f23d19e..b11c563d2025 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -121,8 +121,8 @@ struct btrfs_device { /* bytes used on the current transaction */ u64 commit_bytes_used; - /* for sending down flush barriers */ - struct bio *flush_bio; + /* Bio used for flushing device barriers */ + struct bio flush_bio; struct completion flush_wait; /* per-device scrub information */ @@ -328,6 +328,9 @@ struct btrfs_fs_devices { struct btrfs_bio { unsigned int mirror_num; + /* for direct I/O */ + u64 file_offset; + /* @device is for stripe IO submission. */ struct btrfs_device *device; u8 *csum; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 99abf41b89b9..85691dc2232f 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -262,7 +262,8 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name, inode_inc_iversion(inode); inode->i_ctime = current_time(inode); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); - BUG_ON(ret); + if (ret) + btrfs_abort_transaction(trans, ret); out: if (start_trans) btrfs_end_transaction(trans); @@ -403,10 +404,13 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, struct btrfs_root *root = BTRFS_I(inode)->root; name = xattr_full_name(handler, name); - ret = btrfs_validate_prop(name, value, size); + ret = btrfs_validate_prop(BTRFS_I(inode), name, value, size); if (ret) return ret; + if (btrfs_ignore_prop(BTRFS_I(inode), name)) + return 0; + trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -416,7 +420,8 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, inode_inc_iversion(inode); inode->i_ctime = current_time(inode); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); - BUG_ON(ret); + if (ret) + btrfs_abort_transaction(trans, ret); } btrfs_end_transaction(trans); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 1b1b310c3c51..29b54fd9c128 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -350,7 +350,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_zoned_device_info *zone_info = NULL; struct block_device *bdev = device->bdev; - struct request_queue *queue = bdev_get_queue(bdev); unsigned int max_active_zones; unsigned int nactive; sector_t nr_sectors; @@ -410,7 +409,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; - max_active_zones = queue_max_active_zones(queue); + max_active_zones = bdev_max_active_zones(bdev); if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) { btrfs_err_in_rcu(fs_info, "zoned: %s: max active zones %u is too small, need at least %u active zones", @@ -1835,6 +1834,12 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) goto out_unlock; } + /* No space left */ + if (block_group->alloc_offset == block_group->zone_capacity) { + ret = false; + goto out_unlock; + } + for (i = 0; i < map->num_stripes; i++) { device = map->stripes[i].dev; physical = map->stripes[i].physical; @@ -1842,35 +1847,23 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) if (device->zone_info->max_active_zones == 0) continue; - /* No space left */ - if (block_group->alloc_offset == block_group->zone_capacity) { - ret = false; - goto out_unlock; - } - if (!btrfs_dev_set_active_zone(device, physical)) { /* Cannot activate the zone */ ret = false; goto out_unlock; } - - /* Successfully activated all the zones */ - if (i == map->num_stripes - 1) - block_group->zone_is_active = 1; - - } + + /* Successfully activated all the zones */ + block_group->zone_is_active = 1; spin_unlock(&block_group->lock); - if (block_group->zone_is_active) { - /* For the active block group list */ - btrfs_get_block_group(block_group); + /* For the active block group list */ + btrfs_get_block_group(block_group); - spin_lock(&fs_info->zone_active_bgs_lock); - list_add_tail(&block_group->active_bg_list, - &fs_info->zone_active_bgs); - spin_unlock(&fs_info->zone_active_bgs_lock); - } + spin_lock(&fs_info->zone_active_bgs_lock); + list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); + spin_unlock(&fs_info->zone_active_bgs_lock); return true; diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index cbf016a7bb5d..6dee76248cb4 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -359,7 +359,7 @@ static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode) struct btrfs_root *root = inode->root; if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info)) - btrfs_inode_lock(&inode->vfs_inode, 0); + mutex_lock(&root->fs_info->zoned_data_reloc_io_lock); } static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode) @@ -367,7 +367,7 @@ static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode) struct btrfs_root *root = inode->root; if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info)) - btrfs_inode_unlock(&inode->vfs_inode, 0); + mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock); } #endif diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index f256c8aff7bb..ca9f3e4ec4b3 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -57,6 +57,16 @@ static void __cachefiles_unmark_inode_in_use(struct cachefiles_object *object, trace_cachefiles_mark_inactive(object, inode); } +static void cachefiles_do_unmark_inode_in_use(struct cachefiles_object *object, + struct dentry *dentry) +{ + struct inode *inode = d_backing_inode(dentry); + + inode_lock(inode); + __cachefiles_unmark_inode_in_use(object, dentry); + inode_unlock(inode); +} + /* * Unmark a backing inode and tell cachefilesd that there's something that can * be culled. @@ -68,9 +78,7 @@ void cachefiles_unmark_inode_in_use(struct cachefiles_object *object, struct inode *inode = file_inode(file); if (inode) { - inode_lock(inode); - __cachefiles_unmark_inode_in_use(object, file->f_path.dentry); - inode_unlock(inode); + cachefiles_do_unmark_inode_in_use(object, file->f_path.dentry); if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) { atomic_long_add(inode->i_blocks, &cache->b_released); @@ -484,7 +492,7 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) object, d_backing_inode(path.dentry), ret, cachefiles_trace_trunc_error); file = ERR_PTR(ret); - goto out_dput; + goto out_unuse; } } @@ -494,15 +502,20 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) trace_cachefiles_vfs_error(object, d_backing_inode(path.dentry), PTR_ERR(file), cachefiles_trace_open_error); - goto out_dput; + goto out_unuse; } if (unlikely(!file->f_op->read_iter) || unlikely(!file->f_op->write_iter)) { fput(file); pr_notice("Cache does not support read_iter and write_iter\n"); file = ERR_PTR(-EINVAL); + goto out_unuse; } + goto out_dput; + +out_unuse: + cachefiles_do_unmark_inode_in_use(object, path.dentry); out_dput: dput(path.dentry); out: @@ -590,14 +603,16 @@ static bool cachefiles_open_file(struct cachefiles_object *object, check_failed: fscache_cookie_lookup_negative(object->cookie); cachefiles_unmark_inode_in_use(object, file); - if (ret == -ESTALE) { - fput(file); - dput(dentry); + fput(file); + dput(dentry); + if (ret == -ESTALE) return cachefiles_create_file(object); - } + return false; + error_fput: fput(file); error: + cachefiles_do_unmark_inode_in_use(object, dentry); dput(dentry); return false; } diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 35465109d9c4..00b087c14995 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -203,7 +203,7 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) if (!buf) return false; buf->reserved = cpu_to_be32(0); - memcpy(buf->data, p, len); + memcpy(buf->data, p, volume->vcookie->coherency_len); ret = cachefiles_inject_write_error(); if (ret == 0) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index aa25bffd4823..b6edcf89a429 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -85,7 +85,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) if (folio_test_dirty(folio)) { dout("%p dirty_folio %p idx %lu -- already dirty\n", mapping->host, folio, folio->index); - BUG_ON(!folio_get_private(folio)); + VM_BUG_ON_FOLIO(!folio_test_private(folio), folio); return false; } @@ -122,7 +122,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) * Reference snap context in folio->private. Also set * PagePrivate so that we get invalidate_folio callback. */ - BUG_ON(folio_get_private(folio)); + VM_BUG_ON_FOLIO(folio_test_private(folio), folio); folio_attach_private(folio, snapc); return ceph_fscache_dirty_folio(mapping, folio); @@ -150,7 +150,7 @@ static void ceph_invalidate_folio(struct folio *folio, size_t offset, } WARN_ON(!folio_test_locked(folio)); - if (folio_get_private(folio)) { + if (folio_test_private(folio)) { dout("%p invalidate_folio idx %lu full dirty page\n", inode, folio->index); @@ -729,8 +729,11 @@ static void writepages_finish(struct ceph_osd_request *req) /* clean all pages */ for (i = 0; i < req->r_num_ops; i++) { - if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) + if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) { + pr_warn("%s incorrect op %d req %p index %d tid %llu\n", + __func__, req->r_ops[i].op, req, i, req->r_tid); break; + } osd_data = osd_req_op_extent_osd_data(req, i); BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index f1ad6884d4da..5c14ef04e474 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2274,6 +2274,8 @@ retry: list_for_each_entry(req, &ci->i_unsafe_dirops, r_unsafe_dir_item) { s = req->r_session; + if (!s) + continue; if (unlikely(s->s_mds >= max_sessions)) { spin_unlock(&ci->i_unsafe_lock); for (i = 0; i < max_sessions; i++) { @@ -2294,6 +2296,8 @@ retry: list_for_each_entry(req, &ci->i_unsafe_iops, r_unsafe_target_item) { s = req->r_session; + if (!s) + continue; if (unlikely(s->s_mds >= max_sessions)) { spin_unlock(&ci->i_unsafe_lock); for (i = 0; i < max_sessions; i++) { @@ -3870,6 +3874,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n", inode, ci, mds, mseq, target); retry: + down_read(&mdsc->snap_rwsem); spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id)) @@ -3933,6 +3938,7 @@ retry: } spin_unlock(&ci->i_ceph_lock); + up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); /* open target session */ @@ -3958,6 +3964,7 @@ retry: out_unlock: spin_unlock(&ci->i_ceph_lock); + up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); if (tsession) { mutex_unlock(&tsession->s_mutex); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 6c9e837aa1d3..8c8226c0feac 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -629,9 +629,15 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, iinfo.change_attr = 1; ceph_encode_timespec64(&iinfo.btime, &now); - iinfo.xattr_len = ARRAY_SIZE(xattr_buf); - iinfo.xattr_data = xattr_buf; - memset(iinfo.xattr_data, 0, iinfo.xattr_len); + if (req->r_pagelist) { + iinfo.xattr_len = req->r_pagelist->length; + iinfo.xattr_data = req->r_pagelist->mapped_tail; + } else { + /* fake it */ + iinfo.xattr_len = ARRAY_SIZE(xattr_buf); + iinfo.xattr_data = xattr_buf; + memset(iinfo.xattr_data, 0, iinfo.xattr_len); + } in.ino = cpu_to_le64(vino.ino); in.snapid = cpu_to_le64(CEPH_NOSNAP); @@ -743,6 +749,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, err = ceph_security_init_secctx(dentry, mode, &as_ctx); if (err < 0) goto out_ctx; + /* Async create can't handle more than a page of xattrs */ + if (as_ctx.pagelist && + !list_is_singular(&as_ctx.pagelist->head)) + try_async = false; } else if (!d_in_lookup(dentry)) { /* If it's not being looked up, it's negative */ return -ENOENT; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index fa38c013126d..00c3de177dd6 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4434,8 +4434,6 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc) bool check_session_state(struct ceph_mds_session *s) { - struct ceph_fs_client *fsc = s->s_mdsc->fsc; - switch (s->s_state) { case CEPH_MDS_SESSION_OPEN: if (s->s_ttl && time_after(jiffies, s->s_ttl)) { @@ -4444,10 +4442,6 @@ bool check_session_state(struct ceph_mds_session *s) } break; case CEPH_MDS_SESSION_CLOSING: - /* Should never reach this when not force unmounting */ - WARN_ON_ONCE(s->s_ttl && - READ_ONCE(fsc->mount_state) != CEPH_MOUNT_SHUTDOWN); - fallthrough; case CEPH_MDS_SESSION_NEW: case CEPH_MDS_SESSION_RESTARTING: case CEPH_MDS_SESSION_CLOSED: diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index a47fa44b6d52..2b1a1c029c75 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -266,22 +266,24 @@ static void cifs_kill_sb(struct super_block *sb) * before we kill the sb. */ if (cifs_sb->root) { + for (node = rb_first(root); node; node = rb_next(node)) { + tlink = rb_entry(node, struct tcon_link, tl_rbnode); + tcon = tlink_tcon(tlink); + if (IS_ERR(tcon)) + continue; + cfid = &tcon->crfid; + mutex_lock(&cfid->fid_mutex); + if (cfid->dentry) { + dput(cfid->dentry); + cfid->dentry = NULL; + } + mutex_unlock(&cfid->fid_mutex); + } + + /* finally release root dentry */ dput(cifs_sb->root); cifs_sb->root = NULL; } - node = rb_first(root); - while (node != NULL) { - tlink = rb_entry(node, struct tcon_link, tl_rbnode); - tcon = tlink_tcon(tlink); - cfid = &tcon->crfid; - mutex_lock(&cfid->fid_mutex); - if (cfid->dentry) { - dput(cfid->dentry); - cfid->dentry = NULL; - } - mutex_unlock(&cfid->fid_mutex); - node = rb_next(node); - } kill_anon_super(sb); cifs_umount(cifs_sb); @@ -944,7 +946,7 @@ cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter) ssize_t rc; struct inode *inode = file_inode(iocb->ki_filp); - if (iocb->ki_filp->f_flags & O_DIRECT) + if (iocb->ki_flags & IOCB_DIRECT) return cifs_user_readv(iocb, iter); rc = cifs_revalidate_mapping(inode); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 54155eb4faac..42e14f408856 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -534,12 +534,19 @@ int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session) { /* If tcp session is not an dfs connection, then reconnect to last target server */ spin_lock(&cifs_tcp_ses_lock); - if (!server->is_dfs_conn || !server->origin_fullpath || !server->leaf_fullpath) { + if (!server->is_dfs_conn) { spin_unlock(&cifs_tcp_ses_lock); return __cifs_reconnect(server, mark_smb_session); } spin_unlock(&cifs_tcp_ses_lock); + mutex_lock(&server->refpath_lock); + if (!server->origin_fullpath || !server->leaf_fullpath) { + mutex_unlock(&server->refpath_lock); + return __cifs_reconnect(server, mark_smb_session); + } + mutex_unlock(&server->refpath_lock); + return reconnect_dfs_server(server); } #else @@ -1049,7 +1056,7 @@ smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server) spin_unlock(&server->req_lock); wake_up(&server->request_q); - trace_smb3_add_credits(server->CurrentMid, + trace_smb3_hdr_credits(server->CurrentMid, server->conn_id, server->hostname, scredits, le16_to_cpu(shdr->CreditRequest), in_flight); cifs_server_dbg(FYI, "%s: added %u credits total=%d\n", @@ -3675,9 +3682,11 @@ static void setup_server_referral_paths(struct mount_ctx *mnt_ctx) { struct TCP_Server_Info *server = mnt_ctx->server; + mutex_lock(&server->refpath_lock); server->origin_fullpath = mnt_ctx->origin_fullpath; server->leaf_fullpath = mnt_ctx->leaf_fullpath; server->current_fullpath = mnt_ctx->leaf_fullpath; + mutex_unlock(&server->refpath_lock); mnt_ctx->origin_fullpath = mnt_ctx->leaf_fullpath = NULL; } diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 30e040da4f09..956f8e5cf3e7 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -1422,12 +1422,14 @@ static int refresh_tcon(struct cifs_ses **sessions, struct cifs_tcon *tcon, bool struct TCP_Server_Info *server = tcon->ses->server; mutex_lock(&server->refpath_lock); - if (strcasecmp(server->leaf_fullpath, server->origin_fullpath)) - __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, force_refresh); + if (server->origin_fullpath) { + if (server->leaf_fullpath && strcasecmp(server->leaf_fullpath, + server->origin_fullpath)) + __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, force_refresh); + __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, force_refresh); + } mutex_unlock(&server->refpath_lock); - __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, force_refresh); - return 0; } @@ -1530,11 +1532,14 @@ static void refresh_mounts(struct cifs_ses **sessions) list_del_init(&tcon->ulist); mutex_lock(&server->refpath_lock); - if (strcasecmp(server->leaf_fullpath, server->origin_fullpath)) - __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, false); + if (server->origin_fullpath) { + if (server->leaf_fullpath && strcasecmp(server->leaf_fullpath, + server->origin_fullpath)) + __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, false); + __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, false); + } mutex_unlock(&server->refpath_lock); - __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, false); cifs_put_tcon(tcon); } } diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 852e54ee82c2..bbdf3281559c 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -85,6 +85,9 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len, if (rc != 1) return -EINVAL; + if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN) + return -EINVAL; + rc = symlink_hash(link_len, link_str, md5_hash); if (rc) { cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index db23f5b404ba..d6aaeff4a30a 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -86,6 +86,9 @@ smb2_add_credits(struct TCP_Server_Info *server, if (*val > 65000) { *val = 65000; /* Don't get near 64K credits, avoid srv bugs */ pr_warn_once("server overflowed SMB3 credits\n"); + trace_smb3_overflow_credits(server->CurrentMid, + server->conn_id, server->hostname, *val, + add, server->in_flight); } server->in_flight--; if (server->in_flight == 0 && @@ -251,7 +254,7 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, in_flight = server->in_flight; spin_unlock(&server->req_lock); - trace_smb3_add_credits(server->CurrentMid, + trace_smb3_wait_credits(server->CurrentMid, server->conn_id, server->hostname, scredits, -(credits->value), in_flight); cifs_dbg(FYI, "%s: removed %u credits total=%d\n", __func__, credits->value, scredits); @@ -300,7 +303,7 @@ smb2_adjust_credits(struct TCP_Server_Info *server, spin_unlock(&server->req_lock); wake_up(&server->request_q); - trace_smb3_add_credits(server->CurrentMid, + trace_smb3_adj_credits(server->CurrentMid, server->conn_id, server->hostname, scredits, credits->value - new_val, in_flight); cifs_dbg(FYI, "%s: adjust added %u credits total=%d\n", @@ -1855,9 +1858,17 @@ smb2_copychunk_range(const unsigned int xid, int chunks_copied = 0; bool chunk_sizes_updated = false; ssize_t bytes_written, total_bytes_written = 0; + struct inode *inode; pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL); + /* + * We need to flush all unwritten data before we can send the + * copychunk ioctl to the server. + */ + inode = d_inode(trgtfile->dentry); + filemap_write_and_wait(inode->i_mapping); + if (pcchunk == NULL) return -ENOMEM; @@ -2492,7 +2503,7 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server) spin_unlock(&server->req_lock); wake_up(&server->request_q); - trace_smb3_add_credits(server->CurrentMid, + trace_smb3_pend_credits(server->CurrentMid, server->conn_id, server->hostname, scredits, le16_to_cpu(shdr->CreditRequest), in_flight); cifs_dbg(FYI, "%s: status pending add %u credits total=%d\n", diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index 6cecf302dcfd..bc279616c513 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -1006,6 +1006,13 @@ DEFINE_SMB3_CREDIT_EVENT(credit_timeout); DEFINE_SMB3_CREDIT_EVENT(insufficient_credits); DEFINE_SMB3_CREDIT_EVENT(too_many_credits); DEFINE_SMB3_CREDIT_EVENT(add_credits); +DEFINE_SMB3_CREDIT_EVENT(adj_credits); +DEFINE_SMB3_CREDIT_EVENT(hdr_credits); +DEFINE_SMB3_CREDIT_EVENT(nblk_credits); +DEFINE_SMB3_CREDIT_EVENT(pend_credits); +DEFINE_SMB3_CREDIT_EVENT(wait_credits); +DEFINE_SMB3_CREDIT_EVENT(waitff_credits); +DEFINE_SMB3_CREDIT_EVENT(overflow_credits); DEFINE_SMB3_CREDIT_EVENT(set_credits); #endif /* _CIFS_TRACE_H */ diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index eeb1a699bd6f..c667e6ddfe2f 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -464,13 +464,12 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, return -EIO; } - tr_hdr = kmalloc(sizeof(*tr_hdr), GFP_NOFS); + tr_hdr = kzalloc(sizeof(*tr_hdr), GFP_NOFS); if (!tr_hdr) return -ENOMEM; memset(&cur_rqst[0], 0, sizeof(cur_rqst)); memset(&iov, 0, sizeof(iov)); - memset(tr_hdr, 0, sizeof(*tr_hdr)); iov.iov_base = tr_hdr; iov.iov_len = sizeof(*tr_hdr); @@ -542,7 +541,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, in_flight = server->in_flight; spin_unlock(&server->req_lock); - trace_smb3_add_credits(server->CurrentMid, + trace_smb3_nblk_credits(server->CurrentMid, server->conn_id, server->hostname, scredits, -1, in_flight); cifs_dbg(FYI, "%s: remove %u credits total=%d\n", __func__, 1, scredits); @@ -648,7 +647,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, in_flight = server->in_flight; spin_unlock(&server->req_lock); - trace_smb3_add_credits(server->CurrentMid, + trace_smb3_waitff_credits(server->CurrentMid, server->conn_id, server->hostname, scredits, -(num_credits), in_flight); cifs_dbg(FYI, "%s: remove %u credits total=%d\n", diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 526a4c1bed99..e78be66bbf01 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -113,7 +113,7 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, if (WARN_ON_ONCE(len <= 0)) return -EINVAL; - if (WARN_ON_ONCE(len % FS_CRYPTO_BLOCK_SIZE != 0)) + if (WARN_ON_ONCE(len % FSCRYPT_CONTENTS_ALIGNMENT != 0)) return -EINVAL; fscrypt_generate_iv(&iv, lblk_num, ci); @@ -213,8 +213,8 @@ EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks); * fscrypt_encrypt_block_inplace() - Encrypt a filesystem block in-place * @inode: The inode to which this block belongs * @page: The page containing the block to encrypt - * @len: Size of block to encrypt. Doesn't need to be a multiple of the - * fs block size, but must be a multiple of FS_CRYPTO_BLOCK_SIZE. + * @len: Size of block to encrypt. This must be a multiple of + * FSCRYPT_CONTENTS_ALIGNMENT. * @offs: Byte offset within @page at which the block to encrypt begins * @lblk_num: Filesystem logical block number of the block, i.e. the 0-based * number of the block within the file @@ -283,8 +283,8 @@ EXPORT_SYMBOL(fscrypt_decrypt_pagecache_blocks); * fscrypt_decrypt_block_inplace() - Decrypt a filesystem block in-place * @inode: The inode to which this block belongs * @page: The page containing the block to decrypt - * @len: Size of block to decrypt. Doesn't need to be a multiple of the - * fs block size, but must be a multiple of FS_CRYPTO_BLOCK_SIZE. + * @len: Size of block to decrypt. This must be a multiple of + * FSCRYPT_CONTENTS_ALIGNMENT. * @offs: Byte offset within @page at which the block to decrypt begins * @lblk_num: Filesystem logical block number of the block, i.e. the 0-based * number of the block within the file diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index a9be4bc74a94..14e0ef5e9a20 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -19,6 +19,13 @@ #include "fscrypt_private.h" /* + * The minimum message length (input and output length), in bytes, for all + * filenames encryption modes. Filenames shorter than this will be zero-padded + * before being encrypted. + */ +#define FSCRYPT_FNAME_MIN_MSG_LEN 16 + +/* * struct fscrypt_nokey_name - identifier for directory entry when key is absent * * When userspace lists an encrypted directory without access to the key, the @@ -267,7 +274,7 @@ bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, if (orig_len > max_len) return false; - encrypted_len = max(orig_len, (u32)FS_CRYPTO_BLOCK_SIZE); + encrypted_len = max_t(u32, orig_len, FSCRYPT_FNAME_MIN_MSG_LEN); encrypted_len = round_up(encrypted_len, padding); *encrypted_len_ret = min(encrypted_len, max_len); return true; @@ -350,7 +357,7 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode, return 0; } - if (iname->len < FS_CRYPTO_BLOCK_SIZE) + if (iname->len < FSCRYPT_FNAME_MIN_MSG_LEN) return -EUCLEAN; if (fscrypt_has_encryption_key(inode)) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 5b0a9e6478b5..6b4c8094cc7b 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -545,8 +545,8 @@ struct key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec); -int fscrypt_add_test_dummy_key(struct super_block *sb, - struct fscrypt_key_specifier *key_spec); +int fscrypt_get_test_dummy_key_identifier( + u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); int fscrypt_verify_key_added(struct super_block *sb, const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); @@ -561,7 +561,9 @@ struct fscrypt_mode { int keysize; /* key size in bytes */ int security_strength; /* security strength in bytes */ int ivsize; /* IV size in bytes */ - int logged_impl_name; + int logged_cryptoapi_impl; + int logged_blk_crypto_native; + int logged_blk_crypto_fallback; enum blk_crypto_mode_num blk_crypto_mode; }; @@ -621,6 +623,8 @@ int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci); bool fscrypt_policies_equal(const union fscrypt_policy *policy1, const union fscrypt_policy *policy2); +int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy, + struct fscrypt_key_specifier *key_spec); bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, const struct inode *inode); int fscrypt_policy_from_context(union fscrypt_policy *policy_u, diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 93c2ca858092..90f3e68f166e 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -12,7 +12,7 @@ * provides the key and IV to use. */ -#include <linux/blk-crypto.h> +#include <linux/blk-crypto-profile.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/sched/mm.h> @@ -64,6 +64,35 @@ static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci) return DIV_ROUND_UP(lblk_bits, 8); } +/* + * Log a message when starting to use blk-crypto (native) or blk-crypto-fallback + * for an encryption mode for the first time. This is the blk-crypto + * counterpart to the message logged when starting to use the crypto API for the + * first time. A limitation is that these messages don't convey which specific + * filesystems or files are using each implementation. However, *usually* + * systems use just one implementation per mode, which makes these messages + * helpful for debugging problems where the "wrong" implementation is used. + */ +static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode, + struct request_queue **devs, + int num_devs, + const struct blk_crypto_config *cfg) +{ + int i; + + for (i = 0; i < num_devs; i++) { + if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || + __blk_crypto_cfg_supported(devs[i]->crypto_profile, cfg)) { + if (!xchg(&mode->logged_blk_crypto_native, 1)) + pr_info("fscrypt: %s using blk-crypto (native)\n", + mode->friendly_name); + } else if (!xchg(&mode->logged_blk_crypto_fallback, 1)) { + pr_info("fscrypt: %s using blk-crypto-fallback\n", + mode->friendly_name); + } + } +} + /* Enable inline encryption for this file if supported. */ int fscrypt_select_encryption_impl(struct fscrypt_info *ci) { @@ -117,6 +146,8 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci) goto out_free_devs; } + fscrypt_log_blk_crypto_impl(ci->ci_mode, devs, num_devs, &crypto_cfg); + ci->ci_inlinecrypt = true; out_free_devs: kfree(devs); diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 0b3ffbb4faf4..caee9f8620dd 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -688,28 +688,68 @@ out_wipe_secret: } EXPORT_SYMBOL_GPL(fscrypt_ioctl_add_key); -/* - * Add the key for '-o test_dummy_encryption' to the filesystem keyring. - * - * Use a per-boot random key to prevent people from misusing this option. - */ -int fscrypt_add_test_dummy_key(struct super_block *sb, - struct fscrypt_key_specifier *key_spec) +static void +fscrypt_get_test_dummy_secret(struct fscrypt_master_key_secret *secret) { static u8 test_key[FSCRYPT_MAX_KEY_SIZE]; + + get_random_once(test_key, FSCRYPT_MAX_KEY_SIZE); + + memset(secret, 0, sizeof(*secret)); + secret->size = FSCRYPT_MAX_KEY_SIZE; + memcpy(secret->raw, test_key, FSCRYPT_MAX_KEY_SIZE); +} + +int fscrypt_get_test_dummy_key_identifier( + u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) +{ struct fscrypt_master_key_secret secret; int err; - get_random_once(test_key, FSCRYPT_MAX_KEY_SIZE); + fscrypt_get_test_dummy_secret(&secret); - memset(&secret, 0, sizeof(secret)); - secret.size = FSCRYPT_MAX_KEY_SIZE; - memcpy(secret.raw, test_key, FSCRYPT_MAX_KEY_SIZE); + err = fscrypt_init_hkdf(&secret.hkdf, secret.raw, secret.size); + if (err) + goto out; + err = fscrypt_hkdf_expand(&secret.hkdf, HKDF_CONTEXT_KEY_IDENTIFIER, + NULL, 0, key_identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE); +out: + wipe_master_key_secret(&secret); + return err; +} + +/** + * fscrypt_add_test_dummy_key() - add the test dummy encryption key + * @sb: the filesystem instance to add the key to + * @dummy_policy: the encryption policy for test_dummy_encryption + * + * If needed, add the key for the test_dummy_encryption mount option to the + * filesystem. To prevent misuse of this mount option, a per-boot random key is + * used instead of a hardcoded one. This makes it so that any encrypted files + * created using this option won't be accessible after a reboot. + * + * Return: 0 on success, -errno on failure + */ +int fscrypt_add_test_dummy_key(struct super_block *sb, + const struct fscrypt_dummy_policy *dummy_policy) +{ + const union fscrypt_policy *policy = dummy_policy->policy; + struct fscrypt_key_specifier key_spec; + struct fscrypt_master_key_secret secret; + int err; - err = add_master_key(sb, &secret, key_spec); + if (!policy) + return 0; + err = fscrypt_policy_to_key_spec(policy, &key_spec); + if (err) + return err; + fscrypt_get_test_dummy_secret(&secret); + err = add_master_key(sb, &secret, &key_spec); wipe_master_key_secret(&secret); return err; } +EXPORT_SYMBOL_GPL(fscrypt_add_test_dummy_key); /* * Verify that the current user has added a master key with the given identifier diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index eede186b04ce..c35711896bd4 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -94,7 +94,7 @@ fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key, mode->cipher_str, PTR_ERR(tfm)); return tfm; } - if (!xchg(&mode->logged_impl_name, 1)) { + if (!xchg(&mode->logged_cryptoapi_impl, 1)) { /* * fscrypt performance can vary greatly depending on which * crypto algorithm implementation is used. Help people debug @@ -425,23 +425,9 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, if (err) return err; - switch (ci->ci_policy.version) { - case FSCRYPT_POLICY_V1: - mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR; - memcpy(mk_spec.u.descriptor, - ci->ci_policy.v1.master_key_descriptor, - FSCRYPT_KEY_DESCRIPTOR_SIZE); - break; - case FSCRYPT_POLICY_V2: - mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; - memcpy(mk_spec.u.identifier, - ci->ci_policy.v2.master_key_identifier, - FSCRYPT_KEY_IDENTIFIER_SIZE); - break; - default: - WARN_ON(1); - return -EINVAL; - } + err = fscrypt_policy_to_key_spec(&ci->ci_policy, &mk_spec); + if (err) + return err; key = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec); if (IS_ERR(key)) { diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index ed3d623724cd..5f858cee1e3b 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -10,6 +10,7 @@ * Modified by Eric Biggers, 2019 for v2 policy support. */ +#include <linux/fs_context.h> #include <linux/random.h> #include <linux/seq_file.h> #include <linux/string.h> @@ -32,6 +33,26 @@ bool fscrypt_policies_equal(const union fscrypt_policy *policy1, return !memcmp(policy1, policy2, fscrypt_policy_size(policy1)); } +int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy, + struct fscrypt_key_specifier *key_spec) +{ + switch (policy->version) { + case FSCRYPT_POLICY_V1: + key_spec->type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR; + memcpy(key_spec->u.descriptor, policy->v1.master_key_descriptor, + FSCRYPT_KEY_DESCRIPTOR_SIZE); + return 0; + case FSCRYPT_POLICY_V2: + key_spec->type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; + memcpy(key_spec->u.identifier, policy->v2.master_key_identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE); + return 0; + default: + WARN_ON(1); + return -EINVAL; + } +} + static const union fscrypt_policy * fscrypt_get_dummy_policy(struct super_block *sb) { @@ -704,73 +725,45 @@ int fscrypt_set_context(struct inode *inode, void *fs_data) EXPORT_SYMBOL_GPL(fscrypt_set_context); /** - * fscrypt_set_test_dummy_encryption() - handle '-o test_dummy_encryption' - * @sb: the filesystem on which test_dummy_encryption is being specified - * @arg: the argument to the test_dummy_encryption option. May be NULL. - * @dummy_policy: the filesystem's current dummy policy (input/output, see - * below) - * - * Handle the test_dummy_encryption mount option by creating a dummy encryption - * policy, saving it in @dummy_policy, and adding the corresponding dummy - * encryption key to the filesystem. If the @dummy_policy is already set, then - * instead validate that it matches @arg. Don't support changing it via - * remount, as that is difficult to do safely. + * fscrypt_parse_test_dummy_encryption() - parse the test_dummy_encryption mount option + * @param: the mount option + * @dummy_policy: (input/output) the place to write the dummy policy that will + * result from parsing the option. Zero-initialize this. If a policy is + * already set here (due to test_dummy_encryption being given multiple + * times), then this function will verify that the policies are the same. * - * Return: 0 on success (dummy policy set, or the same policy is already set); - * -EEXIST if a different dummy policy is already set; - * or another -errno value. + * Return: 0 on success; -EINVAL if the argument is invalid; -EEXIST if the + * argument conflicts with one already specified; or -ENOMEM. */ -int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg, - struct fscrypt_dummy_policy *dummy_policy) +int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, + struct fscrypt_dummy_policy *dummy_policy) { - struct fscrypt_key_specifier key_spec = { 0 }; - int version; - union fscrypt_policy *policy = NULL; + const char *arg = "v2"; + union fscrypt_policy *policy; int err; - if (!arg) - arg = "v2"; - - if (!strcmp(arg, "v1")) { - version = FSCRYPT_POLICY_V1; - key_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR; - memset(key_spec.u.descriptor, 0x42, - FSCRYPT_KEY_DESCRIPTOR_SIZE); - } else if (!strcmp(arg, "v2")) { - version = FSCRYPT_POLICY_V2; - key_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; - /* key_spec.u.identifier gets filled in when adding the key */ - } else { - err = -EINVAL; - goto out; - } + if (param->type == fs_value_is_string && *param->string) + arg = param->string; policy = kzalloc(sizeof(*policy), GFP_KERNEL); - if (!policy) { - err = -ENOMEM; - goto out; - } - - err = fscrypt_add_test_dummy_key(sb, &key_spec); - if (err) - goto out; + if (!policy) + return -ENOMEM; - policy->version = version; - switch (policy->version) { - case FSCRYPT_POLICY_V1: + if (!strcmp(arg, "v1")) { + policy->version = FSCRYPT_POLICY_V1; policy->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; policy->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; - memcpy(policy->v1.master_key_descriptor, key_spec.u.descriptor, + memset(policy->v1.master_key_descriptor, 0x42, FSCRYPT_KEY_DESCRIPTOR_SIZE); - break; - case FSCRYPT_POLICY_V2: + } else if (!strcmp(arg, "v2")) { + policy->version = FSCRYPT_POLICY_V2; policy->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; policy->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; - memcpy(policy->v2.master_key_identifier, key_spec.u.identifier, - FSCRYPT_KEY_IDENTIFIER_SIZE); - break; - default: - WARN_ON(1); + err = fscrypt_get_test_dummy_key_identifier( + policy->v2.master_key_identifier); + if (err) + goto out; + } else { err = -EINVAL; goto out; } @@ -789,6 +782,37 @@ out: kfree(policy); return err; } +EXPORT_SYMBOL_GPL(fscrypt_parse_test_dummy_encryption); + +/** + * fscrypt_dummy_policies_equal() - check whether two dummy policies are equal + * @p1: the first test dummy policy (may be unset) + * @p2: the second test dummy policy (may be unset) + * + * Return: %true if the dummy policies are both set and equal, or both unset. + */ +bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, + const struct fscrypt_dummy_policy *p2) +{ + if (!p1->policy && !p2->policy) + return true; + if (!p1->policy || !p2->policy) + return false; + return fscrypt_policies_equal(p1->policy, p2->policy); +} +EXPORT_SYMBOL_GPL(fscrypt_dummy_policies_equal); + +/* Deprecated, do not use */ +int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg, + struct fscrypt_dummy_policy *dummy_policy) +{ + struct fs_parameter param = { + .type = fs_value_is_string, + .string = arg ? (char *)arg : "", + }; + return fscrypt_parse_test_dummy_encryption(¶m, dummy_policy) ?: + fscrypt_add_test_dummy_key(sb, dummy_policy); +} EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption); /** diff --git a/fs/direct-io.c b/fs/direct-io.c index aef06e607b40..840752006f60 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1115,11 +1115,10 @@ static inline int drop_refcount(struct dio *dio) * individual fields and will generate much worse code. This is important * for the whole file. */ -static inline ssize_t -do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, - struct block_device *bdev, struct iov_iter *iter, - get_block_t get_block, dio_iodone_t end_io, - dio_submit_t submit_io, int flags) +ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, struct iov_iter *iter, + get_block_t get_block, dio_iodone_t end_io, + dio_submit_t submit_io, int flags) { unsigned i_blkbits = READ_ONCE(inode->i_blkbits); unsigned blkbits = i_blkbits; @@ -1334,29 +1333,6 @@ fail_dio: kmem_cache_free(dio_cache, dio); return retval; } - -ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, - struct block_device *bdev, struct iov_iter *iter, - get_block_t get_block, - dio_iodone_t end_io, dio_submit_t submit_io, - int flags) -{ - /* - * The block device state is needed in the end to finally - * submit everything. Since it's likely to be cache cold - * prefetch it here as first thing to hide some of the - * latency. - * - * Attempt to prefetch the pieces we likely need later. - */ - prefetch(&bdev->bd_disk->part_tbl); - prefetch(bdev->bd_disk->queue); - prefetch((char *)bdev->bd_disk->queue + SMP_CACHE_BYTES); - - return do_blockdev_direct_IO(iocb, inode, bdev, iter, get_block, - end_io, submit_io, flags); -} - EXPORT_SYMBOL(__blockdev_direct_IO); static __init int dio_init(void) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 0ed880f42525..e6dea6dfca16 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1066,12 +1066,9 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, /* wake up the caller thread for sync decompression */ if (sync) { - unsigned long flags; - - spin_lock_irqsave(&io->u.wait.lock, flags); if (!atomic_add_return(bios, &io->pending_bios)) - wake_up_locked(&io->u.wait); - spin_unlock_irqrestore(&io->u.wait.lock, flags); + complete(&io->u.done); + return; } @@ -1217,7 +1214,7 @@ jobqueue_init(struct super_block *sb, } else { fg_out: q = fgq; - init_waitqueue_head(&fgq->u.wait); + init_completion(&fgq->u.done); atomic_set(&fgq->pending_bios, 0); } q->sb = sb; @@ -1419,8 +1416,7 @@ static void z_erofs_runqueue(struct super_block *sb, return; /* wait until all bios are completed */ - io_wait_event(io[JQ_SUBMIT].u.wait, - !atomic_read(&io[JQ_SUBMIT].pending_bios)); + wait_for_completion_io(&io[JQ_SUBMIT].u.done); /* handle synchronous decompress queue in the caller context */ z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool); diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index e043216b545f..800b11c53f57 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -97,7 +97,7 @@ struct z_erofs_decompressqueue { z_erofs_next_pcluster_t head; union { - wait_queue_head_t wait; + struct completion done; struct work_struct work; } u; }; diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 2f5130059236..20d4e47f57ab 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -351,21 +351,20 @@ out: static int exfat_ioctl_fitrim(struct inode *inode, unsigned long arg) { - struct request_queue *q = bdev_get_queue(inode->i_sb->s_bdev); struct fstrim_range range; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(inode->i_sb->s_bdev)) return -EOPNOTSUPP; if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; range.minlen = max_t(unsigned int, range.minlen, - q->limits.discard_granularity); + bdev_discard_granularity(inode->i_sb->s_bdev)); ret = exfat_trim_fs(inode, &range); if (ret < 0) diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 8ca21e7917d1..be0788ecaf20 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -627,13 +627,9 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) if (opts->allow_utime == (unsigned short)-1) opts->allow_utime = ~opts->fs_dmask & 0022; - if (opts->discard) { - struct request_queue *q = bdev_get_queue(sb->s_bdev); - - if (!blk_queue_discard(q)) { - exfat_warn(sb, "mounting with \"discard\" option, but the device does not support discard"); - opts->discard = 0; - } + if (opts->discard && !bdev_max_discard_sectors(sb->s_bdev)) { + exfat_warn(sb, "mounting with \"discard\" option, but the device does not support discard"); + opts->discard = 0; } sb->s_flags |= SB_NODIRATIME; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3f87cca49f0c..a743b1e3b89e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2273,6 +2273,10 @@ static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) * Structure of a directory entry */ #define EXT4_NAME_LEN 255 +/* + * Base length of the ext4 directory entry excluding the name length + */ +#define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN) struct ext4_dir_entry { __le32 inode; /* Inode number */ @@ -3032,7 +3036,7 @@ extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); -extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); +extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); @@ -3064,6 +3068,7 @@ int ext4_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, struct fileattr *fa); int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa); extern void ext4_reset_inode_seed(struct inode *inode); +int ext4_update_overhead(struct super_block *sb); /* migrate.c */ extern int ext4_ext_migrate(struct inode *); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0d98cf402282..e473fde6b64b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4500,9 +4500,9 @@ retry: return ret > 0 ? ret2 : ret; } -static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); +static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len); -static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); +static int ext4_insert_range(struct file *file, loff_t offset, loff_t len); static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) @@ -4574,6 +4574,10 @@ static long ext4_zero_range(struct file *file, loff_t offset, /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out_mutex; + /* Preallocate the range including the unaligned edges */ if (partial_begin || partial_end) { ret = ext4_alloc_file_blocks(file, @@ -4690,7 +4694,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) { - ret = ext4_punch_hole(inode, offset, len); + ret = ext4_punch_hole(file, offset, len); goto exit; } @@ -4699,12 +4703,12 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) goto exit; if (mode & FALLOC_FL_COLLAPSE_RANGE) { - ret = ext4_collapse_range(inode, offset, len); + ret = ext4_collapse_range(file, offset, len); goto exit; } if (mode & FALLOC_FL_INSERT_RANGE) { - ret = ext4_insert_range(inode, offset, len); + ret = ext4_insert_range(file, offset, len); goto exit; } @@ -4740,6 +4744,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out; + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); if (ret) goto out; @@ -5241,8 +5249,9 @@ out: * This implements the fallocate's collapse range functionality for ext4 * Returns: 0 and non-zero on error. */ -static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) +static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) { + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; ext4_lblk_t punch_start, punch_stop; @@ -5294,6 +5303,10 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) /* Wait for existing dio to complete */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out_mutex; + /* * Prevent page faults from reinstantiating pages we have released from * page cache. @@ -5387,8 +5400,9 @@ out_mutex: * by len bytes. * Returns 0 on success, error otherwise. */ -static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) +static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) { + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; handle_t *handle; @@ -5445,6 +5459,10 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) /* Wait for existing dio to complete */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out_mutex; + /* * Prevent page faults from reinstantiating pages we have released from * page cache. diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 13740f2d0e61..646ece9b3455 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3953,12 +3953,14 @@ int ext4_break_layouts(struct inode *inode) * Returns: 0 on success or negative on failure */ -int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) +int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { + struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; ext4_lblk_t first_block, stop_block; struct address_space *mapping = inode->i_mapping; - loff_t first_block_offset, last_block_offset; + loff_t first_block_offset, last_block_offset, max_length; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); handle_t *handle; unsigned int credits; int ret = 0, ret2 = 0; @@ -4001,6 +4003,14 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) offset; } + /* + * For punch hole the length + offset needs to be within one block + * before last range. Adjust the length if it goes beyond that limit. + */ + max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize; + if (offset + length > max_length) + length = max_length - offset; + if (offset & (sb->s_blocksize - 1) || (offset + length) & (sb->s_blocksize - 1)) { /* @@ -4016,6 +4026,10 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); + ret = file_modified(file); + if (ret) + goto out_mutex; + /* * Prevent page faults from reinstantiating pages we have released from * page cache. diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 992229ca2d83..4d1d2326eee9 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1044,7 +1044,6 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg) __u32 flags = 0; unsigned int flush_flags = 0; struct super_block *sb = file_inode(filp)->i_sb; - struct request_queue *q; if (copy_from_user(&flags, (__u32 __user *)arg, sizeof(__u32))) @@ -1065,10 +1064,8 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg) if (flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID) return -EINVAL; - q = bdev_get_queue(EXT4_SB(sb)->s_journal->j_dev); - if (!q) - return -ENXIO; - if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !blk_queue_discard(q)) + if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && + !bdev_max_discard_sectors(EXT4_SB(sb)->s_journal->j_dev)) return -EOPNOTSUPP; if (flags & EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN) @@ -1393,14 +1390,13 @@ resizefs_out: case FITRIM: { - struct request_queue *q = bdev_get_queue(sb->s_bdev); struct fstrim_range range; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(sb->s_bdev)) return -EOPNOTSUPP; /* @@ -1652,3 +1648,19 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); } #endif + +static void set_overhead(struct ext4_super_block *es, const void *arg) +{ + es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg)); +} + +int ext4_update_overhead(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (sb_rdonly(sb) || sbi->s_overhead == 0 || + sbi->s_overhead == le32_to_cpu(sbi->s_es->s_overhead_clusters)) + return 0; + + return ext4_update_superblocks_fn(sb, set_overhead, &sbi->s_overhead); +} diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 252c168454c7..ea653d19f9ec 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3498,7 +3498,7 @@ int ext4_mb_init(struct super_block *sb) spin_lock_init(&lg->lg_prealloc_lock); } - if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev))) + if (bdev_nonrot(sb->s_bdev)) sbi->s_mb_max_linear_groups = 0; else sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT; @@ -3629,7 +3629,7 @@ static inline int ext4_issue_discard(struct super_block *sb, return __blkdev_issue_discard(sb->s_bdev, (sector_t)discard_block << (sb->s_blocksize_bits - 9), (sector_t)count << (sb->s_blocksize_bits - 9), - GFP_NOFS, 0, biop); + GFP_NOFS, biop); } else return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); } @@ -6455,7 +6455,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, */ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) { - struct request_queue *q = bdev_get_queue(sb->s_bdev); + unsigned int discard_granularity = bdev_discard_granularity(sb->s_bdev); struct ext4_group_info *grp; ext4_group_t group, first_group, last_group; ext4_grpblk_t cnt = 0, first_cluster, last_cluster; @@ -6475,9 +6475,9 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) range->len < sb->s_blocksize) return -EINVAL; /* No point to try to trim less than discard granularity */ - if (range->minlen < q->limits.discard_granularity) { + if (range->minlen < discard_granularity) { minlen = EXT4_NUM_B2C(EXT4_SB(sb), - q->limits.discard_granularity >> sb->s_blocksize_bits); + discard_granularity >> sb->s_blocksize_bits); if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) goto out; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index e37da8d5cd0c..767b4bfe39c3 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1466,10 +1466,10 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; - while ((char *) de < dlimit) { + while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ - if ((char *) de + de->name_len <= dlimit && + if (de->name + de->name_len <= dlimit && ext4_match(dir, fname, de)) { /* found a match - just to be sure, do * a full check */ diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 495ce59fb4ad..14695e2b5042 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -134,8 +134,10 @@ static void ext4_finish_bio(struct bio *bio) continue; } clear_buffer_async_write(bh); - if (bio->bi_status) + if (bio->bi_status) { + set_buffer_write_io_error(bh); buffer_io_error(bh); + } } while ((bh = bh->b_this_page) != head); spin_unlock_irqrestore(&head->b_uptodate_lock, flags); if (!under_io) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 81749eaddf4c..6900da973ce2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1199,20 +1199,25 @@ static void ext4_put_super(struct super_block *sb) int aborted = 0; int i, err; - ext4_unregister_li_request(sb); - ext4_quota_off_umount(sb); - - flush_work(&sbi->s_error_work); - destroy_workqueue(sbi->rsv_conversion_wq); - ext4_release_orphan_info(sb); - /* * Unregister sysfs before destroying jbd2 journal. * Since we could still access attr_journal_task attribute via sysfs * path which could have sbi->s_journal->j_task as NULL + * Unregister sysfs before flush sbi->s_error_work. + * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If + * read metadata verify failed then will queue error work. + * flush_stashed_error_work will call start_this_handle may trigger + * BUG_ON. */ ext4_unregister_sysfs(sb); + ext4_unregister_li_request(sb); + ext4_quota_off_umount(sb); + + flush_work(&sbi->s_error_work); + destroy_workqueue(sbi->rsv_conversion_wq); + ext4_release_orphan_info(sb); + if (sbi->s_journal) { aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); @@ -4172,9 +4177,11 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp, ext4_fsblk_t first_block, last_block, b; ext4_group_t i, ngroups = ext4_get_groups_count(sb); int s, j, count = 0; + int has_super = ext4_bg_has_super(sb, grp); if (!ext4_has_feature_bigalloc(sb)) - return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) + + return (has_super + ext4_bg_num_gdb(sb, grp) + + (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) + sbi->s_itb_per_group + 2); first_block = le32_to_cpu(sbi->s_es->s_first_data_block) + @@ -5282,9 +5289,18 @@ no_journal: * Get the # of file system overhead blocks from the * superblock if present. */ - if (es->s_overhead_clusters) - sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); - else { + sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); + /* ignore the precalculated value if it is ridiculous */ + if (sbi->s_overhead > ext4_blocks_count(es)) + sbi->s_overhead = 0; + /* + * If the bigalloc feature is not enabled recalculating the + * overhead doesn't take long, so we might as well just redo + * it to make sure we are using the correct value. + */ + if (!ext4_has_feature_bigalloc(sb)) + sbi->s_overhead = 0; + if (sbi->s_overhead == 0) { err = ext4_calculate_overhead(sb); if (err) goto failed_mount_wq; @@ -5458,13 +5474,9 @@ no_journal: goto failed_mount9; } - if (test_opt(sb, DISCARD)) { - struct request_queue *q = bdev_get_queue(sb->s_bdev); - if (!blk_queue_discard(q)) - ext4_msg(sb, KERN_WARNING, - "mounting with \"discard\" option, but " - "the device does not support discard"); - } + if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) + ext4_msg(sb, KERN_WARNING, + "mounting with \"discard\" option, but the device does not support discard"); if (es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ @@ -5602,6 +5614,8 @@ static int ext4_fill_super(struct super_block *sb, struct fs_context *fc) ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " "Quota mode: %s.", descr, ext4_quota_mode(sb)); + /* Update the s_overhead_clusters if necessary */ + ext4_update_overhead(sb); return 0; free_sbi: diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f5366feea82d..909085a78f9c 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -98,9 +98,9 @@ repeat: } if (unlikely(!PageUptodate(page))) { - if (page->index == sbi->metapage_eio_ofs && - sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO) { - set_ckpt_flags(sbi, CP_ERROR_FLAG); + if (page->index == sbi->metapage_eio_ofs) { + if (sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO) + set_ckpt_flags(sbi, CP_ERROR_FLAG); } else { sbi->metapage_eio_ofs = page->index; sbi->metapage_eio_cnt = 0; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8e0c2e773c8d..9a1a526f2092 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -388,11 +388,23 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) return 0; } -static void __attach_io_flag(struct f2fs_io_info *fio, unsigned int io_flag) +static unsigned int f2fs_io_flags(struct f2fs_io_info *fio) { unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1; - unsigned int fua_flag = io_flag & temp_mask; - unsigned int meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; + unsigned int fua_flag, meta_flag, io_flag; + unsigned int op_flags = 0; + + if (fio->op != REQ_OP_WRITE) + return 0; + if (fio->type == DATA) + io_flag = fio->sbi->data_io_flag; + else if (fio->type == NODE) + io_flag = fio->sbi->node_io_flag; + else + return 0; + + fua_flag = io_flag & temp_mask; + meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; /* * data/node io flag bits per temp: @@ -401,9 +413,10 @@ static void __attach_io_flag(struct f2fs_io_info *fio, unsigned int io_flag) * Cold | Warm | Hot | Cold | Warm | Hot | */ if ((1 << fio->temp) & meta_flag) - fio->op_flags |= REQ_META; + op_flags |= REQ_META; if ((1 << fio->temp) & fua_flag) - fio->op_flags |= REQ_FUA; + op_flags |= REQ_FUA; + return op_flags; } static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) @@ -413,14 +426,10 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) sector_t sector; struct bio *bio; - if (fio->type == DATA) - __attach_io_flag(fio, sbi->data_io_flag); - else if (fio->type == NODE) - __attach_io_flag(fio, sbi->node_io_flag); - bdev = f2fs_target_device(sbi, fio->new_blkaddr, §or); - bio = bio_alloc_bioset(bdev, npages, fio->op | fio->op_flags, GFP_NOIO, - &f2fs_bioset); + bio = bio_alloc_bioset(bdev, npages, + fio->op | fio->op_flags | f2fs_io_flags(fio), + GFP_NOIO, &f2fs_bioset); bio->bi_iter.bi_sector = sector; if (is_read_io(fio->op)) { bio->bi_end_io = f2fs_read_end_io; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cd1e65bcf0b0..2b2b3c87e45e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -154,7 +154,6 @@ struct f2fs_mount_info { int s_jquota_fmt; /* Format of quota to use */ #endif /* For which write hints are passed down to block layer */ - int whint_mode; int alloc_mode; /* segment allocation policy */ int fsync_mode; /* fsync policy */ int fs_mode; /* fs mode: LFS or ADAPTIVE */ @@ -1334,12 +1333,6 @@ enum { }; enum { - WHINT_MODE_OFF, /* not pass down write hints */ - WHINT_MODE_USER, /* try to pass down hints given by users */ - WHINT_MODE_FS, /* pass down hints with F2FS policy */ -}; - -enum { ALLOC_MODE_DEFAULT, /* stay default */ ALLOC_MODE_REUSE, /* reuse segments as much as possible */ }; @@ -3657,8 +3650,6 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi); int __init f2fs_create_segment_manager_caches(void); void f2fs_destroy_segment_manager_caches(void); int f2fs_rw_hint_to_seg_type(enum rw_hint hint); -enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, - enum page_type type, enum temp_type temp); unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, unsigned int segno); unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, @@ -4381,8 +4372,7 @@ static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi) static inline bool f2fs_bdev_support_discard(struct block_device *bdev) { - return blk_queue_discard(bdev_get_queue(bdev)) || - bdev_is_zoned(bdev); + return bdev_max_discard_sectors(bdev) || bdev_is_zoned(bdev); } static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5b89af0f27f0..35b6c720c2bc 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2285,7 +2285,6 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; - struct request_queue *q = bdev_get_queue(sb->s_bdev); struct fstrim_range range; int ret; @@ -2304,7 +2303,7 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) return ret; range.minlen = max((unsigned int)range.minlen, - q->limits.discard_granularity); + bdev_discard_granularity(sb->s_bdev)); ret = f2fs_trim_fs(F2FS_SB(sb), &range); mnt_drop_write_file(filp); if (ret < 0) @@ -3686,18 +3685,18 @@ out: static int f2fs_secure_erase(struct block_device *bdev, struct inode *inode, pgoff_t off, block_t block, block_t len, u32 flags) { - struct request_queue *q = bdev_get_queue(bdev); sector_t sector = SECTOR_FROM_BLOCK(block); sector_t nr_sects = SECTOR_FROM_BLOCK(len); int ret = 0; - if (!q) - return -ENXIO; - - if (flags & F2FS_TRIM_FILE_DISCARD) - ret = blkdev_issue_discard(bdev, sector, nr_sects, GFP_NOFS, - blk_queue_secure_erase(q) ? - BLKDEV_DISCARD_SECURE : 0); + if (flags & F2FS_TRIM_FILE_DISCARD) { + if (bdev_max_secure_erase_sectors(bdev)) + ret = blkdev_issue_secure_erase(bdev, sector, nr_sects, + GFP_NOFS); + else + ret = blkdev_issue_discard(bdev, sector, nr_sects, + GFP_NOFS); + } if (!ret && (flags & F2FS_TRIM_FILE_ZEROOUT)) { if (IS_ENCRYPTED(inode)) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 71f232dcf3c2..83639238a1fe 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -550,7 +550,8 @@ make_now: } f2fs_set_inode_flags(inode); - if (file_should_truncate(inode)) { + if (file_should_truncate(inode) && + !is_sbi_flag_set(sbi, SBI_POR_DOING)) { ret = f2fs_truncate(inode); if (ret) goto bad_inode; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 22dfeb991529..7225ce09f3ab 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1196,9 +1196,8 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, unsigned int *issued) { struct block_device *bdev = dc->bdev; - struct request_queue *q = bdev_get_queue(bdev); unsigned int max_discard_blocks = - SECTOR_TO_BLOCK(q->limits.max_discard_sectors); + SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev)); struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); @@ -1245,7 +1244,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, err = __blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(start), SECTOR_FROM_BLOCK(len), - GFP_NOFS, 0, &bio); + GFP_NOFS, &bio); submit: if (err) { spin_lock_irqsave(&dc->lock, flags); @@ -1375,9 +1374,8 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, struct discard_cmd *dc; struct discard_info di = {0}; struct rb_node **insert_p = NULL, *insert_parent = NULL; - struct request_queue *q = bdev_get_queue(bdev); unsigned int max_discard_blocks = - SECTOR_TO_BLOCK(q->limits.max_discard_sectors); + SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev)); block_t end = lstart + len; dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, @@ -3243,101 +3241,6 @@ int f2fs_rw_hint_to_seg_type(enum rw_hint hint) } } -/* This returns write hints for each segment type. This hints will be - * passed down to block layer. There are mapping tables which depend on - * the mount option 'whint_mode'. - * - * 1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET. - * - * 2) whint_mode=user-based. F2FS tries to pass down hints given by users. - * - * User F2FS Block - * ---- ---- ----- - * META WRITE_LIFE_NOT_SET - * HOT_NODE " - * WARM_NODE " - * COLD_NODE " - * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME - * extension list " " - * - * -- buffered io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET - * WRITE_LIFE_NONE " " - * WRITE_LIFE_MEDIUM " " - * WRITE_LIFE_LONG " " - * - * -- direct io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET - * WRITE_LIFE_NONE " WRITE_LIFE_NONE - * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM - * WRITE_LIFE_LONG " WRITE_LIFE_LONG - * - * 3) whint_mode=fs-based. F2FS passes down hints with its policy. - * - * User F2FS Block - * ---- ---- ----- - * META WRITE_LIFE_MEDIUM; - * HOT_NODE WRITE_LIFE_NOT_SET - * WARM_NODE " - * COLD_NODE WRITE_LIFE_NONE - * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME - * extension list " " - * - * -- buffered io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG - * WRITE_LIFE_NONE " " - * WRITE_LIFE_MEDIUM " " - * WRITE_LIFE_LONG " " - * - * -- direct io - * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME - * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT - * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET - * WRITE_LIFE_NONE " WRITE_LIFE_NONE - * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM - * WRITE_LIFE_LONG " WRITE_LIFE_LONG - */ - -enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, - enum page_type type, enum temp_type temp) -{ - if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) { - if (type == DATA) { - if (temp == WARM) - return WRITE_LIFE_NOT_SET; - else if (temp == HOT) - return WRITE_LIFE_SHORT; - else if (temp == COLD) - return WRITE_LIFE_EXTREME; - } else { - return WRITE_LIFE_NOT_SET; - } - } else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) { - if (type == DATA) { - if (temp == WARM) - return WRITE_LIFE_LONG; - else if (temp == HOT) - return WRITE_LIFE_SHORT; - else if (temp == COLD) - return WRITE_LIFE_EXTREME; - } else if (type == NODE) { - if (temp == WARM || temp == HOT) - return WRITE_LIFE_NOT_SET; - else if (temp == COLD) - return WRITE_LIFE_NONE; - } else if (type == META) { - return WRITE_LIFE_MEDIUM; - } - } - return WRITE_LIFE_NOT_SET; -} - static int __get_segment_type_2(struct f2fs_io_info *fio) { if (fio->type == DATA) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ea939db18f88..4368f90571bd 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -138,7 +138,6 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, - Opt_whint, Opt_alloc, Opt_fsync, Opt_test_dummy_encryption, @@ -214,7 +213,6 @@ static match_table_t f2fs_tokens = { {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, - {Opt_whint, "whint_mode=%s"}, {Opt_alloc, "alloc_mode=%s"}, {Opt_fsync, "fsync_mode=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, @@ -975,22 +973,6 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) f2fs_info(sbi, "quota operations not supported"); break; #endif - case Opt_whint: - name = match_strdup(&args[0]); - if (!name) - return -ENOMEM; - if (!strcmp(name, "user-based")) { - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_USER; - } else if (!strcmp(name, "off")) { - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; - } else if (!strcmp(name, "fs-based")) { - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS; - } else { - kfree(name); - return -EINVAL; - } - kfree(name); - break; case Opt_alloc: name = match_strdup(&args[0]); if (!name) @@ -1328,12 +1310,6 @@ default_check: return -EINVAL; } - /* Not pass down write hints if the number of active logs is lesser - * than NR_CURSEG_PERSIST_TYPE. - */ - if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_PERSIST_TYPE) - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; - if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) { f2fs_err(sbi, "Allow to mount readonly mode only"); return -EROFS; @@ -1978,10 +1954,6 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",prjquota"); #endif f2fs_show_quota_options(seq, sbi->sb); - if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) - seq_printf(seq, ",whint_mode=%s", "user-based"); - else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) - seq_printf(seq, ",whint_mode=%s", "fs-based"); fscrypt_show_test_dummy_encryption(seq, ',', sbi->sb); @@ -2033,7 +2005,6 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE; F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; - F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); @@ -2314,8 +2285,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } - if (*flags & SB_RDONLY || - F2FS_OPTION(sbi).whint_mode != org_mount_opt.whint_mode) { + if (*flags & SB_RDONLY) { sync_inodes_sb(sb); set_sbi_flag(sbi, SBI_IS_DIRTY); diff --git a/fs/fat/file.c b/fs/fat/file.c index a5a309fcc7fa..bf91f977debe 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -127,13 +127,12 @@ static int fat_ioctl_fitrim(struct inode *inode, unsigned long arg) struct super_block *sb = inode->i_sb; struct fstrim_range __user *user_range; struct fstrim_range range; - struct request_queue *q = bdev_get_queue(sb->s_bdev); int err; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(sb->s_bdev)) return -EOPNOTSUPP; user_range = (struct fstrim_range __user *)arg; @@ -141,7 +140,7 @@ static int fat_ioctl_fitrim(struct inode *inode, unsigned long arg) return -EFAULT; range.minlen = max_t(unsigned int, range.minlen, - q->limits.discard_granularity); + bdev_discard_granularity(sb->s_bdev)); err = fat_trim_fs(inode, &range); if (err < 0) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index bf6051bdf1d1..3d1afb95a925 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1872,13 +1872,9 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, goto out_fail; } - if (sbi->options.discard) { - struct request_queue *q = bdev_get_queue(sb->s_bdev); - if (!blk_queue_discard(q)) - fat_msg(sb, KERN_WARNING, - "mounting with \"discard\" option, but " - "the device does not support discard"); - } + if (sbi->options.discard && !bdev_max_discard_sectors(sb->s_bdev)) + fat_msg(sb, KERN_WARNING, + "mounting with \"discard\" option, but the device does not support discard"); fat_set_state(sb, 1, 0); return 0; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 591fe9cf1659..a1074a26e784 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1712,6 +1712,10 @@ static int writeback_single_inode(struct inode *inode, */ if (!(inode->i_state & I_DIRTY_ALL)) inode_cgwb_move_to_attached(inode, wb); + else if (!(inode->i_state & I_SYNC_QUEUED) && + (inode->i_state & I_DIRTY)) + redirty_tail_locked(inode, wb); + spin_unlock(&wb->list_lock); inode_sync_complete(inode); out: @@ -1775,11 +1779,12 @@ static long writeback_sb_inodes(struct super_block *sb, }; unsigned long start_time = jiffies; long write_chunk; - long wrote = 0; /* count both pages and inodes */ + long total_wrote = 0; /* count both pages and inodes */ while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); struct bdi_writeback *tmp_wb; + long wrote; if (inode->i_sb != sb) { if (work->sb) { @@ -1855,7 +1860,9 @@ static long writeback_sb_inodes(struct super_block *sb, wbc_detach_inode(&wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; - wrote += write_chunk - wbc.nr_to_write; + wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped; + wrote = wrote < 0 ? 0 : wrote; + total_wrote += wrote; if (need_resched()) { /* @@ -1877,7 +1884,7 @@ static long writeback_sb_inodes(struct super_block *sb, tmp_wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); if (!(inode->i_state & I_DIRTY_ALL)) - wrote++; + total_wrote++; requeue_inode(inode, tmp_wb, &wbc); inode_sync_complete(inode); spin_unlock(&inode->i_lock); @@ -1891,14 +1898,14 @@ static long writeback_sb_inodes(struct super_block *sb, * bail out to wb_writeback() often enough to check * background threshold and other termination conditions. */ - if (wrote) { + if (total_wrote) { if (time_is_before_jiffies(start_time + HZ / 10UL)) break; if (work->nr_pages <= 0) break; } } - return wrote; + return total_wrote; } static long __writeback_inodes_wb(struct bdi_writeback *wb, diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig index 76316c4a3fb7..b313a978ae0a 100644 --- a/fs/fscache/Kconfig +++ b/fs/fscache/Kconfig @@ -38,6 +38,3 @@ config FSCACHE_DEBUG enabled by setting bits in /sys/modules/fscache/parameter/debug. See Documentation/filesystems/caching/fscache.rst for more information. - -config FSCACHE_OLD_API - bool diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index 2749933852a9..d645f8b302a2 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -214,7 +214,7 @@ void fscache_relinquish_cache(struct fscache_cache *cache) cache->ops = NULL; cache->cache_priv = NULL; - smp_store_release(&cache->state, FSCACHE_CACHE_IS_NOT_PRESENT); + fscache_set_cache_state(cache, FSCACHE_CACHE_IS_NOT_PRESENT); fscache_put_cache(cache, where); } EXPORT_SYMBOL(fscache_relinquish_cache); diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 9bb1ab5fe5ed..9d3cf0111709 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -30,7 +30,7 @@ static DEFINE_SPINLOCK(fscache_cookie_lru_lock); DEFINE_TIMER(fscache_cookie_lru_timer, fscache_cookie_lru_timed_out); static DECLARE_WORK(fscache_cookie_lru_work, fscache_cookie_lru_worker); static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] = "-LCAIFUWRD"; -unsigned int fscache_lru_cookie_timeout = 10 * HZ; +static unsigned int fscache_lru_cookie_timeout = 10 * HZ; void fscache_print_cookie(struct fscache_cookie *cookie, char prefix) { @@ -1069,6 +1069,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie, } EXPORT_SYMBOL(__fscache_invalidate); +#ifdef CONFIG_PROC_FS /* * Generate a list of extant cookies in /proc/fs/fscache/cookies */ @@ -1145,3 +1146,4 @@ const struct seq_operations fscache_cookies_seq_ops = { .stop = fscache_cookies_seq_stop, .show = fscache_cookies_seq_show, }; +#endif diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index ed1c9ed737f2..1336f517e9b1 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -56,7 +56,9 @@ static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache, * cookie.c */ extern struct kmem_cache *fscache_cookie_jar; +#ifdef CONFIG_PROC_FS extern const struct seq_operations fscache_cookies_seq_ops; +#endif extern struct timer_list fscache_cookie_lru_timer; extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix); @@ -137,7 +139,9 @@ int fscache_stats_show(struct seq_file *m, void *v); /* * volume.c */ +#ifdef CONFIG_PROC_FS extern const struct seq_operations fscache_volumes_seq_ops; +#endif struct fscache_volume *fscache_get_volume(struct fscache_volume *volume, enum fscache_volume_trace where); diff --git a/fs/fscache/io.c b/fs/fscache/io.c index c8c7fe9e9a6e..3af3b08a9bb3 100644 --- a/fs/fscache/io.c +++ b/fs/fscache/io.c @@ -235,8 +235,7 @@ static void fscache_wreq_done(void *priv, ssize_t transferred_or_error, { struct fscache_write_request *wreq = priv; - fscache_clear_page_bits(fscache_cres_cookie(&wreq->cache_resources), - wreq->mapping, wreq->start, wreq->len, + fscache_clear_page_bits(wreq->mapping, wreq->start, wreq->len, wreq->set_bits); if (wreq->term_func) @@ -296,7 +295,7 @@ abandon_end: abandon_free: kfree(wreq); abandon: - fscache_clear_page_bits(cookie, mapping, start, len, cond); + fscache_clear_page_bits(mapping, start, len, cond); if (term_func) term_func(term_func_priv, ret, false); } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 39080b2d6cf8..b6697333bb2b 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1153,13 +1153,12 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length, if (length != written && (iomap->flags & IOMAP_F_NEW)) { /* Deallocate blocks that were just allocated. */ - loff_t blockmask = i_blocksize(inode) - 1; - loff_t end = (pos + length) & ~blockmask; + loff_t hstart = round_up(pos + written, i_blocksize(inode)); + loff_t hend = iomap->offset + iomap->length; - pos = (pos + written + blockmask) & ~blockmask; - if (pos < end) { - truncate_pagecache_range(inode, pos, end - 1); - punch_hole(ip, pos, end - pos); + if (hstart < hend) { + truncate_pagecache_range(inode, hstart, hend - 1); + punch_hole(ip, hstart, hend - hstart); } } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 22b41acfbbc3..2556ae1f92ea 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -770,30 +770,27 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, return ret ? ret : ret1; } -static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i, +static inline bool should_fault_in_pages(struct iov_iter *i, + struct kiocb *iocb, size_t *prev_count, size_t *window_size) { size_t count = iov_iter_count(i); size_t size, offs; - if (likely(!count)) - return false; - if (ret <= 0 && ret != -EFAULT) + if (!count) return false; if (!iter_is_iovec(i)) return false; size = PAGE_SIZE; - offs = offset_in_page(i->iov[0].iov_base + i->iov_offset); + offs = offset_in_page(iocb->ki_pos); if (*prev_count != count || !*window_size) { size_t nr_dirtied; - size = ALIGN(offs + count, PAGE_SIZE); - size = min_t(size_t, size, SZ_1M); nr_dirtied = max(current->nr_dirtied_pause - current->nr_dirtied, 8); - size = min(size, nr_dirtied << PAGE_SHIFT); + size = min_t(size_t, SZ_1M, nr_dirtied << PAGE_SHIFT); } *prev_count = count; @@ -807,7 +804,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to, struct file *file = iocb->ki_filp; struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); size_t prev_count = 0, window_size = 0; - size_t written = 0; + size_t read = 0; ssize_t ret; /* @@ -835,35 +832,31 @@ retry: ret = gfs2_glock_nq(gh); if (ret) goto out_uninit; -retry_under_glock: pagefault_disable(); to->nofault = true; ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, - IOMAP_DIO_PARTIAL, written); + IOMAP_DIO_PARTIAL, read); to->nofault = false; pagefault_enable(); + if (ret <= 0 && ret != -EFAULT) + goto out_unlock; if (ret > 0) - written = ret; - - if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { - size_t leftover; + read = ret; - gfs2_holder_allow_demote(gh); - leftover = fault_in_iov_iter_writeable(to, window_size); - gfs2_holder_disallow_demote(gh); - if (leftover != window_size) { - if (gfs2_holder_queued(gh)) - goto retry_under_glock; + if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) { + gfs2_glock_dq(gh); + window_size -= fault_in_iov_iter_writeable(to, window_size); + if (window_size) goto retry; - } } +out_unlock: if (gfs2_holder_queued(gh)) gfs2_glock_dq(gh); out_uninit: gfs2_holder_uninit(gh); if (ret < 0) return ret; - return written; + return read; } static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, @@ -873,7 +866,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, struct inode *inode = file->f_mapping->host; struct gfs2_inode *ip = GFS2_I(inode); size_t prev_count = 0, window_size = 0; - size_t read = 0; + size_t written = 0; ssize_t ret; /* @@ -899,41 +892,37 @@ retry: ret = gfs2_glock_nq(gh); if (ret) goto out_uninit; -retry_under_glock: /* Silently fall back to buffered I/O when writing beyond EOF */ if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode)) - goto out; + goto out_unlock; from->nofault = true; ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, - IOMAP_DIO_PARTIAL, read); + IOMAP_DIO_PARTIAL, written); from->nofault = false; - - if (ret == -ENOTBLK) - ret = 0; + if (ret <= 0) { + if (ret == -ENOTBLK) + ret = 0; + if (ret != -EFAULT) + goto out_unlock; + } if (ret > 0) - read = ret; - - if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { - size_t leftover; + written = ret; - gfs2_holder_allow_demote(gh); - leftover = fault_in_iov_iter_readable(from, window_size); - gfs2_holder_disallow_demote(gh); - if (leftover != window_size) { - if (gfs2_holder_queued(gh)) - goto retry_under_glock; + if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { + gfs2_glock_dq(gh); + window_size -= fault_in_iov_iter_readable(from, window_size); + if (window_size) goto retry; - } } -out: +out_unlock: if (gfs2_holder_queued(gh)) gfs2_glock_dq(gh); out_uninit: gfs2_holder_uninit(gh); if (ret < 0) return ret; - return read; + return written; } static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -941,7 +930,7 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) struct gfs2_inode *ip; struct gfs2_holder gh; size_t prev_count = 0, window_size = 0; - size_t written = 0; + size_t read = 0; ssize_t ret; /* @@ -962,7 +951,7 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (ret >= 0) { if (!iov_iter_count(to)) return ret; - written = ret; + read = ret; } else if (ret != -EFAULT) { if (ret != -EAGAIN) return ret; @@ -975,32 +964,26 @@ retry: ret = gfs2_glock_nq(&gh); if (ret) goto out_uninit; -retry_under_glock: pagefault_disable(); ret = generic_file_read_iter(iocb, to); pagefault_enable(); + if (ret <= 0 && ret != -EFAULT) + goto out_unlock; if (ret > 0) - written += ret; + read += ret; - if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { - size_t leftover; - - gfs2_holder_allow_demote(&gh); - leftover = fault_in_iov_iter_writeable(to, window_size); - gfs2_holder_disallow_demote(&gh); - if (leftover != window_size) { - if (gfs2_holder_queued(&gh)) - goto retry_under_glock; - if (written) - goto out_uninit; + if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) { + gfs2_glock_dq(&gh); + window_size -= fault_in_iov_iter_writeable(to, window_size); + if (window_size) goto retry; - } } +out_unlock: if (gfs2_holder_queued(&gh)) gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); - return written ? written : ret; + return read ? read : ret; } static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, @@ -1014,7 +997,7 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, struct gfs2_holder *statfs_gh = NULL; size_t prev_count = 0, window_size = 0; size_t orig_count = iov_iter_count(from); - size_t read = 0; + size_t written = 0; ssize_t ret; /* @@ -1032,10 +1015,18 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh); retry: + if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { + window_size -= fault_in_iov_iter_readable(from, window_size); + if (!window_size) { + ret = -EFAULT; + goto out_uninit; + } + from->count = min(from->count, window_size); + } ret = gfs2_glock_nq(gh); if (ret) goto out_uninit; -retry_under_glock: + if (inode == sdp->sd_rindex) { struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); @@ -1052,27 +1043,19 @@ retry_under_glock: current->backing_dev_info = NULL; if (ret > 0) { iocb->ki_pos += ret; - read += ret; + written += ret; } if (inode == sdp->sd_rindex) gfs2_glock_dq_uninit(statfs_gh); - from->count = orig_count - read; - if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { - size_t leftover; - - gfs2_holder_allow_demote(gh); - leftover = fault_in_iov_iter_readable(from, window_size); - gfs2_holder_disallow_demote(gh); - if (leftover != window_size) { - from->count = min(from->count, window_size - leftover); - if (gfs2_holder_queued(gh)) - goto retry_under_glock; - if (read && !(iocb->ki_flags & IOCB_DIRECT)) - goto out_uninit; - goto retry; - } + if (ret <= 0 && ret != -EFAULT) + goto out_unlock; + + from->count = orig_count - written; + if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { + gfs2_glock_dq(gh); + goto retry; } out_unlock: if (gfs2_holder_queued(gh)) @@ -1081,8 +1064,8 @@ out_uninit: gfs2_holder_uninit(gh); if (statfs_gh) kfree(statfs_gh); - from->count = orig_count - read; - return read ? read : ret; + from->count = orig_count - written; + return written ? written : ret; } /** diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 801ad9f4f2be..6d26bb525484 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1386,7 +1386,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp) { struct inode *inode = file_inode(filp); struct gfs2_sbd *sdp = GFS2_SB(inode); - struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev); + struct block_device *bdev = sdp->sd_vfs->s_bdev; struct buffer_head *bh; struct gfs2_rgrpd *rgd; struct gfs2_rgrpd *rgd_end; @@ -1405,7 +1405,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp) if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) return -EROFS; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(bdev)) return -EOPNOTSUPP; if (copy_from_user(&r, argp, sizeof(r))) @@ -1418,8 +1418,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp) start = r.start >> bs_shift; end = start + (r.len >> bs_shift); minlen = max_t(u64, r.minlen, sdp->sd_sb.sb_bsize); - minlen = max_t(u64, minlen, - q->limits.discard_granularity) >> bs_shift; + minlen = max_t(u64, minlen, bdev_discard_granularity(bdev)) >> bs_shift; if (end <= start || minlen > sdp->sd_max_rg_data) return -EINVAL; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 99c7477cee5c..dd3a088db11d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -206,7 +206,7 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, info.flags = 0; info.length = len; info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; + info.high_limit = arch_get_mmap_end(addr); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); @@ -222,7 +222,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = current->mm->mmap_base; + info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); @@ -237,7 +237,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; + info.high_limit = arch_get_mmap_end(addr); addr = vm_unmapped_area(&info); } @@ -251,6 +251,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct hstate *h = hstate_file(file); + const unsigned long mmap_end = arch_get_mmap_end(addr); if (len & ~huge_page_mask(h)) return -EINVAL; @@ -266,7 +267,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (addr) { addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && + if (mmap_end - len >= addr && (!vma || addr + len <= vm_start_gap(vma))) return addr; } diff --git a/fs/internal.h b/fs/internal.h index 08503dc68d2b..9a6c233ee7f1 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -191,3 +191,32 @@ long splice_file_to_pipe(struct file *in, struct pipe_inode_info *opipe, loff_t *offset, size_t len, unsigned int flags); + +/* + * fs/xattr.c: + */ +struct xattr_name { + char name[XATTR_NAME_MAX + 1]; +}; + +struct xattr_ctx { + /* Value of attribute */ + union { + const void __user *cvalue; + void __user *value; + }; + void *kvalue; + size_t size; + /* Attribute name */ + struct xattr_name *kname; + unsigned int flags; +}; + + +ssize_t do_getxattr(struct user_namespace *mnt_userns, + struct dentry *d, + struct xattr_ctx *ctx); + +int setxattr_copy(const char __user *name, struct xattr_ctx *ctx); +int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct xattr_ctx *ctx); diff --git a/fs/io-wq.c b/fs/io-wq.c index 32aeb2c581c5..824623bcf1a5 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -871,7 +871,7 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe, static bool io_wq_worker_wake(struct io_worker *worker, void *data) { - set_notify_signal(worker->task); + __set_notify_signal(worker->task); wake_up_process(worker->task); return false; } @@ -991,7 +991,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker, { if (work && match->fn(work, match->data)) { work->flags |= IO_WQ_WORK_CANCEL; - set_notify_signal(worker->task); + __set_notify_signal(worker->task); return true; } diff --git a/fs/io-wq.h b/fs/io-wq.h index 04d374e65e54..ba6eee76d028 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -155,7 +155,7 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack) struct io_wq_work { struct io_wq_work_node list; unsigned flags; - int fd; + int cancel_seq; }; static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) diff --git a/fs/io_uring.c b/fs/io_uring.c index 659f8ecba5b7..9f1c682d7caf 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -80,6 +80,7 @@ #include <linux/io_uring.h> #include <linux/audit.h> #include <linux/security.h> +#include <linux/xattr.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -94,7 +95,7 @@ #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 /* only define max */ -#define IORING_MAX_FIXED_FILES (1U << 15) +#define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) @@ -113,6 +114,11 @@ #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA) +#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ + IO_REQ_CLEAN_FLAGS) + +#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED) + #define IO_TCTX_REFS_CACHE_NR (1U << 10) struct io_uring { @@ -166,7 +172,7 @@ struct io_rings { * The application needs a full memory barrier before checking * for IORING_SQ_NEED_WAKEUP after updating the sq tail. */ - u32 sq_flags; + atomic_t sq_flags; /* * Runtime CQ flags * @@ -198,13 +204,6 @@ struct io_rings { struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; }; -enum io_uring_cmd_flags { - IO_URING_F_COMPLETE_DEFER = 1, - IO_URING_F_UNLOCKED = 2, - /* int's last bit, sign checks are usually faster than a bit test */ - IO_URING_F_NONBLOCK = INT_MIN, -}; - struct io_mapped_ubuf { u64 ubuf; u64 ubuf_end; @@ -216,10 +215,27 @@ struct io_mapped_ubuf { struct io_ring_ctx; struct io_overflow_cqe { - struct io_uring_cqe cqe; struct list_head list; + struct io_uring_cqe cqe; }; +/* + * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0 + * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we + * can't safely always dereference the file when the task has exited and ring + * cleanup is done. If a file is tracked and part of SCM, then unix gc on + * process exit may reap it before __io_sqe_files_unregister() is run. + */ +#define FFS_NOWAIT 0x1UL +#define FFS_ISREG 0x2UL +#if defined(CONFIG_64BIT) +#define FFS_SCM 0x4UL +#else +#define IO_URING_SCM_ALL +#define FFS_SCM 0x0UL +#endif +#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM) + struct io_fixed_file { /* file * with additional FFS_* flags */ unsigned long file_ptr; @@ -237,6 +253,8 @@ struct io_rsrc_put { struct io_file_table { struct io_fixed_file *files; + unsigned long *bitmap; + unsigned int alloc_hint; }; struct io_rsrc_node { @@ -261,10 +279,26 @@ struct io_rsrc_data { bool quiesce; }; +#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) struct io_buffer_list { - struct list_head list; - struct list_head buf_list; + /* + * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not, + * then these are classic provided buffers and ->buf_list is used. + */ + union { + struct list_head buf_list; + struct { + struct page **buf_pages; + struct io_uring_buf_ring *buf_ring; + }; + }; __u16 bgid; + + /* below is for ring provided buffers */ + __u16 buf_nr_pages; + __u16 nr_entries; + __u32 head; + __u32 mask; }; struct io_buffer { @@ -337,7 +371,7 @@ struct io_ev_fd { struct rcu_head rcu; }; -#define IO_BUFFERS_HASH_BITS 5 +#define BGID_ARRAY 64 struct io_ring_ctx { /* const or read-mostly hot data */ @@ -346,6 +380,7 @@ struct io_ring_ctx { struct io_rings *rings; unsigned int flags; + enum task_work_notify_mode notify_method; unsigned int compat: 1; unsigned int drain_next: 1; unsigned int restricted: 1; @@ -353,6 +388,7 @@ struct io_ring_ctx { unsigned int drain_active: 1; unsigned int drain_disabled: 1; unsigned int has_evfd: 1; + unsigned int syscall_iopoll: 1; } ____cacheline_aligned_in_smp; /* submission data */ @@ -382,17 +418,21 @@ struct io_ring_ctx { */ struct io_rsrc_node *rsrc_node; int rsrc_cached_refs; + atomic_t cancel_seq; struct io_file_table file_table; unsigned nr_user_files; unsigned nr_user_bufs; struct io_mapped_ubuf **user_bufs; struct io_submit_state submit_state; + + struct io_buffer_list *io_bl; + struct xarray io_bl_xa; + struct list_head io_buffers_cache; + struct list_head timeout_list; struct list_head ltimeout_list; struct list_head cq_overflow_list; - struct list_head *io_buffers; - struct list_head io_buffers_cache; struct list_head apoll_cache; struct xarray personalities; u32 pers_next; @@ -409,9 +449,16 @@ struct io_ring_ctx { struct wait_queue_head sqo_sq_wait; struct list_head sqd_list; - unsigned long check_cq_overflow; + unsigned long check_cq; struct { + /* + * We cache a range of free CQEs we can use, once exhausted it + * should go through a slower range setup, see __io_get_cqe() + */ + struct io_uring_cqe *cqe_cached; + struct io_uring_cqe *cqe_sentinel; + unsigned cached_cq_tail; unsigned cq_entries; struct io_ev_fd __rcu *io_ev_fd; @@ -497,7 +544,7 @@ struct io_uring_task { spinlock_t task_lock; struct io_wq_work_list task_list; - struct io_wq_work_list prior_task_list; + struct io_wq_work_list prio_task_list; struct callback_head task_work; struct file **registered_rings; bool task_running; @@ -546,6 +593,16 @@ struct io_accept { unsigned long nofile; }; +struct io_socket { + struct file *file; + int domain; + int type; + int protocol; + int flags; + u32 file_slot; + unsigned long nofile; +}; + struct io_sync { struct file *file; loff_t len; @@ -557,6 +614,8 @@ struct io_sync { struct io_cancel { struct file *file; u64 addr; + u32 flags; + s32 fd; }; struct io_timeout { @@ -585,7 +644,7 @@ struct io_rw { struct kiocb kiocb; u64 addr; u32 len; - u32 flags; + rwf_t flags; }; struct io_connect { @@ -602,9 +661,9 @@ struct io_sr_msg { void __user *buf; }; int msg_flags; - int bgid; size_t len; size_t done_io; + unsigned int flags; }; struct io_open { @@ -722,6 +781,12 @@ struct io_msg { u32 len; }; +struct io_nop { + struct file *file; + u64 extra1; + u64 extra2; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -748,6 +813,12 @@ struct io_async_rw { struct wait_page_queue wpq; }; +struct io_xattr { + struct file *file; + struct xattr_ctx ctx; + struct filename *filename; +}; + enum { REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, @@ -766,6 +837,7 @@ enum { REQ_F_NEED_CLEANUP_BIT, REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, + REQ_F_BUFFER_RING_BIT, REQ_F_COMPLETE_INLINE_BIT, REQ_F_REISSUE_BIT, REQ_F_CREDS_BIT, @@ -776,6 +848,7 @@ enum { REQ_F_SINGLE_POLL_BIT, REQ_F_DOUBLE_POLL_BIT, REQ_F_PARTIAL_IO_BIT, + REQ_F_APOLL_MULTISHOT_BIT, /* keep async read/write and isreg together and in order */ REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, @@ -816,6 +889,8 @@ enum { REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), /* buffer already selected */ REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), + /* buffer selected from ring, needs commit */ + REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT), /* completion is deferred through io_comp_state */ REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), /* caller should reissue async */ @@ -840,6 +915,8 @@ enum { REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), /* request has already done partial IO */ REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), + /* fast poll multishot mode */ + REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), }; struct async_poll { @@ -862,6 +939,21 @@ enum { IORING_RSRC_BUFFER = 1, }; +struct io_cqe { + __u64 user_data; + __s32 res; + /* fd initially, then cflags for completion */ + union { + __u32 flags; + int fd; + }; +}; + +enum { + IO_CHECK_CQ_OVERFLOW_BIT, + IO_CHECK_CQ_DROPPED_BIT, +}; + /* * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can @@ -897,38 +989,65 @@ struct io_kiocb { struct io_symlink symlink; struct io_hardlink hardlink; struct io_msg msg; + struct io_xattr xattr; + struct io_socket sock; + struct io_nop nop; + struct io_uring_cmd uring_cmd; }; u8 opcode; /* polled IO has completed */ u8 iopoll_completed; + /* + * Can be either a fixed buffer index, or used with provided buffers. + * For the latter, before issue it points to the buffer group ID, + * and after selection it points to the buffer ID itself. + */ u16 buf_index; unsigned int flags; - u64 user_data; - u32 result; - u32 cflags; + struct io_cqe cqe; struct io_ring_ctx *ctx; struct task_struct *task; - struct percpu_ref *fixed_rsrc_refs; - /* store used ubuf, so we can prevent reloading */ - struct io_mapped_ubuf *imu; + struct io_rsrc_node *rsrc_node; + + union { + /* store used ubuf, so we can prevent reloading */ + struct io_mapped_ubuf *imu; + + /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ + struct io_buffer *kbuf; - /* used by request caches, completion batching and iopoll */ - struct io_wq_work_node comp_list; + /* + * stores buffer ID for ring provided buffers, valid IFF + * REQ_F_BUFFER_RING is set. + */ + struct io_buffer_list *buf_list; + }; + + union { + /* used by request caches, completion batching and iopoll */ + struct io_wq_work_node comp_list; + /* cache ->apoll->events */ + __poll_t apoll_events; + }; atomic_t refs; atomic_t poll_refs; struct io_task_work io_task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ - struct hlist_node hash_node; + union { + struct hlist_node hash_node; + struct { + u64 extra1; + u64 extra2; + }; + }; /* internal polling, see IORING_FEAT_FAST_POLL */ struct async_poll *apoll; /* opcode allocated if it needs to store data for async defer */ void *async_data; - /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ - struct io_buffer *kbuf; /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ struct io_kiocb *link; /* custom credentials, valid IFF REQ_F_CREDS is set */ @@ -948,6 +1067,24 @@ struct io_defer_entry { u32 seq; }; +struct io_cancel_data { + struct io_ring_ctx *ctx; + union { + u64 data; + struct file *file; + }; + u32 flags; + int seq; +}; + +/* + * The URING_CMD payload starts at 'cmd' in the first sqe, and continues into + * the following sqe if SQE128 is used. + */ +#define uring_cmd_pdu_size(is_sqe128) \ + ((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) - \ + offsetof(struct io_uring_sqe, cmd)) + struct io_op_def { /* needs req->file assigned */ unsigned needs_file : 1; @@ -969,12 +1106,20 @@ struct io_op_def { unsigned not_supported : 1; /* skip auditing */ unsigned audit_skip : 1; + /* supports ioprio */ + unsigned ioprio : 1; + /* supports iopoll */ + unsigned iopoll : 1; /* size of async data needed, if any */ unsigned short async_size; }; static const struct io_op_def io_op_defs[] = { - [IORING_OP_NOP] = {}, + [IORING_OP_NOP] = { + .audit_skip = 1, + .iopoll = 1, + .buffer_select = 1, + }, [IORING_OP_READV] = { .needs_file = 1, .unbound_nonreg_file = 1, @@ -983,6 +1128,8 @@ static const struct io_op_def io_op_defs[] = { .needs_async_setup = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITEV] = { @@ -993,6 +1140,8 @@ static const struct io_op_def io_op_defs[] = { .needs_async_setup = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_FSYNC] = { @@ -1005,6 +1154,8 @@ static const struct io_op_def io_op_defs[] = { .pollin = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITE_FIXED] = { @@ -1014,6 +1165,8 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_POLL_ADD] = { @@ -1056,6 +1209,7 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollin = 1, .poll_exclusive = 1, + .ioprio = 1, /* used for flags */ }, [IORING_OP_ASYNC_CANCEL] = { .audit_skip = 1, @@ -1078,6 +1232,7 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_CLOSE] = {}, [IORING_OP_FILES_UPDATE] = { .audit_skip = 1, + .iopoll = 1, }, [IORING_OP_STATX] = { .audit_skip = 1, @@ -1089,6 +1244,8 @@ static const struct io_op_def io_op_defs[] = { .buffer_select = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITE] = { @@ -1098,6 +1255,8 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .plug = 1, .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_FADVISE] = { @@ -1132,9 +1291,11 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_PROVIDE_BUFFERS] = { .audit_skip = 1, + .iopoll = 1, }, [IORING_OP_REMOVE_BUFFERS] = { .audit_skip = 1, + .iopoll = 1, }, [IORING_OP_TEE] = { .needs_file = 1, @@ -1152,11 +1313,30 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_LINKAT] = {}, [IORING_OP_MSG_RING] = { .needs_file = 1, + .iopoll = 1, + }, + [IORING_OP_FSETXATTR] = { + .needs_file = 1 + }, + [IORING_OP_SETXATTR] = {}, + [IORING_OP_FGETXATTR] = { + .needs_file = 1 + }, + [IORING_OP_GETXATTR] = {}, + [IORING_OP_SOCKET] = { + .audit_skip = 1, + }, + [IORING_OP_URING_CMD] = { + .needs_file = 1, + .plug = 1, + .needs_async_setup = 1, + .async_size = uring_cmd_pdu_size(1), }, }; /* requests with any of those set should undergo io_disarm_next() */ #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) +#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) static bool io_disarm_next(struct io_kiocb *req); static void io_uring_del_tctx_node(unsigned long index); @@ -1165,10 +1345,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, bool cancel_all); static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); -static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags); - -static void io_put_req(struct io_kiocb *req); -static void io_put_req_deferred(struct io_kiocb *req); +static void __io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags); static void io_dismantle_req(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, @@ -1177,10 +1354,10 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, static void io_clean_op(struct io_kiocb *req); static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); -static inline struct file *io_file_get_normal(struct io_kiocb *req, int fd); +static struct file *io_file_get_normal(struct io_kiocb *req, int fd); static void io_drop_inflight_file(struct io_kiocb *req); static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags); -static void __io_queue_sqe(struct io_kiocb *req); +static void io_queue_sqe(struct io_kiocb *req); static void io_rsrc_put_work(struct work_struct *work); static void io_req_task_queue(struct io_kiocb *req); @@ -1193,11 +1370,115 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); static void io_eventfd_signal(struct io_ring_ctx *ctx); +static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); static struct kmem_cache *req_cachep; static const struct file_operations io_uring_fops; +const char *io_uring_get_opcode(u8 opcode) +{ + switch ((enum io_uring_op)opcode) { + case IORING_OP_NOP: + return "NOP"; + case IORING_OP_READV: + return "READV"; + case IORING_OP_WRITEV: + return "WRITEV"; + case IORING_OP_FSYNC: + return "FSYNC"; + case IORING_OP_READ_FIXED: + return "READ_FIXED"; + case IORING_OP_WRITE_FIXED: + return "WRITE_FIXED"; + case IORING_OP_POLL_ADD: + return "POLL_ADD"; + case IORING_OP_POLL_REMOVE: + return "POLL_REMOVE"; + case IORING_OP_SYNC_FILE_RANGE: + return "SYNC_FILE_RANGE"; + case IORING_OP_SENDMSG: + return "SENDMSG"; + case IORING_OP_RECVMSG: + return "RECVMSG"; + case IORING_OP_TIMEOUT: + return "TIMEOUT"; + case IORING_OP_TIMEOUT_REMOVE: + return "TIMEOUT_REMOVE"; + case IORING_OP_ACCEPT: + return "ACCEPT"; + case IORING_OP_ASYNC_CANCEL: + return "ASYNC_CANCEL"; + case IORING_OP_LINK_TIMEOUT: + return "LINK_TIMEOUT"; + case IORING_OP_CONNECT: + return "CONNECT"; + case IORING_OP_FALLOCATE: + return "FALLOCATE"; + case IORING_OP_OPENAT: + return "OPENAT"; + case IORING_OP_CLOSE: + return "CLOSE"; + case IORING_OP_FILES_UPDATE: + return "FILES_UPDATE"; + case IORING_OP_STATX: + return "STATX"; + case IORING_OP_READ: + return "READ"; + case IORING_OP_WRITE: + return "WRITE"; + case IORING_OP_FADVISE: + return "FADVISE"; + case IORING_OP_MADVISE: + return "MADVISE"; + case IORING_OP_SEND: + return "SEND"; + case IORING_OP_RECV: + return "RECV"; + case IORING_OP_OPENAT2: + return "OPENAT2"; + case IORING_OP_EPOLL_CTL: + return "EPOLL_CTL"; + case IORING_OP_SPLICE: + return "SPLICE"; + case IORING_OP_PROVIDE_BUFFERS: + return "PROVIDE_BUFFERS"; + case IORING_OP_REMOVE_BUFFERS: + return "REMOVE_BUFFERS"; + case IORING_OP_TEE: + return "TEE"; + case IORING_OP_SHUTDOWN: + return "SHUTDOWN"; + case IORING_OP_RENAMEAT: + return "RENAMEAT"; + case IORING_OP_UNLINKAT: + return "UNLINKAT"; + case IORING_OP_MKDIRAT: + return "MKDIRAT"; + case IORING_OP_SYMLINKAT: + return "SYMLINKAT"; + case IORING_OP_LINKAT: + return "LINKAT"; + case IORING_OP_MSG_RING: + return "MSG_RING"; + case IORING_OP_FSETXATTR: + return "FSETXATTR"; + case IORING_OP_SETXATTR: + return "SETXATTR"; + case IORING_OP_FGETXATTR: + return "FGETXATTR"; + case IORING_OP_GETXATTR: + return "GETXATTR"; + case IORING_OP_SOCKET: + return "SOCKET"; + case IORING_OP_URING_CMD: + return "URING_CMD"; + case IORING_OP_LAST: + return "INVALID"; + } + return "INVALID"; +} + struct sock *io_uring_get_socket(struct file *file) { #if defined(CONFIG_UNIX) @@ -1211,6 +1492,42 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); +#if defined(CONFIG_UNIX) +static inline bool io_file_need_scm(struct file *filp) +{ +#if defined(IO_URING_SCM_ALL) + return true; +#else + return !!unix_get_socket(filp); +#endif +} +#else +static inline bool io_file_need_scm(struct file *filp) +{ + return false; +} +#endif + +static void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags) +{ + lockdep_assert_held(&ctx->uring_lock); + if (issue_flags & IO_URING_F_UNLOCKED) + mutex_unlock(&ctx->uring_lock); +} + +static void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags) +{ + /* + * "Normal" inline submissions always hold the uring_lock, since we + * grab it from the system call. Same is true for the SQPOLL offload. + * The only exception is when we've detached the request and issue it + * from an async worker thread, grab the lock for that case. + */ + if (issue_flags & IO_URING_F_UNLOCKED) + mutex_lock(&ctx->uring_lock); + lockdep_assert_held(&ctx->uring_lock); +} + static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) { if (!*locked) { @@ -1272,31 +1589,36 @@ static inline void io_req_set_refcount(struct io_kiocb *req) #define IO_RSRC_REF_BATCH 100 +static void io_rsrc_put_node(struct io_rsrc_node *node, int nr) +{ + percpu_ref_put_many(&node->refs, nr); +} + static inline void io_req_put_rsrc_locked(struct io_kiocb *req, struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { - struct percpu_ref *ref = req->fixed_rsrc_refs; + struct io_rsrc_node *node = req->rsrc_node; - if (ref) { - if (ref == &ctx->rsrc_node->refs) + if (node) { + if (node == ctx->rsrc_node) ctx->rsrc_cached_refs++; else - percpu_ref_put(ref); + io_rsrc_put_node(node, 1); } } -static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx) +static inline void io_req_put_rsrc(struct io_kiocb *req) { - if (req->fixed_rsrc_refs) - percpu_ref_put(req->fixed_rsrc_refs); + if (req->rsrc_node) + io_rsrc_put_node(req->rsrc_node, 1); } static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { if (ctx->rsrc_cached_refs) { - percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs); + io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs); ctx->rsrc_cached_refs = 0; } } @@ -1312,8 +1634,8 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req, struct io_ring_ctx *ctx, unsigned int issue_flags) { - if (!req->fixed_rsrc_refs) { - req->fixed_rsrc_refs = &ctx->rsrc_node->refs; + if (!req->rsrc_node) { + req->rsrc_node = ctx->rsrc_node; if (!(issue_flags & IO_URING_F_UNLOCKED)) { lockdep_assert_held(&ctx->uring_lock); @@ -1321,28 +1643,30 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req, if (unlikely(ctx->rsrc_cached_refs < 0)) io_rsrc_refs_refill(ctx); } else { - percpu_ref_get(req->fixed_rsrc_refs); + percpu_ref_get(&req->rsrc_node->refs); } } } static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list) { - struct io_buffer *kbuf = req->kbuf; - unsigned int cflags; + if (req->flags & REQ_F_BUFFER_RING) { + if (req->buf_list) + req->buf_list->head++; + req->flags &= ~REQ_F_BUFFER_RING; + } else { + list_add(&req->kbuf->list, list); + req->flags &= ~REQ_F_BUFFER_SELECTED; + } - cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT); - req->flags &= ~REQ_F_BUFFER_SELECTED; - list_add(&kbuf->list, list); - req->kbuf = NULL; - return cflags; + return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); } static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) { lockdep_assert_held(&req->ctx->completion_lock); - if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) return 0; return __io_put_kbuf(req, &req->ctx->io_buffers_comp); } @@ -1352,7 +1676,7 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req, { unsigned int cflags; - if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) return 0; /* @@ -1367,7 +1691,10 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req, * We migrate buffers from the comp_list to the issue cache list * when we need one. */ - if (issue_flags & IO_URING_F_UNLOCKED) { + if (req->flags & REQ_F_BUFFER_RING) { + /* no buffers to recycle for this case */ + cflags = __io_put_kbuf(req, NULL); + } else if (issue_flags & IO_URING_F_UNLOCKED) { struct io_ring_ctx *ctx = req->ctx; spin_lock(&ctx->completion_lock); @@ -1385,15 +1712,10 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req, static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, unsigned int bgid) { - struct list_head *hash_list; - struct io_buffer_list *bl; + if (ctx->io_bl && bgid < BGID_ARRAY) + return &ctx->io_bl[bgid]; - hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)]; - list_for_each_entry(bl, hash_list, list) - if (bl->bgid == bgid || bgid == -1U) - return bl; - - return NULL; + return xa_load(&ctx->io_bl_xa, bgid); } static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) @@ -1402,25 +1724,33 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) struct io_buffer_list *bl; struct io_buffer *buf; - if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) return; /* don't recycle if we already did IO to this buffer */ if (req->flags & REQ_F_PARTIAL_IO) return; + /* + * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear + * the flag and hence ensure that bl->head doesn't get incremented. + * If the tail has already been incremented, hang on to it. + */ + if (req->flags & REQ_F_BUFFER_RING) { + if (req->buf_list) { + req->buf_index = req->buf_list->bgid; + req->flags &= ~REQ_F_BUFFER_RING; + } + return; + } - if (issue_flags & IO_URING_F_UNLOCKED) - mutex_lock(&ctx->uring_lock); - - lockdep_assert_held(&ctx->uring_lock); + io_ring_submit_lock(ctx, issue_flags); buf = req->kbuf; bl = io_buffer_get_list(ctx, buf->bgid); list_add(&buf->list, &bl->buf_list); req->flags &= ~REQ_F_BUFFER_SELECTED; - req->kbuf = NULL; + req->buf_index = buf->bgid; - if (issue_flags & IO_URING_F_UNLOCKED) - mutex_unlock(&ctx->uring_lock); + io_ring_submit_unlock(ctx, issue_flags); } static bool io_match_task(struct io_kiocb *head, struct task_struct *task, @@ -1461,7 +1791,12 @@ static inline void req_set_fail(struct io_kiocb *req) static inline void req_fail_link_node(struct io_kiocb *req, int res) { req_set_fail(req); - req->result = res; + req->cqe.res = res; +} + +static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) +{ + wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); } static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) @@ -1498,12 +1833,14 @@ static __cold void io_fallback_req_func(struct work_struct *work) static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; - int i, hash_bits; + int hash_bits; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return NULL; + xa_init(&ctx->io_bl_xa); + /* * Use 5 bits less than the max cq entries, that should give us around * 32 entries per hash list if totally full and uniformly spread. @@ -1525,13 +1862,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) /* set invalid range, so io_import_fixed() fails meeting it */ ctx->dummy_ubuf->ubuf = -1UL; - ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS, - sizeof(struct list_head), GFP_KERNEL); - if (!ctx->io_buffers) - goto err; - for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) - INIT_LIST_HEAD(&ctx->io_buffers[i]); - if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) goto err; @@ -1567,7 +1897,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) err: kfree(ctx->dummy_ubuf); kfree(ctx->cancel_hash); - kfree(ctx->io_buffers); + kfree(ctx->io_bl); + xa_destroy(&ctx->io_bl_xa); kfree(ctx); return NULL; } @@ -1591,10 +1922,6 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) return false; } -#define FFS_NOWAIT 0x1UL -#define FFS_ISREG 0x2UL -#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) - static inline bool io_req_ffs_set(struct io_kiocb *req) { return req->flags & REQ_F_FIXED_FILE; @@ -1621,6 +1948,17 @@ static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) return __io_prep_linked_timeout(req); } +static noinline void __io_arm_ltimeout(struct io_kiocb *req) +{ + io_queue_linked_timeout(__io_prep_linked_timeout(req)); +} + +static inline void io_arm_ltimeout(struct io_kiocb *req) +{ + if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT)) + __io_arm_ltimeout(req); +} + static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -1633,6 +1971,7 @@ static void io_prep_async_work(struct io_kiocb *req) req->work.list.next = NULL; req->work.flags = 0; + req->work.cancel_seq = atomic_read(&ctx->cancel_seq); if (req->flags & REQ_F_FORCE_ASYNC) req->work.flags |= IO_WQ_WORK_CONCURRENT; @@ -1664,17 +2003,15 @@ static void io_prep_async_link(struct io_kiocb *req) static inline void io_req_add_compl_list(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - struct io_submit_state *state = &ctx->submit_state; + struct io_submit_state *state = &req->ctx->submit_state; if (!(req->flags & REQ_F_CQE_SKIP)) - ctx->submit_state.flush_cqes = true; + state->flush_cqes = true; wq_list_add_tail(&req->comp_list, &state->compl_reqs); } -static void io_queue_async_work(struct io_kiocb *req, bool *dont_use) +static void io_queue_iowq(struct io_kiocb *req, bool *dont_use) { - struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; @@ -1694,8 +2031,9 @@ static void io_queue_async_work(struct io_kiocb *req, bool *dont_use) if (WARN_ON_ONCE(!same_thread_group(req->task, current))) req->work.flags |= IO_WQ_WORK_CANCEL; - trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags, - &req->work, io_wq_is_hashed(&req->work)); + trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data, + req->opcode, req->flags, &req->work, + io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); if (link) io_queue_linked_timeout(link); @@ -1713,8 +2051,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status) atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&req->timeout.list); - io_fill_cqe_req(req, status, 0); - io_put_req_deferred(req); + io_req_tw_post_queue(req, status, 0); } } @@ -1796,21 +2133,53 @@ static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); } -static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) +/* + * writes to the cq entry need to come after reading head; the + * control dependency is enough as we're using WRITE_ONCE to + * fill the cq entry + */ +static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; - unsigned tail, mask = ctx->cq_entries - 1; - - /* - * writes to the cq entry need to come after reading head; the - * control dependency is enough as we're using WRITE_ONCE to - * fill the cq entry - */ - if (__io_cqring_events(ctx) == ctx->cq_entries) + unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); + unsigned int shift = 0; + unsigned int free, queued, len; + + if (ctx->flags & IORING_SETUP_CQE32) + shift = 1; + + /* userspace may cheat modifying the tail, be safe and do min */ + queued = min(__io_cqring_events(ctx), ctx->cq_entries); + free = ctx->cq_entries - queued; + /* we need a contiguous range, limit based on the current array offset */ + len = min(free, ctx->cq_entries - off); + if (!len) return NULL; - tail = ctx->cached_cq_tail++; - return &rings->cqes[tail & mask]; + ctx->cached_cq_tail++; + ctx->cqe_cached = &rings->cqes[off]; + ctx->cqe_sentinel = ctx->cqe_cached + len; + ctx->cqe_cached++; + return &rings->cqes[off << shift]; +} + +static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) +{ + if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { + struct io_uring_cqe *cqe = ctx->cqe_cached; + + if (ctx->flags & IORING_SETUP_CQE32) { + unsigned int off = ctx->cqe_cached - ctx->rings->cqes; + + cqe += off; + } + + ctx->cached_cq_tail++; + ctx->cqe_cached++; + return cqe; + } + + return __io_get_cqe(ctx); } static void io_eventfd_signal(struct io_ring_ctx *ctx) @@ -1881,10 +2250,14 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { bool all_flushed, posted; + size_t cqe_size = sizeof(struct io_uring_cqe); if (!force && __io_cqring_events(ctx) == ctx->cq_entries) return false; + if (ctx->flags & IORING_SETUP_CQE32) + cqe_size <<= 1; + posted = false; spin_lock(&ctx->completion_lock); while (!list_empty(&ctx->cq_overflow_list)) { @@ -1896,7 +2269,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); if (cqe) - memcpy(cqe, &ocqe->cqe, sizeof(*cqe)); + memcpy(cqe, &ocqe->cqe, cqe_size); else io_account_cq_overflow(ctx); @@ -1907,13 +2280,11 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) all_flushed = list_empty(&ctx->cq_overflow_list); if (all_flushed) { - clear_bit(0, &ctx->check_cq_overflow); - WRITE_ONCE(ctx->rings->sq_flags, - ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW); + clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); + atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } - if (posted) - io_commit_cqring(ctx); + io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); if (posted) io_cqring_ev_posted(ctx); @@ -1924,7 +2295,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) { bool ret = true; - if (test_bit(0, &ctx->check_cq_overflow)) { + if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { /* iopoll syncs against uring_lock, not completion_lock */ if (ctx->flags & IORING_SETUP_IOPOLL) mutex_lock(&ctx->uring_lock); @@ -1936,19 +2307,23 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) return ret; } -/* must to be called somewhat shortly after putting a request */ -static inline void io_put_task(struct task_struct *task, int nr) +static void __io_put_task(struct task_struct *task, int nr) { struct io_uring_task *tctx = task->io_uring; - if (likely(task == current)) { - tctx->cached_refs += nr; - } else { - percpu_counter_sub(&tctx->inflight, nr); - if (unlikely(atomic_read(&tctx->in_idle))) - wake_up(&tctx->wait); - put_task_struct_many(task, nr); - } + percpu_counter_sub(&tctx->inflight, nr); + if (unlikely(atomic_read(&tctx->in_idle))) + wake_up(&tctx->wait); + put_task_struct_many(task, nr); +} + +/* must to be called somewhat shortly after putting a request */ +static inline void io_put_task(struct task_struct *task, int nr) +{ + if (likely(task == current)) + task->io_uring->cached_refs += nr; + else + __io_put_task(task, nr); } static void io_task_refs_refill(struct io_uring_task *tctx) @@ -1982,11 +2357,18 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task) } static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags) + s32 res, u32 cflags, u64 extra1, + u64 extra2) { struct io_overflow_cqe *ocqe; + size_t ocq_size = sizeof(struct io_overflow_cqe); + bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); + + if (is_cqe32) + ocq_size += sizeof(struct io_uring_cqe); - ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT); + ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT); + trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); if (!ocqe) { /* * If we're in ring overflow flush mode, or in task cancel mode, @@ -1994,17 +2376,21 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, * on the floor. */ io_account_cq_overflow(ctx); + set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); return false; } if (list_empty(&ctx->cq_overflow_list)) { - set_bit(0, &ctx->check_cq_overflow); - WRITE_ONCE(ctx->rings->sq_flags, - ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW); + set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); + atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } ocqe->cqe.user_data = user_data; ocqe->cqe.res = res; ocqe->cqe.flags = cflags; + if (is_cqe32) { + ocqe->cqe.big_cqe[0] = extra1; + ocqe->cqe.big_cqe[1] = extra2; + } list_add_tail(&ocqe->list, &ctx->cq_overflow_list); return true; } @@ -2026,42 +2412,114 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, WRITE_ONCE(cqe->flags, cflags); return true; } - return io_cqring_event_overflow(ctx, user_data, res, cflags); + return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); +} + +static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + struct io_uring_cqe *cqe; + + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, + req->cqe.res, req->cqe.flags, 0, 0); + + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqe(ctx); + if (likely(cqe)) { + memcpy(cqe, &req->cqe, sizeof(*cqe)); + return true; + } + return io_cqring_event_overflow(ctx, req->cqe.user_data, + req->cqe.res, req->cqe.flags, 0, 0); +} + +static inline bool __io_fill_cqe32_req_filled(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + struct io_uring_cqe *cqe; + u64 extra1 = req->extra1; + u64 extra2 = req->extra2; + + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, + req->cqe.res, req->cqe.flags, extra1, extra2); + + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqe(ctx); + if (likely(cqe)) { + memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe)); + cqe->big_cqe[0] = extra1; + cqe->big_cqe[1] = extra2; + return true; + } + + return io_cqring_event_overflow(ctx, req->cqe.user_data, req->cqe.res, + req->cqe.flags, extra1, extra2); } static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) { - trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags); - return __io_fill_cqe(req->ctx, req->user_data, res, cflags); + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags, 0, 0); + return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags); } -static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) +static inline void __io_fill_cqe32_req(struct io_kiocb *req, s32 res, u32 cflags, + u64 extra1, u64 extra2) { - if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe_req(req, res, cflags); + struct io_ring_ctx *ctx = req->ctx; + struct io_uring_cqe *cqe; + + if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32))) + return; + if (req->flags & REQ_F_CQE_SKIP) + return; + + trace_io_uring_complete(ctx, req, req->cqe.user_data, res, cflags, + extra1, extra2); + + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqe(ctx); + if (likely(cqe)) { + WRITE_ONCE(cqe->user_data, req->cqe.user_data); + WRITE_ONCE(cqe->res, res); + WRITE_ONCE(cqe->flags, cflags); + WRITE_ONCE(cqe->big_cqe[0], extra1); + WRITE_ONCE(cqe->big_cqe[1], extra2); + return; + } + + io_cqring_event_overflow(ctx, req->cqe.user_data, res, cflags, extra1, extra2); } static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { ctx->cq_extra++; - trace_io_uring_complete(ctx, NULL, user_data, res, cflags); + trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); return __io_fill_cqe(ctx, user_data, res, cflags); } -static void __io_req_complete_post(struct io_kiocb *req, s32 res, - u32 cflags) +static void __io_req_complete_put(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - - if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe_req(req, res, cflags); /* * If we're the last reference to this request, add to our locked * free_list cache. */ if (req_ref_put_and_test(req)) { - if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { + struct io_ring_ctx *ctx = req->ctx; + + if (req->flags & IO_REQ_LINK_FLAGS) { if (req->flags & IO_DISARM_MASK) io_disarm_next(req); if (req->link) { @@ -2069,7 +2527,7 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res, req->link = NULL; } } - io_req_put_rsrc(req, ctx); + io_req_put_rsrc(req); /* * Selected buffer deallocation in io_clean_op() assumes that * we don't hold ->completion_lock. Clean them here to avoid @@ -2083,8 +2541,23 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res, } } -static void io_req_complete_post(struct io_kiocb *req, s32 res, - u32 cflags) +static void __io_req_complete_post(struct io_kiocb *req, s32 res, + u32 cflags) +{ + if (!(req->flags & REQ_F_CQE_SKIP)) + __io_fill_cqe_req(req, res, cflags); + __io_req_complete_put(req); +} + +static void __io_req_complete_post32(struct io_kiocb *req, s32 res, + u32 cflags, u64 extra1, u64 extra2) +{ + if (!(req->flags & REQ_F_CQE_SKIP)) + __io_fill_cqe32_req(req, res, cflags, extra1, extra2); + __io_req_complete_put(req); +} + +static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags) { struct io_ring_ctx *ctx = req->ctx; @@ -2095,11 +2568,23 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res, io_cqring_ev_posted(ctx); } +static void io_req_complete_post32(struct io_kiocb *req, s32 res, + u32 cflags, u64 extra1, u64 extra2) +{ + struct io_ring_ctx *ctx = req->ctx; + + spin_lock(&ctx->completion_lock); + __io_req_complete_post32(req, res, cflags, extra1, extra2); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + io_cqring_ev_posted(ctx); +} + static inline void io_req_complete_state(struct io_kiocb *req, s32 res, u32 cflags) { - req->result = res; - req->cflags = cflags; + req->cqe.res = res; + req->cqe.flags = cflags; req->flags |= REQ_F_COMPLETE_INLINE; } @@ -2112,8 +2597,23 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, io_req_complete_post(req, res, cflags); } +static inline void __io_req_complete32(struct io_kiocb *req, + unsigned int issue_flags, s32 res, + u32 cflags, u64 extra1, u64 extra2) +{ + if (issue_flags & IO_URING_F_COMPLETE_DEFER) { + io_req_complete_state(req, res, cflags); + req->extra1 = extra1; + req->extra2 = extra2; + } else { + io_req_complete_post32(req, res, cflags, extra1, extra2); + } +} + static inline void io_req_complete(struct io_kiocb *req, s32 res) { + if (res < 0) + req_set_fail(req); __io_req_complete(req, 0, res, 0); } @@ -2123,17 +2623,6 @@ static void io_req_complete_failed(struct io_kiocb *req, s32 res) io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); } -static void io_req_complete_fail_submit(struct io_kiocb *req) -{ - /* - * We don't submit, fail them all, for that replace hardlinks with - * normal links. Extra REQ_F_LINK is tolerated. - */ - req->flags &= ~REQ_F_HARDLINK; - req->flags |= REQ_F_LINK; - io_req_complete_failed(req, req->result); -} - /* * Don't initialise the fields below on every allocation, but do that in * advance and keep them valid across allocations. @@ -2144,7 +2633,7 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) req->link = NULL; req->async_data = NULL; /* not necessary, but safer to zero */ - req->result = 0; + req->cqe.res = 0; } static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, @@ -2156,19 +2645,9 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, spin_unlock(&ctx->completion_lock); } -/* Returns true IFF there are requests in the cache */ -static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) +static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) { - struct io_submit_state *state = &ctx->submit_state; - - /* - * If we have more than a batch's worth of requests in our IRQ side - * locked cache, grab the lock and move them over to our submission - * side cache. - */ - if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) - io_flush_cached_locked_reqs(ctx, state); - return !!state->free_list.next; + return !ctx->submit_state.free_list.next; } /* @@ -2180,14 +2659,20 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { - struct io_submit_state *state = &ctx->submit_state; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; void *reqs[IO_REQ_ALLOC_BATCH]; - struct io_kiocb *req; int ret, i; - if (likely(state->free_list.next || io_flush_cached_reqs(ctx))) - return true; + /* + * If we have more than a batch's worth of requests in our IRQ side + * locked cache, grab the lock and move them over to our submission + * side cache. + */ + if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) { + io_flush_cached_locked_reqs(ctx, &ctx->submit_state); + if (!io_req_cache_empty(ctx)) + return true; + } ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); @@ -2204,17 +2689,17 @@ static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) percpu_ref_get_many(&ctx->refs, ret); for (i = 0; i < ret; i++) { - req = reqs[i]; + struct io_kiocb *req = reqs[i]; io_preinit_req(req, ctx); - wq_stack_add_head(&req->comp_list, &state->free_list); + io_req_add_to_cache(req, ctx); } return true; } static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) { - if (unlikely(!ctx->submit_state.free_list.next)) + if (unlikely(io_req_cache_empty(ctx))) return __io_alloc_req_refill(ctx); return true; } @@ -2243,11 +2728,11 @@ static inline void io_dismantle_req(struct io_kiocb *req) io_put_file(req->file); } -static __cold void __io_free_req(struct io_kiocb *req) +static __cold void io_free_req(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - io_req_put_rsrc(req, ctx); + io_req_put_rsrc(req); io_dismantle_req(req); io_put_task(req->task, 1); @@ -2265,7 +2750,7 @@ static inline void io_remove_next_linked(struct io_kiocb *req) nxt->link = NULL; } -static bool io_kill_linked_timeout(struct io_kiocb *req) +static struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req) __must_hold(&req->ctx->completion_lock) __must_hold(&req->ctx->timeout_lock) { @@ -2278,13 +2763,10 @@ static bool io_kill_linked_timeout(struct io_kiocb *req) link->timeout.head = NULL; if (hrtimer_try_to_cancel(&io->timer) != -1) { list_del(&link->timeout.list); - /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */ - io_fill_cqe_req(link, -ECANCELED, 0); - io_put_req_deferred(link); - return true; + return link; } } - return false; + return NULL; } static void io_fail_links(struct io_kiocb *req) @@ -2298,19 +2780,19 @@ static void io_fail_links(struct io_kiocb *req) long res = -ECANCELED; if (link->flags & REQ_F_FAIL) - res = link->result; + res = link->cqe.res; nxt = link->link; link->link = NULL; - trace_io_uring_fail_link(req->ctx, req, req->user_data, + trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data, req->opcode, link); - if (!ignore_cqes) { + if (ignore_cqes) + link->flags |= REQ_F_CQE_SKIP; + else link->flags &= ~REQ_F_CQE_SKIP; - io_fill_cqe_req(link, res, 0); - } - io_put_req_deferred(link); + __io_req_complete_post(link, res, 0); link = nxt; } } @@ -2318,25 +2800,27 @@ static void io_fail_links(struct io_kiocb *req) static bool io_disarm_next(struct io_kiocb *req) __must_hold(&req->ctx->completion_lock) { + struct io_kiocb *link = NULL; bool posted = false; if (req->flags & REQ_F_ARM_LTIMEOUT) { - struct io_kiocb *link = req->link; - + link = req->link; req->flags &= ~REQ_F_ARM_LTIMEOUT; if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { io_remove_next_linked(req); - /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */ - io_fill_cqe_req(link, -ECANCELED, 0); - io_put_req_deferred(link); + io_req_tw_post_queue(link, -ECANCELED, 0); posted = true; } } else if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; spin_lock_irq(&ctx->timeout_lock); - posted = io_kill_linked_timeout(req); + link = io_disarm_linked_timeout(req); spin_unlock_irq(&ctx->timeout_lock); + if (link) { + posted = true; + io_req_tw_post_queue(link, -ECANCELED, 0); + } } if (unlikely((req->flags & REQ_F_FAIL) && !(req->flags & REQ_F_HARDLINK))) { @@ -2353,8 +2837,7 @@ static void __io_req_find_next_prep(struct io_kiocb *req) spin_lock(&ctx->completion_lock); posted = io_disarm_next(req); - if (posted) - io_commit_cqring(ctx); + io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); if (posted) io_cqring_ev_posted(ctx); @@ -2364,8 +2847,6 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) { struct io_kiocb *nxt; - if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))) - return NULL; /* * If LINK is set, we have dependent requests in this chain. If we * didn't fail this request, queue the first one up, moving any other @@ -2383,6 +2864,8 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) { if (!ctx) return; + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); if (*locked) { io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); @@ -2426,7 +2909,7 @@ static void handle_prev_tw_list(struct io_wq_work_node *node, if (likely(*uring_locked)) req->io_task_work.func(req, uring_locked); else - __io_req_complete_post(req, req->result, + __io_req_complete_post(req, req->cqe.res, io_put_kbuf_comp(req)); node = next; } while (node); @@ -2467,15 +2950,11 @@ static void tctx_task_work(struct callback_head *cb) while (1) { struct io_wq_work_node *node1, *node2; - if (!tctx->task_list.first && - !tctx->prior_task_list.first && uring_locked) - io_submit_flush_completions(ctx); - spin_lock_irq(&tctx->task_lock); - node1 = tctx->prior_task_list.first; + node1 = tctx->prio_task_list.first; node2 = tctx->task_list.first; INIT_WQ_LIST(&tctx->task_list); - INIT_WQ_LIST(&tctx->prior_task_list); + INIT_WQ_LIST(&tctx->prio_task_list); if (!node2 && !node1) tctx->task_running = false; spin_unlock_irq(&tctx->task_lock); @@ -2484,10 +2963,13 @@ static void tctx_task_work(struct callback_head *cb) if (node1) handle_prev_tw_list(node1, &ctx, &uring_locked); - if (node2) handle_tw_list(node2, &ctx, &uring_locked); cond_resched(); + + if (data_race(!tctx->task_list.first) && + data_race(!tctx->prio_task_list.first) && uring_locked) + io_submit_flush_completions(ctx); } ctx_flush_and_put(ctx, &uring_locked); @@ -2497,24 +2979,19 @@ static void tctx_task_work(struct callback_head *cb) io_uring_drop_tctx_refs(current); } -static void io_req_task_work_add(struct io_kiocb *req, bool priority) +static void __io_req_task_work_add(struct io_kiocb *req, + struct io_uring_task *tctx, + struct io_wq_work_list *list) { - struct task_struct *tsk = req->task; - struct io_uring_task *tctx = tsk->io_uring; - enum task_work_notify_mode notify; + struct io_ring_ctx *ctx = req->ctx; struct io_wq_work_node *node; unsigned long flags; bool running; - WARN_ON_ONCE(!tctx); - io_drop_inflight_file(req); spin_lock_irqsave(&tctx->task_lock, flags); - if (priority) - wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list); - else - wq_list_add_tail(&req->io_task_work.node, &tctx->task_list); + wq_list_add_tail(&req->io_task_work.node, list); running = tctx->task_running; if (!running) tctx->task_running = true; @@ -2524,22 +3001,15 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority) if (running) return; - /* - * SQPOLL kernel thread doesn't need notification, just a wakeup. For - * all other cases, use TWA_SIGNAL unconditionally to ensure we're - * processing task_work. There's no reliable way to tell if TWA_RESUME - * will do the job. - */ - notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL; - if (likely(!task_work_add(tsk, &tctx->task_work, notify))) { - if (notify == TWA_NONE) - wake_up_process(tsk); + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + + if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) return; - } spin_lock_irqsave(&tctx->task_lock, flags); tctx->task_running = false; - node = wq_list_merge(&tctx->prior_task_list, &tctx->task_list); + node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list); spin_unlock_irqrestore(&tctx->task_lock, flags); while (node) { @@ -2551,47 +3021,73 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority) } } -static void io_req_task_cancel(struct io_kiocb *req, bool *locked) +static void io_req_task_work_add(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; + struct io_uring_task *tctx = req->task->io_uring; + __io_req_task_work_add(req, tctx, &tctx->task_list); +} + +static void io_req_task_prio_work_add(struct io_kiocb *req) +{ + struct io_uring_task *tctx = req->task->io_uring; + + if (req->ctx->flags & IORING_SETUP_SQPOLL) + __io_req_task_work_add(req, tctx, &tctx->prio_task_list); + else + __io_req_task_work_add(req, tctx, &tctx->task_list); +} + +static void io_req_tw_post(struct io_kiocb *req, bool *locked) +{ + io_req_complete_post(req, req->cqe.res, req->cqe.flags); +} + +static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags) +{ + req->cqe.res = res; + req->cqe.flags = cflags; + req->io_task_work.func = io_req_tw_post; + io_req_task_work_add(req); +} + +static void io_req_task_cancel(struct io_kiocb *req, bool *locked) +{ /* not needed for normal modes, but SQPOLL depends on it */ - io_tw_lock(ctx, locked); - io_req_complete_failed(req, req->result); + io_tw_lock(req->ctx, locked); + io_req_complete_failed(req, req->cqe.res); } static void io_req_task_submit(struct io_kiocb *req, bool *locked) { - struct io_ring_ctx *ctx = req->ctx; - - io_tw_lock(ctx, locked); + io_tw_lock(req->ctx, locked); /* req->task == current here, checking PF_EXITING is safe */ if (likely(!(req->task->flags & PF_EXITING))) - __io_queue_sqe(req); + io_queue_sqe(req); else io_req_complete_failed(req, -EFAULT); } static void io_req_task_queue_fail(struct io_kiocb *req, int ret) { - req->result = ret; + req->cqe.res = ret; req->io_task_work.func = io_req_task_cancel; - io_req_task_work_add(req, false); + io_req_task_work_add(req); } static void io_req_task_queue(struct io_kiocb *req) { req->io_task_work.func = io_req_task_submit; - io_req_task_work_add(req, false); + io_req_task_work_add(req); } static void io_req_task_queue_reissue(struct io_kiocb *req) { - req->io_task_work.func = io_queue_async_work; - io_req_task_work_add(req, false); + req->io_task_work.func = io_queue_iowq; + io_req_task_work_add(req); } -static inline void io_queue_next(struct io_kiocb *req) +static void io_queue_next(struct io_kiocb *req) { struct io_kiocb *nxt = io_req_find_next(req); @@ -2599,17 +3095,6 @@ static inline void io_queue_next(struct io_kiocb *req) io_req_task_queue(nxt); } -static void io_free_req(struct io_kiocb *req) -{ - io_queue_next(req); - __io_free_req(req); -} - -static void io_free_req_work(struct io_kiocb *req, bool *locked) -{ - io_free_req(req); -} - static void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) __must_hold(&ctx->uring_lock) @@ -2621,15 +3106,30 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); - if (unlikely(req->flags & REQ_F_REFCOUNT)) { - node = req->comp_list.next; - if (!req_ref_put_and_test(req)) - continue; + if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) { + if (req->flags & REQ_F_REFCOUNT) { + node = req->comp_list.next; + if (!req_ref_put_and_test(req)) + continue; + } + if ((req->flags & REQ_F_POLLED) && req->apoll) { + struct async_poll *apoll = req->apoll; + + if (apoll->double_poll) + kfree(apoll->double_poll); + list_add(&apoll->poll.wait.entry, + &ctx->apoll_cache); + req->flags &= ~REQ_F_POLLED; + } + if (req->flags & IO_REQ_LINK_FLAGS) + io_queue_next(req); + if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) + io_clean_op(req); } + if (!(req->flags & REQ_F_FIXED_FILE)) + io_put_file(req->file); io_req_put_rsrc_locked(req, ctx); - io_queue_next(req); - io_dismantle_req(req); if (req->task != task) { if (task) @@ -2639,7 +3139,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, } task_refs++; node = req->comp_list.next; - wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); + io_req_add_to_cache(req, ctx); } while (node); if (task) @@ -2658,16 +3158,11 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); - if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe_req(req, req->result, req->cflags); - if ((req->flags & REQ_F_POLLED) && req->apoll) { - struct async_poll *apoll = req->apoll; - - if (apoll->double_poll) - kfree(apoll->double_poll); - list_add(&apoll->poll.wait.entry, - &ctx->apoll_cache); - req->flags &= ~REQ_F_POLLED; + if (!(req->flags & REQ_F_CQE_SKIP)) { + if (!(ctx->flags & IORING_SETUP_CQE32)) + __io_fill_cqe_req_filled(ctx, req); + else + __io_fill_cqe32_req_filled(ctx, req); } } @@ -2690,23 +3185,18 @@ static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) struct io_kiocb *nxt = NULL; if (req_ref_put_and_test(req)) { - nxt = io_req_find_next(req); - __io_free_req(req); + if (unlikely(req->flags & IO_REQ_LINK_FLAGS)) + nxt = io_req_find_next(req); + io_free_req(req); } return nxt; } static inline void io_put_req(struct io_kiocb *req) { - if (req_ref_put_and_test(req)) - io_free_req(req); -} - -static inline void io_put_req_deferred(struct io_kiocb *req) -{ if (req_ref_put_and_test(req)) { - req->io_task_work.func = io_free_req_work; - io_req_task_work_add(req, false); + io_queue_next(req); + io_free_req(req); } } @@ -2789,11 +3279,10 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) /* order with io_complete_rw_iopoll(), e.g. ->result updates */ if (!smp_load_acquire(&req->iopoll_completed)) break; + nr_events++; if (unlikely(req->flags & REQ_F_CQE_SKIP)) continue; - - __io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0)); - nr_events++; + __io_fill_cqe_req(req, req->cqe.res, io_put_kbuf(req, 0)); } if (unlikely(!nr_events)) @@ -2839,22 +3328,26 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) { unsigned int nr_events = 0; int ret = 0; + unsigned long check_cq; /* - * We disallow the app entering submit/complete with polling, but we - * still need to lock the ring to prevent racing with polled issue - * that got punted to a workqueue. - */ - mutex_lock(&ctx->uring_lock); - /* * Don't enter poll loop if we already have events pending. * If we do, we can potentially be spinning for commands that * already triggered a CQE (eg in error). */ - if (test_bit(0, &ctx->check_cq_overflow)) + check_cq = READ_ONCE(ctx->check_cq); + if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) __io_cqring_overflow_flush(ctx, false); if (io_cqring_events(ctx)) - goto out; + return 0; + + /* + * Similarly do not spin if we have not informed the user of any + * dropped CQE. + */ + if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))) + return -EBADR; + do { /* * If a submit got punted to a workqueue, we can have the @@ -2884,8 +3377,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) nr_events += ret; ret = 0; } while (nr_events < min && !need_resched()); -out: - mutex_unlock(&ctx->uring_lock); + return ret; } @@ -2958,21 +3450,21 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) } else { fsnotify_access(req->file); } - if (unlikely(res != req->result)) { + if (unlikely(res != req->cqe.res)) { if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_should_reissue(req)) { req->flags |= REQ_F_REISSUE; return true; } req_set_fail(req); - req->result = res; + req->cqe.res = res; } return false; } static inline void io_req_task_complete(struct io_kiocb *req, bool *locked) { - int res = req->result; + int res = req->cqe.res; if (*locked) { io_req_complete_state(req, res, io_put_kbuf(req, 0)); @@ -2988,7 +3480,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, { if (__io_complete_rw_common(req, res)) return; - __io_req_complete(req, issue_flags, req->result, + __io_req_complete(req, issue_flags, req->cqe.res, io_put_kbuf(req, issue_flags)); } @@ -2998,9 +3490,9 @@ static void io_complete_rw(struct kiocb *kiocb, long res) if (__io_complete_rw_common(req, res)) return; - req->result = res; + req->cqe.res = res; req->io_task_work.func = io_req_task_complete; - io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL)); + io_req_task_prio_work_add(req); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) @@ -3009,12 +3501,12 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); - if (unlikely(res != req->result)) { + if (unlikely(res != req->cqe.res)) { if (res == -EAGAIN && io_rw_should_reissue(req)) { req->flags |= REQ_F_REISSUE; return; } - req->result = res; + req->cqe.res = res; } /* order with io_iopoll_complete() checking ->iopoll_completed */ @@ -3124,6 +3616,8 @@ static unsigned int io_file_get_flags(struct file *file) res |= FFS_ISREG; if (__io_file_supports_nowait(file, mode)) res |= FFS_NOWAIT; + if (io_file_need_scm(file)) + res |= FFS_SCM; return res; } @@ -3155,6 +3649,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->rw.addr = READ_ONCE(sqe->addr); req->rw.len = READ_ONCE(sqe->len); req->rw.flags = READ_ONCE(sqe->rw_flags); + /* used for fixed read/write too - just read unconditionally */ req->buf_index = READ_ONCE(sqe->buf_index); return 0; } @@ -3183,19 +3678,18 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) { struct kiocb *kiocb = &req->rw.kiocb; - bool is_stream = req->file->f_mode & FMODE_STREAM; - if (kiocb->ki_pos == -1) { - if (!is_stream) { - req->flags |= REQ_F_CUR_POS; - kiocb->ki_pos = req->file->f_pos; - return &kiocb->ki_pos; - } else { - kiocb->ki_pos = 0; - return NULL; - } + if (kiocb->ki_pos != -1) + return &kiocb->ki_pos; + + if (!(req->file->f_mode & FMODE_STREAM)) { + req->flags |= REQ_F_CUR_POS; + kiocb->ki_pos = req->file->f_pos; + return &kiocb->ki_pos; } - return is_stream ? NULL : &kiocb->ki_pos; + + kiocb->ki_pos = 0; + return NULL; } static void kiocb_done(struct io_kiocb *req, ssize_t ret, @@ -3304,77 +3798,96 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, return __io_import_fixed(req, rw, iter, imu); } -static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock) +static int io_buffer_add_list(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, unsigned int bgid) { - if (needs_lock) - mutex_unlock(&ctx->uring_lock); -} - -static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) -{ - /* - * "Normal" inline submissions always hold the uring_lock, since we - * grab it from the system call. Same is true for the SQPOLL offload. - * The only exception is when we've detached the request and issue it - * from an async worker thread, grab the lock for that case. - */ - if (needs_lock) - mutex_lock(&ctx->uring_lock); -} - -static void io_buffer_add_list(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, unsigned int bgid) -{ - struct list_head *list; - - list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)]; - INIT_LIST_HEAD(&bl->buf_list); bl->bgid = bgid; - list_add(&bl->list, list); + if (bgid < BGID_ARRAY) + return 0; + + return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } -static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, - int bgid, unsigned int issue_flags) +static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, + struct io_buffer_list *bl) { - struct io_buffer *kbuf = req->kbuf; - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - - if (req->flags & REQ_F_BUFFER_SELECTED) - return kbuf; - - io_ring_submit_lock(ctx, needs_lock); + if (!list_empty(&bl->buf_list)) { + struct io_buffer *kbuf; - lockdep_assert_held(&ctx->uring_lock); - - bl = io_buffer_get_list(ctx, bgid); - if (bl && !list_empty(&bl->buf_list)) { kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); list_del(&kbuf->list); if (*len > kbuf->len) *len = kbuf->len; req->flags |= REQ_F_BUFFER_SELECTED; req->kbuf = kbuf; + req->buf_index = kbuf->bid; + return u64_to_user_ptr(kbuf->addr); + } + return NULL; +} + +static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, + struct io_buffer_list *bl, + unsigned int issue_flags) +{ + struct io_uring_buf_ring *br = bl->buf_ring; + struct io_uring_buf *buf; + __u32 head = bl->head; + + if (unlikely(smp_load_acquire(&br->tail) == head)) { + io_ring_submit_unlock(req->ctx, issue_flags); + return NULL; + } + + head &= bl->mask; + if (head < IO_BUFFER_LIST_BUF_PER_PAGE) { + buf = &br->bufs[head]; } else { - kbuf = ERR_PTR(-ENOBUFS); + int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); + int index = head / IO_BUFFER_LIST_BUF_PER_PAGE - 1; + buf = page_address(bl->buf_pages[index]); + buf += off; } + if (*len > buf->len) + *len = buf->len; + req->flags |= REQ_F_BUFFER_RING; + req->buf_list = bl; + req->buf_index = buf->bid; - io_ring_submit_unlock(req->ctx, needs_lock); - return kbuf; + if (issue_flags & IO_URING_F_UNLOCKED) { + /* + * If we came in unlocked, we have no choice but to consume the + * buffer here. This does mean it'll be pinned until the IO + * completes. But coming in unlocked means we're in io-wq + * context, hence there should be no further retry. For the + * locked case, the caller must ensure to call the commit when + * the transfer completes (or if we get -EAGAIN and must poll + * or retry). + */ + req->buf_list = NULL; + bl->head++; + } + return u64_to_user_ptr(buf->addr); } -static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, - unsigned int issue_flags) +static void __user *io_buffer_select(struct io_kiocb *req, size_t *len, + unsigned int issue_flags) { - struct io_buffer *kbuf; - u16 bgid; + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + void __user *ret = NULL; + + io_ring_submit_lock(req->ctx, issue_flags); - bgid = req->buf_index; - kbuf = io_buffer_select(req, len, bgid, issue_flags); - if (IS_ERR(kbuf)) - return kbuf; - return u64_to_user_ptr(kbuf->addr); + bl = io_buffer_get_list(ctx, req->buf_index); + if (likely(bl)) { + if (bl->buf_nr_pages) + ret = io_ring_buffer_select(req, len, bl, issue_flags); + else + ret = io_provided_buffer_select(req, len, bl); + } + io_ring_submit_unlock(req->ctx, issue_flags); + return ret; } #ifdef CONFIG_COMPAT @@ -3384,7 +3897,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, struct compat_iovec __user *uiov; compat_ssize_t clen; void __user *buf; - ssize_t len; + size_t len; uiov = u64_to_user_ptr(req->rw.addr); if (!access_ok(uiov, sizeof(*uiov))) @@ -3395,11 +3908,12 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, return -EINVAL; len = clen; - buf = io_rw_buffer_select(req, &len, issue_flags); - if (IS_ERR(buf)) - return PTR_ERR(buf); + buf = io_buffer_select(req, &len, issue_flags); + if (!buf) + return -ENOBUFS; + req->rw.addr = (unsigned long) buf; iov[0].iov_base = buf; - iov[0].iov_len = (compat_size_t) len; + req->rw.len = iov[0].iov_len = (compat_size_t) len; return 0; } #endif @@ -3417,22 +3931,21 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, len = iov[0].iov_len; if (len < 0) return -EINVAL; - buf = io_rw_buffer_select(req, &len, issue_flags); - if (IS_ERR(buf)) - return PTR_ERR(buf); + buf = io_buffer_select(req, &len, issue_flags); + if (!buf) + return -ENOBUFS; + req->rw.addr = (unsigned long) buf; iov[0].iov_base = buf; - iov[0].iov_len = len; + req->rw.len = iov[0].iov_len = len; return 0; } static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, unsigned int issue_flags) { - if (req->flags & REQ_F_BUFFER_SELECTED) { - struct io_buffer *kbuf = req->kbuf; - - iov[0].iov_base = u64_to_user_ptr(kbuf->addr); - iov[0].iov_len = kbuf->len; + if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { + iov[0].iov_base = u64_to_user_ptr(req->rw.addr); + iov[0].iov_len = req->rw.len; return 0; } if (req->rw.len != 1) @@ -3446,6 +3959,13 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, return __io_iov_buffer_select(req, iov, issue_flags); } +static inline bool io_do_buffer_select(struct io_kiocb *req) +{ + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return false; + return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)); +} + static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, struct io_rw_state *s, unsigned int issue_flags) @@ -3464,18 +3984,15 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, return NULL; } - /* buffer index only valid with fixed read/write, or buffer select */ - if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))) - return ERR_PTR(-EINVAL); - buf = u64_to_user_ptr(req->rw.addr); sqe_len = req->rw.len; if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { - if (req->flags & REQ_F_BUFFER_SELECT) { - buf = io_rw_buffer_select(req, &sqe_len, issue_flags); - if (IS_ERR(buf)) - return ERR_CAST(buf); + if (io_do_buffer_select(req)) { + buf = io_buffer_select(req, &sqe_len, issue_flags); + if (!buf) + return ERR_PTR(-ENOBUFS); + req->rw.addr = (unsigned long) buf; req->rw.len = sqe_len; } @@ -3777,6 +4294,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) return -EOPNOTSUPP; + kiocb->private = NULL; kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; kiocb->ki_complete = io_complete_rw_iopoll; req->iopoll_completed = 0; @@ -3825,9 +4343,11 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) iovec = NULL; } ret = io_rw_init_file(req, FMODE_READ); - if (unlikely(ret)) + if (unlikely(ret)) { + kfree(iovec); return ret; - req->result = iov_iter_count(&s->iter); + } + req->cqe.res = iov_iter_count(&s->iter); if (force_nonblock) { /* If the file doesn't support async, just async punt */ @@ -3843,7 +4363,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) ppos = io_kiocb_update_pos(req); - ret = rw_verify_area(READ, req->file, ppos, req->result); + ret = rw_verify_area(READ, req->file, ppos, req->cqe.res); if (unlikely(ret)) { kfree(iovec); return ret; @@ -3865,7 +4385,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) ret = 0; } else if (ret == -EIOCBQUEUED) { goto out_free; - } else if (ret == req->result || ret <= 0 || !force_nonblock || + } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { /* read all, failed, already did sync or don't want to retry */ goto done; @@ -3951,9 +4471,11 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) iovec = NULL; } ret = io_rw_init_file(req, FMODE_WRITE); - if (unlikely(ret)) + if (unlikely(ret)) { + kfree(iovec); return ret; - req->result = iov_iter_count(&s->iter); + } + req->cqe.res = iov_iter_count(&s->iter); if (force_nonblock) { /* If the file doesn't support async, just async punt */ @@ -3973,7 +4495,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) ppos = io_kiocb_update_pos(req); - ret = rw_verify_area(WRITE, req->file, ppos, req->result); + ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res); if (unlikely(ret)) goto out_free; @@ -4037,9 +4559,7 @@ static int io_renameat_prep(struct io_kiocb *req, struct io_rename *ren = &req->rename; const char __user *oldf, *newf; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4076,22 +4596,257 @@ static int io_renameat(struct io_kiocb *req, unsigned int issue_flags) ren->newpath, ren->flags); req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; } +static inline void __io_xattr_finish(struct io_kiocb *req) +{ + struct io_xattr *ix = &req->xattr; + + if (ix->filename) + putname(ix->filename); + + kfree(ix->ctx.kname); + kvfree(ix->ctx.kvalue); +} + +static void io_xattr_finish(struct io_kiocb *req, int ret) +{ + req->flags &= ~REQ_F_NEED_CLEANUP; + + __io_xattr_finish(req); + io_req_complete(req, ret); +} + +static int __io_getxattr_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_xattr *ix = &req->xattr; + const char __user *name; + int ret; + + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + ix->filename = NULL; + ix->ctx.kvalue = NULL; + name = u64_to_user_ptr(READ_ONCE(sqe->addr)); + ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + ix->ctx.size = READ_ONCE(sqe->len); + ix->ctx.flags = READ_ONCE(sqe->xattr_flags); + + if (ix->ctx.flags) + return -EINVAL; + + ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); + if (!ix->ctx.kname) + return -ENOMEM; + + ret = strncpy_from_user(ix->ctx.kname->name, name, + sizeof(ix->ctx.kname->name)); + if (!ret || ret == sizeof(ix->ctx.kname->name)) + ret = -ERANGE; + if (ret < 0) { + kfree(ix->ctx.kname); + return ret; + } + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +static int io_fgetxattr_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + return __io_getxattr_prep(req, sqe); +} + +static int io_getxattr_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_xattr *ix = &req->xattr; + const char __user *path; + int ret; + + ret = __io_getxattr_prep(req, sqe); + if (ret) + return ret; + + path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); + + ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); + if (IS_ERR(ix->filename)) { + ret = PTR_ERR(ix->filename); + ix->filename = NULL; + } + + return ret; +} + +static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_xattr *ix = &req->xattr; + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt), + req->file->f_path.dentry, + &ix->ctx); + + io_xattr_finish(req, ret); + return 0; +} + +static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_xattr *ix = &req->xattr; + unsigned int lookup_flags = LOOKUP_FOLLOW; + struct path path; + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + +retry: + ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); + if (!ret) { + ret = do_getxattr(mnt_user_ns(path.mnt), + path.dentry, + &ix->ctx); + + path_put(&path); + if (retry_estale(ret, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + } + + io_xattr_finish(req, ret); + return 0; +} + +static int __io_setxattr_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_xattr *ix = &req->xattr; + const char __user *name; + int ret; + + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + ix->filename = NULL; + name = u64_to_user_ptr(READ_ONCE(sqe->addr)); + ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + ix->ctx.kvalue = NULL; + ix->ctx.size = READ_ONCE(sqe->len); + ix->ctx.flags = READ_ONCE(sqe->xattr_flags); + + ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); + if (!ix->ctx.kname) + return -ENOMEM; + + ret = setxattr_copy(name, &ix->ctx); + if (ret) { + kfree(ix->ctx.kname); + return ret; + } + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +static int io_setxattr_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_xattr *ix = &req->xattr; + const char __user *path; + int ret; + + ret = __io_setxattr_prep(req, sqe); + if (ret) + return ret; + + path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); + + ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); + if (IS_ERR(ix->filename)) { + ret = PTR_ERR(ix->filename); + ix->filename = NULL; + } + + return ret; +} + +static int io_fsetxattr_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + return __io_setxattr_prep(req, sqe); +} + +static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags, + struct path *path) +{ + struct io_xattr *ix = &req->xattr; + int ret; + + ret = mnt_want_write(path->mnt); + if (!ret) { + ret = do_setxattr(mnt_user_ns(path->mnt), path->dentry, &ix->ctx); + mnt_drop_write(path->mnt); + } + + return ret; +} + +static int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) +{ + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = __io_setxattr(req, issue_flags, &req->file->f_path); + io_xattr_finish(req, ret); + + return 0; +} + +static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_xattr *ix = &req->xattr; + unsigned int lookup_flags = LOOKUP_FOLLOW; + struct path path; + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + +retry: + ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); + if (!ret) { + ret = __io_setxattr(req, issue_flags, &path); + path_put(&path); + if (retry_estale(ret, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + } + + io_xattr_finish(req, ret); + return 0; +} + static int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_unlink *un = &req->unlink; const char __user *fname; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || - sqe->splice_fd_in) + if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4125,8 +4880,6 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) ret = do_unlinkat(un->dfd, un->filename); req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -4137,10 +4890,7 @@ static int io_mkdirat_prep(struct io_kiocb *req, struct io_mkdir *mkd = &req->mkdir; const char __user *fname; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index || - sqe->splice_fd_in) + if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4168,8 +4918,6 @@ static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -4180,10 +4928,7 @@ static int io_symlinkat_prep(struct io_kiocb *req, struct io_symlink *sl = &req->symlink; const char __user *oldpath, *newpath; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index || - sqe->splice_fd_in) + if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4217,8 +4962,6 @@ static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath); req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -4229,9 +4972,7 @@ static int io_linkat_prep(struct io_kiocb *req, struct io_hardlink *lnk = &req->hardlink; const char __user *oldf, *newf; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) + if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4268,9 +5009,97 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) lnk->newpath, lnk->flags); req->flags &= ~REQ_F_NEED_CLEANUP; + io_req_complete(req, ret); + return 0; +} + +static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) +{ + req->uring_cmd.task_work_cb(&req->uring_cmd); +} + +void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *)) +{ + struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd); + + req->uring_cmd.task_work_cb = task_work_cb; + req->io_task_work.func = io_uring_cmd_work; + io_req_task_prio_work_add(req); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task); + +/* + * Called by consumers of io_uring_cmd, if they originally returned + * -EIOCBQUEUED upon receiving the command. + */ +void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) +{ + struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd); + if (ret < 0) req_set_fail(req); - io_req_complete(req, ret); + if (req->ctx->flags & IORING_SETUP_CQE32) + __io_req_complete32(req, 0, ret, 0, res2, 0); + else + io_req_complete(req, ret); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_done); + +static int io_uring_cmd_prep_async(struct io_kiocb *req) +{ + size_t cmd_size; + + cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128); + + memcpy(req->async_data, req->uring_cmd.cmd, cmd_size); + return 0; +} + +static int io_uring_cmd_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_uring_cmd *ioucmd = &req->uring_cmd; + + if (sqe->rw_flags) + return -EINVAL; + ioucmd->cmd = sqe->cmd; + ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); + return 0; +} + +static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_uring_cmd *ioucmd = &req->uring_cmd; + struct io_ring_ctx *ctx = req->ctx; + struct file *file = req->file; + int ret; + + if (!req->file->f_op->uring_cmd) + return -EOPNOTSUPP; + + if (ctx->flags & IORING_SETUP_SQE128) + issue_flags |= IO_URING_F_SQE128; + if (ctx->flags & IORING_SETUP_CQE32) + issue_flags |= IO_URING_F_CQE32; + if (ctx->flags & IORING_SETUP_IOPOLL) + issue_flags |= IO_URING_F_IOPOLL; + + if (req_has_async_data(req)) + ioucmd->cmd = req->async_data; + + ret = file->f_op->uring_cmd(ioucmd, issue_flags); + if (ret == -EAGAIN) { + if (!req_has_async_data(req)) { + if (io_alloc_async_data(req)) + return -ENOMEM; + io_uring_cmd_prep_async(req); + } + return -EAGAIN; + } + + if (ret != -EIOCBQUEUED) + io_uring_cmd_done(ioucmd, ret, 0); return 0; } @@ -4278,9 +5107,7 @@ static int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || + if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; @@ -4305,8 +5132,6 @@ static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) return -ENOTSOCK; ret = __sys_shutdown_sock(sock, req->shutdown.how); - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; #else @@ -4320,9 +5145,6 @@ static int __io_splice_prep(struct io_kiocb *req, struct io_splice *sp = &req->splice; unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - sp->len = READ_ONCE(sqe->len); sp->flags = READ_ONCE(sqe->splice_flags); if (unlikely(sp->flags & ~valid_flags)) @@ -4351,7 +5173,7 @@ static int io_tee(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; if (sp->flags & SPLICE_F_FD_IN_FIXED) - in = io_file_get_fixed(req, sp->splice_fd_in, IO_URING_F_UNLOCKED); + in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); else in = io_file_get_normal(req, sp->splice_fd_in); if (!in) { @@ -4367,7 +5189,7 @@ static int io_tee(struct io_kiocb *req, unsigned int issue_flags) done: if (ret != sp->len) req_set_fail(req); - io_req_complete(req, ret); + __io_req_complete(req, 0, ret, 0); return 0; } @@ -4393,7 +5215,7 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; if (sp->flags & SPLICE_F_FD_IN_FIXED) - in = io_file_get_fixed(req, sp->splice_fd_in, IO_URING_F_UNLOCKED); + in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); else in = io_file_get_normal(req, sp->splice_fd_in); if (!in) { @@ -4412,7 +5234,20 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags) done: if (ret != sp->len) req_set_fail(req); - io_req_complete(req, ret); + __io_req_complete(req, 0, ret, 0); + return 0; +} + +static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + /* + * If the ring is setup with CQE32, relay back addr/addr + */ + if (req->ctx->flags & IORING_SETUP_CQE32) { + req->nop.extra1 = READ_ONCE(sqe->addr); + req->nop.extra2 = READ_ONCE(sqe->addr2); + } + return 0; } @@ -4421,20 +5256,31 @@ done: */ static int io_nop(struct io_kiocb *req, unsigned int issue_flags) { - struct io_ring_ctx *ctx = req->ctx; + unsigned int cflags; + void __user *buf; - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; + if (req->flags & REQ_F_BUFFER_SELECT) { + size_t len = 1; + + buf = io_buffer_select(req, &len, issue_flags); + if (!buf) + return -ENOBUFS; + } - __io_req_complete(req, issue_flags, 0, 0); + cflags = io_put_kbuf(req, issue_flags); + if (!(req->ctx->flags & IORING_SETUP_CQE32)) + __io_req_complete(req, issue_flags, 0, cflags); + else + __io_req_complete32(req, issue_flags, 0, cflags, + req->nop.extra1, req->nop.extra2); return 0; } static int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags || - sqe->splice_fd_in || sqe->buf_index || sqe->personality)) + if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in || + sqe->buf_index || sqe->personality)) return -EINVAL; req->msg.user_data = READ_ONCE(sqe->off); @@ -4470,17 +5316,15 @@ done: if (ret < 0) req_set_fail(req); __io_req_complete(req, issue_flags, ret, 0); + /* put file to avoid an attempt to IOPOLL the req */ + io_put_file(req->file); + req->file = NULL; return 0; } static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_ring_ctx *ctx = req->ctx; - - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || - sqe->splice_fd_in)) + if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; req->sync.flags = READ_ONCE(sqe->fsync_flags); @@ -4504,8 +5348,6 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) ret = vfs_fsync_range(req->file, req->sync.off, end > 0 ? end : LLONG_MAX, req->sync.flags & IORING_FSYNC_DATASYNC); - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -4513,10 +5355,7 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) static int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (sqe->ioprio || sqe->buf_index || sqe->rw_flags || - sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; req->sync.off = READ_ONCE(sqe->off); @@ -4534,9 +5373,7 @@ static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, req->sync.len); - if (ret < 0) - req_set_fail(req); - else + if (ret >= 0) fsnotify_modify(req->file); io_req_complete(req, ret); return 0; @@ -4547,9 +5384,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe const char __user *fname; int ret; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (unlikely(sqe->ioprio || sqe->buf_index)) + if (unlikely(sqe->buf_index)) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -4604,6 +5439,61 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return __io_openat_prep(req, sqe); } +static int io_file_bitmap_get(struct io_ring_ctx *ctx) +{ + struct io_file_table *table = &ctx->file_table; + unsigned long nr = ctx->nr_user_files; + int ret; + + if (table->alloc_hint >= nr) + table->alloc_hint = 0; + + do { + ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint); + if (ret != nr) { + table->alloc_hint = ret + 1; + return ret; + } + if (!table->alloc_hint) + break; + + nr = table->alloc_hint; + table->alloc_hint = 0; + } while (1); + + return -ENFILE; +} + +static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, + struct file *file, unsigned int file_slot) +{ + bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC; + struct io_ring_ctx *ctx = req->ctx; + int ret; + + if (alloc_slot) { + io_ring_submit_lock(ctx, issue_flags); + ret = io_file_bitmap_get(ctx); + if (unlikely(ret < 0)) { + io_ring_submit_unlock(ctx, issue_flags); + return ret; + } + + file_slot = ret; + } else { + file_slot--; + } + + ret = io_install_fixed_file(req, file, issue_flags, file_slot); + if (alloc_slot) { + io_ring_submit_unlock(ctx, issue_flags); + if (!ret) + return file_slot; + } + + return ret; +} + static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) { struct open_flags op; @@ -4659,8 +5549,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) if (!fixed) fd_install(ret, file); else - ret = io_install_fixed_file(req, file, issue_flags, - req->open.file_slot - 1); + ret = io_fixed_fd_install(req, issue_flags, file, + req->open.file_slot); err: putname(req->open.filename); req->flags &= ~REQ_F_NEED_CLEANUP; @@ -4681,7 +5571,7 @@ static int io_remove_buffers_prep(struct io_kiocb *req, struct io_provide_buf *p = &req->pbuf; u64 tmp; - if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off || + if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off || sqe->splice_fd_in) return -EINVAL; @@ -4704,6 +5594,20 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (!nbufs) return 0; + if (bl->buf_nr_pages) { + int j; + + i = bl->buf_ring->tail - bl->head; + for (j = 0; j < bl->buf_nr_pages; j++) + unpin_user_page(bl->buf_pages[j]); + kvfree(bl->buf_pages); + bl->buf_pages = NULL; + bl->buf_nr_pages = 0; + /* make sure it's seen as empty */ + INIT_LIST_HEAD(&bl->buf_list); + return i; + } + /* the head kbuf is the list itself */ while (!list_empty(&bl->buf_list)) { struct io_buffer *nxt; @@ -4725,22 +5629,23 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; int ret = 0; - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; - io_ring_submit_lock(ctx, needs_lock); - - lockdep_assert_held(&ctx->uring_lock); + io_ring_submit_lock(ctx, issue_flags); ret = -ENOENT; bl = io_buffer_get_list(ctx, p->bgid); - if (bl) - ret = __io_remove_buffers(ctx, bl, p->nbufs); + if (bl) { + ret = -EINVAL; + /* can't use provide/remove buffers command on mapped buffers */ + if (!bl->buf_nr_pages) + ret = __io_remove_buffers(ctx, bl, p->nbufs); + } if (ret < 0) req_set_fail(req); /* complete before unlock, IOPOLL may need the lock */ __io_req_complete(req, issue_flags, ret, 0); - io_ring_submit_unlock(ctx, needs_lock); + io_ring_submit_unlock(ctx, issue_flags); return 0; } @@ -4751,7 +5656,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req, struct io_provide_buf *p = &req->pbuf; u64 tmp; - if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) + if (sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; tmp = READ_ONCE(sqe->fd); @@ -4848,26 +5753,56 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, return i ? 0 : -ENOMEM; } +static __cold int io_init_bl_list(struct io_ring_ctx *ctx) +{ + int i; + + ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), + GFP_KERNEL); + if (!ctx->io_bl) + return -ENOMEM; + + for (i = 0; i < BGID_ARRAY; i++) { + INIT_LIST_HEAD(&ctx->io_bl[i].buf_list); + ctx->io_bl[i].bgid = i; + } + + return 0; +} + static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) { struct io_provide_buf *p = &req->pbuf; struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; int ret = 0; - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; - io_ring_submit_lock(ctx, needs_lock); + io_ring_submit_lock(ctx, issue_flags); - lockdep_assert_held(&ctx->uring_lock); + if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) { + ret = io_init_bl_list(ctx); + if (ret) + goto err; + } bl = io_buffer_get_list(ctx, p->bgid); if (unlikely(!bl)) { - bl = kmalloc(sizeof(*bl), GFP_KERNEL); + bl = kzalloc(sizeof(*bl), GFP_KERNEL); if (!bl) { ret = -ENOMEM; goto err; } - io_buffer_add_list(ctx, bl, p->bgid); + INIT_LIST_HEAD(&bl->buf_list); + ret = io_buffer_add_list(ctx, bl, p->bgid); + if (ret) { + kfree(bl); + goto err; + } + } + /* can't add buffers via this command for a mapped buffer ring */ + if (bl->buf_nr_pages) { + ret = -EINVAL; + goto err; } ret = io_add_buffers(ctx, p, bl); @@ -4876,7 +5811,7 @@ err: req_set_fail(req); /* complete before unlock, IOPOLL may need the lock */ __io_req_complete(req, issue_flags, ret, 0); - io_ring_submit_unlock(ctx, needs_lock); + io_ring_submit_unlock(ctx, issue_flags); return 0; } @@ -4884,9 +5819,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_EPOLL) - if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; req->epoll.epfd = READ_ONCE(sqe->fd); @@ -4930,9 +5863,7 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) - if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (sqe->buf_index || sqe->off || sqe->splice_fd_in) return -EINVAL; req->madvise.addr = READ_ONCE(sqe->addr); @@ -4954,8 +5885,6 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; #else @@ -4965,9 +5894,7 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (sqe->buf_index || sqe->addr || sqe->splice_fd_in) return -EINVAL; req->fadvise.offset = READ_ONCE(sqe->off); @@ -5003,9 +5930,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { const char __user *path; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; @@ -5041,19 +5966,13 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags) ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask, ctx->buffer); - - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; } static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || - sqe->rw_flags || sqe->buf_index) + if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; @@ -5085,7 +6004,8 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags) spin_unlock(&files->file_lock); goto err; } - file = fdt->fd[close->fd]; + file = rcu_dereference_protected(fdt->fd[close->fd], + lockdep_is_held(&files->file_lock)); if (!file || file->f_op == &io_uring_fops) { spin_unlock(&files->file_lock); file = NULL; @@ -5119,12 +6039,7 @@ err: static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_ring_ctx *ctx = req->ctx; - - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || - sqe->splice_fd_in)) + if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; req->sync.off = READ_ONCE(sqe->off); @@ -5143,13 +6058,18 @@ static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) ret = sync_file_range(req->file, req->sync.off, req->sync.len, req->sync.flags); - if (ret < 0) - req_set_fail(req); io_req_complete(req, ret); return 0; } #if defined(CONFIG_NET) +static bool io_net_retry(struct socket *sock, int flags) +{ + if (!(flags & MSG_WAITALL)) + return false; + return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; +} + static int io_setup_async_msg(struct io_kiocb *req, struct io_async_msghdr *kmsg) { @@ -5195,11 +6115,16 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = &req->sr_msg; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(sqe->file_index)) + return -EINVAL; + if (unlikely(sqe->addr2 || sqe->file_index)) return -EINVAL; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); + sr->flags = READ_ONCE(sqe->addr2); + if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) + return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; @@ -5208,12 +6133,14 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif + sr->done_io = 0; return 0; } static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_msghdr iomsg, *kmsg; + struct io_sr_msg *sr = &req->sr_msg; struct socket *sock; unsigned flags; int min_ret = 0; @@ -5232,7 +6159,11 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) kmsg = &iomsg; } - flags = req->sr_msg.msg_flags; + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return io_setup_async_msg(req, kmsg); + + flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) @@ -5245,12 +6176,21 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) return io_setup_async_msg(req, kmsg); if (ret == -ERESTARTSYS) ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return io_setup_async_msg(req, kmsg); + } req_set_fail(req); } /* fast path, check for non-NULL to avoid function call */ if (kmsg->free_iov) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -5265,6 +6205,10 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) int min_ret = 0; int ret; + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; + sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; @@ -5278,7 +6222,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) msg.msg_controllen = 0; msg.msg_namelen = 0; - flags = req->sr_msg.msg_flags; + flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) @@ -5291,8 +6235,19 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; if (ret == -ERESTARTSYS) ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + sr->len -= ret; + sr->buf += ret; + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return -EAGAIN; + } req_set_fail(req); } + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -5384,14 +6339,6 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, return __io_recvmsg_copy_hdr(req, iomsg); } -static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, - unsigned int issue_flags) -{ - struct io_sr_msg *sr = &req->sr_msg; - - return io_buffer_select(req, &sr->len, sr->bgid, issue_flags); -} - static int io_recvmsg_prep_async(struct io_kiocb *req) { int ret; @@ -5406,12 +6353,16 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = &req->sr_msg; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(sqe->file_index)) + return -EINVAL; + if (unlikely(sqe->addr2 || sqe->file_index)) return -EINVAL; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); - sr->bgid = READ_ONCE(sqe->buf_group); + sr->flags = READ_ONCE(sqe->addr2); + if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) + return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; @@ -5424,19 +6375,12 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static bool io_net_retry(struct socket *sock, int flags) -{ - if (!(flags & MSG_WAITALL)) - return false; - return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; -} - static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_msghdr iomsg, *kmsg; struct io_sr_msg *sr = &req->sr_msg; struct socket *sock; - struct io_buffer *kbuf; + unsigned int cflags; unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; @@ -5454,24 +6398,30 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) kmsg = &iomsg; } - if (req->flags & REQ_F_BUFFER_SELECT) { - kbuf = io_recv_buffer_select(req, issue_flags); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); - kmsg->fast_iov[0].iov_len = req->sr_msg.len; - iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, - 1, req->sr_msg.len); + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return io_setup_async_msg(req, kmsg); + + if (io_do_buffer_select(req)) { + void __user *buf; + + buf = io_buffer_select(req, &sr->len, issue_flags); + if (!buf) + return -ENOBUFS; + kmsg->fast_iov[0].iov_base = buf; + kmsg->fast_iov[0].iov_len = sr->len; + iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1, + sr->len); } - flags = req->sr_msg.msg_flags; + flags = sr->msg_flags; if (force_nonblock) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) min_ret = iov_iter_count(&kmsg->msg.msg_iter); - ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, - kmsg->uaddr, flags); + kmsg->msg.msg_get_inq = 1; + ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags); if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) return io_setup_async_msg(req, kmsg); @@ -5495,45 +6445,54 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; - __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags)); + cflags = io_put_kbuf(req, issue_flags); + if (kmsg->msg.msg_inq) + cflags |= IORING_CQE_F_SOCK_NONEMPTY; + __io_req_complete(req, issue_flags, ret, cflags); return 0; } static int io_recv(struct io_kiocb *req, unsigned int issue_flags) { - struct io_buffer *kbuf; struct io_sr_msg *sr = &req->sr_msg; struct msghdr msg; - void __user *buf = sr->buf; struct socket *sock; struct iovec iov; + unsigned int cflags; unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; + sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; - if (req->flags & REQ_F_BUFFER_SELECT) { - kbuf = io_recv_buffer_select(req, issue_flags); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - buf = u64_to_user_ptr(kbuf->addr); + if (io_do_buffer_select(req)) { + void __user *buf; + + buf = io_buffer_select(req, &sr->len, issue_flags); + if (!buf) + return -ENOBUFS; + sr->buf = buf; } - ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); + ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter); if (unlikely(ret)) goto out_free; msg.msg_name = NULL; + msg.msg_namelen = 0; msg.msg_control = NULL; + msg.msg_get_inq = 1; + msg.msg_flags = 0; msg.msg_controllen = 0; - msg.msg_namelen = 0; msg.msg_iocb = NULL; - msg.msg_flags = 0; - flags = req->sr_msg.msg_flags; + flags = sr->msg_flags; if (force_nonblock) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) @@ -5562,36 +6521,49 @@ out_free: ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; - __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags)); + cflags = io_put_kbuf(req, issue_flags); + if (msg.msg_inq) + cflags |= IORING_CQE_F_SOCK_NONEMPTY; + __io_req_complete(req, issue_flags, ret, cflags); return 0; } static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_accept *accept = &req->accept; + unsigned flags; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->len || sqe->buf_index) + if (sqe->len || sqe->buf_index) return -EINVAL; accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); accept->flags = READ_ONCE(sqe->accept_flags); accept->nofile = rlimit(RLIMIT_NOFILE); + flags = READ_ONCE(sqe->ioprio); + if (flags & ~IORING_ACCEPT_MULTISHOT) + return -EINVAL; accept->file_slot = READ_ONCE(sqe->file_index); - if (accept->file_slot && (accept->flags & SOCK_CLOEXEC)) - return -EINVAL; + if (accept->file_slot) { + if (accept->flags & SOCK_CLOEXEC) + return -EINVAL; + if (flags & IORING_ACCEPT_MULTISHOT && + accept->file_slot != IORING_FILE_INDEX_ALLOC) + return -EINVAL; + } if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; + if (flags & IORING_ACCEPT_MULTISHOT) + req->flags |= REQ_F_APOLL_MULTISHOT; return 0; } static int io_accept(struct io_kiocb *req, unsigned int issue_flags) { + struct io_ring_ctx *ctx = req->ctx; struct io_accept *accept = &req->accept; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; @@ -5599,6 +6571,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) struct file *file; int ret, fd; +retry: if (!fixed) { fd = __get_unused_fd_flags(accept->flags, accept->nofile); if (unlikely(fd < 0)) @@ -5610,7 +6583,89 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) if (!fixed) put_unused_fd(fd); ret = PTR_ERR(file); - if (ret == -EAGAIN && force_nonblock) + if (ret == -EAGAIN && force_nonblock) { + /* + * if it's multishot and polled, we don't need to + * return EAGAIN to arm the poll infra since it + * has already been done + */ + if ((req->flags & IO_APOLL_MULTI_POLLED) == + IO_APOLL_MULTI_POLLED) + ret = 0; + return ret; + } + if (ret == -ERESTARTSYS) + ret = -EINTR; + req_set_fail(req); + } else if (!fixed) { + fd_install(fd, file); + ret = fd; + } else { + ret = io_fixed_fd_install(req, issue_flags, file, + accept->file_slot); + } + + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { + __io_req_complete(req, issue_flags, ret, 0); + return 0; + } + if (ret >= 0) { + bool filled; + + spin_lock(&ctx->completion_lock); + filled = io_fill_cqe_aux(ctx, req->cqe.user_data, ret, + IORING_CQE_F_MORE); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + if (filled) { + io_cqring_ev_posted(ctx); + goto retry; + } + ret = -ECANCELED; + } + + return ret; +} + +static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_socket *sock = &req->sock; + + if (sqe->addr || sqe->rw_flags || sqe->buf_index) + return -EINVAL; + + sock->domain = READ_ONCE(sqe->fd); + sock->type = READ_ONCE(sqe->off); + sock->protocol = READ_ONCE(sqe->len); + sock->file_slot = READ_ONCE(sqe->file_index); + sock->nofile = rlimit(RLIMIT_NOFILE); + + sock->flags = sock->type & ~SOCK_TYPE_MASK; + if (sock->file_slot && (sock->flags & SOCK_CLOEXEC)) + return -EINVAL; + if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) + return -EINVAL; + return 0; +} + +static int io_socket(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_socket *sock = &req->sock; + bool fixed = !!sock->file_slot; + struct file *file; + int ret, fd; + + if (!fixed) { + fd = __get_unused_fd_flags(sock->flags, sock->nofile); + if (unlikely(fd < 0)) + return fd; + } + file = __sys_socket_file(sock->domain, sock->type, sock->protocol); + if (IS_ERR(file)) { + if (!fixed) + put_unused_fd(fd); + ret = PTR_ERR(file); + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) return -EAGAIN; if (ret == -ERESTARTSYS) ret = -EINTR; @@ -5620,7 +6675,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) ret = fd; } else { ret = io_install_fixed_file(req, file, issue_flags, - accept->file_slot - 1); + sock->file_slot - 1); } __io_req_complete(req, issue_flags, ret, 0); return 0; @@ -5638,10 +6693,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_connect *conn = &req->connect; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags || - sqe->splice_fd_in) + if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -5714,6 +6766,7 @@ IO_NETOP_PREP_ASYNC(sendmsg); IO_NETOP_PREP_ASYNC(recvmsg); IO_NETOP_PREP_ASYNC(connect); IO_NETOP_PREP(accept); +IO_NETOP_PREP(socket); IO_NETOP_FN(send); IO_NETOP_FN(recv); #endif /* CONFIG_NET */ @@ -5764,7 +6817,7 @@ static void io_poll_req_insert(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; struct hlist_head *list; - list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; + list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)]; hlist_add_head(&req->hash_node, list); } @@ -5823,23 +6876,23 @@ static void io_poll_remove_entries(struct io_kiocb *req) rcu_read_unlock(); } +static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags); /* * All poll tw should go through this. Checks for poll events, manages * references, does rewait, etc. * * Returns a negative error on failure. >0 when no action require, which is * either spurious wakeup or multishot CQE is served. 0 when it's done with - * the request, then the mask is stored in req->result. + * the request, then the mask is stored in req->cqe.res. */ -static int io_poll_check_events(struct io_kiocb *req, bool locked) +static int io_poll_check_events(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; - struct io_poll_iocb *poll = io_poll_get_single(req); - int v; + int v, ret; /* req->task == current here, checking PF_EXITING is safe */ if (unlikely(req->task->flags & PF_EXITING)) - io_poll_mark_cancelled(req); + return -ECANCELED; do { v = atomic_read(&req->poll_refs); @@ -5850,32 +6903,46 @@ static int io_poll_check_events(struct io_kiocb *req, bool locked) if (v & IO_POLL_CANCEL_FLAG) return -ECANCELED; - if (!req->result) { - struct poll_table_struct pt = { ._key = req->cflags }; + if (!req->cqe.res) { + struct poll_table_struct pt = { ._key = req->apoll_events }; + unsigned flags = locked ? 0 : IO_URING_F_UNLOCKED; - if (unlikely(!io_assign_file(req, IO_URING_F_UNLOCKED))) - req->result = -EBADF; - else - req->result = vfs_poll(req->file, &pt) & req->cflags; + if (unlikely(!io_assign_file(req, flags))) + return -EBADF; + req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events; } - /* multishot, just fill an CQE and proceed */ - if (req->result && !(req->cflags & EPOLLONESHOT)) { - __poll_t mask = mangle_poll(req->result & poll->events); + if ((unlikely(!req->cqe.res))) + continue; + if (req->apoll_events & EPOLLONESHOT) + return 0; + + /* multishot, just fill a CQE and proceed */ + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { + __poll_t mask = mangle_poll(req->cqe.res & + req->apoll_events); bool filled; spin_lock(&ctx->completion_lock); - filled = io_fill_cqe_aux(ctx, req->user_data, mask, - IORING_CQE_F_MORE); + filled = io_fill_cqe_aux(ctx, req->cqe.user_data, + mask, IORING_CQE_F_MORE); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); - if (unlikely(!filled)) - return -ECANCELED; - io_cqring_ev_posted(ctx); - } else if (req->result) { - return 0; + if (filled) { + io_cqring_ev_posted(ctx); + continue; + } + return -ECANCELED; } + io_tw_lock(req->ctx, locked); + if (unlikely(req->task->flags & PF_EXITING)) + return -EFAULT; + ret = io_issue_sqe(req, + IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); + if (ret) + return ret; + /* * Release all references, retry if someone tried to restart * task_work while we were executing it. @@ -5890,21 +6957,21 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) struct io_ring_ctx *ctx = req->ctx; int ret; - ret = io_poll_check_events(req, *locked); + ret = io_poll_check_events(req, locked); if (ret > 0) return; if (!ret) { - req->result = mangle_poll(req->result & req->poll.events); + req->cqe.res = mangle_poll(req->cqe.res & req->poll.events); } else { - req->result = ret; + req->cqe.res = ret; req_set_fail(req); } io_poll_remove_entries(req); spin_lock(&ctx->completion_lock); hash_del(&req->hash_node); - __io_req_complete_post(req, req->result, 0); + __io_req_complete_post(req, req->cqe.res, 0); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); @@ -5915,7 +6982,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) struct io_ring_ctx *ctx = req->ctx; int ret; - ret = io_poll_check_events(req, *locked); + ret = io_poll_check_events(req, locked); if (ret > 0) return; @@ -5930,26 +6997,27 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) io_req_complete_failed(req, ret); } -static void __io_poll_execute(struct io_kiocb *req, int mask, int events) +static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events) { - req->result = mask; + req->cqe.res = mask; /* * This is useful for poll that is armed on behalf of another * request, and where the wakeup path could be on a different * CPU. We want to avoid pulling in req->apoll->events for that * case. */ - req->cflags = events; + req->apoll_events = events; if (req->opcode == IORING_OP_POLL_ADD) req->io_task_work.func = io_poll_task_func; else req->io_task_work.func = io_apoll_task_func; - trace_io_uring_task_add(req->ctx, req, req->user_data, req->opcode, mask); - io_req_task_work_add(req, false); + trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask); + io_req_task_work_add(req); } -static inline void io_poll_execute(struct io_kiocb *req, int res, int events) +static inline void io_poll_execute(struct io_kiocb *req, int res, + __poll_t events) { if (io_poll_get_ownership(req)) __io_poll_execute(req, res, events); @@ -5964,6 +7032,7 @@ static void io_poll_cancel_req(struct io_kiocb *req) #define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1)) #define wqe_is_double(wait) ((unsigned long) (wait)->private & 1) +#define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI) static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) @@ -5998,7 +7067,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, } /* for instances that support it check for an event match first */ - if (mask && !(mask & poll->events)) + if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON))) return 0; if (io_poll_get_ownership(req)) { @@ -6084,6 +7153,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, int v; INIT_HLIST_NODE(&req->hash_node); + req->work.cancel_seq = atomic_read(&ctx->cancel_seq); io_init_poll_iocb(poll, mask, io_poll_wake); poll->file = req->file; @@ -6154,28 +7224,34 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) struct io_ring_ctx *ctx = req->ctx; struct async_poll *apoll; struct io_poll_table ipt; - __poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI; + __poll_t mask = POLLPRI | POLLERR; int ret; if (!def->pollin && !def->pollout) return IO_APOLL_ABORTED; - if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED)) + if (!file_can_poll(req->file)) + return IO_APOLL_ABORTED; + if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED) return IO_APOLL_ABORTED; + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) + mask |= EPOLLONESHOT; if (def->pollin) { - mask |= POLLIN | POLLRDNORM; + mask |= EPOLLIN | EPOLLRDNORM; /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ if ((req->opcode == IORING_OP_RECVMSG) && (req->sr_msg.msg_flags & MSG_ERRQUEUE)) - mask &= ~POLLIN; + mask &= ~EPOLLIN; } else { - mask |= POLLOUT | POLLWRNORM; + mask |= EPOLLOUT | EPOLLWRNORM; } if (def->poll_exclusive) mask |= EPOLLEXCLUSIVE; - if (!(issue_flags & IO_URING_F_UNLOCKED) && - !list_empty(&ctx->apoll_cache)) { + if (req->flags & REQ_F_POLLED) { + apoll = req->apoll; + } else if (!(issue_flags & IO_URING_F_UNLOCKED) && + !list_empty(&ctx->apoll_cache)) { apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, poll.wait.entry); list_del_init(&apoll->poll.wait.entry); @@ -6195,7 +7271,7 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) if (ret || ipt.error) return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; - trace_io_uring_poll_arm(ctx, req, req->user_data, req->opcode, + trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode, mask, apoll->poll.events); return IO_APOLL_OK; } @@ -6228,24 +7304,53 @@ static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, return found; } -static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr, - bool poll_only) +static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, + struct io_cancel_data *cd) __must_hold(&ctx->completion_lock) { struct hlist_head *list; struct io_kiocb *req; - list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; + list = &ctx->cancel_hash[hash_long(cd->data, ctx->cancel_hash_bits)]; hlist_for_each_entry(req, list, hash_node) { - if (sqe_addr != req->user_data) + if (cd->data != req->cqe.user_data) continue; if (poll_only && req->opcode != IORING_OP_POLL_ADD) continue; + if (cd->flags & IORING_ASYNC_CANCEL_ALL) { + if (cd->seq == req->work.cancel_seq) + continue; + req->work.cancel_seq = cd->seq; + } return req; } return NULL; } +static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, + struct io_cancel_data *cd) + __must_hold(&ctx->completion_lock) +{ + struct io_kiocb *req; + int i; + + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list; + + list = &ctx->cancel_hash[i]; + hlist_for_each_entry(req, list, hash_node) { + if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && + req->file != cd->file) + continue; + if (cd->seq == req->work.cancel_seq) + continue; + req->work.cancel_seq = cd->seq; + return req; + } + } + return NULL; +} + static bool io_poll_disarm(struct io_kiocb *req) __must_hold(&ctx->completion_lock) { @@ -6256,12 +7361,15 @@ static bool io_poll_disarm(struct io_kiocb *req) return true; } -static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr, - bool poll_only) +static int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) __must_hold(&ctx->completion_lock) { - struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only); + struct io_kiocb *req; + if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) + req = io_poll_file_find(ctx, cd); + else + req = io_poll_find(ctx, false, cd); if (!req) return -ENOENT; io_poll_cancel_req(req); @@ -6288,9 +7396,7 @@ static int io_poll_update_prep(struct io_kiocb *req, struct io_poll_update *upd = &req->poll_update; u32 flags; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; flags = READ_ONCE(sqe->len); if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | @@ -6320,9 +7426,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe struct io_poll_iocb *poll = &req->poll; u32 flags; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr) + if (sqe->buf_index || sqe->off || sqe->addr) return -EINVAL; flags = READ_ONCE(sqe->len); if (flags & ~IORING_POLL_ADD_MULTI) @@ -6331,7 +7435,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return -EINVAL; io_req_set_refcount(req); - req->cflags = poll->events = io_poll_parse_events(sqe, flags); + req->apoll_events = poll->events = io_poll_parse_events(sqe, flags); return 0; } @@ -6352,13 +7456,14 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) { + struct io_cancel_data cd = { .data = req->poll_update.old_user_data, }; struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *preq; int ret2, ret = 0; bool locked; spin_lock(&ctx->completion_lock); - preq = io_poll_find(ctx, req->poll_update.old_user_data, true); + preq = io_poll_find(ctx, true, &cd); if (!preq || !io_poll_disarm(preq)) { spin_unlock(&ctx->completion_lock); ret = preq ? -EALREADY : -ENOENT; @@ -6374,7 +7479,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) preq->poll.events |= IO_POLL_UNMASK; } if (req->poll_update.update_user_data) - preq->user_data = req->poll_update.new_user_data; + preq->cqe.user_data = req->poll_update.new_user_data; ret2 = io_poll_add(preq, issue_flags); /* successfully updated, don't complete poll request */ @@ -6383,7 +7488,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) } req_set_fail(preq); - preq->result = -ECANCELED; + preq->cqe.res = -ECANCELED; locked = !(issue_flags & IO_URING_F_UNLOCKED); io_req_task_complete(preq, &locked); out: @@ -6411,14 +7516,14 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) req_set_fail(req); - req->result = -ETIME; + req->cqe.res = -ETIME; req->io_task_work.func = io_req_task_complete; - io_req_task_work_add(req, false); + io_req_task_work_add(req); return HRTIMER_NORESTART; } static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, - __u64 user_data) + struct io_cancel_data *cd) __must_hold(&ctx->timeout_lock) { struct io_timeout_data *io; @@ -6426,9 +7531,16 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, bool found = false; list_for_each_entry(req, &ctx->timeout_list, timeout.list) { - found = user_data == req->user_data; - if (found) - break; + if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && + cd->data != req->cqe.user_data) + continue; + if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { + if (cd->seq == req->work.cancel_seq) + continue; + req->work.cancel_seq = cd->seq; + } + found = true; + break; } if (!found) return ERR_PTR(-ENOENT); @@ -6440,11 +7552,14 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, return req; } -static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) +static int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) __must_hold(&ctx->completion_lock) - __must_hold(&ctx->timeout_lock) { - struct io_kiocb *req = io_timeout_extract(ctx, user_data); + struct io_kiocb *req; + + spin_lock_irq(&ctx->timeout_lock); + req = io_timeout_extract(ctx, cd); + spin_unlock_irq(&ctx->timeout_lock); if (IS_ERR(req)) return PTR_ERR(req); @@ -6477,7 +7592,7 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, bool found = false; list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) { - found = user_data == req->user_data; + found = user_data == req->cqe.user_data; if (found) break; } @@ -6497,7 +7612,8 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, struct timespec64 *ts, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) { - struct io_kiocb *req = io_timeout_extract(ctx, user_data); + struct io_cancel_data cd = { .data = user_data, }; + struct io_kiocb *req = io_timeout_extract(ctx, &cd); struct io_timeout_data *data; if (IS_ERR(req)) @@ -6517,11 +7633,9 @@ static int io_timeout_remove_prep(struct io_kiocb *req, { struct io_timeout_rem *tr = &req->timeout_rem; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in) + if (sqe->buf_index || sqe->len || sqe->splice_fd_in) return -EINVAL; tr->ltimeout = false; @@ -6562,10 +7676,10 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) int ret; if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) { + struct io_cancel_data cd = { .data = tr->addr, }; + spin_lock(&ctx->completion_lock); - spin_lock_irq(&ctx->timeout_lock); - ret = io_timeout_cancel(ctx, tr->addr); - spin_unlock_irq(&ctx->timeout_lock); + ret = io_timeout_cancel(ctx, &cd); spin_unlock(&ctx->completion_lock); } else { enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); @@ -6591,10 +7705,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, unsigned flags; u32 off = READ_ONCE(sqe->off); - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || - sqe->splice_fd_in) + if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in) return -EINVAL; if (off && is_timeout_link) return -EINVAL; @@ -6693,30 +7804,42 @@ add: return 0; } -struct io_cancel_data { - struct io_ring_ctx *ctx; - u64 user_data; -}; - static bool io_cancel_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_cancel_data *cd = data; - return req->ctx == cd->ctx && req->user_data == cd->user_data; + if (req->ctx != cd->ctx) + return false; + if (cd->flags & IORING_ASYNC_CANCEL_ANY) { + ; + } else if (cd->flags & IORING_ASYNC_CANCEL_FD) { + if (req->file != cd->file) + return false; + } else { + if (req->cqe.user_data != cd->data) + return false; + } + if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { + if (cd->seq == req->work.cancel_seq) + return false; + req->work.cancel_seq = cd->seq; + } + return true; } -static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, - struct io_ring_ctx *ctx) +static int io_async_cancel_one(struct io_uring_task *tctx, + struct io_cancel_data *cd) { - struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, }; enum io_wq_cancel cancel_ret; int ret = 0; + bool all; if (!tctx || !tctx->io_wq) return -ENOENT; - cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false); + all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); + cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all); switch (cancel_ret) { case IO_WQ_CANCEL_OK: ret = 0; @@ -6732,14 +7855,14 @@ static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, return ret; } -static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr) +static int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) { struct io_ring_ctx *ctx = req->ctx; int ret; WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); - ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); + ret = io_async_cancel_one(req->task->io_uring, cd); /* * Fall-through even for -EALREADY, as we may have poll armed * that need unarming. @@ -6748,56 +7871,98 @@ static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr) return 0; spin_lock(&ctx->completion_lock); - ret = io_poll_cancel(ctx, sqe_addr, false); + ret = io_poll_cancel(ctx, cd); if (ret != -ENOENT) goto out; - - spin_lock_irq(&ctx->timeout_lock); - ret = io_timeout_cancel(ctx, sqe_addr); - spin_unlock_irq(&ctx->timeout_lock); + if (!(cd->flags & IORING_ASYNC_CANCEL_FD)) + ret = io_timeout_cancel(ctx, cd); out: spin_unlock(&ctx->completion_lock); return ret; } +#define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ + IORING_ASYNC_CANCEL_ANY) + static int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) + if (unlikely(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags || - sqe->splice_fd_in) + if (sqe->off || sqe->len || sqe->splice_fd_in) return -EINVAL; req->cancel.addr = READ_ONCE(sqe->addr); + req->cancel.flags = READ_ONCE(sqe->cancel_flags); + if (req->cancel.flags & ~CANCEL_FLAGS) + return -EINVAL; + if (req->cancel.flags & IORING_ASYNC_CANCEL_FD) { + if (req->cancel.flags & IORING_ASYNC_CANCEL_ANY) + return -EINVAL; + req->cancel.fd = READ_ONCE(sqe->fd); + } + return 0; } -static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) +static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req, + unsigned int issue_flags) { - struct io_ring_ctx *ctx = req->ctx; - u64 sqe_addr = req->cancel.addr; - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; + bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); + struct io_ring_ctx *ctx = cd->ctx; struct io_tctx_node *node; - int ret; + int ret, nr = 0; - ret = io_try_cancel_userdata(req, sqe_addr); - if (ret != -ENOENT) - goto done; + do { + ret = io_try_cancel(req, cd); + if (ret == -ENOENT) + break; + if (!all) + return ret; + nr++; + } while (1); /* slow path, try all io-wq's */ - io_ring_submit_lock(ctx, needs_lock); + io_ring_submit_lock(ctx, issue_flags); ret = -ENOENT; list_for_each_entry(node, &ctx->tctx_list, ctx_node) { struct io_uring_task *tctx = node->task->io_uring; - ret = io_async_cancel_one(tctx, req->cancel.addr, ctx); - if (ret != -ENOENT) - break; + ret = io_async_cancel_one(tctx, cd); + if (ret != -ENOENT) { + if (!all) + break; + nr++; + } + } + io_ring_submit_unlock(ctx, issue_flags); + return all ? nr : ret; +} + +static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_cancel_data cd = { + .ctx = req->ctx, + .data = req->cancel.addr, + .flags = req->cancel.flags, + .seq = atomic_inc_return(&req->ctx->cancel_seq), + }; + int ret; + + if (cd.flags & IORING_ASYNC_CANCEL_FD) { + if (req->flags & REQ_F_FIXED_FILE) + req->file = io_file_get_fixed(req, req->cancel.fd, + issue_flags); + else + req->file = io_file_get_normal(req, req->cancel.fd); + if (!req->file) { + ret = -EBADF; + goto done; + } + cd.file = req->file; } - io_ring_submit_unlock(ctx, needs_lock); + + ret = __io_async_cancel(&cd, req, issue_flags); done: if (ret < 0) req_set_fail(req); @@ -6810,7 +7975,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req, { if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) + if (sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; req->rsrc_update.offset = READ_ONCE(sqe->off); @@ -6824,7 +7989,6 @@ static int io_rsrc_update_prep(struct io_kiocb *req, static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; struct io_uring_rsrc_update2 up; int ret; @@ -6833,11 +7997,12 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) up.nr = 0; up.tags = 0; up.resv = 0; + up.resv2 = 0; - io_ring_submit_lock(ctx, needs_lock); + io_ring_submit_lock(ctx, issue_flags); ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, req->rsrc_update.nr_args); - io_ring_submit_unlock(ctx, needs_lock); + io_ring_submit_unlock(ctx, issue_flags); if (ret < 0) req_set_fail(req); @@ -6849,7 +8014,7 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { switch (req->opcode) { case IORING_OP_NOP: - return 0; + return io_nop_prep(req, sqe); case IORING_OP_READV: case IORING_OP_READ_FIXED: case IORING_OP_READ: @@ -6923,6 +8088,18 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_linkat_prep(req, sqe); case IORING_OP_MSG_RING: return io_msg_ring_prep(req, sqe); + case IORING_OP_FSETXATTR: + return io_fsetxattr_prep(req, sqe); + case IORING_OP_SETXATTR: + return io_setxattr_prep(req, sqe); + case IORING_OP_FGETXATTR: + return io_fgetxattr_prep(req, sqe); + case IORING_OP_GETXATTR: + return io_getxattr_prep(req, sqe); + case IORING_OP_SOCKET: + return io_socket_prep(req, sqe); + case IORING_OP_URING_CMD: + return io_uring_cmd_prep(req, sqe); } printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -6932,7 +8109,12 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_req_prep_async(struct io_kiocb *req) { - if (!io_op_defs[req->opcode].needs_async_setup) + const struct io_op_def *def = &io_op_defs[req->opcode]; + + /* assign early for deferred execution for non-fixed file */ + if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE)) + req->file = io_file_get_normal(req, req->cqe.fd); + if (!def->needs_async_setup) return 0; if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; @@ -6950,6 +8132,8 @@ static int io_req_prep_async(struct io_kiocb *req) return io_recvmsg_prep_async(req); case IORING_OP_CONNECT: return io_connect_prep_async(req); + case IORING_OP_URING_CMD: + return io_uring_cmd_prep_async(req); } printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n", req->opcode); @@ -6959,9 +8143,10 @@ static int io_req_prep_async(struct io_kiocb *req) static u32 io_get_sequence(struct io_kiocb *req) { u32 seq = req->ctx->cached_sq_head; + struct io_kiocb *cur; /* need original cached_sq_head, but it was increased for each req */ - io_for_each_link(req, req) + io_for_each_link(cur, req) seq--; return seq; } @@ -7004,7 +8189,7 @@ fail: goto queue; } - trace_io_uring_defer(ctx, req, req->user_data, req->opcode); + trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode); de->req = req; de->seq = seq; list_add_tail(&de->list, &ctx->defer_list); @@ -7066,6 +8251,12 @@ static void io_clean_op(struct io_kiocb *req) if (req->statx.filename) putname(req->statx.filename); break; + case IORING_OP_SETXATTR: + case IORING_OP_FSETXATTR: + case IORING_OP_GETXATTR: + case IORING_OP_FGETXATTR: + __io_xattr_finish(req); + break; } } if ((req->flags & REQ_F_POLLED) && req->apoll) { @@ -7088,15 +8279,11 @@ static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) return true; if (req->flags & REQ_F_FIXED_FILE) - req->file = io_file_get_fixed(req, req->work.fd, issue_flags); + req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags); else - req->file = io_file_get_normal(req, req->work.fd); - if (req->file) - return true; + req->file = io_file_get_normal(req, req->cqe.fd); - req_set_fail(req); - req->result = -EBADF; - return false; + return !!req->file; } static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) @@ -7104,13 +8291,14 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) const struct cred *creds = NULL; int ret; + if (unlikely(!io_assign_file(req, issue_flags))) + return -EBADF; + if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) creds = override_creds(req->creds); if (!io_op_defs[req->opcode].audit_skip) audit_uring_entry(req->opcode); - if (unlikely(!io_assign_file(req, issue_flags))) - return -EBADF; switch (req->opcode) { case IORING_OP_NOP: @@ -7225,6 +8413,24 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) case IORING_OP_MSG_RING: ret = io_msg_ring(req, issue_flags); break; + case IORING_OP_FSETXATTR: + ret = io_fsetxattr(req, issue_flags); + break; + case IORING_OP_SETXATTR: + ret = io_setxattr(req, issue_flags); + break; + case IORING_OP_FGETXATTR: + ret = io_fgetxattr(req, issue_flags); + break; + case IORING_OP_GETXATTR: + ret = io_getxattr(req, issue_flags); + break; + case IORING_OP_SOCKET: + ret = io_socket(req, issue_flags); + break; + case IORING_OP_URING_CMD: + ret = io_uring_cmd(req, issue_flags); + break; default: ret = -EINVAL; break; @@ -7258,7 +8464,6 @@ static void io_wq_submit_work(struct io_wq_work *work) const struct io_op_def *def = &io_op_defs[req->opcode]; unsigned int issue_flags = IO_URING_F_UNLOCKED; bool needs_poll = false; - struct io_kiocb *timeout; int ret = 0, err = -ECANCELED; /* one will be dropped by ->io_free_work() after returning to io-wq */ @@ -7267,20 +8472,19 @@ static void io_wq_submit_work(struct io_wq_work *work) else req_ref_get(req); - timeout = io_prep_linked_timeout(req); - if (timeout) - io_queue_linked_timeout(timeout); - - if (!io_assign_file(req, issue_flags)) { - err = -EBADF; - work->flags |= IO_WQ_WORK_CANCEL; - } + io_arm_ltimeout(req); /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ if (work->flags & IO_WQ_WORK_CANCEL) { +fail: io_req_task_queue_fail(req, err); return; } + if (!io_assign_file(req, issue_flags)) { + err = -EBADF; + work->flags |= IO_WQ_WORK_CANCEL; + goto fail; + } if (req->flags & REQ_F_FORCE_ASYNC) { bool opcode_poll = def->pollin || def->pollout; @@ -7301,6 +8505,8 @@ static void io_wq_submit_work(struct io_wq_work *work) * wait for request slots on the block side. */ if (!needs_poll) { + if (!(req->ctx->flags & IORING_SETUP_IOPOLL)) + break; cond_resched(); continue; } @@ -7346,8 +8552,7 @@ static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, struct file *file = NULL; unsigned long file_ptr; - if (issue_flags & IO_URING_F_UNLOCKED) - mutex_lock(&ctx->uring_lock); + io_ring_submit_lock(ctx, issue_flags); if (unlikely((unsigned int)fd >= ctx->nr_user_files)) goto out; @@ -7358,9 +8563,9 @@ static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, /* mask in overlapping REQ_F and FFS bits */ req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT); io_req_set_rsrc_node(req, ctx, 0); + WARN_ON_ONCE(file && !test_bit(fd, ctx->file_table.bitmap)); out: - if (issue_flags & IO_URING_F_UNLOCKED) - mutex_unlock(&ctx->uring_lock); + io_ring_submit_unlock(ctx, issue_flags); return file; } @@ -7381,7 +8586,7 @@ static struct file *io_file_get_normal(struct io_kiocb *req, int fd) { struct file *file = fget(fd); - trace_io_uring_file_get(req->ctx, req, req->user_data, fd); + trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd); /* we don't allow fixed io_uring files */ if (file && file->f_op == &io_uring_fops) @@ -7395,8 +8600,14 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) int ret = -ENOENT; if (prev) { - if (!(req->task->flags & PF_EXITING)) - ret = io_try_cancel_userdata(req, prev->user_data); + if (!(req->task->flags & PF_EXITING)) { + struct io_cancel_data cd = { + .ctx = req->ctx, + .data = prev->cqe.user_data, + }; + + ret = io_try_cancel(req, &cd); + } io_req_complete_post(req, ret ?: -ETIME, 0); io_put_req(prev); } else { @@ -7430,7 +8641,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) spin_unlock_irqrestore(&ctx->timeout_lock, flags); req->io_task_work.func = io_req_task_link_timeout; - io_req_task_work_add(req, false); + io_req_task_work_add(req); return HRTIMER_NORESTART; } @@ -7456,10 +8667,17 @@ static void io_queue_linked_timeout(struct io_kiocb *req) io_put_req(req); } -static void io_queue_sqe_arm_apoll(struct io_kiocb *req) +static void io_queue_async(struct io_kiocb *req, int ret) __must_hold(&req->ctx->uring_lock) { - struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); + struct io_kiocb *linked_timeout; + + if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) { + io_req_complete_failed(req, ret); + return; + } + + linked_timeout = io_prep_linked_timeout(req); switch (io_arm_poll_handler(req, 0)) { case IO_APOLL_READY: @@ -7470,7 +8688,7 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req) * Queued up for async execution, worker will release * submit reference when the iocb is actually submitted. */ - io_queue_async_work(req, NULL); + io_queue_iowq(req, NULL); break; case IO_APOLL_OK: break; @@ -7480,10 +8698,9 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req) io_queue_linked_timeout(linked_timeout); } -static inline void __io_queue_sqe(struct io_kiocb *req) +static inline void io_queue_sqe(struct io_kiocb *req) __must_hold(&req->ctx->uring_lock) { - struct io_kiocb *linked_timeout; int ret; ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); @@ -7496,22 +8713,23 @@ static inline void __io_queue_sqe(struct io_kiocb *req) * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts */ - if (likely(!ret)) { - linked_timeout = io_prep_linked_timeout(req); - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); - } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { - io_queue_sqe_arm_apoll(req); - } else { - io_req_complete_failed(req, ret); - } + if (likely(!ret)) + io_arm_ltimeout(req); + else + io_queue_async(req, ret); } static void io_queue_sqe_fallback(struct io_kiocb *req) __must_hold(&req->ctx->uring_lock) { - if (req->flags & REQ_F_FAIL) { - io_req_complete_fail_submit(req); + if (unlikely(req->flags & REQ_F_FAIL)) { + /* + * We don't submit, fail them all, for that replace hardlinks + * with normal links. Extra REQ_F_LINK is tolerated. + */ + req->flags &= ~REQ_F_HARDLINK; + req->flags |= REQ_F_LINK; + io_req_complete_failed(req, req->cqe.res); } else if (unlikely(req->ctx->drain_active)) { io_drain_req(req); } else { @@ -7520,19 +8738,10 @@ static void io_queue_sqe_fallback(struct io_kiocb *req) if (unlikely(ret)) io_req_complete_failed(req, ret); else - io_queue_async_work(req, NULL); + io_queue_iowq(req, NULL); } } -static inline void io_queue_sqe(struct io_kiocb *req) - __must_hold(&req->ctx->uring_lock) -{ - if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) - __io_queue_sqe(req); - else - io_queue_sqe_fallback(req); -} - /* * Check SQE restrictions (opcode and flags). * @@ -7587,9 +8796,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->opcode = opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ req->flags = sqe_flags = READ_ONCE(sqe->flags); - req->user_data = READ_ONCE(sqe->user_data); + req->cqe.user_data = READ_ONCE(sqe->user_data); req->file = NULL; - req->fixed_rsrc_refs = NULL; + req->rsrc_node = NULL; req->task = current; if (unlikely(opcode >= IORING_OP_LAST)) { @@ -7600,9 +8809,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, /* enforce forwards compatibility on users */ if (sqe_flags & ~SQE_VALID_FLAGS) return -EINVAL; - if ((sqe_flags & IOSQE_BUFFER_SELECT) && - !io_op_defs[opcode].buffer_select) - return -EOPNOTSUPP; + if (sqe_flags & IOSQE_BUFFER_SELECT) { + if (!io_op_defs[opcode].buffer_select) + return -EOPNOTSUPP; + req->buf_index = READ_ONCE(sqe->buf_group); + } if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) ctx->drain_disabled = true; if (sqe_flags & IOSQE_IO_DRAIN) { @@ -7625,10 +8836,15 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, } } + if (!io_op_defs[opcode].ioprio && sqe->ioprio) + return -EINVAL; + if (!io_op_defs[opcode].iopoll && (ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (io_op_defs[opcode].needs_file) { struct io_submit_state *state = &ctx->submit_state; - req->work.fd = READ_ONCE(sqe->fd); + req->cqe.fd = READ_ONCE(sqe->fd); /* * Plug now if we have more than 2 IO left after this, and the @@ -7660,7 +8876,44 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, return io_req_prep(req, sqe); } -static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, +static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe, + struct io_kiocb *req, int ret) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_submit_link *link = &ctx->submit_state.link; + struct io_kiocb *head = link->head; + + trace_io_uring_req_failed(sqe, ctx, req, ret); + + /* + * Avoid breaking links in the middle as it renders links with SQPOLL + * unusable. Instead of failing eagerly, continue assembling the link if + * applicable and mark the head with REQ_F_FAIL. The link flushing code + * should find the flag and handle the rest. + */ + req_fail_link_node(req, ret); + if (head && !(head->flags & REQ_F_FAIL)) + req_fail_link_node(head, -ECANCELED); + + if (!(req->flags & IO_REQ_LINK_FLAGS)) { + if (head) { + link->last->link = req; + link->head = NULL; + req = head; + } + io_queue_sqe_fallback(req); + return ret; + } + + if (head) + link->last->link = req; + else + link->head = req; + link->last = req; + return 0; +} + +static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) __must_hold(&ctx->uring_lock) { @@ -7668,35 +8921,11 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, int ret; ret = io_init_req(ctx, req, sqe); - if (unlikely(ret)) { - trace_io_uring_req_failed(sqe, ctx, req, ret); - - /* fail even hard links since we don't submit */ - if (link->head) { - /* - * we can judge a link req is failed or cancelled by if - * REQ_F_FAIL is set, but the head is an exception since - * it may be set REQ_F_FAIL because of other req's failure - * so let's leverage req->result to distinguish if a head - * is set REQ_F_FAIL because of its failure or other req's - * failure so that we can set the correct ret code for it. - * init result here to avoid affecting the normal path. - */ - if (!(link->head->flags & REQ_F_FAIL)) - req_fail_link_node(link->head, -ECANCELED); - } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { - /* - * the current req is a normal req, we should return - * error and thus break the submittion loop. - */ - io_req_complete_failed(req, ret); - return ret; - } - req_fail_link_node(req, ret); - } + if (unlikely(ret)) + return io_submit_fail_init(sqe, req, ret); /* don't need @sqe from now on */ - trace_io_uring_submit_sqe(ctx, req, req->user_data, req->opcode, + trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode, req->flags, true, ctx->flags & IORING_SETUP_SQPOLL); @@ -7707,29 +8936,32 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, * submitted sync once the chain is complete. If none of those * conditions are true (normal request), then just queue it. */ - if (link->head) { - struct io_kiocb *head = link->head; - - if (!(req->flags & REQ_F_FAIL)) { - ret = io_req_prep_async(req); - if (unlikely(ret)) { - req_fail_link_node(req, ret); - if (!(head->flags & REQ_F_FAIL)) - req_fail_link_node(head, -ECANCELED); - } - } - trace_io_uring_link(ctx, req, head); + if (unlikely(link->head)) { + ret = io_req_prep_async(req); + if (unlikely(ret)) + return io_submit_fail_init(sqe, req, ret); + + trace_io_uring_link(ctx, req, link->head); link->last->link = req; link->last = req; - if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) + if (req->flags & IO_REQ_LINK_FLAGS) return 0; - /* last request of a link, enqueue the link */ + /* last request of the link, flush it */ + req = link->head; link->head = NULL; - req = head; - } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { - link->head = req; - link->last = req; + if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)) + goto fallback; + + } else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS | + REQ_F_FORCE_ASYNC | REQ_F_FAIL))) { + if (req->flags & IO_REQ_LINK_FLAGS) { + link->head = req; + link->last = req; + } else { +fallback: + io_queue_sqe_fallback(req); + } return 0; } @@ -7744,8 +8976,8 @@ static void io_submit_state_end(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; - if (state->link.head) - io_queue_sqe(state->link.head); + if (unlikely(state->link.head)) + io_queue_sqe_fallback(state->link.head); /* flush only after queuing links as they can generate completions */ io_submit_flush_completions(ctx); if (state->plug_started) @@ -7799,8 +9031,12 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) * though the application is the one updating it. */ head = READ_ONCE(ctx->sq_array[sq_idx]); - if (likely(head < ctx->sq_entries)) + if (likely(head < ctx->sq_entries)) { + /* double index for 128-byte SQEs, twice as long */ + if (ctx->flags & IORING_SETUP_SQE128) + head <<= 1; return &ctx->sq_sqes[head]; + } /* drop invalid entries */ ctx->cq_extra--; @@ -7813,54 +9049,52 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) __must_hold(&ctx->uring_lock) { unsigned int entries = io_sqring_entries(ctx); - int submitted = 0; + unsigned int left; + int ret; if (unlikely(!entries)) return 0; /* make sure SQ entry isn't read before tail */ - nr = min3(nr, ctx->sq_entries, entries); - io_get_task_refs(nr); + ret = left = min3(nr, ctx->sq_entries, entries); + io_get_task_refs(left); + io_submit_state_start(&ctx->submit_state, left); - io_submit_state_start(&ctx->submit_state, nr); do { const struct io_uring_sqe *sqe; struct io_kiocb *req; - if (unlikely(!io_alloc_req_refill(ctx))) { - if (!submitted) - submitted = -EAGAIN; + if (unlikely(!io_alloc_req_refill(ctx))) break; - } req = io_alloc_req(ctx); sqe = io_get_sqe(ctx); if (unlikely(!sqe)) { - wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); + io_req_add_to_cache(req, ctx); break; } - /* will complete beyond this point, count as submitted */ - submitted++; - if (io_submit_sqe(ctx, req, sqe)) { - /* - * Continue submitting even for sqe failure if the - * ring was setup with IORING_SETUP_SUBMIT_ALL - */ - if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL)) - break; - } - } while (submitted < nr); - if (unlikely(submitted != nr)) { - int ref_used = (submitted == -EAGAIN) ? 0 : submitted; - int unused = nr - ref_used; + /* + * Continue submitting even for sqe failure if the + * ring was setup with IORING_SETUP_SUBMIT_ALL + */ + if (unlikely(io_submit_sqe(ctx, req, sqe)) && + !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) { + left--; + break; + } + } while (--left); - current->io_uring->cached_refs += unused; + if (unlikely(left)) { + ret -= left; + /* try again if it submitted nothing and can't allocate a req */ + if (!ret && io_req_cache_empty(ctx)) + ret = -EAGAIN; + current->io_uring->cached_refs += left; } io_submit_state_end(ctx); /* Commit SQ ring head once we've consumed and submitted all SQEs */ io_commit_sqring(ctx); - - return submitted; + return ret; } static inline bool io_sqd_events_pending(struct io_sq_data *sqd) @@ -7868,23 +9102,6 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd) return READ_ONCE(sqd->state); } -static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx) -{ - /* Tell userspace we may need a wakeup call */ - spin_lock(&ctx->completion_lock); - WRITE_ONCE(ctx->rings->sq_flags, - ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP); - spin_unlock(&ctx->completion_lock); -} - -static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) -{ - spin_lock(&ctx->completion_lock); - WRITE_ONCE(ctx->rings->sq_flags, - ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP); - spin_unlock(&ctx->completion_lock); -} - static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) { unsigned int to_submit; @@ -8000,8 +9217,8 @@ static int io_sq_thread(void *data) bool needs_sched = true; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { - io_ring_set_wakeup_flag(ctx); - + atomic_or(IORING_SQ_NEED_WAKEUP, + &ctx->rings->sq_flags); if ((ctx->flags & IORING_SETUP_IOPOLL) && !wq_list_empty(&ctx->iopoll_list)) { needs_sched = false; @@ -8012,7 +9229,7 @@ static int io_sq_thread(void *data) * Ensure the store of the wakeup flag is not * reordered with the load of the SQ tail */ - smp_mb(); + smp_mb__after_atomic(); if (io_sqring_entries(ctx)) { needs_sched = false; @@ -8026,7 +9243,8 @@ static int io_sq_thread(void *data) mutex_lock(&sqd->lock); } list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - io_ring_clear_wakeup_flag(ctx); + atomic_andnot(IORING_SQ_NEED_WAKEUP, + &ctx->rings->sq_flags); } finish_wait(&sqd->wait, &wait); @@ -8036,7 +9254,7 @@ static int io_sq_thread(void *data) io_uring_cancel_generic(true, sqd); sqd->thread = NULL; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - io_ring_set_wakeup_flag(ctx); + atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); io_run_task_work(); mutex_unlock(&sqd->lock); @@ -8076,7 +9294,8 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, * Cannot safely flush overflowed CQEs from here, ensure we wake up * the task, and the next invocation will do it. */ - if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow)) + if (io_should_wake(iowq) || + test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &iowq->ctx->check_cq)) return autoremove_wake_function(curr, mode, wake_flags, key); return -1; } @@ -8098,15 +9317,18 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, ktime_t timeout) { int ret; + unsigned long check_cq; /* make sure we run task_work before checking for signals */ ret = io_run_task_work_sig(); if (ret || io_should_wake(iowq)) return ret; + check_cq = READ_ONCE(ctx->check_cq); /* let the caller flush overflows, retry */ - if (test_bit(0, &ctx->check_cq_overflow)) + if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) return 1; - + if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))) + return -EBADR; if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS)) return -ETIME; return 1; @@ -8171,10 +9393,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, TASK_INTERRUPTIBLE); ret = io_cqring_wait_schedule(ctx, &iowq, timeout); - finish_wait(&ctx->cq_wait, &iowq.wq); cond_resched(); } while (ret > 0); + finish_wait(&ctx->cq_wait, &iowq.wq); restore_saved_sigmask_unless(ret == -EINTR); return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; @@ -8412,17 +9634,57 @@ static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) { table->files = kvcalloc(nr_files, sizeof(table->files[0]), GFP_KERNEL_ACCOUNT); - return !!table->files; + if (unlikely(!table->files)) + return false; + + table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT); + if (unlikely(!table->bitmap)) { + kvfree(table->files); + return false; + } + + return true; } static void io_free_file_tables(struct io_file_table *table) { kvfree(table->files); + bitmap_free(table->bitmap); table->files = NULL; + table->bitmap = NULL; +} + +static inline void io_file_bitmap_set(struct io_file_table *table, int bit) +{ + WARN_ON_ONCE(test_bit(bit, table->bitmap)); + __set_bit(bit, table->bitmap); + if (bit == table->alloc_hint) + table->alloc_hint++; +} + +static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) +{ + __clear_bit(bit, table->bitmap); + table->alloc_hint = bit; } static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) { +#if !defined(IO_URING_SCM_ALL) + int i; + + for (i = 0; i < ctx->nr_user_files; i++) { + struct file *file = io_file_from_index(ctx, i); + + if (!file) + continue; + if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM) + continue; + io_file_bitmap_clear(&ctx->file_table, i); + fput(file); + } +#endif + #if defined(CONFIG_UNIX) if (ctx->ring_sock) { struct sock *sock = ctx->ring_sock->sk; @@ -8431,16 +9693,6 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) kfree_skb(skb); } -#else - int i; - - for (i = 0; i < ctx->nr_user_files; i++) { - struct file *file; - - file = io_file_from_index(ctx, i); - if (file) - fput(file); - } #endif io_free_file_tables(&ctx->file_table); io_rsrc_data_free(ctx->file_data); @@ -8585,107 +9837,66 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, return sqd; } -#if defined(CONFIG_UNIX) /* * Ensure the UNIX gc is aware of our file set, so we are certain that * the io_uring can be safely unregistered on process exit, even if we have - * loops in the file referencing. + * loops in the file referencing. We account only files that can hold other + * files because otherwise they can't form a loop and so are not interesting + * for GC. */ -static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) +static int io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) { +#if defined(CONFIG_UNIX) struct sock *sk = ctx->ring_sock->sk; + struct sk_buff_head *head = &sk->sk_receive_queue; struct scm_fp_list *fpl; struct sk_buff *skb; - int i, nr_files; - - fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); - if (!fpl) - return -ENOMEM; - skb = alloc_skb(0, GFP_KERNEL); - if (!skb) { - kfree(fpl); - return -ENOMEM; - } + if (likely(!io_file_need_scm(file))) + return 0; - skb->sk = sk; + /* + * See if we can merge this file into an existing skb SCM_RIGHTS + * file set. If there's no room, fall back to allocating a new skb + * and filling it in. + */ + spin_lock_irq(&head->lock); + skb = skb_peek(head); + if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) + __skb_unlink(skb, head); + else + skb = NULL; + spin_unlock_irq(&head->lock); - nr_files = 0; - fpl->user = get_uid(current_user()); - for (i = 0; i < nr; i++) { - struct file *file = io_file_from_index(ctx, i + offset); + if (!skb) { + fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); + if (!fpl) + return -ENOMEM; - if (!file) - continue; - fpl->fp[nr_files] = get_file(file); - unix_inflight(fpl->user, fpl->fp[nr_files]); - nr_files++; - } + skb = alloc_skb(0, GFP_KERNEL); + if (!skb) { + kfree(fpl); + return -ENOMEM; + } - if (nr_files) { + fpl->user = get_uid(current_user()); fpl->max = SCM_MAX_FD; - fpl->count = nr_files; + fpl->count = 0; + UNIXCB(skb).fp = fpl; + skb->sk = sk; skb->destructor = unix_destruct_scm; refcount_add(skb->truesize, &sk->sk_wmem_alloc); - skb_queue_head(&sk->sk_receive_queue, skb); - - for (i = 0; i < nr; i++) { - struct file *file = io_file_from_index(ctx, i + offset); - - if (file) - fput(file); - } - } else { - kfree_skb(skb); - free_uid(fpl->user); - kfree(fpl); - } - - return 0; -} - -/* - * If UNIX sockets are enabled, fd passing can cause a reference cycle which - * causes regular reference counting to break down. We rely on the UNIX - * garbage collection to take care of this problem for us. - */ -static int io_sqe_files_scm(struct io_ring_ctx *ctx) -{ - unsigned left, total; - int ret = 0; - - total = 0; - left = ctx->nr_user_files; - while (left) { - unsigned this_files = min_t(unsigned, left, SCM_MAX_FD); - - ret = __io_sqe_files_scm(ctx, this_files, total); - if (ret) - break; - left -= this_files; - total += this_files; } - if (!ret) - return 0; - - while (total < ctx->nr_user_files) { - struct file *file = io_file_from_index(ctx, total); - - if (file) - fput(file); - total++; - } - - return ret; -} -#else -static int io_sqe_files_scm(struct io_ring_ctx *ctx) -{ + fpl = UNIXCB(skb).fp; + fpl->fp[fpl->count++] = get_file(file); + unix_inflight(fpl->user, file); + skb_queue_head(head, skb); + fput(file); +#endif return 0; } -#endif static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) { @@ -8696,6 +9907,11 @@ static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) struct sk_buff *skb; int i; + if (!io_file_need_scm(file)) { + fput(file); + return; + } + __skb_queue_head_init(&list); /* @@ -8760,15 +9976,17 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) list_del(&prsrc->list); if (prsrc->tag) { - bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL; + if (ctx->flags & IORING_SETUP_IOPOLL) + mutex_lock(&ctx->uring_lock); - io_ring_submit_lock(ctx, lock_ring); spin_lock(&ctx->completion_lock); io_fill_cqe_aux(ctx, prsrc->tag, 0, 0); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); - io_ring_submit_unlock(ctx, lock_ring); + + if (ctx->flags & IORING_SETUP_IOPOLL) + mutex_unlock(&ctx->uring_lock); } rsrc_data->do_put(ctx, prsrc); @@ -8822,27 +10040,31 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, if (ret) return ret; - ret = -ENOMEM; - if (!io_alloc_file_tables(&ctx->file_table, nr_args)) - goto out_free; + if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { + io_rsrc_data_free(ctx->file_data); + ctx->file_data = NULL; + return -ENOMEM; + } for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { - if (copy_from_user(&fd, &fds[i], sizeof(fd))) { + struct io_fixed_file *file_slot; + + if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { ret = -EFAULT; - goto out_fput; + goto fail; } /* allow sparse sets */ - if (fd == -1) { + if (!fds || fd == -1) { ret = -EINVAL; if (unlikely(*io_get_tag_slot(ctx->file_data, i))) - goto out_fput; + goto fail; continue; } file = fget(fd); ret = -EBADF; if (unlikely(!file)) - goto out_fput; + goto fail; /* * Don't allow io_uring instances to be registered. If UNIX @@ -8853,74 +10075,23 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, */ if (file->f_op == &io_uring_fops) { fput(file); - goto out_fput; + goto fail; } - io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file); - } - - ret = io_sqe_files_scm(ctx); - if (ret) { - __io_sqe_files_unregister(ctx); - return ret; - } - - io_rsrc_node_switch(ctx, NULL); - return ret; -out_fput: - for (i = 0; i < ctx->nr_user_files; i++) { - file = io_file_from_index(ctx, i); - if (file) + ret = io_scm_file_account(ctx, file); + if (ret) { fput(file); - } - io_free_file_tables(&ctx->file_table); - ctx->nr_user_files = 0; -out_free: - io_rsrc_data_free(ctx->file_data); - ctx->file_data = NULL; - return ret; -} - -static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, - int index) -{ -#if defined(CONFIG_UNIX) - struct sock *sock = ctx->ring_sock->sk; - struct sk_buff_head *head = &sock->sk_receive_queue; - struct sk_buff *skb; - - /* - * See if we can merge this file into an existing skb SCM_RIGHTS - * file set. If there's no room, fall back to allocating a new skb - * and filling it in. - */ - spin_lock_irq(&head->lock); - skb = skb_peek(head); - if (skb) { - struct scm_fp_list *fpl = UNIXCB(skb).fp; - - if (fpl->count < SCM_MAX_FD) { - __skb_unlink(skb, head); - spin_unlock_irq(&head->lock); - fpl->fp[fpl->count] = get_file(file); - unix_inflight(fpl->user, fpl->fp[fpl->count]); - fpl->count++; - spin_lock_irq(&head->lock); - __skb_queue_head(head, skb); - } else { - skb = NULL; + goto fail; } - } - spin_unlock_irq(&head->lock); - - if (skb) { - fput(file); - return 0; + file_slot = io_fixed_file_slot(&ctx->file_table, i); + io_fixed_file_set(file_slot, file); + io_file_bitmap_set(&ctx->file_table, i); } - return __io_sqe_files_scm(ctx, 1, index); -#else + io_rsrc_node_switch(ctx, NULL); return 0; -#endif +fail: + __io_sqe_files_unregister(ctx); + return ret; } static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, @@ -8944,12 +10115,11 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, unsigned int issue_flags, u32 slot_index) { struct io_ring_ctx *ctx = req->ctx; - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; bool needs_switch = false; struct io_fixed_file *file_slot; int ret = -EBADF; - io_ring_submit_lock(ctx, needs_lock); + io_ring_submit_lock(ctx, issue_flags); if (file->f_op == &io_uring_fops) goto err; ret = -ENXIO; @@ -8975,22 +10145,20 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, if (ret) goto err; file_slot->file_ptr = 0; + io_file_bitmap_clear(&ctx->file_table, slot_index); needs_switch = true; } - *io_get_tag_slot(ctx->file_data, slot_index) = 0; - io_fixed_file_set(file_slot, file); - ret = io_sqe_file_register(ctx, file, slot_index); - if (ret) { - file_slot->file_ptr = 0; - goto err; + ret = io_scm_file_account(ctx, file); + if (!ret) { + *io_get_tag_slot(ctx->file_data, slot_index) = 0; + io_fixed_file_set(file_slot, file); + io_file_bitmap_set(&ctx->file_table, slot_index); } - - ret = 0; err: if (needs_switch) io_rsrc_node_switch(ctx, ctx->file_data); - io_ring_submit_unlock(ctx, needs_lock); + io_ring_submit_unlock(ctx, issue_flags); if (ret) fput(file); return ret; @@ -9000,12 +10168,11 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) { unsigned int offset = req->close.file_slot - 1; struct io_ring_ctx *ctx = req->ctx; - bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; struct io_fixed_file *file_slot; struct file *file; int ret; - io_ring_submit_lock(ctx, needs_lock); + io_ring_submit_lock(ctx, issue_flags); ret = -ENXIO; if (unlikely(!ctx->file_data)) goto out; @@ -9028,10 +10195,11 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) goto out; file_slot->file_ptr = 0; + io_file_bitmap_clear(&ctx->file_table, offset); io_rsrc_node_switch(ctx, ctx->file_data); ret = 0; out: - io_ring_submit_unlock(ctx, needs_lock); + io_ring_submit_unlock(ctx, issue_flags); return ret; } @@ -9077,6 +10245,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, if (err) break; file_slot->file_ptr = 0; + io_file_bitmap_clear(&ctx->file_table, i); needs_switch = true; } if (fd != -1) { @@ -9098,14 +10267,14 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, err = -EBADF; break; } - *io_get_tag_slot(data, i) = tag; - io_fixed_file_set(file_slot, file); - err = io_sqe_file_register(ctx, file, i); + err = io_scm_file_account(ctx, file); if (err) { - file_slot->file_ptr = 0; fput(file); break; } + *io_get_tag_slot(data, i) = tag; + io_fixed_file_set(file_slot, file); + io_file_bitmap_set(&ctx->file_table, i); } } @@ -9185,7 +10354,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task, task->io_uring = tctx; spin_lock_init(&tctx->task_lock); INIT_WQ_LIST(&tctx->task_list); - INIT_WQ_LIST(&tctx->prior_task_list); + INIT_WQ_LIST(&tctx->prio_task_list); init_task_work(&tctx->task_work, tctx_task_work); return 0; } @@ -9363,8 +10532,8 @@ static void *io_mem_alloc(size_t size) return (void *) __get_free_pages(gfp, get_order(size)); } -static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, - size_t *sq_offset) +static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, + unsigned int cq_entries, size_t *sq_offset) { struct io_rings *rings; size_t off, sq_array_size; @@ -9372,6 +10541,10 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, off = struct_size(rings, cqes, cq_entries); if (off == SIZE_MAX) return SIZE_MAX; + if (ctx->flags & IORING_SETUP_CQE32) { + if (check_shl_overflow(off, 1, &off)) + return SIZE_MAX; + } #ifdef CONFIG_SMP off = ALIGN(off, SMP_CACHE_BYTES); @@ -9533,30 +10706,18 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, return ret; } -static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, - struct io_mapped_ubuf **pimu, - struct page **last_hpage) +static struct page **io_pin_pages(unsigned long ubuf, unsigned long len, + int *npages) { - struct io_mapped_ubuf *imu = NULL; + unsigned long start, end, nr_pages; struct vm_area_struct **vmas = NULL; struct page **pages = NULL; - unsigned long off, start, end, ubuf; - size_t size; - int ret, pret, nr_pages, i; - - if (!iov->iov_base) { - *pimu = ctx->dummy_ubuf; - return 0; - } + int i, pret, ret = -ENOMEM; - ubuf = (unsigned long) iov->iov_base; - end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; + end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; start = ubuf >> PAGE_SHIFT; nr_pages = end - start; - *pimu = NULL; - ret = -ENOMEM; - pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) goto done; @@ -9566,10 +10727,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, if (!vmas) goto done; - imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); - if (!imu) - goto done; - ret = 0; mmap_read_lock(current->mm); pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, @@ -9587,6 +10744,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, break; } } + *npages = nr_pages; } else { ret = pret < 0 ? pret : -EFAULT; } @@ -9600,14 +10758,53 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, unpin_user_pages(pages, pret); goto done; } + ret = 0; +done: + kvfree(vmas); + if (ret < 0) { + kvfree(pages); + pages = ERR_PTR(ret); + } + return pages; +} + +static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, + struct io_mapped_ubuf **pimu, + struct page **last_hpage) +{ + struct io_mapped_ubuf *imu = NULL; + struct page **pages = NULL; + unsigned long off; + size_t size; + int ret, nr_pages, i; + + if (!iov->iov_base) { + *pimu = ctx->dummy_ubuf; + return 0; + } + + *pimu = NULL; + ret = -ENOMEM; + + pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, + &nr_pages); + if (IS_ERR(pages)) { + ret = PTR_ERR(pages); + pages = NULL; + goto done; + } + + imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); + if (!imu) + goto done; - ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage); + ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); if (ret) { - unpin_user_pages(pages, pret); + unpin_user_pages(pages, nr_pages); goto done; } - off = ubuf & ~PAGE_MASK; + off = (unsigned long) iov->iov_base & ~PAGE_MASK; size = iov->iov_len; for (i = 0; i < nr_pages; i++) { size_t vec_len; @@ -9620,8 +10817,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, size -= vec_len; } /* store original address for later verification */ - imu->ubuf = ubuf; - imu->ubuf_end = ubuf + iov->iov_len; + imu->ubuf = (unsigned long) iov->iov_base; + imu->ubuf_end = imu->ubuf + iov->iov_len; imu->nr_bvecs = nr_pages; *pimu = imu; ret = 0; @@ -9629,7 +10826,6 @@ done: if (ret) kvfree(imu); kvfree(pages); - kvfree(vmas); return ret; } @@ -9688,12 +10884,17 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, } for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { - ret = io_copy_iov(ctx, &iov, arg, i); - if (ret) - break; - ret = io_buffer_validate(&iov); - if (ret) - break; + if (arg) { + ret = io_copy_iov(ctx, &iov, arg, i); + if (ret) + break; + ret = io_buffer_validate(&iov); + if (ret) + break; + } else { + memset(&iov, 0, sizeof(iov)); + } + if (!iov.iov_base && *io_get_tag_slot(data, i)) { ret = -EINVAL; break; @@ -9832,19 +11033,19 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx) static void io_destroy_buffers(struct io_ring_ctx *ctx) { + struct io_buffer_list *bl; + unsigned long index; int i; - for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) { - struct list_head *list = &ctx->io_buffers[i]; - - while (!list_empty(list)) { - struct io_buffer_list *bl; + for (i = 0; i < BGID_ARRAY; i++) { + if (!ctx->io_bl) + break; + __io_remove_buffers(ctx, &ctx->io_bl[i], -1U); + } - bl = list_first_entry(list, struct io_buffer_list, list); - __io_remove_buffers(ctx, bl, -1U); - list_del(&bl->list); - kfree(bl); - } + xa_for_each(&ctx->io_bl_xa, index, bl) { + xa_erase(&ctx->io_bl_xa, bl->bgid); + __io_remove_buffers(ctx, bl, -1U); } while (!list_empty(&ctx->io_buffers_pages)) { @@ -9864,7 +11065,7 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) mutex_lock(&ctx->uring_lock); io_flush_cached_locked_reqs(ctx, state); - while (state->free_list.next) { + while (!io_req_cache_empty(ctx)) { struct io_wq_work_node *node; struct io_kiocb *req; @@ -9953,7 +11154,8 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_wq_put_hash(ctx->hash_map); kfree(ctx->cancel_hash); kfree(ctx->dummy_ubuf); - kfree(ctx->io_buffers); + kfree(ctx->io_bl); + xa_destroy(&ctx->io_bl_xa); kfree(ctx); } @@ -9984,7 +11186,8 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) * Users may get EPOLLIN meanwhile seeing nothing in cqring, this * pushs them to do the flush. */ - if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow)) + if (io_cqring_events(ctx) || + test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) mask |= EPOLLIN | EPOLLRDNORM; return mask; @@ -10116,8 +11319,7 @@ static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, } } spin_unlock_irq(&ctx->timeout_lock); - if (canceled != 0) - io_commit_cqring(ctx); + io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); if (canceled != 0) io_cqring_ev_posted(ctx); @@ -10137,11 +11339,13 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_unregister_personality(ctx, index); mutex_unlock(&ctx->uring_lock); - io_kill_timeouts(ctx, NULL, true); - io_poll_remove_all(ctx, NULL, true); - - /* if we failed setting up the ctx, we might not have any rings */ - io_iopoll_try_reap_events(ctx); + /* failed during ring init, it couldn't have issued any requests */ + if (ctx->rings) { + io_kill_timeouts(ctx, NULL, true); + io_poll_remove_all(ctx, NULL, true); + /* if we failed setting up the ctx, we might not have any rings */ + io_iopoll_try_reap_events(ctx); + } INIT_WORK(&ctx->exit_work, io_ring_exit_work); /* @@ -10233,6 +11437,10 @@ static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; struct io_uring_task *tctx = task ? task->io_uring : NULL; + /* failed during ring init, it couldn't have issued any requests */ + if (!ctx->rings) + return; + while (1) { enum io_wq_cancel cret; bool ret = false; @@ -10524,6 +11732,11 @@ static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, break; } + if (reg.resv) { + ret = -EINVAL; + break; + } + if (reg.offset == -1U) { start = 0; end = IO_RINGFD_REG_MAX; @@ -10570,7 +11783,7 @@ static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, ret = -EFAULT; break; } - if (reg.offset >= IO_RINGFD_REG_MAX) { + if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) { ret = -EINVAL; break; } @@ -10673,6 +11886,19 @@ static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) return 0; } +static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) +{ + if (flags & IORING_ENTER_EXT_ARG) { + struct io_uring_getevents_arg arg; + + if (argsz != sizeof(arg)) + return -EINVAL; + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; + } + return 0; +} + static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, struct __kernel_timespec __user **ts, const sigset_t __user **sig) @@ -10697,6 +11923,8 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz return -EINVAL; if (copy_from_user(&arg, argp, sizeof(arg))) return -EFAULT; + if (arg.pad) + return -EINVAL; *sig = u64_to_user_ptr(arg.sigmask); *argsz = arg.sigmask_sz; *ts = u64_to_user_ptr(arg.ts); @@ -10708,7 +11936,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, size_t, argsz) { struct io_ring_ctx *ctx; - int submitted = 0; struct fd f; long ret; @@ -10771,39 +11998,64 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (ret) goto out; } - submitted = to_submit; + ret = to_submit; } else if (to_submit) { ret = io_uring_add_tctx_node(ctx); if (unlikely(ret)) goto out; - mutex_lock(&ctx->uring_lock); - submitted = io_submit_sqes(ctx, to_submit); - mutex_unlock(&ctx->uring_lock); - if (submitted != to_submit) + mutex_lock(&ctx->uring_lock); + ret = io_submit_sqes(ctx, to_submit); + if (ret != to_submit) { + mutex_unlock(&ctx->uring_lock); goto out; + } + if ((flags & IORING_ENTER_GETEVENTS) && ctx->syscall_iopoll) + goto iopoll_locked; + mutex_unlock(&ctx->uring_lock); } if (flags & IORING_ENTER_GETEVENTS) { - const sigset_t __user *sig; - struct __kernel_timespec __user *ts; - - ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); - if (unlikely(ret)) - goto out; + int ret2; + if (ctx->syscall_iopoll) { + /* + * We disallow the app entering submit/complete with + * polling, but we still need to lock the ring to + * prevent racing with polled issue that got punted to + * a workqueue. + */ + mutex_lock(&ctx->uring_lock); +iopoll_locked: + ret2 = io_validate_ext_arg(flags, argp, argsz); + if (likely(!ret2)) { + min_complete = min(min_complete, + ctx->cq_entries); + ret2 = io_iopoll_check(ctx, min_complete); + } + mutex_unlock(&ctx->uring_lock); + } else { + const sigset_t __user *sig; + struct __kernel_timespec __user *ts; + + ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); + if (likely(!ret2)) { + min_complete = min(min_complete, + ctx->cq_entries); + ret2 = io_cqring_wait(ctx, min_complete, sig, + argsz, ts); + } + } - min_complete = min(min_complete, ctx->cq_entries); + if (!ret) { + ret = ret2; - /* - * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user - * space applications don't need to do io completion events - * polling again, they can rely on io_sq_thread to do polling - * work, which can reduce cpu usage and uring_lock contention. - */ - if (ctx->flags & IORING_SETUP_IOPOLL && - !(ctx->flags & IORING_SETUP_SQPOLL)) { - ret = io_iopoll_check(ctx, min_complete); - } else { - ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts); + /* + * EBADR indicates that one or more CQE were dropped. + * Once the user has been informed we can clear the bit + * as they are obviously ok with those drops. + */ + if (unlikely(ret2 == -EBADR)) + clear_bit(IO_CHECK_CQ_DROPPED_BIT, + &ctx->check_cq); } } @@ -10812,7 +12064,7 @@ out: out_fput: if (!(flags & IORING_ENTER_REGISTERED_RING)) fdput(f); - return submitted ? submitted : ret; + return ret; } #ifdef CONFIG_PROC_FS @@ -10859,10 +12111,15 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, unsigned int sq_tail = READ_ONCE(r->sq.tail); unsigned int cq_head = READ_ONCE(r->cq.head); unsigned int cq_tail = READ_ONCE(r->cq.tail); + unsigned int cq_shift = 0; unsigned int sq_entries, cq_entries; bool has_lock; + bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); unsigned int i; + if (is_cqe32) + cq_shift = 1; + /* * we may get imprecise sqe and cqe info if uring is actively running * since we get cached_sq_head and cached_cq_tail without uring_lock @@ -10895,11 +12152,18 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, cq_entries = min(cq_tail - cq_head, ctx->cq_entries); for (i = 0; i < cq_entries; i++) { unsigned int entry = i + cq_head; - struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask]; + struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift]; - seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", + if (!is_cqe32) { + seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", entry & cq_mask, cqe->user_data, cqe->res, cqe->flags); + } else { + seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, " + "extra1:%llu, extra2:%llu\n", + entry & cq_mask, cqe->user_data, cqe->res, + cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]); + } } /* @@ -11002,7 +12266,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries; - size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); + size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset); if (size == SIZE_MAX) return -EOVERFLOW; @@ -11017,7 +12281,10 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, rings->sq_ring_entries = p->sq_entries; rings->cq_ring_entries = p->cq_entries; - size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); + if (p->flags & IORING_SETUP_SQE128) + size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries); + else + size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); if (size == SIZE_MAX) { io_mem_free(ctx->rings); ctx->rings = NULL; @@ -11129,11 +12396,41 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, ctx = io_ring_ctx_alloc(p); if (!ctx) return -ENOMEM; + + /* + * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user + * space applications don't need to do io completion events + * polling again, they can rely on io_sq_thread to do polling + * work, which can reduce cpu usage and uring_lock contention. + */ + if (ctx->flags & IORING_SETUP_IOPOLL && + !(ctx->flags & IORING_SETUP_SQPOLL)) + ctx->syscall_iopoll = 1; + ctx->compat = in_compat_syscall(); if (!capable(CAP_IPC_LOCK)) ctx->user = get_uid(current_user()); /* + * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if + * COOP_TASKRUN is set, then IPIs are never needed by the app. + */ + ret = -EINVAL; + if (ctx->flags & IORING_SETUP_SQPOLL) { + /* IPI related flags don't make sense with SQPOLL */ + if (ctx->flags & (IORING_SETUP_COOP_TASKRUN | + IORING_SETUP_TASKRUN_FLAG)) + goto err; + ctx->notify_method = TWA_SIGNAL_NO_IPI; + } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) { + ctx->notify_method = TWA_SIGNAL_NO_IPI; + } else { + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + goto err; + ctx->notify_method = TWA_SIGNAL; + } + + /* * This is just grabbed for accounting purposes. When a process exits, * the mm is exited and dropped before the files, hence we need to hang * on to this mm purely for the purposes of being able to unaccount @@ -11178,7 +12475,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | - IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP; + IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | + IORING_FEAT_LINKED_FILE; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; @@ -11229,10 +12527,12 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | - IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL)) + IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | + IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | + IORING_SETUP_SQE128 | IORING_SETUP_CQE32)) return -EINVAL; - return io_uring_create(entries, &p, params); + return io_uring_create(entries, &p, params); } SYSCALL_DEFINE2(io_uring_setup, u32, entries, @@ -11389,8 +12689,6 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, __u32 tmp; int err; - if (up->resv) - return -EINVAL; if (check_add_overflow(up->offset, nr_args, &tmp)) return -EOVERFLOW; err = io_rsrc_node_switch_start(ctx); @@ -11416,6 +12714,8 @@ static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, memset(&up, 0, sizeof(up)); if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) return -EFAULT; + if (up.resv || up.resv2) + return -EINVAL; return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); } @@ -11428,7 +12728,7 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, return -EINVAL; if (copy_from_user(&up, arg, sizeof(up))) return -EFAULT; - if (!up.nr || up.resv) + if (!up.nr || up.resv || up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, type, &up, up.nr); } @@ -11445,14 +12745,20 @@ static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, memset(&rr, 0, sizeof(rr)); if (copy_from_user(&rr, arg, size)) return -EFAULT; - if (!rr.nr || rr.resv || rr.resv2) + if (!rr.nr || rr.resv2) + return -EINVAL; + if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) return -EINVAL; switch (type) { case IORING_RSRC_FILE: + if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) + break; return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), rr.nr, u64_to_user_ptr(rr.tags)); case IORING_RSRC_BUFFER: + if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) + break; return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), rr.nr, u64_to_user_ptr(rr.tags)); } @@ -11587,6 +12893,85 @@ err: return ret; } +static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_buf_ring *br; + struct io_uring_buf_reg reg; + struct io_buffer_list *bl; + struct page **pages; + int nr_pages; + + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + + if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) + return -EINVAL; + if (!reg.ring_addr) + return -EFAULT; + if (reg.ring_addr & ~PAGE_MASK) + return -EINVAL; + if (!is_power_of_2(reg.ring_entries)) + return -EINVAL; + + if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { + int ret = io_init_bl_list(ctx); + if (ret) + return ret; + } + + bl = io_buffer_get_list(ctx, reg.bgid); + if (bl) { + /* if mapped buffer ring OR classic exists, don't allow */ + if (bl->buf_nr_pages || !list_empty(&bl->buf_list)) + return -EEXIST; + } else { + bl = kzalloc(sizeof(*bl), GFP_KERNEL); + if (!bl) + return -ENOMEM; + } + + pages = io_pin_pages(reg.ring_addr, + struct_size(br, bufs, reg.ring_entries), + &nr_pages); + if (IS_ERR(pages)) { + kfree(bl); + return PTR_ERR(pages); + } + + br = page_address(pages[0]); + bl->buf_pages = pages; + bl->buf_nr_pages = nr_pages; + bl->nr_entries = reg.ring_entries; + bl->buf_ring = br; + bl->mask = reg.ring_entries - 1; + io_buffer_add_list(ctx, bl, reg.bgid); + return 0; +} + +static int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_buf_reg reg; + struct io_buffer_list *bl; + + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) + return -EINVAL; + + bl = io_buffer_get_list(ctx, reg.bgid); + if (!bl) + return -ENOENT; + if (!bl->buf_nr_pages) + return -EINVAL; + + __io_remove_buffers(ctx, bl, -1U); + if (bl->bgid >= BGID_ARRAY) { + xa_erase(&ctx->io_bl_xa, bl->bgid); + kfree(bl); + } + return 0; +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -11612,6 +12997,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, switch (opcode) { case IORING_REGISTER_BUFFERS: + ret = -EFAULT; + if (!arg) + break; ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); break; case IORING_UNREGISTER_BUFFERS: @@ -11621,6 +13009,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ret = io_sqe_buffers_unregister(ctx); break; case IORING_REGISTER_FILES: + ret = -EFAULT; + if (!arg) + break; ret = io_sqe_files_register(ctx, arg, nr_args, NULL); break; case IORING_UNREGISTER_FILES: @@ -11715,6 +13106,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, case IORING_UNREGISTER_RING_FDS: ret = io_ringfd_unregister(ctx, arg, nr_args); break; + case IORING_REGISTER_PBUF_RING: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_pbuf_ring(ctx, arg); + break; + case IORING_UNREGISTER_PBUF_RING: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_unregister_pbuf_ring(ctx, arg); + break; default: ret = -EINVAL; break; @@ -11791,6 +13194,7 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(42, __u16, personality); BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); BUILD_BUG_SQE_ELEM(44, __u32, file_index); + BUILD_BUG_SQE_ELEM(48, __u64, addr3); BUILD_BUG_ON(sizeof(struct io_uring_files_update) != sizeof(struct io_uring_rsrc_update)); @@ -11799,6 +13203,10 @@ static int __init io_uring_init(void) /* ->buf_index is u16 */ BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); + BUILD_BUG_ON(BGID_ARRAY * sizeof(struct io_buffer_list) > PAGE_SIZE); + BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0); + BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) != + offsetof(struct io_uring_buf_ring, tail)); /* should fit into one byte */ BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); @@ -11808,6 +13216,10 @@ static int __init io_uring_init(void) BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); + BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); + + BUILD_BUG_ON(sizeof(struct io_uring_cmd) > 64); + req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); return 0; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b08f5dc31780..80f9b047aa1b 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -56,7 +56,8 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter, { atomic_inc(&dio->ref); - if (dio->iocb->ki_flags & IOCB_HIPRI) { + /* Sync dio can't be polled reliably */ + if ((dio->iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(dio->iocb)) { bio_set_polled(bio, dio->iocb); dio->submit.poll_bio = bio; } @@ -265,8 +266,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, * cache flushes on IO completion. */ if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && - (dio->flags & IOMAP_DIO_WRITE_FUA) && - blk_queue_fua(bdev_get_queue(iomap->bdev))) + (dio->flags & IOMAP_DIO_WRITE_FUA) && bdev_fua(iomap->bdev)) use_fua = true; } @@ -654,9 +654,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (!READ_ONCE(dio->submit.waiter)) break; - if (!dio->submit.poll_bio || - !bio_poll(dio->submit.poll_bio, NULL, 0)) - blk_io_schedule(); + blk_io_schedule(); } __set_current_state(TASK_RUNNING); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 5b9408e3b370..ac7f067b7bdd 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -488,7 +488,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd2_journal_wait_updates(journal); commit_transaction->t_state = T_SWITCH; - write_unlock(&journal->j_state_lock); J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <= journal->j_max_transaction_buffers); @@ -508,6 +507,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) * has reserved. This is consistent with the existing behaviour * that multiple jbd2_journal_get_write_access() calls to the same * buffer are perfectly permissible. + * We use journal->j_state_lock here to serialize processing of + * t_reserved_list with eviction of buffers from journal_unmap_buffer(). */ while (commit_transaction->t_reserved_list) { jh = commit_transaction->t_reserved_list; @@ -527,6 +528,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd2_journal_refile_buffer(journal, jh); } + write_unlock(&journal->j_state_lock); /* * Now try to drop any written-back buffers from the journal's * checkpoint lists. We do this *before* commit because it potentially diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index fcacafa4510d..c0cbeeaec2d1 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1762,7 +1762,6 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) unsigned long block, log_offset; /* logical */ unsigned long long phys_block, block_start, block_stop; /* physical */ loff_t byte_start, byte_stop, byte_count; - struct request_queue *q = bdev_get_queue(journal->j_dev); /* flags must be set to either discard or zeroout */ if ((flags & ~JBD2_JOURNAL_FLUSH_VALID) || !flags || @@ -1770,10 +1769,8 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) (flags & JBD2_JOURNAL_FLUSH_ZEROOUT))) return -EINVAL; - if (!q) - return -ENXIO; - - if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !blk_queue_discard(q)) + if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && + !bdev_max_discard_sectors(journal->j_dev)) return -EOPNOTSUPP; /* @@ -1828,7 +1825,7 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) err = blkdev_issue_discard(journal->j_dev, byte_start >> SECTOR_SHIFT, byte_count >> SECTOR_SHIFT, - GFP_NOFS, 0); + GFP_NOFS); } else if (flags & JBD2_JOURNAL_FLUSH_ZEROOUT) { err = blkdev_issue_zeroout(journal->j_dev, byte_start >> SECTOR_SHIFT, diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index 03a845ab4f00..1e7b177ece60 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -110,14 +110,13 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case FITRIM: { struct super_block *sb = inode->i_sb; - struct request_queue *q = bdev_get_queue(sb->s_bdev); struct fstrim_range range; s64 ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) { + if (!bdev_max_discard_sectors(sb->s_bdev)) { jfs_warn("FITRIM not supported on device"); return -EOPNOTSUPP; } @@ -127,7 +126,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return -EFAULT; range.minlen = max_t(unsigned int, range.minlen, - q->limits.discard_granularity); + bdev_discard_granularity(sb->s_bdev)); ret = jfs_ioc_trim(inode, &range); if (ret < 0) diff --git a/fs/jfs/super.c b/fs/jfs/super.c index f1a13a74cddf..85d4f44f2ac4 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -372,19 +372,16 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, } case Opt_discard: - { - struct request_queue *q = bdev_get_queue(sb->s_bdev); /* if set to 1, even copying files will cause * trimming :O * -> user has more control over the online trimming */ sbi->minblks_trim = 64; - if (blk_queue_discard(q)) + if (bdev_max_discard_sectors(sb->s_bdev)) *flag |= JFS_DISCARD; else pr_err("JFS: discard option not supported on device\n"); break; - } case Opt_nodiscard: *flag &= ~JFS_DISCARD; @@ -392,10 +389,9 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, case Opt_discard_minblk: { - struct request_queue *q = bdev_get_queue(sb->s_bdev); char *minblks_trim = args[0].from; int rc; - if (blk_queue_discard(q)) { + if (bdev_max_discard_sectors(sb->s_bdev)) { *flag |= JFS_DISCARD; rc = kstrtouint(minblks_trim, 0, &sbi->minblks_trim); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 61a8edc4ba8b..e205fde7163a 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1406,7 +1406,12 @@ static void __kernfs_remove(struct kernfs_node *kn) */ void kernfs_remove(struct kernfs_node *kn) { - struct kernfs_root *root = kernfs_root(kn); + struct kernfs_root *root; + + if (!kn) + return; + + root = kernfs_root(kn); down_write(&root->kernfs_rwsem); __kernfs_remove(kn); diff --git a/fs/ksmbd/misc.c b/fs/ksmbd/misc.c index 60e7ac62c917..1e2076a53bed 100644 --- a/fs/ksmbd/misc.c +++ b/fs/ksmbd/misc.c @@ -158,19 +158,41 @@ out: * Return : windows path string or error */ -char *convert_to_nt_pathname(char *filename) +char *convert_to_nt_pathname(struct ksmbd_share_config *share, + struct path *path) { - char *ab_pathname; + char *pathname, *ab_pathname, *nt_pathname; + int share_path_len = share->path_sz; - if (strlen(filename) == 0) - filename = "\\"; + pathname = kmalloc(PATH_MAX, GFP_KERNEL); + if (!pathname) + return ERR_PTR(-EACCES); - ab_pathname = kstrdup(filename, GFP_KERNEL); - if (!ab_pathname) - return NULL; + ab_pathname = d_path(path, pathname, PATH_MAX); + if (IS_ERR(ab_pathname)) { + nt_pathname = ERR_PTR(-EACCES); + goto free_pathname; + } + + if (strncmp(ab_pathname, share->path, share_path_len)) { + nt_pathname = ERR_PTR(-EACCES); + goto free_pathname; + } + + nt_pathname = kzalloc(strlen(&ab_pathname[share_path_len]) + 2, GFP_KERNEL); + if (!nt_pathname) { + nt_pathname = ERR_PTR(-ENOMEM); + goto free_pathname; + } + if (ab_pathname[share_path_len] == '\0') + strcpy(nt_pathname, "/"); + strcat(nt_pathname, &ab_pathname[share_path_len]); + + ksmbd_conv_path_to_windows(nt_pathname); - ksmbd_conv_path_to_windows(ab_pathname); - return ab_pathname; +free_pathname: + kfree(pathname); + return nt_pathname; } int get_nlink(struct kstat *st) diff --git a/fs/ksmbd/misc.h b/fs/ksmbd/misc.h index 253366bd0951..aae2a252945f 100644 --- a/fs/ksmbd/misc.h +++ b/fs/ksmbd/misc.h @@ -14,7 +14,8 @@ struct ksmbd_file; int match_pattern(const char *str, size_t len, const char *pattern); int ksmbd_validate_filename(char *filename); int parse_stream_name(char *filename, char **stream_name, int *s_type); -char *convert_to_nt_pathname(char *filename); +char *convert_to_nt_pathname(struct ksmbd_share_config *share, + struct path *path); int get_nlink(struct kstat *st); void ksmbd_conv_path_to_unix(char *path); void ksmbd_strip_last_slash(char *path); diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c index 23871b18a429..8b5560574d4c 100644 --- a/fs/ksmbd/oplock.c +++ b/fs/ksmbd/oplock.c @@ -1694,33 +1694,3 @@ out: read_unlock(&lease_list_lock); return ret_op; } - -int smb2_check_durable_oplock(struct ksmbd_file *fp, - struct lease_ctx_info *lctx, char *name) -{ - struct oplock_info *opinfo = opinfo_get(fp); - int ret = 0; - - if (opinfo && opinfo->is_lease) { - if (!lctx) { - pr_err("open does not include lease\n"); - ret = -EBADF; - goto out; - } - if (memcmp(opinfo->o_lease->lease_key, lctx->lease_key, - SMB2_LEASE_KEY_SIZE)) { - pr_err("invalid lease key\n"); - ret = -EBADF; - goto out; - } - if (name && strcmp(fp->filename, name)) { - pr_err("invalid name reconnect %s\n", name); - ret = -EINVAL; - goto out; - } - } -out: - if (opinfo) - opinfo_put(opinfo); - return ret; -} diff --git a/fs/ksmbd/oplock.h b/fs/ksmbd/oplock.h index 0cf7a2b5bbc0..09753448f779 100644 --- a/fs/ksmbd/oplock.h +++ b/fs/ksmbd/oplock.h @@ -124,6 +124,4 @@ struct oplock_info *lookup_lease_in_table(struct ksmbd_conn *conn, int find_same_lease_key(struct ksmbd_session *sess, struct ksmbd_inode *ci, struct lease_ctx_info *lctx); void destroy_lease_table(struct ksmbd_conn *conn); -int smb2_check_durable_oplock(struct ksmbd_file *fp, - struct lease_ctx_info *lctx, char *name); #endif /* __KSMBD_OPLOCK_H */ diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 3bf6c56c654c..16c803a9d996 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -11,6 +11,7 @@ #include <linux/statfs.h> #include <linux/ethtool.h> #include <linux/falloc.h> +#include <linux/mount.h> #include "glob.h" #include "smbfsctl.h" @@ -2918,7 +2919,6 @@ int smb2_open(struct ksmbd_work *work) goto err_out; } - fp->filename = name; fp->cdoption = req->CreateDisposition; fp->daccess = daccess; fp->saccess = req->ShareAccess; @@ -3270,14 +3270,13 @@ err_out1: if (!rsp->hdr.Status) rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR; - if (!fp || !fp->filename) - kfree(name); if (fp) ksmbd_fd_put(work, fp); smb2_set_err_rsp(work); ksmbd_debug(SMB, "Error response: %x\n", rsp->hdr.Status); } + kfree(name); kfree(lc); return 0; @@ -3895,8 +3894,6 @@ int smb2_query_dir(struct ksmbd_work *work) ksmbd_debug(SMB, "Search pattern is %s\n", srch_ptr); } - ksmbd_debug(SMB, "Directory name is %s\n", dir_fp->filename); - if (srch_flag & SMB2_REOPEN || srch_flag & SMB2_RESTART_SCANS) { ksmbd_debug(SMB, "Restart directory scan\n"); generic_file_llseek(dir_fp->filp, 0, SEEK_SET); @@ -4390,9 +4387,9 @@ static int get_file_all_info(struct ksmbd_work *work, return -EACCES; } - filename = convert_to_nt_pathname(fp->filename); - if (!filename) - return -ENOMEM; + filename = convert_to_nt_pathname(work->tcon->share_conf, &fp->filp->f_path); + if (IS_ERR(filename)) + return PTR_ERR(filename); inode = file_inode(fp->filp); generic_fillattr(file_mnt_user_ns(fp->filp), inode, &stat); @@ -4999,15 +4996,17 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, case FS_SECTOR_SIZE_INFORMATION: { struct smb3_fs_ss_info *info; + unsigned int sector_size = + min_t(unsigned int, path.mnt->mnt_sb->s_blocksize, 4096); info = (struct smb3_fs_ss_info *)(rsp->Buffer); - info->LogicalBytesPerSector = cpu_to_le32(stfs.f_bsize); + info->LogicalBytesPerSector = cpu_to_le32(sector_size); info->PhysicalBytesPerSectorForAtomicity = - cpu_to_le32(stfs.f_bsize); - info->PhysicalBytesPerSectorForPerf = cpu_to_le32(stfs.f_bsize); + cpu_to_le32(sector_size); + info->PhysicalBytesPerSectorForPerf = cpu_to_le32(sector_size); info->FSEffPhysicalBytesPerSectorForAtomicity = - cpu_to_le32(stfs.f_bsize); + cpu_to_le32(sector_size); info->Flags = cpu_to_le32(SSINFO_FLAGS_ALIGNED_DEVICE | SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE); info->ByteOffsetForSectorAlignment = 0; @@ -5683,8 +5682,7 @@ static int set_file_allocation_info(struct ksmbd_work *work, size = i_size_read(inode); rc = ksmbd_vfs_truncate(work, fp, alloc_blks * 512); if (rc) { - pr_err("truncate failed! filename : %s, err %d\n", - fp->filename, rc); + pr_err("truncate failed!, err %d\n", rc); return rc; } if (size < alloc_blks * 512) @@ -5714,12 +5712,10 @@ static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp, * truncated range. */ if (inode->i_sb->s_magic != MSDOS_SUPER_MAGIC) { - ksmbd_debug(SMB, "filename : %s truncated to newsize %lld\n", - fp->filename, newsize); + ksmbd_debug(SMB, "truncated to newsize %lld\n", newsize); rc = ksmbd_vfs_truncate(work, fp, newsize); if (rc) { - ksmbd_debug(SMB, "truncate failed! filename : %s err %d\n", - fp->filename, rc); + ksmbd_debug(SMB, "truncate failed!, err %d\n", rc); if (rc != -EAGAIN) rc = -EBADF; return rc; @@ -5765,8 +5761,10 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, if (parent_fp) { if (parent_fp->daccess & FILE_DELETE_LE) { pr_err("parent dir is opened with delete access\n"); + ksmbd_fd_put(work, parent_fp); return -ESHARE; } + ksmbd_fd_put(work, parent_fp); } next: return smb2_rename(work, fp, user_ns, rename_info, diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index 9cebb6ba555b..dcdd07c6efff 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -398,8 +398,7 @@ int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, size_t count, nbytes = kernel_read(filp, rbuf, count, pos); if (nbytes < 0) { - pr_err("smb read failed for (%s), err = %zd\n", - fp->filename, nbytes); + pr_err("smb read failed, err = %zd\n", nbytes); return nbytes; } @@ -875,8 +874,7 @@ int ksmbd_vfs_truncate(struct ksmbd_work *work, err = vfs_truncate(&filp->f_path, size); if (err) - pr_err("truncate failed for filename : %s err %d\n", - fp->filename, err); + pr_err("truncate failed, err %d\n", err); return err; } diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c index 29c1db66bd0f..c4d59d2735f0 100644 --- a/fs/ksmbd/vfs_cache.c +++ b/fs/ksmbd/vfs_cache.c @@ -328,7 +328,6 @@ static void __ksmbd_close_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp) kfree(smb_lock); } - kfree(fp->filename); if (ksmbd_stream_fd(fp)) kfree(fp->stream.name); kmem_cache_free(filp_cache, fp); @@ -497,6 +496,7 @@ struct ksmbd_file *ksmbd_lookup_fd_inode(struct inode *inode) list_for_each_entry(lfp, &ci->m_fp_list, node) { if (inode == file_inode(lfp->filp)) { atomic_dec(&ci->m_count); + lfp = ksmbd_fp_get(lfp); read_unlock(&ci->m_lock); return lfp; } diff --git a/fs/ksmbd/vfs_cache.h b/fs/ksmbd/vfs_cache.h index 36239ce31afd..fcb13413fa8d 100644 --- a/fs/ksmbd/vfs_cache.h +++ b/fs/ksmbd/vfs_cache.h @@ -62,7 +62,6 @@ struct ksmbd_inode { struct ksmbd_file { struct file *filp; - char *filename; u64 persistent_id; u64 volatile_id; diff --git a/fs/namei.c b/fs/namei.c index 3f1829b3ab5b..509657fdf4f5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3673,18 +3673,14 @@ static struct dentry *filename_create(int dfd, struct filename *name, { struct dentry *dentry = ERR_PTR(-EEXIST); struct qstr last; + bool want_dir = lookup_flags & LOOKUP_DIRECTORY; + unsigned int reval_flag = lookup_flags & LOOKUP_REVAL; + unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL; int type; int err2; int error; - bool is_dir = (lookup_flags & LOOKUP_DIRECTORY); - /* - * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any - * other flags passed in are ignored! - */ - lookup_flags &= LOOKUP_REVAL; - - error = filename_parentat(dfd, name, lookup_flags, path, &last, &type); + error = filename_parentat(dfd, name, reval_flag, path, &last, &type); if (error) return ERR_PTR(error); @@ -3698,11 +3694,13 @@ static struct dentry *filename_create(int dfd, struct filename *name, /* don't fail immediately if it's r/o, at least try to report other errors */ err2 = mnt_want_write(path->mnt); /* - * Do the final lookup. + * Do the final lookup. Suppress 'create' if there is a trailing + * '/', and a directory wasn't requested. */ - lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL; + if (last.name[last.len] && !want_dir) + create_flags = 0; inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); - dentry = __lookup_hash(&last, path->dentry, lookup_flags); + dentry = __lookup_hash(&last, path->dentry, reval_flag | create_flags); if (IS_ERR(dentry)) goto unlock; @@ -3716,7 +3714,7 @@ static struct dentry *filename_create(int dfd, struct filename *name, * all is fine. Let's be bastards - you had / on the end, you've * been asking for (non-existent) directory. -ENOENT for you. */ - if (unlikely(!is_dir && last.name[last.len])) { + if (unlikely(!create_flags)) { error = -ENOENT; goto fail; } diff --git a/fs/namespace.c b/fs/namespace.c index a0a36bfa3aa0..afe2b64b14f1 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4058,10 +4058,22 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) if (err) { struct mount *p; - for (p = mnt; p != m; p = next_mnt(p, mnt)) { + /* + * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will + * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all + * mounts and needs to take care to include the first mount. + */ + for (p = mnt; p; p = next_mnt(p, mnt)) { /* If we had to hold writers unblock them. */ if (p->mnt.mnt_flags & MNT_WRITE_HOLD) mnt_unhold_writers(p); + + /* + * We're done once the first mount we changed got + * MNT_WRITE_HOLD unset. + */ + if (p == m) + break; } } return err; diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index e2d59bb5e6bb..9a16897e8dc6 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -517,7 +517,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, if (result.negated) ctx->flags &= ~NFS_MOUNT_SOFTREVAL; else - ctx->flags &= NFS_MOUNT_SOFTREVAL; + ctx->flags |= NFS_MOUNT_SOFTREVAL; break; case Opt_posix: if (result.negated) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 16106f805ffa..a79f66432bd3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -363,6 +363,14 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent kunmap_atomic(start); } +static void nfs4_fattr_set_prechange(struct nfs_fattr *fattr, u64 version) +{ + if (!(fattr->valid & NFS_ATTR_FATTR_PRECHANGE)) { + fattr->pre_change_attr = version; + fattr->valid |= NFS_ATTR_FATTR_PRECHANGE; + } +} + static void nfs4_test_and_free_stateid(struct nfs_server *server, nfs4_stateid *stateid, const struct cred *cred) @@ -6553,7 +6561,9 @@ static void nfs4_delegreturn_release(void *calldata) pnfs_roc_release(&data->lr.arg, &data->lr.res, data->res.lr_ret); if (inode) { - nfs_post_op_update_inode_force_wcc(inode, &data->fattr); + nfs4_fattr_set_prechange(&data->fattr, + inode_peek_iversion_raw(inode)); + nfs_refresh_inode(inode, &data->fattr); nfs_iput_and_deactive(inode); } kfree(calldata); diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index c08882f5867b..2c1b027774d4 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -237,6 +237,13 @@ nfsd_file_check_write_error(struct nfsd_file *nf) } static void +nfsd_file_flush(struct nfsd_file *nf) +{ + if (nf->nf_file && vfs_fsync(nf->nf_file, 1) != 0) + nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); +} + +static void nfsd_file_do_unhash(struct nfsd_file *nf) { lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); @@ -295,19 +302,15 @@ nfsd_file_put_noref(struct nfsd_file *nf) void nfsd_file_put(struct nfsd_file *nf) { - bool is_hashed; - set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); - if (refcount_read(&nf->nf_ref) > 2 || !nf->nf_file) { + if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) { + nfsd_file_flush(nf); nfsd_file_put_noref(nf); - return; + } else { + nfsd_file_put_noref(nf); + if (nf->nf_file) + nfsd_file_schedule_laundrette(); } - - filemap_flush(nf->nf_file->f_mapping); - is_hashed = test_bit(NFSD_FILE_HASHED, &nf->nf_flags) != 0; - nfsd_file_put_noref(nf); - if (is_hashed) - nfsd_file_schedule_laundrette(); if (atomic_long_read(&nfsd_filecache_count) >= NFSD_FILE_LRU_LIMIT) nfsd_file_gc(); } @@ -328,6 +331,7 @@ nfsd_file_dispose_list(struct list_head *dispose) while(!list_empty(dispose)) { nf = list_first_entry(dispose, struct nfsd_file, nf_lru); list_del(&nf->nf_lru); + nfsd_file_flush(nf); nfsd_file_put_noref(nf); } } @@ -341,6 +345,7 @@ nfsd_file_dispose_list_sync(struct list_head *dispose) while(!list_empty(dispose)) { nf = list_first_entry(dispose, struct nfsd_file, nf_lru); list_del(&nf->nf_lru); + nfsd_file_flush(nf); if (!refcount_dec_and_test(&nf->nf_ref)) continue; if (nfsd_file_free(nf)) diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 367551bddfc6..b5760801d377 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -249,34 +249,34 @@ nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr) int w; if (!svcxdr_encode_stat(xdr, resp->status)) - return 0; + return false; if (dentry == NULL || d_really_is_negative(dentry)) - return 1; + return true; inode = d_inode(dentry); if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat)) - return 0; + return false; if (xdr_stream_encode_u32(xdr, resp->mask) < 0) - return 0; + return false; rqstp->rq_res.page_len = w = nfsacl_size( (resp->mask & NFS_ACL) ? resp->acl_access : NULL, (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); while (w > 0) { if (!*(rqstp->rq_next_page++)) - return 1; + return true; w -= PAGE_SIZE; } if (!nfs_stream_encode_acl(xdr, inode, resp->acl_access, resp->mask & NFS_ACL, 0)) - return 0; + return false; if (!nfs_stream_encode_acl(xdr, inode, resp->acl_default, resp->mask & NFS_DFACL, NFS_ACL_DEFAULT)) - return 0; + return false; - return 1; + return true; } /* ACCESS */ @@ -286,17 +286,17 @@ nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, struct xdr_stream *xdr) struct nfsd3_accessres *resp = rqstp->rq_resp; if (!svcxdr_encode_stat(xdr, resp->status)) - return 0; + return false; switch (resp->status) { case nfs_ok: if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat)) - return 0; + return false; if (xdr_stream_encode_u32(xdr, resp->access) < 0) - return 0; + return false; break; } - return 1; + return true; } /* diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index fec194a666f4..87e1004b606d 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -1052,20 +1052,20 @@ out: static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp) { struct the_nilfs *nilfs = inode->i_sb->s_fs_info; - struct request_queue *q = bdev_get_queue(nilfs->ns_bdev); struct fstrim_range range; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(nilfs->ns_bdev)) return -EOPNOTSUPP; if (copy_from_user(&range, argp, sizeof(range))) return -EFAULT; - range.minlen = max_t(u64, range.minlen, q->limits.discard_granularity); + range.minlen = max_t(u64, range.minlen, + bdev_discard_granularity(nilfs->ns_bdev)); down_read(&nilfs->ns_segctor_sem); ret = nilfs_sufile_trim_fs(nilfs->ns_sufile, &range); diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index e385cca2004a..77ff8e95421f 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -1100,7 +1100,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, nblocks * sects_per_block, - GFP_NOFS, 0); + GFP_NOFS); if (ret < 0) { put_bh(su_bh); goto out_sem; @@ -1134,7 +1134,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range) ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, nblocks * sects_per_block, - GFP_NOFS, 0); + GFP_NOFS); if (!ret) ndiscarded += nblocks; } diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index dd48a8f74d57..3b4a079c9617 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -672,7 +672,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, nblocks * sects_per_block, - GFP_NOFS, 0); + GFP_NOFS); if (ret < 0) return ret; nblocks = 0; @@ -682,7 +682,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, nblocks * sects_per_block, - GFP_NOFS, 0); + GFP_NOFS); return ret; } diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 9b32b76a9c30..a792e21c5309 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1657,6 +1657,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, else mnt = path.mnt; + /* + * FAN_RENAME is not allowed on non-dir (for now). + * We shouldn't have allowed setting any dirent events in mask of + * non-dir, but because we always allowed it, error only if group + * was initialized with the new flag FAN_REPORT_TARGET_FID. + */ + ret = -ENOTDIR; + if (inode && !S_ISDIR(inode->i_mode) && + ((mask & FAN_RENAME) || + ((mask & FANOTIFY_DIRENT_EVENTS) && + FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID)))) + goto path_put_and_out; + /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ if (mnt || !S_ISDIR(inode->i_mode)) { mask &= ~FAN_EVENT_ON_CHILD; diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 787b53b984ee..15806eeae217 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -22,20 +22,20 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg) { struct fstrim_range __user *user_range; struct fstrim_range range; - struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); int err; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(sbi->sb->s_bdev)) return -EOPNOTSUPP; user_range = (struct fstrim_range __user *)arg; if (copy_from_user(&range, user_range, sizeof(range))) return -EFAULT; - range.minlen = max_t(u32, range.minlen, q->limits.discard_granularity); + range.minlen = max_t(u32, range.minlen, + bdev_discard_granularity(sbi->sb->s_bdev)); err = ntfs_trim_fs(sbi, &range); if (err < 0) diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 278dcf502410..5781b9e8e3d8 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -882,7 +882,6 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) int err; struct ntfs_sb_info *sbi = sb->s_fs_info; struct block_device *bdev = sb->s_bdev; - struct request_queue *rq; struct inode *inode; struct ntfs_inode *ni; size_t i, tt; @@ -912,15 +911,14 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) goto out; } - rq = bdev_get_queue(bdev); - if (blk_queue_discard(rq) && rq->limits.discard_granularity) { - sbi->discard_granularity = rq->limits.discard_granularity; + if (bdev_max_discard_sectors(bdev) && bdev_discard_granularity(bdev)) { + sbi->discard_granularity = bdev_discard_granularity(bdev); sbi->discard_granularity_mask_inv = ~(u64)(sbi->discard_granularity - 1); } /* Parse boot. */ - err = ntfs_init_from_boot(sb, rq ? queue_logical_block_size(rq) : 512, + err = ntfs_init_from_boot(sb, bdev_logical_block_size(bdev), bdev_nr_bytes(bdev)); if (err) goto out; @@ -1335,7 +1333,7 @@ int ntfs_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len) return 0; err = blkdev_issue_discard(sb->s_bdev, start >> 9, (end - start) >> 9, - GFP_NOFS, 0); + GFP_NOFS); if (err == -EOPNOTSUPP) sbi->flags |= NTFS_FLAGS_NODISCARD; diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index f59461d85da4..afd54ec66103 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -903,20 +903,19 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case FITRIM: { struct super_block *sb = inode->i_sb; - struct request_queue *q = bdev_get_queue(sb->s_bdev); struct fstrim_range range; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(sb->s_bdev)) return -EOPNOTSUPP; if (copy_from_user(&range, argp, sizeof(range))) return -EFAULT; - range.minlen = max_t(u64, q->limits.discard_granularity, + range.minlen = max_t(u64, bdev_discard_granularity(sb->s_bdev), range.minlen); ret = ocfs2_trim_fs(sb, &range); if (ret < 0) diff --git a/fs/pipe.c b/fs/pipe.c index 9648ac15164a..e140ea150bbb 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -804,7 +804,7 @@ struct pipe_inode_info *alloc_pipe_info(void) if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user()) goto out_revert_acct; - pipe->bufs = kvcalloc(pipe_bufs, sizeof(struct pipe_buffer), + pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), GFP_KERNEL_ACCOUNT); if (pipe->bufs) { @@ -849,7 +849,7 @@ void free_pipe_info(struct pipe_inode_info *pipe) #endif if (pipe->tmp_page) __free_page(pipe->tmp_page); - kvfree(pipe->bufs); + kfree(pipe->bufs); kfree(pipe); } @@ -1264,7 +1264,8 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) if (nr_slots < n) return -EBUSY; - bufs = kvcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT); + bufs = kcalloc(nr_slots, sizeof(*bufs), + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (unlikely(!bufs)) return -ENOMEM; @@ -1291,7 +1292,7 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) head = n; tail = 0; - kvfree(pipe->bufs); + kfree(pipe->bufs); pipe->bufs = bufs; pipe->ring_size = nr_slots; if (pipe->max_usage > nr_slots) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 80acb6885cf9..962d32468eb4 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -759,9 +759,14 @@ static void posix_acl_fix_xattr_userns( } void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns, + struct inode *inode, void *value, size_t size) { struct user_namespace *user_ns = current_user_ns(); + + /* Leave ids untouched on non-idmapped mounts. */ + if (no_idmapping(mnt_userns, i_user_ns(inode))) + mnt_userns = &init_user_ns; if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns)) return; posix_acl_fix_xattr_userns(&init_user_ns, user_ns, mnt_userns, value, @@ -769,9 +774,14 @@ void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns, } void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns, + struct inode *inode, void *value, size_t size) { struct user_namespace *user_ns = current_user_ns(); + + /* Leave ids untouched on non-idmapped mounts. */ + if (no_idmapping(mnt_userns, i_user_ns(inode))) + mnt_userns = &init_user_ns; if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns)) return; posix_acl_fix_xattr_userns(user_ns, &init_user_ns, mnt_userns, value, diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c index 419760fd77bd..f38bda5b83ec 100644 --- a/fs/proc/cpuinfo.c +++ b/fs/proc/cpuinfo.c @@ -5,14 +5,10 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> -__weak void arch_freq_prepare_all(void) -{ -} - extern const struct seq_operations cpuinfo_op; + static int cpuinfo_open(struct inode *inode, struct file *file) { - arch_freq_prepare_all(); return seq_open(file, &cpuinfo_op); } diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 172c86270b31..913bef0d2a36 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -72,7 +72,7 @@ out: return 0; } -static int seq_fdinfo_open(struct inode *inode, struct file *file) +static int proc_fdinfo_access_allowed(struct inode *inode) { bool allowed = false; struct task_struct *task = get_proc_task(inode); @@ -86,6 +86,16 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file) if (!allowed) return -EACCES; + return 0; +} + +static int seq_fdinfo_open(struct inode *inode, struct file *file) +{ + int ret = proc_fdinfo_access_allowed(inode); + + if (ret) + return ret; + return single_open(file, seq_show, inode); } @@ -348,12 +358,23 @@ static int proc_readfdinfo(struct file *file, struct dir_context *ctx) proc_fdinfo_instantiate); } +static int proc_open_fdinfo(struct inode *inode, struct file *file) +{ + int ret = proc_fdinfo_access_allowed(inode); + + if (ret) + return ret; + + return 0; +} + const struct inode_operations proc_fdinfo_inode_operations = { .lookup = proc_lookupfdinfo, .setattr = proc_setattr, }; const struct file_operations proc_fdinfo_operations = { + .open = proc_open_fdinfo, .read = generic_read_dir, .iterate_shared = proc_readfdinfo, .llseek = generic_file_llseek, diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 622c844f6d11..8879d052f96c 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -86,17 +86,10 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length, int error, i; struct bio *bio; - if (page_count <= BIO_MAX_VECS) { - bio = bio_alloc(sb->s_bdev, page_count, REQ_OP_READ, GFP_NOIO); - } else { - bio = bio_kmalloc(GFP_NOIO, page_count); - bio_set_dev(bio, sb->s_bdev); - bio->bi_opf = REQ_OP_READ; - } - + bio = bio_kmalloc(page_count, GFP_NOIO); if (!bio) return -ENOMEM; - + bio_init(bio, sb->s_bdev, bio->bi_inline_vecs, page_count, REQ_OP_READ); bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT); for (i = 0; i < page_count; ++i) { @@ -126,7 +119,8 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length, out_free_bio: bio_free_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); return error; } @@ -190,7 +184,8 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, length |= data[0] << 8; } bio_free_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); compressed = SQUASHFS_COMPRESSED(length); length = SQUASHFS_COMPRESSED_SIZE(length); @@ -224,7 +219,8 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, out_free_bio: bio_free_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); out: if (res < 0) { ERROR("Failed to read block 0x%llx: %d\n", index, res); diff --git a/fs/stat.c b/fs/stat.c index 7f734be0e57e..5c2c94464e8b 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -348,9 +348,6 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat # define choose_32_64(a,b) b #endif -#define valid_dev(x) choose_32_64(old_valid_dev(x),true) -#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x) - #ifndef INIT_STRUCT_STAT_PADDING # define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st)) #endif @@ -359,7 +356,9 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) { struct stat tmp; - if (!valid_dev(stat->dev) || !valid_dev(stat->rdev)) + if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev)) + return -EOVERFLOW; + if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev)) return -EOVERFLOW; #if BITS_PER_LONG == 32 if (stat->size > MAX_NON_LFS) @@ -367,7 +366,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) #endif INIT_STRUCT_STAT_PADDING(tmp); - tmp.st_dev = encode_dev(stat->dev); + tmp.st_dev = new_encode_dev(stat->dev); tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; @@ -377,7 +376,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) return -EOVERFLOW; SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); - tmp.st_rdev = encode_dev(stat->rdev); + tmp.st_rdev = new_encode_dev(stat->rdev); tmp.st_size = stat->size; tmp.st_atime = stat->atime.tv_sec; tmp.st_mtime = stat->mtime.tv_sec; @@ -665,11 +664,13 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) { struct compat_stat tmp; - if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev)) + if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev)) + return -EOVERFLOW; + if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev)) return -EOVERFLOW; memset(&tmp, 0, sizeof(tmp)); - tmp.st_dev = old_encode_dev(stat->dev); + tmp.st_dev = new_encode_dev(stat->dev); tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; @@ -679,7 +680,7 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) return -EOVERFLOW; SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); - tmp.st_rdev = old_encode_dev(stat->rdev); + tmp.st_rdev = new_encode_dev(stat->rdev); if ((u64) stat->size > MAX_NON_LFS) return -EOVERFLOW; tmp.st_size = stat->size; diff --git a/fs/super.c b/fs/super.c index f1d4a193602d..60f57c7bc0a6 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1204,7 +1204,7 @@ static int set_bdev_super(struct super_block *s, void *data) s->s_dev = s->s_bdev->bd_dev; s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi); - if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue)) + if (bdev_stable_writes(s->s_bdev)) s->s_iflags |= SB_I_STABLE_WRITES; return 0; } diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 008fa46ef61e..7d6d2f152e03 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -132,7 +132,7 @@ #define WORST_COMPR_FACTOR 2 #ifdef CONFIG_FS_ENCRYPTION -#define UBIFS_CIPHER_BLOCK_SIZE FS_CRYPTO_BLOCK_SIZE +#define UBIFS_CIPHER_BLOCK_SIZE FSCRYPT_CONTENTS_ALIGNMENT #else #define UBIFS_CIPHER_BLOCK_SIZE 0 #endif diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 0ed4861b038f..b3d5f97f16cd 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -75,11 +75,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, if (fileident) { if (adinicb || (offset + lfi < 0)) { - memcpy(udf_get_fi_ident(sfi), fileident, lfi); + memcpy(sfi->impUse + liu, fileident, lfi); } else if (offset >= 0) { memcpy(fibh->ebh->b_data + offset, fileident, lfi); } else { - memcpy(udf_get_fi_ident(sfi), fileident, -offset); + memcpy(sfi->impUse + liu, fileident, -offset); memcpy(fibh->ebh->b_data, fileident - offset, lfi + offset); } @@ -88,11 +88,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, offset += lfi; if (adinicb || (offset + padlen < 0)) { - memset(udf_get_fi_ident(sfi) + lfi, 0x00, padlen); + memset(sfi->impUse + liu + lfi, 0x00, padlen); } else if (offset >= 0) { memset(fibh->ebh->b_data + offset, 0x00, padlen); } else { - memset(udf_get_fi_ident(sfi) + lfi, 0x00, -offset); + memset(sfi->impUse + liu + lfi, 0x00, -offset); memset(fibh->ebh->b_data, 0x00, padlen + offset); } diff --git a/fs/verity/enable.c b/fs/verity/enable.c index 60a4372aa4d7..d52872c808ff 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -202,7 +202,7 @@ static int enable_verity(struct file *filp, const struct fsverity_operations *vops = inode->i_sb->s_vop; struct merkle_tree_params params = { }; struct fsverity_descriptor *desc; - size_t desc_size = sizeof(*desc) + arg->sig_size; + size_t desc_size = struct_size(desc, signature, arg->sig_size); struct fsverity_info *vi; int err; @@ -281,7 +281,7 @@ static int enable_verity(struct file *filp, * from disk. This is simpler, and it serves as an extra check that the * metadata we're writing is valid before actually enabling verity. */ - vi = fsverity_create_info(inode, desc, desc_size); + vi = fsverity_create_info(inode, desc); if (IS_ERR(vi)) { err = PTR_ERR(vi); goto rollback; diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index a7920434bae5..caf9d248ec2d 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -122,16 +122,14 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, const u8 *salt, size_t salt_size); struct fsverity_info *fsverity_create_info(const struct inode *inode, - struct fsverity_descriptor *desc, - size_t desc_size); + struct fsverity_descriptor *desc); void fsverity_set_info(struct inode *inode, struct fsverity_info *vi); void fsverity_free_info(struct fsverity_info *vi); int fsverity_get_descriptor(struct inode *inode, - struct fsverity_descriptor **desc_ret, - size_t *desc_size_ret); + struct fsverity_descriptor **desc_ret); int __init fsverity_init_info_cache(void); void __init fsverity_exit_info_cache(void); diff --git a/fs/verity/open.c b/fs/verity/open.c index 92df87f5fa38..81ff94442f7b 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -147,8 +147,7 @@ static int compute_file_digest(struct fsverity_hash_alg *hash_alg, * fsverity_descriptor must have already undergone basic validation. */ struct fsverity_info *fsverity_create_info(const struct inode *inode, - struct fsverity_descriptor *desc, - size_t desc_size) + struct fsverity_descriptor *desc) { struct fsverity_info *vi; int err; @@ -264,8 +263,7 @@ static bool validate_fsverity_descriptor(struct inode *inode, * the filesystem, and do basic validation of it. */ int fsverity_get_descriptor(struct inode *inode, - struct fsverity_descriptor **desc_ret, - size_t *desc_size_ret) + struct fsverity_descriptor **desc_ret) { int res; struct fsverity_descriptor *desc; @@ -297,7 +295,6 @@ int fsverity_get_descriptor(struct inode *inode, } *desc_ret = desc; - *desc_size_ret = res; return 0; } @@ -306,17 +303,16 @@ static int ensure_verity_info(struct inode *inode) { struct fsverity_info *vi = fsverity_get_info(inode); struct fsverity_descriptor *desc; - size_t desc_size; int err; if (vi) return 0; - err = fsverity_get_descriptor(inode, &desc, &desc_size); + err = fsverity_get_descriptor(inode, &desc); if (err) return err; - vi = fsverity_create_info(inode, desc, desc_size); + vi = fsverity_create_info(inode, desc); if (IS_ERR(vi)) { err = PTR_ERR(vi); goto out_free_desc; diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c index 7e2d0c7bdf0d..6ee849dc7bc1 100644 --- a/fs/verity/read_metadata.c +++ b/fs/verity/read_metadata.c @@ -101,7 +101,7 @@ static int fsverity_read_descriptor(struct inode *inode, size_t desc_size; int res; - res = fsverity_get_descriptor(inode, &desc, &desc_size); + res = fsverity_get_descriptor(inode, &desc); if (res) return res; @@ -119,10 +119,9 @@ static int fsverity_read_signature(struct inode *inode, void __user *buf, u64 offset, int length) { struct fsverity_descriptor *desc; - size_t desc_size; int res; - res = fsverity_get_descriptor(inode, &desc, &desc_size); + res = fsverity_get_descriptor(inode, &desc); if (res) return res; diff --git a/fs/xattr.c b/fs/xattr.c index 5c8c5175b385..e8dd03e4561e 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -25,6 +25,8 @@ #include <linux/uaccess.h> +#include "internal.h" + static const char * strcmp_prefix(const char *a, const char *a_prefix) { @@ -539,43 +541,76 @@ EXPORT_SYMBOL_GPL(vfs_removexattr); /* * Extended attribute SET operations */ -static long -setxattr(struct user_namespace *mnt_userns, struct dentry *d, - const char __user *name, const void __user *value, size_t size, - int flags) + +int setxattr_copy(const char __user *name, struct xattr_ctx *ctx) { int error; - void *kvalue = NULL; - char kname[XATTR_NAME_MAX + 1]; - if (flags & ~(XATTR_CREATE|XATTR_REPLACE)) + if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE)) return -EINVAL; - error = strncpy_from_user(kname, name, sizeof(kname)); - if (error == 0 || error == sizeof(kname)) - error = -ERANGE; + error = strncpy_from_user(ctx->kname->name, name, + sizeof(ctx->kname->name)); + if (error == 0 || error == sizeof(ctx->kname->name)) + return -ERANGE; if (error < 0) return error; - if (size) { - if (size > XATTR_SIZE_MAX) + error = 0; + if (ctx->size) { + if (ctx->size > XATTR_SIZE_MAX) return -E2BIG; - kvalue = kvmalloc(size, GFP_KERNEL); - if (!kvalue) - return -ENOMEM; - if (copy_from_user(kvalue, value, size)) { - error = -EFAULT; - goto out; + + ctx->kvalue = vmemdup_user(ctx->cvalue, ctx->size); + if (IS_ERR(ctx->kvalue)) { + error = PTR_ERR(ctx->kvalue); + ctx->kvalue = NULL; } - if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || - (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) - posix_acl_fix_xattr_from_user(mnt_userns, kvalue, size); } - error = vfs_setxattr(mnt_userns, d, kname, kvalue, size, flags); -out: - kvfree(kvalue); + return error; +} + +static void setxattr_convert(struct user_namespace *mnt_userns, + struct dentry *d, struct xattr_ctx *ctx) +{ + if (ctx->size && + ((strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || + (strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))) + posix_acl_fix_xattr_from_user(mnt_userns, d_inode(d), + ctx->kvalue, ctx->size); +} + +int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct xattr_ctx *ctx) +{ + setxattr_convert(mnt_userns, dentry, ctx); + return vfs_setxattr(mnt_userns, dentry, ctx->kname->name, + ctx->kvalue, ctx->size, ctx->flags); +} + +static long +setxattr(struct user_namespace *mnt_userns, struct dentry *d, + const char __user *name, const void __user *value, size_t size, + int flags) +{ + struct xattr_name kname; + struct xattr_ctx ctx = { + .cvalue = value, + .kvalue = NULL, + .size = size, + .kname = &kname, + .flags = flags, + }; + int error; + error = setxattr_copy(name, &ctx); + if (error) + return error; + + error = do_setxattr(mnt_userns, d, &ctx); + + kvfree(ctx.kvalue); return error; } @@ -641,43 +676,61 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, /* * Extended attribute GET operations */ -static ssize_t -getxattr(struct user_namespace *mnt_userns, struct dentry *d, - const char __user *name, void __user *value, size_t size) +ssize_t +do_getxattr(struct user_namespace *mnt_userns, struct dentry *d, + struct xattr_ctx *ctx) { ssize_t error; - void *kvalue = NULL; - char kname[XATTR_NAME_MAX + 1]; - - error = strncpy_from_user(kname, name, sizeof(kname)); - if (error == 0 || error == sizeof(kname)) - error = -ERANGE; - if (error < 0) - return error; + char *kname = ctx->kname->name; - if (size) { - if (size > XATTR_SIZE_MAX) - size = XATTR_SIZE_MAX; - kvalue = kvzalloc(size, GFP_KERNEL); - if (!kvalue) + if (ctx->size) { + if (ctx->size > XATTR_SIZE_MAX) + ctx->size = XATTR_SIZE_MAX; + ctx->kvalue = kvzalloc(ctx->size, GFP_KERNEL); + if (!ctx->kvalue) return -ENOMEM; } - error = vfs_getxattr(mnt_userns, d, kname, kvalue, size); + error = vfs_getxattr(mnt_userns, d, kname, ctx->kvalue, ctx->size); if (error > 0) { if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) - posix_acl_fix_xattr_to_user(mnt_userns, kvalue, error); - if (size && copy_to_user(value, kvalue, error)) + posix_acl_fix_xattr_to_user(mnt_userns, d_inode(d), + ctx->kvalue, error); + if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error)) error = -EFAULT; - } else if (error == -ERANGE && size >= XATTR_SIZE_MAX) { + } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) { /* The file system tried to returned a value bigger than XATTR_SIZE_MAX bytes. Not possible. */ error = -E2BIG; } - kvfree(kvalue); + return error; +} + +static ssize_t +getxattr(struct user_namespace *mnt_userns, struct dentry *d, + const char __user *name, void __user *value, size_t size) +{ + ssize_t error; + struct xattr_name kname; + struct xattr_ctx ctx = { + .value = value, + .kvalue = NULL, + .size = size, + .kname = &kname, + .flags = 0, + }; + + error = strncpy_from_user(kname.name, name, sizeof(kname.name)); + if (error == 0 || error == sizeof(kname.name)) + error = -ERANGE; + if (error < 0) + return error; + + error = do_getxattr(mnt_userns, d, &ctx); + kvfree(ctx.kvalue); return error; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index e1afb9e503e1..bf4e60871068 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -406,7 +406,7 @@ xfs_buf_alloc_pages( STATIC int _xfs_buf_map_pages( struct xfs_buf *bp, - uint flags) + xfs_buf_flags_t flags) { ASSERT(bp->b_flags & _XBF_PAGES); if (bp->b_page_count == 1) { @@ -868,7 +868,7 @@ xfs_buf_read_uncached( struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, - int flags, + xfs_buf_flags_t flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { @@ -903,7 +903,7 @@ int xfs_buf_get_uncached( struct xfs_buftarg *target, size_t numblks, - int flags, + xfs_buf_flags_t flags, struct xfs_buf **bpp) { int error; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index edcb6254fa6a..1ee3056ff9cf 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -22,28 +22,28 @@ struct xfs_buf; #define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) -#define XBF_READ (1 << 0) /* buffer intended for reading from device */ -#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ -#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ -#define XBF_NO_IOACCT (1 << 3) /* bypass I/O accounting (non-LRU bufs) */ -#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ -#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ -#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ -#define XBF_WRITE_FAIL (1 << 7) /* async writes have failed on this buffer */ +#define XBF_READ (1u << 0) /* buffer intended for reading from device */ +#define XBF_WRITE (1u << 1) /* buffer intended for writing to device */ +#define XBF_READ_AHEAD (1u << 2) /* asynchronous read-ahead */ +#define XBF_NO_IOACCT (1u << 3) /* bypass I/O accounting (non-LRU bufs) */ +#define XBF_ASYNC (1u << 4) /* initiator will not wait for completion */ +#define XBF_DONE (1u << 5) /* all pages in the buffer uptodate */ +#define XBF_STALE (1u << 6) /* buffer has been staled, do not find it */ +#define XBF_WRITE_FAIL (1u << 7) /* async writes have failed on this buffer */ /* buffer type flags for write callbacks */ -#define _XBF_INODES (1 << 16)/* inode buffer */ -#define _XBF_DQUOTS (1 << 17)/* dquot buffer */ -#define _XBF_LOGRECOVERY (1 << 18)/* log recovery buffer */ +#define _XBF_INODES (1u << 16)/* inode buffer */ +#define _XBF_DQUOTS (1u << 17)/* dquot buffer */ +#define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */ /* flags used only internally */ -#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ -#define _XBF_KMEM (1 << 21)/* backed by heap memory */ -#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ +#define _XBF_PAGES (1u << 20)/* backed by refcounted pages */ +#define _XBF_KMEM (1u << 21)/* backed by heap memory */ +#define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */ /* flags used only as arguments to access routines */ -#define XBF_TRYLOCK (1 << 30)/* lock requested, but do not wait */ -#define XBF_UNMAPPED (1 << 31)/* do not map the buffer */ +#define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */ +#define XBF_UNMAPPED (1u << 31)/* do not map the buffer */ typedef unsigned int xfs_buf_flags_t; @@ -58,7 +58,7 @@ typedef unsigned int xfs_buf_flags_t; { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ { _XBF_INODES, "INODES" }, \ { _XBF_DQUOTS, "DQUOTS" }, \ - { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \ + { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \ { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ @@ -247,11 +247,11 @@ xfs_buf_readahead( return xfs_buf_readahead_map(target, &map, 1, ops); } -int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags, - struct xfs_buf **bpp); +int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, + xfs_buf_flags_t flags, struct xfs_buf **bpp); int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, - size_t numblks, int flags, struct xfs_buf **bpp, - const struct xfs_buf_ops *ops); + size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp, + const struct xfs_buf_ops *ops); int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags); void xfs_buf_hold(struct xfs_buf *bp); diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 0191de8ce9ce..c6fe3f6ebb6b 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -114,7 +114,7 @@ xfs_trim_extents( } trace_xfs_discard_extent(mp, agno, fbno, flen); - error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0); + error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS); if (error) goto out_del_cursor; *blocks_trimmed += flen; @@ -152,8 +152,8 @@ xfs_ioc_trim( struct xfs_mount *mp, struct fstrim_range __user *urange) { - struct request_queue *q = bdev_get_queue(mp->m_ddev_targp->bt_bdev); - unsigned int granularity = q->limits.discard_granularity; + unsigned int granularity = + bdev_discard_granularity(mp->m_ddev_targp->bt_bdev); struct fstrim_range range; xfs_daddr_t start, end, minlen; xfs_agnumber_t start_agno, end_agno, agno; @@ -162,7 +162,7 @@ xfs_ioc_trim( if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) return -EOPNOTSUPP; /* diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9de6205fe134..39ae53efb3ab 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2594,14 +2594,13 @@ xfs_ifree_cluster( } /* - * This is called to return an inode to the inode free list. - * The inode should already be truncated to 0 length and have - * no pages associated with it. This routine also assumes that - * the inode is already a part of the transaction. + * This is called to return an inode to the inode free list. The inode should + * already be truncated to 0 length and have no pages associated with it. This + * routine also assumes that the inode is already a part of the transaction. * - * The on-disk copy of the inode will have been added to the list - * of unlinked inodes in the AGI. We need to remove the inode from - * that list atomically with respect to freeing it here. + * The on-disk copy of the inode will have been added to the list of unlinked + * inodes in the AGI. We need to remove the inode from that list atomically with + * respect to freeing it here. */ int xfs_ifree( @@ -2623,13 +2622,16 @@ xfs_ifree( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); /* - * Pull the on-disk inode from the AGI unlinked list. + * Free the inode first so that we guarantee that the AGI lock is going + * to be taken before we remove the inode from the unlinked list. This + * makes the AGI lock -> unlinked list modification order the same as + * used in O_TMPFILE creation. */ - error = xfs_iunlink_remove(tp, pag, ip); + error = xfs_difree(tp, pag, ip->i_ino, &xic); if (error) - goto out; + return error; - error = xfs_difree(tp, pag, ip->i_ino, &xic); + error = xfs_iunlink_remove(tp, pag, ip); if (error) goto out; diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index ba57323bfdce..c9f55e4f0957 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -605,7 +605,7 @@ xlog_discard_busy_extents( error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), XFS_FSB_TO_BB(mp, busyp->length), - GFP_NOFS, 0, &bio); + GFP_NOFS, &bio); if (error && error != -EOPNOTSUPP) { xfs_info(mp, "discard failed for extent [0x%llx,%u], error %d", diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 54be9d64093e..a276b8111f63 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1608,14 +1608,10 @@ xfs_fs_fill_super( goto out_filestream_unmount; } - if (xfs_has_discard(mp)) { - struct request_queue *q = bdev_get_queue(sb->s_bdev); - - if (!blk_queue_discard(q)) { - xfs_warn(mp, "mounting with \"discard\" option, but " - "the device does not support discard"); - mp->m_features &= ~XFS_FEAT_DISCARD; - } + if (xfs_has_discard(mp) && !bdev_max_discard_sectors(sb->s_bdev)) { + xfs_warn(mp, + "mounting with \"discard\" option, but the device does not support discard"); + mp->m_features &= ~XFS_FEAT_DISCARD; } if (xfs_has_reflink(mp)) { diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index de177842b951..0c82673238f4 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -175,7 +175,7 @@ xfs_trans_get_buf( struct xfs_buftarg *target, xfs_daddr_t blkno, int numblks, - uint flags, + xfs_buf_flags_t flags, struct xfs_buf **bpp) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); diff --git a/fs/zonefs/Makefile b/fs/zonefs/Makefile index 33c1a4f1132e..9fe54f5319f2 100644 --- a/fs/zonefs/Makefile +++ b/fs/zonefs/Makefile @@ -3,4 +3,4 @@ ccflags-y += -I$(src) obj-$(CONFIG_ZONEFS_FS) += zonefs.o -zonefs-y := super.o +zonefs-y := super.o sysfs.o diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 3614c7834007..b3b0b71fdf6c 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -27,6 +27,39 @@ #define CREATE_TRACE_POINTS #include "trace.h" +/* + * Manage the active zone count. Called with zi->i_truncate_mutex held. + */ +static void zonefs_account_active(struct inode *inode) +{ + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + lockdep_assert_held(&zi->i_truncate_mutex); + + if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) + return; + + /* + * If the zone is active, that is, if it is explicitly open or + * partially written, check if it was already accounted as active. + */ + if ((zi->i_flags & ZONEFS_ZONE_OPEN) || + (zi->i_wpoffset > 0 && zi->i_wpoffset < zi->i_max_size)) { + if (!(zi->i_flags & ZONEFS_ZONE_ACTIVE)) { + zi->i_flags |= ZONEFS_ZONE_ACTIVE; + atomic_inc(&sbi->s_active_seq_files); + } + return; + } + + /* The zone is not active. If it was, update the active count */ + if (zi->i_flags & ZONEFS_ZONE_ACTIVE) { + zi->i_flags &= ~ZONEFS_ZONE_ACTIVE; + atomic_dec(&sbi->s_active_seq_files); + } +} + static inline int zonefs_zone_mgmt(struct inode *inode, enum req_opf op) { @@ -35,6 +68,17 @@ static inline int zonefs_zone_mgmt(struct inode *inode, lockdep_assert_held(&zi->i_truncate_mutex); + /* + * With ZNS drives, closing an explicitly open zone that has not been + * written will change the zone state to "closed", that is, the zone + * will remain active. Since this can then cause failure of explicit + * open operation on other zones if the drive active zone resources + * are exceeded, make sure that the zone does not remain active by + * resetting it. + */ + if (op == REQ_OP_ZONE_CLOSE && !zi->i_wpoffset) + op = REQ_OP_ZONE_RESET; + trace_zonefs_zone_mgmt(inode, op); ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); @@ -57,8 +101,13 @@ static inline void zonefs_i_size_write(struct inode *inode, loff_t isize) * A full zone is no longer open/active and does not need * explicit closing. */ - if (isize >= zi->i_max_size) - zi->i_flags &= ~ZONEFS_ZONE_OPEN; + if (isize >= zi->i_max_size) { + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + + if (zi->i_flags & ZONEFS_ZONE_ACTIVE) + atomic_dec(&sbi->s_active_seq_files); + zi->i_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE); + } } static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, @@ -386,6 +435,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, zonefs_update_stats(inode, data_size); zonefs_i_size_write(inode, data_size); zi->i_wpoffset = data_size; + zonefs_account_active(inode); return 0; } @@ -497,6 +547,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize) zonefs_update_stats(inode, isize); truncate_setsize(inode, isize); zi->i_wpoffset = isize; + zonefs_account_active(inode); unlock: mutex_unlock(&zi->i_truncate_mutex); @@ -678,13 +729,12 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); struct zonefs_inode_info *zi = ZONEFS_I(inode); struct block_device *bdev = inode->i_sb->s_bdev; - unsigned int max; + unsigned int max = bdev_max_zone_append_sectors(bdev); struct bio *bio; ssize_t size; int nr_pages; ssize_t ret; - max = queue_max_zone_append_sectors(bdev_get_queue(bdev)); max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); iov_iter_truncate(from, max); @@ -855,8 +905,15 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) (ret > 0 || ret == -EIOCBQUEUED)) { if (ret > 0) count = ret; + + /* + * Update the zone write pointer offset assuming the write + * operation succeeded. If it did not, the error recovery path + * will correct it. Also do active seq file accounting. + */ mutex_lock(&zi->i_truncate_mutex); zi->i_wpoffset += count; + zonefs_account_active(inode); mutex_unlock(&zi->i_truncate_mutex); } @@ -998,13 +1055,13 @@ inode_unlock: return ret; } -static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *file) +/* + * Write open accounting is done only for sequential files. + */ +static inline bool zonefs_seq_file_need_wro(struct inode *inode, + struct file *file) { struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); - - if (!(sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN)) - return false; if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) return false; @@ -1015,28 +1072,34 @@ static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *fi return true; } -static int zonefs_open_zone(struct inode *inode) +static int zonefs_seq_file_write_open(struct inode *inode) { struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); int ret = 0; mutex_lock(&zi->i_truncate_mutex); if (!zi->i_wr_refcnt) { - if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) { - atomic_dec(&sbi->s_open_zones); - ret = -EBUSY; - goto unlock; - } + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files); - if (i_size_read(inode) < zi->i_max_size) { - ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); - if (ret) { - atomic_dec(&sbi->s_open_zones); + if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { + + if (wro > sbi->s_max_wro_seq_files) { + atomic_dec(&sbi->s_wro_seq_files); + ret = -EBUSY; goto unlock; } - zi->i_flags |= ZONEFS_ZONE_OPEN; + + if (i_size_read(inode) < zi->i_max_size) { + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); + if (ret) { + atomic_dec(&sbi->s_wro_seq_files); + goto unlock; + } + zi->i_flags |= ZONEFS_ZONE_OPEN; + zonefs_account_active(inode); + } } } @@ -1056,30 +1119,31 @@ static int zonefs_file_open(struct inode *inode, struct file *file) if (ret) return ret; - if (zonefs_file_use_exp_open(inode, file)) - return zonefs_open_zone(inode); + if (zonefs_seq_file_need_wro(inode, file)) + return zonefs_seq_file_write_open(inode); return 0; } -static void zonefs_close_zone(struct inode *inode) +static void zonefs_seq_file_write_close(struct inode *inode) { struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); int ret = 0; mutex_lock(&zi->i_truncate_mutex); - zi->i_wr_refcnt--; - if (!zi->i_wr_refcnt) { - struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); - struct super_block *sb = inode->i_sb; - /* - * If the file zone is full, it is not open anymore and we only - * need to decrement the open count. - */ - if (!(zi->i_flags & ZONEFS_ZONE_OPEN)) - goto dec; + zi->i_wr_refcnt--; + if (zi->i_wr_refcnt) + goto unlock; + /* + * The file zone may not be open anymore (e.g. the file was truncated to + * its maximum size or it was fully written). For this case, we only + * need to decrement the write open count. + */ + if (zi->i_flags & ZONEFS_ZONE_OPEN) { ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); if (ret) { __zonefs_io_error(inode, false); @@ -1091,14 +1155,23 @@ static void zonefs_close_zone(struct inode *inode) */ if (zi->i_flags & ZONEFS_ZONE_OPEN && !(sb->s_flags & SB_RDONLY)) { - zonefs_warn(sb, "closing zone failed, remounting filesystem read-only\n"); + zonefs_warn(sb, + "closing zone at %llu failed %d\n", + zi->i_zsector, ret); + zonefs_warn(sb, + "remounting filesystem read-only\n"); sb->s_flags |= SB_RDONLY; } + goto unlock; } + zi->i_flags &= ~ZONEFS_ZONE_OPEN; -dec: - atomic_dec(&sbi->s_open_zones); + zonefs_account_active(inode); } + + atomic_dec(&sbi->s_wro_seq_files); + +unlock: mutex_unlock(&zi->i_truncate_mutex); } @@ -1110,8 +1183,8 @@ static int zonefs_file_release(struct inode *inode, struct file *file) * the zone has gone offline or read-only). Make sure we don't fail the * close(2) for user-space. */ - if (zonefs_file_use_exp_open(inode, file)) - zonefs_close_zone(inode); + if (zonefs_seq_file_need_wro(inode, file)) + zonefs_seq_file_write_close(inode); return 0; } @@ -1142,6 +1215,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb) inode_init_once(&zi->i_vnode); mutex_init(&zi->i_truncate_mutex); zi->i_wr_refcnt = 0; + zi->i_flags = 0; return &zi->i_vnode; } @@ -1293,12 +1367,13 @@ static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode, inc_nlink(parent); } -static void zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, - enum zonefs_ztype type) +static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, + enum zonefs_ztype type) { struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); struct zonefs_inode_info *zi = ZONEFS_I(inode); + int ret; inode->i_ino = zone->start >> sbi->s_zone_sectors_shift; inode->i_mode = S_IFREG | sbi->s_perm; @@ -1323,6 +1398,29 @@ static void zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes); sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits; sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits; + + mutex_lock(&zi->i_truncate_mutex); + + /* + * For sequential zones, make sure that any open zone is closed first + * to ensure that the initial number of open zones is 0, in sync with + * the open zone accounting done when the mount option + * ZONEFS_MNTOPT_EXPLICIT_OPEN is used. + */ + if (type == ZONEFS_ZTYPE_SEQ && + (zone->cond == BLK_ZONE_COND_IMP_OPEN || + zone->cond == BLK_ZONE_COND_EXP_OPEN)) { + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); + if (ret) + goto unlock; + } + + zonefs_account_active(inode); + +unlock: + mutex_unlock(&zi->i_truncate_mutex); + + return 0; } static struct dentry *zonefs_create_inode(struct dentry *parent, @@ -1332,6 +1430,7 @@ static struct dentry *zonefs_create_inode(struct dentry *parent, struct inode *dir = d_inode(parent); struct dentry *dentry; struct inode *inode; + int ret; dentry = d_alloc_name(parent, name); if (!dentry) @@ -1342,10 +1441,16 @@ static struct dentry *zonefs_create_inode(struct dentry *parent, goto dput; inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime; - if (zone) - zonefs_init_file_inode(inode, zone, type); - else + if (zone) { + ret = zonefs_init_file_inode(inode, zone, type); + if (ret) { + iput(inode); + goto dput; + } + } else { zonefs_init_dir_inode(dir, inode, type); + } + d_add(dentry, inode); dir->i_size++; @@ -1652,14 +1757,18 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) sbi->s_gid = GLOBAL_ROOT_GID; sbi->s_perm = 0640; sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO; - sbi->s_max_open_zones = bdev_max_open_zones(sb->s_bdev); - atomic_set(&sbi->s_open_zones, 0); - if (!sbi->s_max_open_zones && + + atomic_set(&sbi->s_wro_seq_files, 0); + sbi->s_max_wro_seq_files = bdev_max_open_zones(sb->s_bdev); + if (!sbi->s_max_wro_seq_files && sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n"); sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; } + atomic_set(&sbi->s_active_seq_files, 0); + sbi->s_max_active_seq_files = bdev_max_active_zones(sb->s_bdev); + ret = zonefs_read_super(sb); if (ret) return ret; @@ -1674,6 +1783,10 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) if (ret) goto cleanup; + ret = zonefs_sysfs_register(sb); + if (ret) + goto cleanup; + zonefs_info(sb, "Mounting %u zones", blkdev_nr_zones(sb->s_bdev->bd_disk)); @@ -1719,6 +1832,8 @@ static void zonefs_kill_super(struct super_block *sb) if (sb->s_root) d_genocide(sb->s_root); + + zonefs_sysfs_unregister(sb); kill_block_super(sb); kfree(sbi); } @@ -1766,16 +1881,26 @@ static int __init zonefs_init(void) return ret; ret = register_filesystem(&zonefs_type); - if (ret) { - zonefs_destroy_inodecache(); - return ret; - } + if (ret) + goto destroy_inodecache; + + ret = zonefs_sysfs_init(); + if (ret) + goto unregister_fs; return 0; + +unregister_fs: + unregister_filesystem(&zonefs_type); +destroy_inodecache: + zonefs_destroy_inodecache(); + + return ret; } static void __exit zonefs_exit(void) { + zonefs_sysfs_exit(); zonefs_destroy_inodecache(); unregister_filesystem(&zonefs_type); } diff --git a/fs/zonefs/sysfs.c b/fs/zonefs/sysfs.c new file mode 100644 index 000000000000..9cb6755ce39a --- /dev/null +++ b/fs/zonefs/sysfs.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Simple file system for zoned block devices exposing zones as files. + * + * Copyright (C) 2022 Western Digital Corporation or its affiliates. + */ +#include <linux/fs.h> +#include <linux/seq_file.h> +#include <linux/blkdev.h> + +#include "zonefs.h" + +struct zonefs_sysfs_attr { + struct attribute attr; + ssize_t (*show)(struct zonefs_sb_info *sbi, char *buf); +}; + +static inline struct zonefs_sysfs_attr *to_attr(struct attribute *attr) +{ + return container_of(attr, struct zonefs_sysfs_attr, attr); +} + +#define ZONEFS_SYSFS_ATTR_RO(name) \ +static struct zonefs_sysfs_attr zonefs_sysfs_attr_##name = __ATTR_RO(name) + +#define ATTR_LIST(name) &zonefs_sysfs_attr_##name.attr + +static ssize_t zonefs_sysfs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct zonefs_sb_info *sbi = + container_of(kobj, struct zonefs_sb_info, s_kobj); + struct zonefs_sysfs_attr *zonefs_attr = + container_of(attr, struct zonefs_sysfs_attr, attr); + + if (!zonefs_attr->show) + return 0; + + return zonefs_attr->show(sbi, buf); +} + +static ssize_t max_wro_seq_files_show(struct zonefs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%u\n", sbi->s_max_wro_seq_files); +} +ZONEFS_SYSFS_ATTR_RO(max_wro_seq_files); + +static ssize_t nr_wro_seq_files_show(struct zonefs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%d\n", atomic_read(&sbi->s_wro_seq_files)); +} +ZONEFS_SYSFS_ATTR_RO(nr_wro_seq_files); + +static ssize_t max_active_seq_files_show(struct zonefs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%u\n", sbi->s_max_active_seq_files); +} +ZONEFS_SYSFS_ATTR_RO(max_active_seq_files); + +static ssize_t nr_active_seq_files_show(struct zonefs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%d\n", atomic_read(&sbi->s_active_seq_files)); +} +ZONEFS_SYSFS_ATTR_RO(nr_active_seq_files); + +static struct attribute *zonefs_sysfs_attrs[] = { + ATTR_LIST(max_wro_seq_files), + ATTR_LIST(nr_wro_seq_files), + ATTR_LIST(max_active_seq_files), + ATTR_LIST(nr_active_seq_files), + NULL, +}; +ATTRIBUTE_GROUPS(zonefs_sysfs); + +static void zonefs_sysfs_sb_release(struct kobject *kobj) +{ + struct zonefs_sb_info *sbi = + container_of(kobj, struct zonefs_sb_info, s_kobj); + + complete(&sbi->s_kobj_unregister); +} + +static const struct sysfs_ops zonefs_sysfs_attr_ops = { + .show = zonefs_sysfs_attr_show, +}; + +static struct kobj_type zonefs_sb_ktype = { + .default_groups = zonefs_sysfs_groups, + .sysfs_ops = &zonefs_sysfs_attr_ops, + .release = zonefs_sysfs_sb_release, +}; + +static struct kobject *zonefs_sysfs_root; + +int zonefs_sysfs_register(struct super_block *sb) +{ + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + int ret; + + init_completion(&sbi->s_kobj_unregister); + ret = kobject_init_and_add(&sbi->s_kobj, &zonefs_sb_ktype, + zonefs_sysfs_root, "%s", sb->s_id); + if (ret) { + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + return ret; + } + + sbi->s_sysfs_registered = true; + + return 0; +} + +void zonefs_sysfs_unregister(struct super_block *sb) +{ + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + + if (!sbi || !sbi->s_sysfs_registered) + return; + + kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); +} + +int __init zonefs_sysfs_init(void) +{ + zonefs_sysfs_root = kobject_create_and_add("zonefs", fs_kobj); + if (!zonefs_sysfs_root) + return -ENOMEM; + + return 0; +} + +void zonefs_sysfs_exit(void) +{ + kobject_put(zonefs_sysfs_root); + zonefs_sysfs_root = NULL; +} diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 7b147907c328..4b3de66c3233 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -12,6 +12,7 @@ #include <linux/uuid.h> #include <linux/mutex.h> #include <linux/rwsem.h> +#include <linux/kobject.h> /* * Maximum length of file names: this only needs to be large enough to fit @@ -39,6 +40,7 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) } #define ZONEFS_ZONE_OPEN (1 << 0) +#define ZONEFS_ZONE_ACTIVE (1 << 1) /* * In-memory inode data. @@ -182,8 +184,15 @@ struct zonefs_sb_info { loff_t s_blocks; loff_t s_used_blocks; - unsigned int s_max_open_zones; - atomic_t s_open_zones; + unsigned int s_max_wro_seq_files; + atomic_t s_wro_seq_files; + + unsigned int s_max_active_seq_files; + atomic_t s_active_seq_files; + + bool s_sysfs_registered; + struct kobject s_kobj; + struct completion s_kobj_unregister; }; static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb) @@ -198,4 +207,9 @@ static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb) #define zonefs_warn(sb, format, args...) \ pr_warn("zonefs (%s) WARNING: " format, sb->s_id, ## args) +int zonefs_sysfs_register(struct super_block *sb); +void zonefs_sysfs_unregister(struct super_block *sb); +int zonefs_sysfs_init(void); +void zonefs_sysfs_exit(void); + #endif |