summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig.binfmt8
-rw-r--r--fs/adfs/super.c1
-rw-r--r--fs/affs/amigaffs.h6
-rw-r--r--fs/afs/dir.c6
-rw-r--r--fs/afs/dir_edit.c4
-rw-r--r--fs/afs/inode.c4
-rw-r--r--fs/aio.c23
-rw-r--r--fs/attr.c2
-rw-r--r--fs/autofs/init.c1
-rw-r--r--fs/autofs/inode.c16
-rw-r--r--fs/bcachefs/Makefile3
-rw-r--r--fs/bcachefs/acl.c4
-rw-r--r--fs/bcachefs/alloc_background.c500
-rw-r--r--fs/bcachefs/alloc_background.h47
-rw-r--r--fs/bcachefs/alloc_background_format.h2
-rw-r--r--fs/bcachefs/alloc_foreground.c26
-rw-r--r--fs/bcachefs/alloc_foreground.h1
-rw-r--r--fs/bcachefs/backpointers.c90
-rw-r--r--fs/bcachefs/bcachefs.h45
-rw-r--r--fs/bcachefs/bcachefs_format.h70
-rw-r--r--fs/bcachefs/bcachefs_ioctl.h36
-rw-r--r--fs/bcachefs/bkey.c5
-rw-r--r--fs/bcachefs/bkey.h7
-rw-r--r--fs/bcachefs/bkey_methods.c1
-rw-r--r--fs/bcachefs/btree_cache.c16
-rw-r--r--fs/bcachefs/btree_cache.h2
-rw-r--r--fs/bcachefs/btree_gc.c321
-rw-r--r--fs/bcachefs/btree_gc.h23
-rw-r--r--fs/bcachefs/btree_gc_types.h13
-rw-r--r--fs/bcachefs/btree_io.c45
-rw-r--r--fs/bcachefs/btree_io.h6
-rw-r--r--fs/bcachefs/btree_iter.c114
-rw-r--r--fs/bcachefs/btree_iter.h15
-rw-r--r--fs/bcachefs/btree_journal_iter.c23
-rw-r--r--fs/bcachefs/btree_journal_iter.h17
-rw-r--r--fs/bcachefs/btree_key_cache.c344
-rw-r--r--fs/bcachefs/btree_locking.c22
-rw-r--r--fs/bcachefs/btree_locking.h31
-rw-r--r--fs/bcachefs/btree_node_scan.c51
-rw-r--r--fs/bcachefs/btree_node_scan_types.h1
-rw-r--r--fs/bcachefs/btree_trans_commit.c171
-rw-r--r--fs/bcachefs/btree_types.h22
-rw-r--r--fs/bcachefs/btree_update.c6
-rw-r--r--fs/bcachefs/btree_update.h36
-rw-r--r--fs/bcachefs/btree_update_interior.c42
-rw-r--r--fs/bcachefs/btree_update_interior.h2
-rw-r--r--fs/bcachefs/btree_write_buffer.c175
-rw-r--r--fs/bcachefs/btree_write_buffer.h52
-rw-r--r--fs/bcachefs/btree_write_buffer_types.h2
-rw-r--r--fs/bcachefs/buckets.c766
-rw-r--r--fs/bcachefs/buckets.h79
-rw-r--r--fs/bcachefs/buckets_types.h17
-rw-r--r--fs/bcachefs/chardev.c112
-rw-r--r--fs/bcachefs/checksum.c5
-rw-r--r--fs/bcachefs/clock.c69
-rw-r--r--fs/bcachefs/clock.h9
-rw-r--r--fs/bcachefs/clock_types.h3
-rw-r--r--fs/bcachefs/darray.c3
-rw-r--r--fs/bcachefs/data_update.c44
-rw-r--r--fs/bcachefs/data_update.h5
-rw-r--r--fs/bcachefs/debug.c113
-rw-r--r--fs/bcachefs/dirent.c8
-rw-r--r--fs/bcachefs/disk_accounting.c790
-rw-r--r--fs/bcachefs/disk_accounting.h219
-rw-r--r--fs/bcachefs/disk_accounting_format.h162
-rw-r--r--fs/bcachefs/disk_accounting_types.h19
-rw-r--r--fs/bcachefs/disk_groups.c2
-rw-r--r--fs/bcachefs/ec.c117
-rw-r--r--fs/bcachefs/errcode.h3
-rw-r--r--fs/bcachefs/error.c56
-rw-r--r--fs/bcachefs/error.h22
-rw-r--r--fs/bcachefs/extents.c29
-rw-r--r--fs/bcachefs/extents.h4
-rw-r--r--fs/bcachefs/eytzinger.h17
-rw-r--r--fs/bcachefs/fs-common.h2
-rw-r--r--fs/bcachefs/fs-io-buffered.c41
-rw-r--r--fs/bcachefs/fs-io-direct.c4
-rw-r--r--fs/bcachefs/fs-io-pagecache.c37
-rw-r--r--fs/bcachefs/fs-io-pagecache.h7
-rw-r--r--fs/bcachefs/fs-io.c23
-rw-r--r--fs/bcachefs/fs-ioctl.c80
-rw-r--r--fs/bcachefs/fs.c218
-rw-r--r--fs/bcachefs/fsck.c280
-rw-r--r--fs/bcachefs/inode.c60
-rw-r--r--fs/bcachefs/inode.h2
-rw-r--r--fs/bcachefs/io_misc.c8
-rw-r--r--fs/bcachefs/io_read.c118
-rw-r--r--fs/bcachefs/io_write.c36
-rw-r--r--fs/bcachefs/io_write.h2
-rw-r--r--fs/bcachefs/journal.c40
-rw-r--r--fs/bcachefs/journal.h10
-rw-r--r--fs/bcachefs/journal_io.c46
-rw-r--r--fs/bcachefs/journal_reclaim.c11
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c2
-rw-r--r--fs/bcachefs/lru.c45
-rw-r--r--fs/bcachefs/lru.h15
-rw-r--r--fs/bcachefs/lru_format.h25
-rw-r--r--fs/bcachefs/move.c27
-rw-r--r--fs/bcachefs/movinggc.c11
-rw-r--r--fs/bcachefs/opts.c120
-rw-r--r--fs/bcachefs/opts.h15
-rw-r--r--fs/bcachefs/printbuf.c14
-rw-r--r--fs/bcachefs/printbuf.h1
-rw-r--r--fs/bcachefs/recovery.c134
-rw-r--r--fs/bcachefs/recovery_passes.c5
-rw-r--r--fs/bcachefs/recovery_passes_types.h1
-rw-r--r--fs/bcachefs/reflink.c2
-rw-r--r--fs/bcachefs/replicas.c251
-rw-r--r--fs/bcachefs/replicas.h16
-rw-r--r--fs/bcachefs/replicas_types.h16
-rw-r--r--fs/bcachefs/sb-clean.c62
-rw-r--r--fs/bcachefs/sb-downgrade.c113
-rw-r--r--fs/bcachefs/sb-downgrade.h1
-rw-r--r--fs/bcachefs/sb-errors.c14
-rw-r--r--fs/bcachefs/sb-errors_format.h7
-rw-r--r--fs/bcachefs/seqmutex.h11
-rw-r--r--fs/bcachefs/snapshot.c27
-rw-r--r--fs/bcachefs/subvolume.c20
-rw-r--r--fs/bcachefs/super-io.c5
-rw-r--r--fs/bcachefs/super.c109
-rw-r--r--fs/bcachefs/sysfs.c126
-rw-r--r--fs/bcachefs/tests.c14
-rw-r--r--fs/bcachefs/thread_with_file.c87
-rw-r--r--fs/bcachefs/thread_with_file.h4
-rw-r--r--fs/bcachefs/thread_with_file_types.h5
-rw-r--r--fs/bcachefs/trace.h50
-rw-r--r--fs/bcachefs/two_state_shared_lock.h11
-rw-r--r--fs/bcachefs/util.c25
-rw-r--r--fs/bcachefs/util.h22
-rw-r--r--fs/bcachefs/varint.c2
-rw-r--r--fs/befs/linuxvfs.c10
-rw-r--r--fs/binfmt_elf.c101
-rw-r--r--fs/binfmt_elf_fdpic.c5
-rw-r--r--fs/binfmt_misc.c8
-rw-r--r--fs/binfmt_script.c1
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/accessors.h15
-rw-r--r--fs/btrfs/bio.c4
-rw-r--r--fs/btrfs/block-group.c66
-rw-r--r--fs/btrfs/block-group.h3
-rw-r--r--fs/btrfs/btrfs_inode.h156
-rw-r--r--fs/btrfs/compression.c25
-rw-r--r--fs/btrfs/compression.h2
-rw-r--r--fs/btrfs/ctree.c108
-rw-r--r--fs/btrfs/ctree.h18
-rw-r--r--fs/btrfs/defrag.c18
-rw-r--r--fs/btrfs/delalloc-space.c2
-rw-r--r--fs/btrfs/delalloc-space.h2
-rw-r--r--fs/btrfs/delayed-inode.c47
-rw-r--r--fs/btrfs/delayed-inode.h10
-rw-r--r--fs/btrfs/delayed-ref.c51
-rw-r--r--fs/btrfs/delayed-ref.h8
-rw-r--r--fs/btrfs/dev-replace.c4
-rw-r--r--fs/btrfs/dir-item.c8
-rw-r--r--fs/btrfs/dir-item.h6
-rw-r--r--fs/btrfs/direct-io.c1052
-rw-r--r--fs/btrfs/direct-io.h14
-rw-r--r--fs/btrfs/disk-io.c130
-rw-r--r--fs/btrfs/disk-io.h18
-rw-r--r--fs/btrfs/export.c6
-rw-r--r--fs/btrfs/extent-io-tree.c4
-rw-r--r--fs/btrfs/extent-tree.c685
-rw-r--r--fs/btrfs/extent-tree.h8
-rw-r--r--fs/btrfs/extent_io.c1094
-rw-r--r--fs/btrfs/extent_io.h19
-rw-r--r--fs/btrfs/extent_map.c366
-rw-r--r--fs/btrfs/extent_map.h54
-rw-r--r--fs/btrfs/fiemap.c930
-rw-r--r--fs/btrfs/fiemap.h11
-rw-r--r--fs/btrfs/file-item.c52
-rw-r--r--fs/btrfs/file.c355
-rw-r--r--fs/btrfs/file.h4
-rw-r--r--fs/btrfs/free-space-cache.c14
-rw-r--r--fs/btrfs/free-space-tree.c10
-rw-r--r--fs/btrfs/fs.h82
-rw-r--r--fs/btrfs/inode-item.c4
-rw-r--r--fs/btrfs/inode.c1461
-rw-r--r--fs/btrfs/ioctl.c96
-rw-r--r--fs/btrfs/ioctl.h2
-rw-r--r--fs/btrfs/locking.h1
-rw-r--r--fs/btrfs/lru_cache.h1
-rw-r--r--fs/btrfs/lzo.c43
-rw-r--r--fs/btrfs/messages.c3
-rw-r--r--fs/btrfs/misc.h4
-rw-r--r--fs/btrfs/ordered-data.c146
-rw-r--r--fs/btrfs/ordered-data.h27
-rw-r--r--fs/btrfs/print-tree.c10
-rw-r--r--fs/btrfs/props.c20
-rw-r--r--fs/btrfs/props.h4
-rw-r--r--fs/btrfs/qgroup.c235
-rw-r--r--fs/btrfs/qgroup.h25
-rw-r--r--fs/btrfs/raid-stripe-tree.c13
-rw-r--r--fs/btrfs/raid-stripe-tree.h3
-rw-r--r--fs/btrfs/raid56.c118
-rw-r--r--fs/btrfs/ref-verify.c9
-rw-r--r--fs/btrfs/reflink.c8
-rw-r--r--fs/btrfs/relocation.c160
-rw-r--r--fs/btrfs/scrub.c37
-rw-r--r--fs/btrfs/send.c51
-rw-r--r--fs/btrfs/send.h4
-rw-r--r--fs/btrfs/space-info.c269
-rw-r--r--fs/btrfs/space-info.h48
-rw-r--r--fs/btrfs/subpage.c162
-rw-r--r--fs/btrfs/subpage.h9
-rw-r--r--fs/btrfs/super.c60
-rw-r--r--fs/btrfs/super.h3
-rw-r--r--fs/btrfs/sysfs.c85
-rw-r--r--fs/btrfs/tests/btrfs-tests.c5
-rw-r--r--fs/btrfs/tests/extent-map-tests.c120
-rw-r--r--fs/btrfs/tests/inode-tests.c176
-rw-r--r--fs/btrfs/transaction.c31
-rw-r--r--fs/btrfs/transaction.h9
-rw-r--r--fs/btrfs/tree-checker.c37
-rw-r--r--fs/btrfs/tree-log.c115
-rw-r--r--fs/btrfs/tree-log.h6
-rw-r--r--fs/btrfs/ulist.c21
-rw-r--r--fs/btrfs/ulist.h2
-rw-r--r--fs/btrfs/uuid-tree.c10
-rw-r--r--fs/btrfs/uuid-tree.h4
-rw-r--r--fs/btrfs/volumes.c62
-rw-r--r--fs/btrfs/volumes.h2
-rw-r--r--fs/btrfs/xattr.c4
-rw-r--r--fs/btrfs/xattr.h2
-rw-r--r--fs/btrfs/zlib.c56
-rw-r--r--fs/btrfs/zoned.c31
-rw-r--r--fs/btrfs/zoned.h14
-rw-r--r--fs/btrfs/zstd.c70
-rw-r--r--fs/buffer.c9
-rw-r--r--fs/cachefiles/cache.c45
-rw-r--r--fs/cachefiles/daemon.c4
-rw-r--r--fs/cachefiles/internal.h3
-rw-r--r--fs/cachefiles/ondemand.c52
-rw-r--r--fs/cachefiles/volume.c1
-rw-r--r--fs/cachefiles/xattr.c5
-rw-r--r--fs/ceph/dir.c2
-rw-r--r--fs/ceph/inode.c2
-rw-r--r--fs/coda/symlink.c10
-rw-r--r--fs/configfs/dir.c10
-rw-r--r--fs/cramfs/inode.c26
-rw-r--r--fs/dcache.c90
-rw-r--r--fs/debugfs/inode.c16
-rw-r--r--fs/dlm/ast.c172
-rw-r--r--fs/dlm/ast.h11
-rw-r--r--fs/dlm/config.c2
-rw-r--r--fs/dlm/debug_fs.c10
-rw-r--r--fs/dlm/dlm_internal.h60
-rw-r--r--fs/dlm/lock.c568
-rw-r--r--fs/dlm/lock.h7
-rw-r--r--fs/dlm/lockspace.c131
-rw-r--r--fs/dlm/lowcomms.c8
-rw-r--r--fs/dlm/lowcomms.h2
-rw-r--r--fs/dlm/member.c2
-rw-r--r--fs/dlm/memory.c10
-rw-r--r--fs/dlm/midcomms.c4
-rw-r--r--fs/dlm/midcomms.h2
-rw-r--r--fs/dlm/recover.c78
-rw-r--r--fs/dlm/recover.h2
-rw-r--r--fs/dlm/recoverd.c14
-rw-r--r--fs/dlm/user.c42
-rw-r--r--fs/efivarfs/super.c12
-rw-r--r--fs/efs/inode.c1
-rw-r--r--fs/efs/symlink.c14
-rw-r--r--fs/erofs/compress.h61
-rw-r--r--fs/erofs/decompressor.c148
-rw-r--r--fs/erofs/decompressor_deflate.c149
-rw-r--r--fs/erofs/decompressor_lzma.c166
-rw-r--r--fs/erofs/decompressor_zstd.c154
-rw-r--r--fs/erofs/internal.h48
-rw-r--r--fs/erofs/super.c36
-rw-r--r--fs/erofs/zdata.c346
-rw-r--r--fs/erofs/zmap.c6
-rw-r--r--fs/erofs/zutil.c8
-rw-r--r--fs/exec.c63
-rw-r--r--fs/exec_test.c141
-rw-r--r--fs/exfat/dir.c2
-rw-r--r--fs/exfat/file.c22
-rw-r--r--fs/exfat/super.c10
-rw-r--r--fs/exportfs/expfs.c9
-rw-r--r--fs/ext2/balloc.c11
-rw-r--r--fs/ext4/block_validity.c2
-rw-r--r--fs/ext4/crypto.c10
-rw-r--r--fs/ext4/ext4.h37
-rw-r--r--fs/ext4/extents_status.c72
-rw-r--r--fs/ext4/extents_status.h5
-rw-r--r--fs/ext4/fast_commit.c14
-rw-r--r--fs/ext4/ialloc.c5
-rw-r--r--fs/ext4/inline.c6
-rw-r--r--fs/ext4/inode-test.c1
-rw-r--r--fs/ext4/inode.c255
-rw-r--r--fs/ext4/ioctl.c2
-rw-r--r--fs/ext4/namei.c195
-rw-r--r--fs/ext4/super.c32
-rw-r--r--fs/ext4/xattr.c6
-rw-r--r--fs/f2fs/acl.c3
-rw-r--r--fs/f2fs/dir.c105
-rw-r--r--fs/f2fs/f2fs.h16
-rw-r--r--fs/f2fs/file.c6
-rw-r--r--fs/f2fs/namei.c10
-rw-r--r--fs/f2fs/recovery.c9
-rw-r--r--fs/f2fs/super.c8
-rw-r--r--fs/fat/fat.h18
-rw-r--r--fs/fat/fat_test.c1
-rw-r--r--fs/fat/inode.c675
-rw-r--r--fs/fat/namei_msdos.c38
-rw-r--r--fs/fat/namei_vfat.c38
-rw-r--r--fs/fhandle.c178
-rw-r--r--fs/fs_parser.c34
-rw-r--r--fs/fsopen.c7
-rw-r--r--fs/fuse/acl.c4
-rw-r--r--fs/fuse/inode.c24
-rw-r--r--fs/fuse/virtio_fs.c62
-rw-r--r--fs/gfs2/glock.c227
-rw-r--r--fs/gfs2/glock.h1
-rw-r--r--fs/gfs2/glops.c42
-rw-r--r--fs/gfs2/incore.h12
-rw-r--r--fs/gfs2/lock_dlm.c28
-rw-r--r--fs/gfs2/ops_fstype.c13
-rw-r--r--fs/gfs2/quota.c388
-rw-r--r--fs/gfs2/super.c1
-rw-r--r--fs/gfs2/trace_gfs2.h6
-rw-r--r--fs/gfs2/util.c12
-rw-r--r--fs/hfs/inode.c3
-rw-r--r--fs/hfs/super.c1
-rw-r--r--fs/hfsplus/bfind.c15
-rw-r--r--fs/hfsplus/extents.c9
-rw-r--r--fs/hfsplus/hfsplus_fs.h21
-rw-r--r--fs/hfsplus/ioctl.c4
-rw-r--r--fs/hfsplus/xattr.c2
-rw-r--r--fs/hostfs/hostfs_kern.c106
-rw-r--r--fs/hpfs/namei.c15
-rw-r--r--fs/hpfs/super.c1
-rw-r--r--fs/hugetlbfs/inode.c32
-rw-r--r--fs/inode.c109
-rw-r--r--fs/internal.h16
-rw-r--r--fs/iomap/buffered-io.c81
-rw-r--r--fs/isofs/inode.c17
-rw-r--r--fs/isofs/rock.c17
-rw-r--r--fs/jbd2/commit.c14
-rw-r--r--fs/jbd2/journal.c124
-rw-r--r--fs/jbd2/recovery.c33
-rw-r--r--fs/jbd2/transaction.c45
-rw-r--r--fs/jffs2/file.c14
-rw-r--r--fs/libfs.c74
-rw-r--r--fs/lockd/Makefile9
-rw-r--r--fs/locks.c11
-rw-r--r--fs/minix/inode.c1
-rw-r--r--fs/minix/namei.c3
-rw-r--r--fs/mount.h3
-rw-r--r--fs/mpage.c13
-rw-r--r--fs/namei.c249
-rw-r--r--fs/namespace.c524
-rw-r--r--fs/netfs/buffered_read.c18
-rw-r--r--fs/netfs/buffered_write.c26
-rw-r--r--fs/netfs/direct_read.c2
-rw-r--r--fs/netfs/direct_write.c11
-rw-r--r--fs/netfs/fscache_cache.c4
-rw-r--r--fs/netfs/fscache_cookie.c28
-rw-r--r--fs/netfs/fscache_io.c12
-rw-r--r--fs/netfs/fscache_main.c2
-rw-r--r--fs/netfs/fscache_volume.c18
-rw-r--r--fs/netfs/internal.h44
-rw-r--r--fs/netfs/io.c12
-rw-r--r--fs/netfs/main.c4
-rw-r--r--fs/netfs/misc.c85
-rw-r--r--fs/netfs/write_collect.c16
-rw-r--r--fs/netfs/write_issue.c38
-rw-r--r--fs/nfs/blocklayout/blocklayout.c25
-rw-r--r--fs/nfs/blocklayout/blocklayout.h9
-rw-r--r--fs/nfs/blocklayout/dev.c116
-rw-r--r--fs/nfs/callback.h5
-rw-r--r--fs/nfs/callback_proc.c19
-rw-r--r--fs/nfs/callback_xdr.c39
-rw-r--r--fs/nfs/client.c3
-rw-r--r--fs/nfs/delegation.c67
-rw-r--r--fs/nfs/delegation.h45
-rw-r--r--fs/nfs/dir.c2
-rw-r--r--fs/nfs/direct.c2
-rw-r--r--fs/nfs/file.c21
-rw-r--r--fs/nfs/filelayout/filelayout.c1
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c2
-rw-r--r--fs/nfs/fscache.c2
-rw-r--r--fs/nfs/inode.c88
-rw-r--r--fs/nfs/internal.h25
-rw-r--r--fs/nfs/iostat.h4
-rw-r--r--fs/nfs/mount_clnt.c5
-rw-r--r--fs/nfs/nfs2super.c1
-rw-r--r--fs/nfs/nfs3proc.c10
-rw-r--r--fs/nfs/nfs3super.c1
-rw-r--r--fs/nfs/nfs4_fs.h3
-rw-r--r--fs/nfs/nfs4client.c6
-rw-r--r--fs/nfs/nfs4proc.c303
-rw-r--r--fs/nfs/nfs4state.c4
-rw-r--r--fs/nfs/nfs4super.c1
-rw-r--r--fs/nfs/nfs4trace.c7
-rw-r--r--fs/nfs/nfs4trace.h88
-rw-r--r--fs/nfs/nfs4xdr.c138
-rw-r--r--fs/nfs/nfstrace.h36
-rw-r--r--fs/nfs/pagelist.c117
-rw-r--r--fs/nfs/pnfs.c223
-rw-r--r--fs/nfs/pnfs.h53
-rw-r--r--fs/nfs/pnfs_dev.c3
-rw-r--r--fs/nfs/pnfs_nfs.c47
-rw-r--r--fs/nfs/proc.c10
-rw-r--r--fs/nfs/read.c80
-rw-r--r--fs/nfs/symlink.c10
-rw-r--r--fs/nfs/unlink.c2
-rw-r--r--fs/nfs/write.c369
-rw-r--r--fs/nfs_common/grace.c1
-rw-r--r--fs/nfs_common/nfsacl.c1
-rw-r--r--fs/nfsd/Kconfig2
-rw-r--r--fs/nfsd/filecache.c2
-rw-r--r--fs/nfsd/netlink.c17
-rw-r--r--fs/nfsd/netlink.h2
-rw-r--r--fs/nfsd/nfs2acl.c2
-rw-r--r--fs/nfsd/nfs3acl.c2
-rw-r--r--fs/nfsd/nfs4proc.c5
-rw-r--r--fs/nfsd/nfs4recover.c4
-rw-r--r--fs/nfsd/nfs4xdr.c12
-rw-r--r--fs/nfsd/nfsctl.c101
-rw-r--r--fs/nfsd/nfsd.h3
-rw-r--r--fs/nfsd/nfsfh.c2
-rw-r--r--fs/nfsd/nfssvc.c67
-rw-r--r--fs/nilfs2/alloc.c19
-rw-r--r--fs/nilfs2/alloc.h4
-rw-r--r--fs/nilfs2/bmap.c10
-rw-r--r--fs/nilfs2/dat.c2
-rw-r--r--fs/nilfs2/dir.c38
-rw-r--r--fs/nilfs2/ifile.c7
-rw-r--r--fs/nilfs2/nilfs.h10
-rw-r--r--fs/nilfs2/the_nilfs.c6
-rw-r--r--fs/nilfs2/the_nilfs.h2
-rw-r--r--fs/nls/mac-celtic.c1
-rw-r--r--fs/nls/mac-centeuro.c1
-rw-r--r--fs/nls/mac-croatian.c1
-rw-r--r--fs/nls/mac-cyrillic.c1
-rw-r--r--fs/nls/mac-gaelic.c1
-rw-r--r--fs/nls/mac-greek.c1
-rw-r--r--fs/nls/mac-iceland.c1
-rw-r--r--fs/nls/mac-inuit.c1
-rw-r--r--fs/nls/mac-roman.c1
-rw-r--r--fs/nls/mac-romanian.c1
-rw-r--r--fs/nls/mac-turkish.c1
-rw-r--r--fs/nls/nls_ascii.c1
-rw-r--r--fs/nls/nls_base.c1
-rw-r--r--fs/nls/nls_cp1250.c1
-rw-r--r--fs/nls/nls_cp1251.c1
-rw-r--r--fs/nls/nls_cp1255.c1
-rw-r--r--fs/nls/nls_cp437.c1
-rw-r--r--fs/nls/nls_cp737.c1
-rw-r--r--fs/nls/nls_cp775.c1
-rw-r--r--fs/nls/nls_cp850.c1
-rw-r--r--fs/nls/nls_cp852.c1
-rw-r--r--fs/nls/nls_cp855.c1
-rw-r--r--fs/nls/nls_cp857.c1
-rw-r--r--fs/nls/nls_cp860.c1
-rw-r--r--fs/nls/nls_cp861.c1
-rw-r--r--fs/nls/nls_cp862.c1
-rw-r--r--fs/nls/nls_cp863.c1
-rw-r--r--fs/nls/nls_cp864.c1
-rw-r--r--fs/nls/nls_cp865.c1
-rw-r--r--fs/nls/nls_cp866.c1
-rw-r--r--fs/nls/nls_cp869.c1
-rw-r--r--fs/nls/nls_cp874.c1
-rw-r--r--fs/nls/nls_cp932.c1
-rw-r--r--fs/nls/nls_cp936.c1
-rw-r--r--fs/nls/nls_cp949.c1
-rw-r--r--fs/nls/nls_cp950.c1
-rw-r--r--fs/nls/nls_euc-jp.c1
-rw-r--r--fs/nls/nls_iso8859-1.c1
-rw-r--r--fs/nls/nls_iso8859-13.c1
-rw-r--r--fs/nls/nls_iso8859-14.c1
-rw-r--r--fs/nls/nls_iso8859-15.c1
-rw-r--r--fs/nls/nls_iso8859-2.c1
-rw-r--r--fs/nls/nls_iso8859-3.c1
-rw-r--r--fs/nls/nls_iso8859-4.c1
-rw-r--r--fs/nls/nls_iso8859-5.c1
-rw-r--r--fs/nls/nls_iso8859-6.c1
-rw-r--r--fs/nls/nls_iso8859-7.c1
-rw-r--r--fs/nls/nls_iso8859-9.c1
-rw-r--r--fs/nls/nls_koi8-r.c1
-rw-r--r--fs/nls/nls_koi8-ru.c1
-rw-r--r--fs/nls/nls_koi8-u.c1
-rw-r--r--fs/nls/nls_ucs2_utils.c1
-rw-r--r--fs/nls/nls_utf8.c1
-rw-r--r--fs/notify/fsnotify.c31
-rw-r--r--fs/notify/fsnotify.h2
-rw-r--r--fs/notify/mark.c32
-rw-r--r--fs/nsfs.c122
-rw-r--r--fs/ntfs3/super.c12
-rw-r--r--fs/ocfs2/aops.c5
-rw-r--r--fs/ocfs2/journal.c17
-rw-r--r--fs/ocfs2/journal.h2
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
-rw-r--r--fs/open.c43
-rw-r--r--fs/openpromfs/inode.c1
-rw-r--r--fs/orangefs/inode.c13
-rw-r--r--fs/orangefs/orangefs-bufmap.c4
-rw-r--r--fs/pidfs.c90
-rw-r--r--fs/proc/internal.h33
-rw-r--r--fs/proc/page.c42
-rw-r--r--fs/proc/proc_sysctl.c70
-rw-r--r--fs/proc/task_mmu.c506
-rw-r--r--fs/proc_namespace.c6
-rw-r--r--fs/pstore/blk.c2
-rw-r--r--fs/pstore/platform.c1
-rw-r--r--fs/pstore/ram.c14
-rw-r--r--fs/qnx4/inode.c1
-rw-r--r--fs/qnx6/inode.c1
-rw-r--r--fs/quota/dquot.c8
-rw-r--r--fs/read_write.c18
-rw-r--r--fs/readdir.c4
-rw-r--r--fs/reiserfs/inode.c1
-rw-r--r--fs/romfs/super.c22
-rw-r--r--fs/smb/client/cifsfs.c2
-rw-r--r--fs/smb/client/cifsglob.h21
-rw-r--r--fs/smb/client/file.c83
-rw-r--r--fs/smb/client/fs_context.c39
-rw-r--r--fs/smb/client/smb1ops.c2
-rw-r--r--fs/smb/client/smb2ops.c42
-rw-r--r--fs/smb/client/smb2pdu.c43
-rw-r--r--fs/smb/client/trace.h55
-rw-r--r--fs/smb/client/transport.c8
-rw-r--r--fs/smb/common/smb2pdu.h34
-rw-r--r--fs/smb/server/smb2pdu.c22
-rw-r--r--fs/stat.c206
-rw-r--r--fs/super.c11
-rw-r--r--fs/sysv/super.c1
-rw-r--r--fs/tracefs/inode.c16
-rw-r--r--fs/udf/balloc.c74
-rw-r--r--fs/udf/file.c2
-rw-r--r--fs/udf/inode.c13
-rw-r--r--fs/udf/namei.c2
-rw-r--r--fs/udf/super.c18
-rw-r--r--fs/ufs/dir.c1
-rw-r--r--fs/userfaultfd.c9
-rw-r--r--fs/vboxsf/file.c18
-rw-r--r--fs/vboxsf/super.c16
-rw-r--r--fs/verity/measure.c5
-rw-r--r--fs/xfs/Kconfig12
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/libxfs/xfs_ag.c2
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.h19
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c235
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h18
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c64
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c86
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h3
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_btree.c51
-rw-r--r--fs/xfs/libxfs/xfs_btree.h16
-rw-r--r--fs/xfs/libxfs/xfs_defer.c4
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c661
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h49
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c31
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h7
-rw-r--r--fs/xfs/libxfs/xfs_format.h9
-rw-r--r--fs/xfs/libxfs/xfs_fs.h2
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c20
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c23
-rw-r--r--fs/xfs/libxfs/xfs_inode_util.c749
-rw-r--r--fs/xfs/libxfs/xfs_inode_util.h62
-rw-r--r--fs/xfs/libxfs/xfs_ondisk.h1
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c156
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h11
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c266
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h15
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c7
-rw-r--r--fs/xfs/libxfs/xfs_shared.h7
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c2
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c1
-rw-r--r--fs/xfs/scrub/common.c1
-rw-r--r--fs/xfs/scrub/newbt.c5
-rw-r--r--fs/xfs/scrub/quota_repair.c1
-rw-r--r--fs/xfs/scrub/reap.c7
-rw-r--r--fs/xfs/scrub/tempfile.c21
-rw-r--r--fs/xfs/xfs.h4
-rw-r--r--fs/xfs/xfs_bmap_item.c6
-rw-r--r--fs/xfs/xfs_bmap_util.c52
-rw-r--r--fs/xfs/xfs_bmap_util.h2
-rw-r--r--fs/xfs/xfs_buf_item.c32
-rw-r--r--fs/xfs/xfs_discard.c303
-rw-r--r--fs/xfs/xfs_dquot_item.c31
-rw-r--r--fs/xfs/xfs_drain.c8
-rw-r--r--fs/xfs/xfs_drain.h5
-rw-r--r--fs/xfs/xfs_extfree_item.c119
-rw-r--r--fs/xfs/xfs_extfree_item.h6
-rw-r--r--fs/xfs/xfs_file.c141
-rw-r--r--fs/xfs/xfs_handle.c1
-rw-r--r--fs/xfs/xfs_icache.c7
-rw-r--r--fs/xfs/xfs_inode.c1494
-rw-r--r--fs/xfs/xfs_inode.h70
-rw-r--r--fs/xfs/xfs_inode_item.c38
-rw-r--r--fs/xfs/xfs_ioctl.c60
-rw-r--r--fs/xfs/xfs_iomap.c105
-rw-r--r--fs/xfs/xfs_iops.c66
-rw-r--r--fs/xfs/xfs_linux.h2
-rw-r--r--fs/xfs/xfs_log.c511
-rw-r--r--fs/xfs/xfs_log.h1
-rw-r--r--fs/xfs/xfs_log_cil.c177
-rw-r--r--fs/xfs/xfs_log_priv.h61
-rw-r--r--fs/xfs/xfs_log_recover.c28
-rw-r--r--fs/xfs/xfs_qm.c7
-rw-r--r--fs/xfs/xfs_qm_bhv.c1
-rw-r--r--fs/xfs/xfs_refcount_item.c110
-rw-r--r--fs/xfs/xfs_refcount_item.h5
-rw-r--r--fs/xfs/xfs_reflink.c2
-rw-r--r--fs/xfs/xfs_reflink.h10
-rw-r--r--fs/xfs/xfs_rmap_item.c151
-rw-r--r--fs/xfs/xfs_rmap_item.h4
-rw-r--r--fs/xfs/xfs_rtalloc.c3
-rw-r--r--fs/xfs/xfs_symlink.c70
-rw-r--r--fs/xfs/xfs_sysfs.c29
-rw-r--r--fs/xfs/xfs_trace.c4
-rw-r--r--fs/xfs/xfs_trace.h533
-rw-r--r--fs/xfs/xfs_trans.c129
-rw-r--r--fs/xfs/xfs_trans.h5
-rw-r--r--fs/xfs/xfs_trans_ail.c244
-rw-r--r--fs/xfs/xfs_trans_priv.h44
-rw-r--r--fs/zonefs/super.c1
621 files changed, 20810 insertions, 15077 deletions
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index f5693164ca9a..bd2f530e5740 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -176,4 +176,12 @@ config COREDUMP
certainly want to say Y here. Not necessary on systems that never
need debugging or only ever run flawless code.
+config EXEC_KUNIT_TEST
+ bool "Build execve tests" if !KUNIT_ALL_TESTS
+ depends on KUNIT=y
+ default KUNIT_ALL_TESTS
+ help
+ This builds the exec KUnit tests, which tests boundary conditions
+ of various aspects of the exec internals.
+
endmenu
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 9354b14bbfe3..f0b999a4961b 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -491,4 +491,5 @@ static void __exit exit_adfs_fs(void)
module_init(init_adfs_fs)
module_exit(exit_adfs_fs)
+MODULE_DESCRIPTION("Acorn Disc Filing System");
MODULE_LICENSE("GPL");
diff --git a/fs/affs/amigaffs.h b/fs/affs/amigaffs.h
index 81fb396d4dfa..1b973a669d23 100644
--- a/fs/affs/amigaffs.h
+++ b/fs/affs/amigaffs.h
@@ -80,7 +80,7 @@ struct affs_head {
__be32 spare1;
__be32 first_data;
__be32 checksum;
- __be32 table[1];
+ __be32 table[];
};
struct affs_tail {
@@ -108,7 +108,7 @@ struct slink_front
__be32 key;
__be32 spare1[3];
__be32 checksum;
- u8 symname[1]; /* depends on block size */
+ u8 symname[]; /* depends on block size */
};
struct affs_data_head
@@ -119,7 +119,7 @@ struct affs_data_head
__be32 size;
__be32 next;
__be32 checksum;
- u8 data[1]; /* depends on block size */
+ u8 data[]; /* depends on block size */
};
/* Permission bits */
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 67afe68972d5..f8622ed72e08 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -533,14 +533,14 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
break;
}
- offset = round_down(ctx->pos, sizeof(*dblock)) - folio_file_pos(folio);
+ offset = round_down(ctx->pos, sizeof(*dblock)) - folio_pos(folio);
size = min_t(loff_t, folio_size(folio),
- req->actual_len - folio_file_pos(folio));
+ req->actual_len - folio_pos(folio));
do {
dblock = kmap_local_folio(folio, offset);
ret = afs_dir_iterate_block(dvnode, ctx, dblock,
- folio_file_pos(folio) + offset);
+ folio_pos(folio) + offset);
kunmap_local(dblock);
if (ret != 1)
goto out;
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index e2fa577b66fe..a71bff10496b 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -256,7 +256,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
folio = folio0;
}
- block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_file_pos(folio));
+ block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_pos(folio));
/* Abandon the edit if we got a callback break. */
if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
@@ -417,7 +417,7 @@ void afs_edit_dir_remove(struct afs_vnode *vnode,
folio = folio0;
}
- block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_file_pos(folio));
+ block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_pos(folio));
/* Abandon the edit if we got a callback break. */
if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 15bb7989c387..3acf5e050072 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -512,7 +512,7 @@ static int afs_iget5_set_root(struct inode *inode, void *opaque)
struct afs_vnode *vnode = AFS_FS_I(inode);
vnode->volume = as->volume;
- vnode->fid.vid = as->volume->vid,
+ vnode->fid.vid = as->volume->vid;
vnode->fid.vnode = 1;
vnode->fid.unique = 1;
inode->i_ino = 1;
@@ -545,7 +545,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key)
BUG_ON(!(inode->i_state & I_NEW));
vnode = AFS_FS_I(inode);
- vnode->cb_v_check = atomic_read(&as->volume->cb_v_break),
+ vnode->cb_v_check = atomic_read(&as->volume->cb_v_break);
afs_set_netfs_context(vnode);
op = afs_alloc_operation(key, as->volume);
diff --git a/fs/aio.c b/fs/aio.c
index 57c9f7c077e6..6066f64967b3 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -410,17 +410,7 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst,
struct kioctx *ctx;
unsigned long flags;
pgoff_t idx;
- int rc;
-
- /*
- * We cannot support the _NO_COPY case here, because copy needs to
- * happen under the ctx->completion_lock. That does not work with the
- * migration workflow of MIGRATE_SYNC_NO_COPY.
- */
- if (mode == MIGRATE_SYNC_NO_COPY)
- return -EINVAL;
-
- rc = 0;
+ int rc = 0;
/* mapping->i_private_lock here protects against the kioctx teardown. */
spin_lock(&mapping->i_private_lock);
@@ -465,7 +455,8 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst,
* events from being lost.
*/
spin_lock_irqsave(&ctx->completion_lock, flags);
- folio_migrate_copy(dst, src);
+ folio_copy(dst, src);
+ folio_migrate_flags(dst, src);
BUG_ON(ctx->ring_folios[idx] != src);
ctx->ring_folios[idx] = dst;
spin_unlock_irqrestore(&ctx->completion_lock, flags);
@@ -1516,7 +1507,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res)
iocb_put(iocb);
}
-static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
+static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int rw_type)
{
int ret;
@@ -1542,7 +1533,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
} else
req->ki_ioprio = get_current_ioprio();
- ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
+ ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags, rw_type);
if (unlikely(ret))
return ret;
@@ -1594,7 +1585,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
struct file *file;
int ret;
- ret = aio_prep_rw(req, iocb);
+ ret = aio_prep_rw(req, iocb, READ);
if (ret)
return ret;
file = req->ki_filp;
@@ -1621,7 +1612,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
struct file *file;
int ret;
- ret = aio_prep_rw(req, iocb);
+ ret = aio_prep_rw(req, iocb, WRITE);
if (ret)
return ret;
file = req->ki_filp;
diff --git a/fs/attr.c b/fs/attr.c
index 960a310581eb..825007d5cda4 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -17,8 +17,6 @@
#include <linux/filelock.h>
#include <linux/security.h>
-#include "internal.h"
-
/**
* setattr_should_drop_sgid - determine whether the setgid bit needs to be
* removed
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
index b5e4dfa04ed0..1d644a35ffa0 100644
--- a/fs/autofs/init.c
+++ b/fs/autofs/init.c
@@ -38,4 +38,5 @@ static void __exit exit_autofs_fs(void)
module_init(init_autofs_fs)
module_exit(exit_autofs_fs)
+MODULE_DESCRIPTION("Kernel automounter support");
MODULE_LICENSE("GPL");
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 1f5db6863663..cf792d4de4f1 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -126,7 +126,7 @@ enum {
const struct fs_parameter_spec autofs_param_specs[] = {
fsparam_flag ("direct", Opt_direct),
fsparam_fd ("fd", Opt_fd),
- fsparam_u32 ("gid", Opt_gid),
+ fsparam_gid ("gid", Opt_gid),
fsparam_flag ("ignore", Opt_ignore),
fsparam_flag ("indirect", Opt_indirect),
fsparam_u32 ("maxproto", Opt_maxproto),
@@ -134,7 +134,7 @@ const struct fs_parameter_spec autofs_param_specs[] = {
fsparam_flag ("offset", Opt_offset),
fsparam_u32 ("pgrp", Opt_pgrp),
fsparam_flag ("strictexpire", Opt_strictexpire),
- fsparam_u32 ("uid", Opt_uid),
+ fsparam_uid ("uid", Opt_uid),
{}
};
@@ -193,8 +193,6 @@ static int autofs_parse_param(struct fs_context *fc, struct fs_parameter *param)
struct autofs_fs_context *ctx = fc->fs_private;
struct autofs_sb_info *sbi = fc->s_fs_info;
struct fs_parse_result result;
- kuid_t uid;
- kgid_t gid;
int opt;
opt = fs_parse(fc, autofs_param_specs, param, &result);
@@ -205,16 +203,10 @@ static int autofs_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_fd:
return autofs_parse_fd(fc, sbi, param, &result);
case Opt_uid:
- uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(uid))
- return invalfc(fc, "Invalid uid");
- ctx->uid = uid;
+ ctx->uid = result.uid;
break;
case Opt_gid:
- gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(gid))
- return invalfc(fc, "Invalid gid");
- ctx->gid = gid;
+ ctx->gid = result.gid;
break;
case Opt_pgrp:
ctx->pgrp = result.uint_32;
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 66ca0bbee639..0ab533a2b03b 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -29,10 +29,11 @@ bcachefs-y := \
clock.o \
compress.o \
darray.o \
+ data_update.o \
debug.o \
dirent.o \
+ disk_accounting.o \
disk_groups.o \
- data_update.o \
ec.o \
errcode.o \
error.o \
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 250d6c6d3a3a..a7b425d3c8a0 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -346,7 +346,6 @@ int bch2_set_acl(struct mnt_idmap *idmap,
{
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter inode_iter = { NULL };
struct bch_inode_unpacked inode_u;
struct posix_acl *acl;
@@ -354,6 +353,7 @@ int bch2_set_acl(struct mnt_idmap *idmap,
int ret;
mutex_lock(&inode->ei_update_lock);
+ struct btree_trans *trans = bch2_trans_get(c);
retry:
bch2_trans_begin(trans);
acl = _acl;
@@ -394,8 +394,8 @@ btree_err:
set_cached_acl(&inode->v, type, acl);
err:
- mutex_unlock(&inode->ei_update_lock);
bch2_trans_put(trans);
+ mutex_unlock(&inode->ei_update_lock);
return ret;
}
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 8dec2c6cbb7e..d9c5a92fa708 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -3,6 +3,7 @@
#include "alloc_background.h"
#include "alloc_foreground.h"
#include "backpointers.h"
+#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_io.h"
#include "btree_key_cache.h"
@@ -14,6 +15,7 @@
#include "buckets_waiting_for_journal.h"
#include "clock.h"
#include "debug.h"
+#include "disk_accounting.h"
#include "ec.h"
#include "error.h"
#include "lru.h"
@@ -29,7 +31,7 @@
#include <linux/sched/task.h>
#include <linux/sort.h>
-static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket);
+static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
/* Persistent alloc info: */
@@ -267,27 +269,41 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
i == READ ? "read" : "write",
a.v->io_time[i], LRU_TIME_MAX);
+ unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(a.v) * sizeof(u64) >
+ offsetof(struct bch_alloc_v4, stripe_sectors)
+ ? a.v->stripe_sectors
+ : 0;
+
switch (a.v->data_type) {
case BCH_DATA_free:
case BCH_DATA_need_gc_gens:
case BCH_DATA_need_discard:
- bkey_fsck_err_on(bch2_bucket_sectors_total(*a.v) || a.v->stripe,
+ bkey_fsck_err_on(stripe_sectors ||
+ a.v->dirty_sectors ||
+ a.v->cached_sectors ||
+ a.v->stripe,
c, err, alloc_key_empty_but_have_data,
- "empty data type free but have data");
+ "empty data type free but have data %u.%u.%u %u",
+ stripe_sectors,
+ a.v->dirty_sectors,
+ a.v->cached_sectors,
+ a.v->stripe);
break;
case BCH_DATA_sb:
case BCH_DATA_journal:
case BCH_DATA_btree:
case BCH_DATA_user:
case BCH_DATA_parity:
- bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v),
+ bkey_fsck_err_on(!a.v->dirty_sectors &&
+ !stripe_sectors,
c, err, alloc_key_dirty_sectors_0,
"data_type %s but dirty_sectors==0",
bch2_data_type_str(a.v->data_type));
break;
case BCH_DATA_cached:
bkey_fsck_err_on(!a.v->cached_sectors ||
- bch2_bucket_sectors_dirty(*a.v) ||
+ a.v->dirty_sectors ||
+ stripe_sectors ||
a.v->stripe,
c, err, alloc_key_cached_inconsistency,
"data type inconsistency");
@@ -318,6 +334,7 @@ void bch2_alloc_v4_swab(struct bkey_s k)
a->stripe = swab32(a->stripe);
a->nr_external_backpointers = swab32(a->nr_external_backpointers);
a->fragmentation_lru = swab64(a->fragmentation_lru);
+ a->stripe_sectors = swab32(a->stripe_sectors);
bps = alloc_v4_backpointers(a);
for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
@@ -342,6 +359,7 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a));
prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a));
prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors);
+ prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors);
prt_printf(out, "cached_sectors %u\n", a->cached_sectors);
prt_printf(out, "stripe %u\n", a->stripe);
prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy);
@@ -459,7 +477,8 @@ err:
}
__flatten
-struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos)
+struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos,
+ enum btree_iter_update_trigger_flags flags)
{
struct btree_iter iter;
struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos);
@@ -467,7 +486,7 @@ struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans,
if (ret)
return ERR_PTR(ret);
- ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+ ret = bch2_trans_update(trans, &iter, &a->k_i, flags);
bch2_trans_iter_exit(trans, &iter);
return unlikely(ret) ? ERR_PTR(ret) : a;
}
@@ -578,8 +597,6 @@ int bch2_alloc_read(struct bch_fs *c)
struct bch_dev *ca = NULL;
int ret;
- down_read(&c->gc_lock);
-
if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
BTREE_ITER_prefetch, k, ({
@@ -628,7 +645,6 @@ int bch2_alloc_read(struct bch_fs *c)
bch2_dev_put(ca);
bch2_trans_put(trans);
- up_read(&c->gc_lock);
bch_err_fn(c, ret);
return ret;
@@ -743,6 +759,61 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
return ret;
}
+static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, struct bch_dev *ca,
+ enum bch_data_type data_type,
+ s64 delta_buckets,
+ s64 delta_sectors,
+ s64 delta_fragmented, unsigned flags)
+{
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_dev_data_type,
+ .dev_data_type.dev = ca->dev_idx,
+ .dev_data_type.data_type = data_type,
+ };
+ s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented };
+
+ return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc);
+}
+
+int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca,
+ const struct bch_alloc_v4 *old,
+ const struct bch_alloc_v4 *new,
+ unsigned flags)
+{
+ s64 old_sectors = bch2_bucket_sectors(*old);
+ s64 new_sectors = bch2_bucket_sectors(*new);
+ if (old->data_type != new->data_type) {
+ int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
+ 1, new_sectors, bch2_bucket_sectors_fragmented(ca, *new), flags) ?:
+ bch2_dev_data_type_accounting_mod(trans, ca, old->data_type,
+ -1, -old_sectors, -bch2_bucket_sectors_fragmented(ca, *old), flags);
+ if (ret)
+ return ret;
+ } else if (old_sectors != new_sectors) {
+ int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
+ 0,
+ new_sectors - old_sectors,
+ bch2_bucket_sectors_fragmented(ca, *new) -
+ bch2_bucket_sectors_fragmented(ca, *old), flags);
+ if (ret)
+ return ret;
+ }
+
+ s64 old_unstriped = bch2_bucket_sectors_unstriped(*old);
+ s64 new_unstriped = bch2_bucket_sectors_unstriped(*new);
+ if (old_unstriped != new_unstriped) {
+ int ret = bch2_dev_data_type_accounting_mod(trans, ca, BCH_DATA_unstriped,
+ !!new_unstriped - !!old_unstriped,
+ new_unstriped - old_unstriped,
+ 0,
+ flags);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
int bch2_trigger_alloc(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s new,
@@ -758,10 +829,9 @@ int bch2_trigger_alloc(struct btree_trans *trans,
struct bch_alloc_v4 old_a_convert;
const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
+ struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
if (flags & BTREE_TRIGGER_transactional) {
- struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
-
alloc_data_type_set(new_a, new_a->data_type);
if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) {
@@ -818,22 +888,21 @@ int bch2_trigger_alloc(struct btree_trans *trans,
goto err;
}
- /*
- * need to know if we're getting called from the invalidate path or
- * not:
- */
-
if ((flags & BTREE_TRIGGER_bucket_invalidate) &&
old_a->cached_sectors) {
- ret = bch2_update_cached_sectors_list(trans, new.k->p.inode,
- -((s64) old_a->cached_sectors));
+ ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx,
+ -((s64) old_a->cached_sectors),
+ flags & BTREE_TRIGGER_gc);
if (ret)
goto err;
}
+
+ ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags);
+ if (ret)
+ goto err;
}
if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
- struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
u64 journal_seq = trans->journal_res.seq;
u64 bucket_journal_seq = new_a->journal_seq;
@@ -862,26 +931,22 @@ int bch2_trigger_alloc(struct btree_trans *trans,
c->journal.flushed_seq_ondisk,
new.k->p.inode, new.k->p.offset,
bucket_journal_seq);
- if (ret) {
- bch2_fs_fatal_error(c,
- "setting bucket_needs_journal_commit: %s", bch2_err_str(ret));
+ if (bch2_fs_fatal_err_on(ret, c,
+ "setting bucket_needs_journal_commit: %s", bch2_err_str(ret)))
goto err;
- }
}
- percpu_down_read(&c->mark_lock);
if (new_a->gen != old_a->gen) {
+ rcu_read_lock();
u8 *gen = bucket_gen(ca, new.k->p.offset);
if (unlikely(!gen)) {
- percpu_up_read(&c->mark_lock);
+ rcu_read_unlock();
goto invalid_bucket;
}
*gen = new_a->gen;
+ rcu_read_unlock();
}
- bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false);
- percpu_up_read(&c->mark_lock);
-
#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; })
#define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr)
#define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk)
@@ -893,42 +958,27 @@ int bch2_trigger_alloc(struct btree_trans *trans,
if (statechange(a->data_type == BCH_DATA_need_discard) &&
!bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) &&
bucket_flushed(new_a))
- bch2_discard_one_bucket_fast(c, new.k->p);
+ bch2_discard_one_bucket_fast(ca, new.k->p.offset);
if (statechange(a->data_type == BCH_DATA_cached) &&
!bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
- bch2_do_invalidates(c);
+ bch2_dev_do_invalidates(ca);
if (statechange(a->data_type == BCH_DATA_need_gc_gens))
bch2_gc_gens_async(c);
}
- if ((flags & BTREE_TRIGGER_gc) &&
- (flags & BTREE_TRIGGER_bucket_invalidate)) {
- struct bch_alloc_v4 new_a_convert;
- const struct bch_alloc_v4 *new_a = bch2_alloc_to_v4(new.s_c, &new_a_convert);
-
- percpu_down_read(&c->mark_lock);
+ if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) {
+ rcu_read_lock();
struct bucket *g = gc_bucket(ca, new.k->p.offset);
if (unlikely(!g)) {
- percpu_up_read(&c->mark_lock);
+ rcu_read_unlock();
goto invalid_bucket;
}
g->gen_valid = 1;
-
- bucket_lock(g);
-
- g->gen_valid = 1;
- g->gen = new_a->gen;
- g->data_type = new_a->data_type;
- g->stripe = new_a->stripe;
- g->stripe_redundancy = new_a->stripe_redundancy;
- g->dirty_sectors = new_a->dirty_sectors;
- g->cached_sectors = new_a->cached_sectors;
-
- bucket_unlock(g);
- percpu_up_read(&c->mark_lock);
+ g->gen = new_a->gen;
+ rcu_read_unlock();
}
err:
printbuf_exit(&buf);
@@ -1062,7 +1112,7 @@ int bch2_check_alloc_key(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p);
if (fsck_err_on(!ca,
- c, alloc_key_to_missing_dev_bucket,
+ trans, alloc_key_to_missing_dev_bucket,
"alloc key for invalid device:bucket %llu:%llu",
alloc_k.k->p.inode, alloc_k.k->p.offset))
ret = bch2_btree_delete_at(trans, alloc_iter, 0);
@@ -1082,7 +1132,7 @@ int bch2_check_alloc_key(struct btree_trans *trans,
goto err;
if (fsck_err_on(k.k->type != discard_key_type,
- c, need_discard_key_wrong,
+ trans, need_discard_key_wrong,
"incorrect key in need_discard btree (got %s should be %s)\n"
" %s",
bch2_bkey_types[k.k->type],
@@ -1112,7 +1162,7 @@ int bch2_check_alloc_key(struct btree_trans *trans,
goto err;
if (fsck_err_on(k.k->type != freespace_key_type,
- c, freespace_key_wrong,
+ trans, freespace_key_wrong,
"incorrect key in freespace btree (got %s should be %s)\n"
" %s",
bch2_bkey_types[k.k->type],
@@ -1143,7 +1193,7 @@ int bch2_check_alloc_key(struct btree_trans *trans,
goto err;
if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
- c, bucket_gens_key_wrong,
+ trans, bucket_gens_key_wrong,
"incorrect gen in bucket_gens btree (got %u should be %u)\n"
" %s",
alloc_gen(k, gens_offset), a->gen,
@@ -1184,7 +1234,6 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
struct bpos *end,
struct btree_iter *freespace_iter)
{
- struct bch_fs *c = trans->c;
struct bkey_s_c k;
struct printbuf buf = PRINTBUF;
int ret;
@@ -1202,7 +1251,7 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
*end = bkey_min(k.k->p, *end);
if (fsck_err_on(k.k->type != KEY_TYPE_set,
- c, freespace_hole_missing,
+ trans, freespace_hole_missing,
"hole in alloc btree missing in freespace btree\n"
" device %llu buckets %llu-%llu",
freespace_iter->pos.inode,
@@ -1238,7 +1287,6 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
struct bpos *end,
struct btree_iter *bucket_gens_iter)
{
- struct bch_fs *c = trans->c;
struct bkey_s_c k;
struct printbuf buf = PRINTBUF;
unsigned i, gens_offset, gens_end_offset;
@@ -1262,7 +1310,7 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
bkey_reassemble(&g.k_i, k);
for (i = gens_offset; i < gens_end_offset; i++) {
- if (fsck_err_on(g.v.gens[i], c,
+ if (fsck_err_on(g.v.gens[i], trans,
bucket_gens_hole_wrong,
"hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
bucket_gens_pos_to_alloc(k.k->p, i).inode,
@@ -1320,8 +1368,8 @@ static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_tran
if (ret)
return ret;
- if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
- need_discard_freespace_key_to_invalid_dev_bucket,
+ if (fsck_err_on(!bch2_dev_bucket_exists(c, pos),
+ trans, need_discard_freespace_key_to_invalid_dev_bucket,
"entry in %s btree for nonexistant dev:bucket %llu:%llu",
bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset))
goto delete;
@@ -1330,8 +1378,8 @@ static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_tran
if (fsck_err_on(a->data_type != state ||
(state == BCH_DATA_free &&
- genbits != alloc_freespace_genbits(*a)), c,
- need_discard_freespace_key_bad,
+ genbits != alloc_freespace_genbits(*a)),
+ trans, need_discard_freespace_key_bad,
"%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
bch2_btree_id_str(iter->btree_id),
@@ -1378,7 +1426,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode);
if (!ca) {
- if (fsck_err(c, bucket_gens_to_invalid_dev,
+ if (fsck_err(trans, bucket_gens_to_invalid_dev,
"bucket_gens key for invalid device:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = bch2_btree_delete_at(trans, iter, 0);
@@ -1386,8 +1434,8 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
}
if (fsck_err_on(end <= ca->mi.first_bucket ||
- start >= ca->mi.nbuckets, c,
- bucket_gens_to_invalid_buckets,
+ start >= ca->mi.nbuckets,
+ trans, bucket_gens_to_invalid_buckets,
"bucket_gens key for invalid buckets:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = bch2_btree_delete_at(trans, iter, 0);
@@ -1395,16 +1443,16 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
}
for (b = start; b < ca->mi.first_bucket; b++)
- if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
- bucket_gens_nonzero_for_invalid_buckets,
+ if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
+ trans, bucket_gens_nonzero_for_invalid_buckets,
"bucket_gens key has nonzero gen for invalid bucket")) {
g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
need_update = true;
}
for (b = ca->mi.nbuckets; b < end; b++)
- if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
- bucket_gens_nonzero_for_invalid_buckets,
+ if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
+ trans, bucket_gens_nonzero_for_invalid_buckets,
"bucket_gens key has nonzero gen for invalid bucket")) {
g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
need_update = true;
@@ -1553,13 +1601,13 @@ err:
}
static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
- struct btree_iter *alloc_iter)
+ struct btree_iter *alloc_iter,
+ struct bkey_buf *last_flushed)
{
struct bch_fs *c = trans->c;
- struct btree_iter lru_iter;
struct bch_alloc_v4 a_convert;
const struct bch_alloc_v4 *a;
- struct bkey_s_c alloc_k, lru_k;
+ struct bkey_s_c alloc_k;
struct printbuf buf = PRINTBUF;
int ret;
@@ -1573,11 +1621,19 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
a = bch2_alloc_to_v4(alloc_k, &a_convert);
+ if (a->fragmentation_lru) {
+ ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START,
+ a->fragmentation_lru,
+ alloc_k, last_flushed);
+ if (ret)
+ return ret;
+ }
+
if (a->data_type != BCH_DATA_cached)
return 0;
- if (fsck_err_on(!a->io_time[READ], c,
- alloc_key_cached_but_read_time_zero,
+ if (fsck_err_on(!a->io_time[READ],
+ trans, alloc_key_cached_but_read_time_zero,
"cached bucket with read_time 0\n"
" %s",
(printbuf_reset(&buf),
@@ -1597,73 +1653,66 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
a = &a_mut->v;
}
- lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
- lru_pos(alloc_k.k->p.inode,
- bucket_to_u64(alloc_k.k->p),
- a->io_time[READ]), 0);
- ret = bkey_err(lru_k);
+ ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ],
+ alloc_k, last_flushed);
if (ret)
- return ret;
-
- if (fsck_err_on(lru_k.k->type != KEY_TYPE_set, c,
- alloc_key_to_missing_lru_entry,
- "missing lru entry\n"
- " %s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
- ret = bch2_lru_set(trans,
- alloc_k.k->p.inode,
- bucket_to_u64(alloc_k.k->p),
- a->io_time[READ]);
- if (ret)
- goto err;
- }
+ goto err;
err:
fsck_err:
- bch2_trans_iter_exit(trans, &lru_iter);
printbuf_exit(&buf);
return ret;
}
int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
{
+ struct bkey_buf last_flushed;
+
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_check_alloc_to_lru_ref(trans, &iter)));
+ bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed)));
+
+ bch2_bkey_buf_exit(&last_flushed, c);
bch_err_fn(c, ret);
return ret;
}
-static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket)
+static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress)
{
int ret;
- mutex_lock(&c->discard_buckets_in_flight_lock);
- darray_for_each(c->discard_buckets_in_flight, i)
- if (bkey_eq(*i, bucket)) {
+ mutex_lock(&ca->discard_buckets_in_flight_lock);
+ darray_for_each(ca->discard_buckets_in_flight, i)
+ if (i->bucket == bucket) {
ret = -BCH_ERR_EEXIST_discard_in_flight_add;
goto out;
}
- ret = darray_push(&c->discard_buckets_in_flight, bucket);
+ ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) {
+ .in_progress = in_progress,
+ .bucket = bucket,
+ }));
out:
- mutex_unlock(&c->discard_buckets_in_flight_lock);
+ mutex_unlock(&ca->discard_buckets_in_flight_lock);
return ret;
}
-static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket)
+static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket)
{
- mutex_lock(&c->discard_buckets_in_flight_lock);
- darray_for_each(c->discard_buckets_in_flight, i)
- if (bkey_eq(*i, bucket)) {
- darray_remove_item(&c->discard_buckets_in_flight, i);
+ mutex_lock(&ca->discard_buckets_in_flight_lock);
+ darray_for_each(ca->discard_buckets_in_flight, i)
+ if (i->bucket == bucket) {
+ BUG_ON(!i->in_progress);
+ darray_remove_item(&ca->discard_buckets_in_flight, i);
goto found;
}
BUG();
found:
- mutex_unlock(&c->discard_buckets_in_flight_lock);
+ mutex_unlock(&ca->discard_buckets_in_flight_lock);
}
struct discard_buckets_state {
@@ -1671,26 +1720,11 @@ struct discard_buckets_state {
u64 open;
u64 need_journal_commit;
u64 discarded;
- struct bch_dev *ca;
u64 need_journal_commit_this_dev;
};
-static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca)
-{
- if (s->ca == ca)
- return;
-
- if (s->ca && s->need_journal_commit_this_dev >
- bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets)
- bch2_journal_flush_async(&c->journal, NULL);
-
- if (s->ca)
- percpu_ref_put(&s->ca->io_ref);
- s->ca = ca;
- s->need_journal_commit_this_dev = 0;
-}
-
static int bch2_discard_one_bucket(struct btree_trans *trans,
+ struct bch_dev *ca,
struct btree_iter *need_discard_iter,
struct bpos *discard_pos_done,
struct discard_buckets_state *s)
@@ -1704,16 +1738,6 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
bool discard_locked = false;
int ret = 0;
- struct bch_dev *ca = s->ca && s->ca->dev_idx == pos.inode
- ? s->ca
- : bch2_dev_get_ioref(c, pos.inode, WRITE);
- if (!ca) {
- bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
- return 0;
- }
-
- discard_buckets_next_dev(c, s, ca);
-
if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
s->open++;
goto out;
@@ -1773,7 +1797,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
goto out;
}
- if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true)))
+ if (discard_in_flight_add(ca, iter.pos.offset, true))
goto out;
discard_locked = true;
@@ -1811,7 +1835,7 @@ write:
s->discarded++;
out:
if (discard_locked)
- discard_in_flight_remove(c, iter.pos);
+ discard_in_flight_remove(ca, iter.pos.offset);
s->seen++;
bch2_trans_iter_exit(trans, &iter);
printbuf_exit(&buf);
@@ -1820,7 +1844,8 @@ out:
static void bch2_do_discards_work(struct work_struct *work)
{
- struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
+ struct bch_dev *ca = container_of(work, struct bch_dev, discard_work);
+ struct bch_fs *c = ca->fs;
struct discard_buckets_state s = {};
struct bpos discard_pos_done = POS_MAX;
int ret;
@@ -1831,23 +1856,41 @@ static void bch2_do_discards_work(struct work_struct *work)
* successful commit:
*/
ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter,
- BTREE_ID_need_discard, POS_MIN, 0, k,
- bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s)));
-
- discard_buckets_next_dev(c, &s, NULL);
+ for_each_btree_key_upto(trans, iter,
+ BTREE_ID_need_discard,
+ POS(ca->dev_idx, 0),
+ POS(ca->dev_idx, U64_MAX), 0, k,
+ bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s)));
trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
bch2_err_str(ret));
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+ percpu_ref_put(&ca->io_ref);
+}
+
+void bch2_dev_do_discards(struct bch_dev *ca)
+{
+ struct bch_fs *c = ca->fs;
+
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ return;
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
+ goto put_ioref;
+
+ if (queue_work(c->write_ref_wq, &ca->discard_work))
+ return;
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+put_ioref:
+ percpu_ref_put(&ca->io_ref);
}
void bch2_do_discards(struct bch_fs *c)
{
- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) &&
- !queue_work(c->write_ref_wq, &c->discard_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+ for_each_member_device(c, ca)
+ bch2_dev_do_discards(ca);
}
static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket)
@@ -1876,68 +1919,69 @@ err:
static void bch2_do_discards_fast_work(struct work_struct *work)
{
- struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work);
+ struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work);
+ struct bch_fs *c = ca->fs;
while (1) {
bool got_bucket = false;
- struct bpos bucket;
- struct bch_dev *ca;
+ u64 bucket;
- mutex_lock(&c->discard_buckets_in_flight_lock);
- darray_for_each(c->discard_buckets_in_flight, i) {
- if (i->snapshot)
+ mutex_lock(&ca->discard_buckets_in_flight_lock);
+ darray_for_each(ca->discard_buckets_in_flight, i) {
+ if (i->in_progress)
continue;
- ca = bch2_dev_get_ioref(c, i->inode, WRITE);
- if (!ca) {
- darray_remove_item(&c->discard_buckets_in_flight, i);
- continue;
- }
-
got_bucket = true;
- bucket = *i;
- i->snapshot = true;
+ bucket = i->bucket;
+ i->in_progress = true;
break;
}
- mutex_unlock(&c->discard_buckets_in_flight_lock);
+ mutex_unlock(&ca->discard_buckets_in_flight_lock);
if (!got_bucket)
break;
if (ca->mi.discard && !c->opts.nochanges)
blkdev_issue_discard(ca->disk_sb.bdev,
- bucket.offset * ca->mi.bucket_size,
+ bucket_to_sector(ca, bucket),
ca->mi.bucket_size,
GFP_KERNEL);
int ret = bch2_trans_do(c, NULL, NULL,
- BCH_WATERMARK_btree|
- BCH_TRANS_COMMIT_no_enospc,
- bch2_clear_bucket_needs_discard(trans, bucket));
+ BCH_WATERMARK_btree|
+ BCH_TRANS_COMMIT_no_enospc,
+ bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket)));
bch_err_fn(c, ret);
- percpu_ref_put(&ca->io_ref);
- discard_in_flight_remove(c, bucket);
+ discard_in_flight_remove(ca, bucket);
if (ret)
break;
}
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+ percpu_ref_put(&ca->io_ref);
}
-static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket)
+static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
{
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, bucket.inode);
- bool dead = !ca || percpu_ref_is_dying(&ca->io_ref);
- rcu_read_unlock();
+ struct bch_fs *c = ca->fs;
+
+ if (discard_in_flight_add(ca, bucket, false))
+ return;
+
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ return;
- if (!dead &&
- !discard_in_flight_add(c, bucket) &&
- bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) &&
- !queue_work(c->write_ref_wq, &c->discard_fast_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
+ goto put_ioref;
+
+ if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
+ return;
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+put_ioref:
+ percpu_ref_put(&ca->io_ref);
}
static int invalidate_one_bucket(struct btree_trans *trans,
@@ -1963,7 +2007,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
return 0;
- a = bch2_trans_start_alloc_update(trans, bucket);
+ a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate);
ret = PTR_ERR_OR_ZERO(a);
if (ret)
goto out;
@@ -1984,6 +2028,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
a->v.gen++;
a->v.data_type = 0;
a->v.dirty_sectors = 0;
+ a->v.stripe_sectors = 0;
a->v.cached_sectors = 0;
a->v.io_time[READ] = bch2_current_io_time(c, READ);
a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE);
@@ -2038,7 +2083,8 @@ again:
static void bch2_do_invalidates_work(struct work_struct *work)
{
- struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
+ struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work);
+ struct bch_fs *c = ca->fs;
struct btree_trans *trans = bch2_trans_get(c);
int ret = 0;
@@ -2046,52 +2092,63 @@ static void bch2_do_invalidates_work(struct work_struct *work)
if (ret)
goto err;
- for_each_member_device(c, ca) {
- s64 nr_to_invalidate =
- should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
- struct btree_iter iter;
- bool wrapped = false;
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
- lru_pos(ca->dev_idx, 0,
- ((bch2_current_io_time(c, READ) + U32_MAX) &
- LRU_TIME_MAX)), 0);
-
- while (true) {
- bch2_trans_begin(trans);
-
- struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
- ret = bkey_err(k);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- break;
- if (!k.k)
- break;
+ s64 nr_to_invalidate =
+ should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
+ struct btree_iter iter;
+ bool wrapped = false;
- ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
- if (ret)
- break;
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
+ lru_pos(ca->dev_idx, 0,
+ ((bch2_current_io_time(c, READ) + U32_MAX) &
+ LRU_TIME_MAX)), 0);
- bch2_btree_iter_advance(&iter);
- }
- bch2_trans_iter_exit(trans, &iter);
+ while (true) {
+ bch2_trans_begin(trans);
- if (ret < 0) {
- bch2_dev_put(ca);
+ struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
+ ret = bkey_err(k);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
break;
- }
+ if (!k.k)
+ break;
+
+ ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
+ if (ret)
+ break;
+
+ bch2_btree_iter_advance(&iter);
}
+ bch2_trans_iter_exit(trans, &iter);
err:
bch2_trans_put(trans);
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+ percpu_ref_put(&ca->io_ref);
+}
+
+void bch2_dev_do_invalidates(struct bch_dev *ca)
+{
+ struct bch_fs *c = ca->fs;
+
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ return;
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
+ goto put_ioref;
+
+ if (queue_work(c->write_ref_wq, &ca->invalidate_work))
+ return;
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+put_ioref:
+ percpu_ref_put(&ca->io_ref);
}
void bch2_do_invalidates(struct bch_fs *c)
{
- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) &&
- !queue_work(c->write_ref_wq, &c->invalidate_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+ for_each_member_device(c, ca)
+ bch2_dev_do_invalidates(ca);
}
int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
@@ -2327,6 +2384,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
reserved_sectors = min(reserved_sectors, capacity);
+ c->reserved = reserved_sectors;
c->capacity = capacity - reserved_sectors;
c->bucket_size_max = bucket_size_max;
@@ -2407,16 +2465,20 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
set_bit(ca->dev_idx, c->rw_devs[i].d);
}
-void bch2_fs_allocator_background_exit(struct bch_fs *c)
+void bch2_dev_allocator_background_exit(struct bch_dev *ca)
+{
+ darray_exit(&ca->discard_buckets_in_flight);
+}
+
+void bch2_dev_allocator_background_init(struct bch_dev *ca)
{
- darray_exit(&c->discard_buckets_in_flight);
+ mutex_init(&ca->discard_buckets_in_flight_lock);
+ INIT_WORK(&ca->discard_work, bch2_do_discards_work);
+ INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work);
+ INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work);
}
void bch2_fs_allocator_background_init(struct bch_fs *c)
{
spin_lock_init(&c->freelist_lock);
- mutex_init(&c->discard_buckets_in_flight_lock);
- INIT_WORK(&c->discard_work, bch2_do_discards_work);
- INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work);
- INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index c3cc3c5ba5b6..8d2b62c9588e 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -41,6 +41,7 @@ static inline void alloc_to_bucket(struct bucket *dst, struct bch_alloc_v4 src)
{
dst->gen = src.gen;
dst->data_type = src.data_type;
+ dst->stripe_sectors = src.stripe_sectors;
dst->dirty_sectors = src.dirty_sectors;
dst->cached_sectors = src.cached_sectors;
dst->stripe = src.stripe;
@@ -50,6 +51,7 @@ static inline void __bucket_m_to_alloc(struct bch_alloc_v4 *dst, struct bucket s
{
dst->gen = src.gen;
dst->data_type = src.data_type;
+ dst->stripe_sectors = src.stripe_sectors;
dst->dirty_sectors = src.dirty_sectors;
dst->cached_sectors = src.cached_sectors;
dst->stripe = src.stripe;
@@ -80,30 +82,49 @@ static inline bool bucket_data_type_mismatch(enum bch_data_type bucket,
bucket_data_type(bucket) != bucket_data_type(ptr);
}
-static inline unsigned bch2_bucket_sectors_total(struct bch_alloc_v4 a)
+static inline s64 bch2_bucket_sectors_total(struct bch_alloc_v4 a)
{
- return a.dirty_sectors + a.cached_sectors;
+ return a.stripe_sectors + a.dirty_sectors + a.cached_sectors;
}
-static inline unsigned bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
+static inline s64 bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
{
- return a.dirty_sectors;
+ return a.stripe_sectors + a.dirty_sectors;
}
-static inline unsigned bch2_bucket_sectors_fragmented(struct bch_dev *ca,
+static inline s64 bch2_bucket_sectors(struct bch_alloc_v4 a)
+{
+ return a.data_type == BCH_DATA_cached
+ ? a.cached_sectors
+ : bch2_bucket_sectors_dirty(a);
+}
+
+static inline s64 bch2_bucket_sectors_fragmented(struct bch_dev *ca,
struct bch_alloc_v4 a)
{
- int d = bch2_bucket_sectors_dirty(a);
+ int d = bch2_bucket_sectors(a);
+
+ return d ? max(0, ca->mi.bucket_size - d) : 0;
+}
+
+static inline s64 bch2_gc_bucket_sectors_fragmented(struct bch_dev *ca, struct bucket a)
+{
+ int d = a.stripe_sectors + a.dirty_sectors;
return d ? max(0, ca->mi.bucket_size - d) : 0;
}
+static inline s64 bch2_bucket_sectors_unstriped(struct bch_alloc_v4 a)
+{
+ return a.data_type == BCH_DATA_stripe ? a.dirty_sectors : 0;
+}
+
static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
enum bch_data_type data_type)
{
if (a.stripe)
return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
- if (a.dirty_sectors)
+ if (bch2_bucket_sectors_dirty(a))
return data_type;
if (a.cached_sectors)
return BCH_DATA_cached;
@@ -185,7 +206,8 @@ static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a)
struct bkey_i_alloc_v4 *
bch2_trans_start_alloc_update_noupdate(struct btree_trans *, struct btree_iter *, struct bpos);
struct bkey_i_alloc_v4 *
-bch2_trans_start_alloc_update(struct btree_trans *, struct bpos);
+bch2_trans_start_alloc_update(struct btree_trans *, struct bpos,
+ enum btree_iter_update_trigger_flags);
void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
@@ -270,11 +292,15 @@ static inline bool bkey_is_alloc(const struct bkey *k)
int bch2_alloc_read(struct bch_fs *);
+int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *,
+ const struct bch_alloc_v4 *,
+ const struct bch_alloc_v4 *, unsigned);
int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s,
enum btree_iter_update_trigger_flags);
int bch2_check_alloc_info(struct bch_fs *);
int bch2_check_alloc_to_lru_refs(struct bch_fs *);
+void bch2_dev_do_discards(struct bch_dev *);
void bch2_do_discards(struct bch_fs *);
static inline u64 should_invalidate_buckets(struct bch_dev *ca,
@@ -289,6 +315,7 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca,
return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
}
+void bch2_dev_do_invalidates(struct bch_dev *);
void bch2_do_invalidates(struct bch_fs *);
static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a)
@@ -312,7 +339,9 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
-void bch2_fs_allocator_background_exit(struct bch_fs *);
+void bch2_dev_allocator_background_exit(struct bch_dev *);
+void bch2_dev_allocator_background_init(struct bch_dev *);
+
void bch2_fs_allocator_background_init(struct bch_fs *);
#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h
index b4ec20be93b8..47d9d006502c 100644
--- a/fs/bcachefs/alloc_background_format.h
+++ b/fs/bcachefs/alloc_background_format.h
@@ -70,6 +70,8 @@ struct bch_alloc_v4 {
__u32 stripe;
__u32 nr_external_backpointers;
__u64 fragmentation_lru;
+ __u32 stripe_sectors;
+ __u32 pad;
} __packed __aligned(8);
#define BCH_ALLOC_V4_U64s_V0 6
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 927a5f300b30..cabf866c7956 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -621,13 +621,13 @@ again:
avail = dev_buckets_free(ca, *usage, watermark);
if (usage->d[BCH_DATA_need_discard].buckets > avail)
- bch2_do_discards(c);
+ bch2_dev_do_discards(ca);
if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
bch2_gc_gens_async(c);
if (should_invalidate_buckets(ca, *usage))
- bch2_do_invalidates(c);
+ bch2_dev_do_invalidates(ca);
if (!avail) {
if (cl && !waiting) {
@@ -1589,7 +1589,7 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c)
}
}
-static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
+void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
{
struct bch_dev *ca = ob_dev(c, ob);
unsigned data_type = ob->data_type;
@@ -1703,17 +1703,18 @@ void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
nr[c->open_buckets[i].data_type]++;
+ printbuf_tabstops_reset(out);
printbuf_tabstop_push(out, 24);
- percpu_down_read(&c->mark_lock);
- prt_printf(out, "hidden\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.hidden));
- prt_printf(out, "btree\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.btree));
- prt_printf(out, "data\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.data));
- prt_printf(out, "cached\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.cached));
- prt_printf(out, "reserved\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.reserved));
- prt_printf(out, "online_reserved\t%llu\n", percpu_u64_get(c->online_reserved));
- prt_printf(out, "nr_inodes\t%llu\n", bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes));
- percpu_up_read(&c->mark_lock);
+ prt_printf(out, "capacity\t%llu\n", c->capacity);
+ prt_printf(out, "reserved\t%llu\n", c->reserved);
+ prt_printf(out, "hidden\t%llu\n", percpu_u64_get(&c->usage->hidden));
+ prt_printf(out, "btree\t%llu\n", percpu_u64_get(&c->usage->btree));
+ prt_printf(out, "data\t%llu\n", percpu_u64_get(&c->usage->data));
+ prt_printf(out, "cached\t%llu\n", percpu_u64_get(&c->usage->cached));
+ prt_printf(out, "reserved\t%llu\n", percpu_u64_get(&c->usage->reserved));
+ prt_printf(out, "online_reserved\t%llu\n", percpu_u64_get(c->online_reserved));
+ prt_printf(out, "nr_inodes\t%llu\n", percpu_u64_get(&c->usage->nr_inodes));
prt_newline(out);
prt_printf(out, "freelist_wait\t%s\n", c->freelist_wait.list.first ? "waiting" : "empty");
@@ -1736,6 +1737,7 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
nr[c->open_buckets[i].data_type]++;
+ printbuf_tabstops_reset(out);
printbuf_tabstop_push(out, 12);
printbuf_tabstop_push(out, 16);
printbuf_tabstop_push(out, 16);
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index a42c9730d32a..6da9e7e29026 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -222,6 +222,7 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp
void bch2_fs_allocator_foreground_init(struct bch_fs *);
+void bch2_open_bucket_to_text(struct printbuf *, struct bch_fs *, struct open_bucket *);
void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *);
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 4321f9fb73bd..3cc02479a982 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -395,7 +395,7 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
struct bpos bucket;
if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) {
- if (fsck_err(c, backpointer_to_missing_device,
+ if (fsck_err(trans, backpointer_to_missing_device,
"backpointer for missing device:\n%s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = bch2_btree_delete_at(trans, bp_iter, 0);
@@ -407,8 +407,8 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
if (ret)
goto out;
- if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c,
- backpointer_to_missing_alloc,
+ if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4,
+ trans, backpointer_to_missing_alloc,
"backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
alloc_iter.pos.inode, alloc_iter.pos.offset,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
@@ -434,13 +434,6 @@ int bch2_check_btree_backpointers(struct bch_fs *c)
return ret;
}
-static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
-{
- return bpos_eq(l.k->p, r.k->p) &&
- bkey_bytes(l.k) == bkey_bytes(r.k) &&
- !memcmp(l.v, r.v, bkey_val_bytes(l.k));
-}
-
struct extents_to_bp_state {
struct bpos bucket_start;
struct bpos bucket_end;
@@ -512,7 +505,7 @@ found:
struct nonce nonce = extent_nonce(extent.k->version, p.crc);
struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes);
if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum),
- c, dup_backpointer_to_bad_csum_extent,
+ trans, dup_backpointer_to_bad_csum_extent,
"%s", buf.buf))
ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1;
fsck_err:
@@ -536,11 +529,8 @@ static int check_bp_exists(struct btree_trans *trans,
struct btree_iter other_extent_iter = {};
struct printbuf buf = PRINTBUF;
struct bkey_s_c bp_k;
- struct bkey_buf tmp;
int ret = 0;
- bch2_bkey_buf_init(&tmp);
-
struct bch_dev *ca = bch2_dev_bucket_tryget(c, bucket);
if (!ca) {
prt_str(&buf, "extent for nonexistent device:bucket ");
@@ -565,22 +555,9 @@ static int check_bp_exists(struct btree_trans *trans,
if (bp_k.k->type != KEY_TYPE_backpointer ||
memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) {
- bch2_bkey_buf_reassemble(&tmp, c, orig_k);
-
- if (!bkey_and_val_eq(orig_k, bkey_i_to_s_c(s->last_flushed.k))) {
- if (bp.level) {
- bch2_trans_unlock(trans);
- bch2_btree_interior_updates_flush(c);
- }
-
- ret = bch2_btree_write_buffer_flush_sync(trans);
- if (ret)
- goto err;
-
- bch2_bkey_buf_copy(&s->last_flushed, c, tmp.k);
- ret = -BCH_ERR_transaction_restart_write_buffer_flush;
- goto out;
- }
+ ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed);
+ if (ret)
+ goto err;
goto check_existing_bp;
}
@@ -589,7 +566,6 @@ err:
fsck_err:
bch2_trans_iter_exit(trans, &other_extent_iter);
bch2_trans_iter_exit(trans, &bp_iter);
- bch2_bkey_buf_exit(&tmp, c);
bch2_dev_put(ca);
printbuf_exit(&buf);
return ret;
@@ -671,7 +647,7 @@ missing:
prt_printf(&buf, "\n want: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i));
- if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
+ if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf))
ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, orig_k, true);
goto out;
@@ -786,14 +762,16 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
for (enum btree_id btree = start.btree;
btree < BTREE_ID_NR && !ret;
btree++) {
- unsigned depth = ((1U << btree) & btree_leaf_mask) ? 0 : 1;
+ unsigned depth = (BIT_ULL(btree) & btree_leaf_mask) ? 0 : 1;
struct btree_iter iter;
struct btree *b;
- if (!((1U << btree) & btree_leaf_mask) &&
- !((1U << btree) & btree_interior_mask))
+ if (!(BIT_ULL(btree) & btree_leaf_mask) &&
+ !(BIT_ULL(btree) & btree_interior_mask))
continue;
+ bch2_trans_begin(trans);
+
__for_each_btree_node(trans, iter, btree,
btree == start.btree ? start.pos : POS_MIN,
0, depth, BTREE_ITER_prefetch, b, ret) {
@@ -905,7 +883,7 @@ static int check_one_backpointer(struct btree_trans *trans,
struct bbpos start,
struct bbpos end,
struct bkey_s_c_backpointer bp,
- struct bpos *last_flushed_pos)
+ struct bkey_buf *last_flushed)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
@@ -925,20 +903,18 @@ static int check_one_backpointer(struct btree_trans *trans,
if (ret)
return ret;
- if (!k.k && !bpos_eq(*last_flushed_pos, bp.k->p)) {
- *last_flushed_pos = bp.k->p;
- ret = bch2_btree_write_buffer_flush_sync(trans) ?:
- -BCH_ERR_transaction_restart_write_buffer_flush;
- goto out;
- }
+ if (!k.k) {
+ ret = bch2_btree_write_buffer_maybe_flush(trans, bp.s_c, last_flushed);
+ if (ret)
+ goto out;
- if (fsck_err_on(!k.k, c,
- backpointer_to_missing_ptr,
- "backpointer for missing %s\n %s",
- bp.v->level ? "btree node" : "extent",
- (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
- ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
- goto out;
+ if (fsck_err(trans, backpointer_to_missing_ptr,
+ "backpointer for missing %s\n %s",
+ bp.v->level ? "btree node" : "extent",
+ (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
+ ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
+ goto out;
+ }
}
out:
fsck_err:
@@ -951,14 +927,20 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
struct bbpos start,
struct bbpos end)
{
- struct bpos last_flushed_pos = SPOS_MAX;
+ struct bkey_buf last_flushed;
- return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
+ int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_one_backpointer(trans, start, end,
bkey_s_c_to_backpointer(k),
- &last_flushed_pos));
+ &last_flushed));
+
+ bch2_bkey_buf_exit(&last_flushed, trans->c);
+ return ret;
}
int bch2_check_backpointers_to_extents(struct bch_fs *c)
@@ -969,8 +951,8 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
while (1) {
ret = bch2_get_btree_in_memory_pos(trans,
- (1U << BTREE_ID_extents)|
- (1U << BTREE_ID_reflink),
+ BIT_ULL(BTREE_ID_extents)|
+ BIT_ULL(BTREE_ID_reflink),
~0,
start, &end);
if (ret)
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index a6b83ecab7ce..91361a167dcd 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -205,6 +205,7 @@
#include <linux/zstd.h>
#include "bcachefs_format.h"
+#include "disk_accounting_types.h"
#include "errcode.h"
#include "fifo.h"
#include "nocow_locking_types.h"
@@ -266,6 +267,8 @@ do { \
#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n")
+void bch2_print_str(struct bch_fs *, const char *);
+
__printf(2, 3)
void bch2_print_opts(struct bch_opts *, const char *, ...);
@@ -493,6 +496,11 @@ struct io_count {
u64 sectors[2][BCH_DATA_NR];
};
+struct discard_in_flight {
+ bool in_progress:1;
+ u64 bucket:63;
+};
+
struct bch_dev {
struct kobject kobj;
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -530,8 +538,8 @@ struct bch_dev {
/*
* Buckets:
* Per-bucket arrays are protected by c->mark_lock, bucket_lock and
- * gc_lock, for device resize - holding any is sufficient for access:
- * Or rcu_read_lock(), but only for dev_ptr_stale():
+ * gc_gens_lock, for device resize - holding any is sufficient for
+ * access: Or rcu_read_lock(), but only for dev_ptr_stale():
*/
struct bucket_array __rcu *buckets_gc;
struct bucket_gens __rcu *bucket_gens;
@@ -539,9 +547,7 @@ struct bch_dev {
unsigned long *buckets_nouse;
struct rw_semaphore bucket_lock;
- struct bch_dev_usage *usage_base;
- struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR];
- struct bch_dev_usage __percpu *usage_gc;
+ struct bch_dev_usage __percpu *usage;
/* Allocator: */
u64 new_fs_bucket_idx;
@@ -554,6 +560,12 @@ struct bch_dev {
size_t inc_gen_really_needs_gc;
size_t buckets_waiting_on_journal;
+ struct work_struct invalidate_work;
+ struct work_struct discard_work;
+ struct mutex discard_buckets_in_flight_lock;
+ DARRAY(struct discard_in_flight) discard_buckets_in_flight;
+ struct work_struct discard_fast_work;
+
atomic64_t rebalance_work;
struct journal_device journal;
@@ -581,6 +593,8 @@ struct bch_dev {
#define BCH_FS_FLAGS() \
x(new_fs) \
x(started) \
+ x(btree_running) \
+ x(accounting_replay_done) \
x(may_go_rw) \
x(rw) \
x(was_rw) \
@@ -659,8 +673,6 @@ struct btree_trans_buf {
struct btree_trans *trans;
};
-#define REPLICAS_DELTA_LIST_MAX (1U << 16)
-
#define BCACHEFS_ROOT_SUBVOL_INUM \
((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
@@ -730,15 +742,14 @@ struct bch_fs {
struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX];
+ struct bch_accounting_mem accounting;
+
struct bch_replicas_cpu replicas;
struct bch_replicas_cpu replicas_gc;
struct mutex replicas_gc_lock;
- mempool_t replicas_delta_pool;
struct journal_entry_res btree_root_journal_res;
- struct journal_entry_res replicas_journal_res;
struct journal_entry_res clock_journal_res;
- struct journal_entry_res dev_usage_journal_res;
struct bch_disk_groups_cpu __rcu *disk_groups;
@@ -861,6 +872,7 @@ struct bch_fs {
struct bch_devs_mask rw_devs[BCH_DATA_NR];
u64 capacity; /* sectors */
+ u64 reserved; /* sectors */
/*
* When capacity _decreases_ (due to a disk being removed), we
@@ -878,15 +890,9 @@ struct bch_fs {
struct percpu_rw_semaphore mark_lock;
seqcount_t usage_lock;
- struct bch_fs_usage *usage_base;
- struct bch_fs_usage __percpu *usage[JOURNAL_BUF_NR];
- struct bch_fs_usage __percpu *usage_gc;
+ struct bch_fs_usage_base __percpu *usage;
u64 __percpu *online_reserved;
- /* single element mempool: */
- struct mutex usage_scratch_lock;
- struct bch_fs_usage_online *usage_scratch;
-
struct io_clock io_clock[2];
/* JOURNAL SEQ BLACKLIST */
@@ -915,11 +921,6 @@ struct bch_fs {
unsigned write_points_nr;
struct buckets_waiting_for_journal buckets_waiting_for_journal;
- struct work_struct invalidate_work;
- struct work_struct discard_work;
- struct mutex discard_buckets_in_flight_lock;
- DARRAY(struct bpos) discard_buckets_in_flight;
- struct work_struct discard_fast_work;
/* GARBAGE COLLECTION */
struct work_struct gc_gens_work;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index e3b1bde489c3..74a60b1a4ddf 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -417,7 +417,8 @@ static inline void bkey_init(struct bkey *k)
x(bucket_gens, 30) \
x(snapshot_tree, 31) \
x(logged_op_truncate, 32) \
- x(logged_op_finsert, 33)
+ x(logged_op_finsert, 33) \
+ x(accounting, 34)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
@@ -467,18 +468,6 @@ struct bch_backpointer {
struct bpos pos;
} __packed __aligned(8);
-/* LRU btree: */
-
-struct bch_lru {
- struct bch_val v;
- __le64 idx;
-} __packed __aligned(8);
-
-#define LRU_ID_STRIPES (1U << 16)
-
-#define LRU_TIME_BITS 48
-#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1)
-
/* Optional/variable size superblock sections: */
struct bch_sb_field {
@@ -505,6 +494,9 @@ struct bch_sb_field {
x(downgrade, 14)
#include "alloc_background_format.h"
+#include "dirent_format.h"
+#include "disk_accounting_format.h"
+#include "disk_groups_format.h"
#include "extents_format.h"
#include "ec_format.h"
#include "dirent_format.h"
@@ -512,6 +504,7 @@ struct bch_sb_field {
#include "inode_format.h"
#include "journal_seq_blacklist_format.h"
#include "logged_ops_format.h"
+#include "lru_format.h"
#include "quota_format.h"
#include "reflink_format.h"
#include "replicas_format.h"
@@ -602,48 +595,6 @@ LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16);
LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32);
LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
-#define BCH_DATA_TYPES() \
- x(free, 0) \
- x(sb, 1) \
- x(journal, 2) \
- x(btree, 3) \
- x(user, 4) \
- x(cached, 5) \
- x(parity, 6) \
- x(stripe, 7) \
- x(need_gc_gens, 8) \
- x(need_discard, 9)
-
-enum bch_data_type {
-#define x(t, n) BCH_DATA_##t,
- BCH_DATA_TYPES()
-#undef x
- BCH_DATA_NR
-};
-
-static inline bool data_type_is_empty(enum bch_data_type type)
-{
- switch (type) {
- case BCH_DATA_free:
- case BCH_DATA_need_gc_gens:
- case BCH_DATA_need_discard:
- return true;
- default:
- return false;
- }
-}
-
-static inline bool data_type_is_hidden(enum bch_data_type type)
-{
- switch (type) {
- case BCH_DATA_sb:
- case BCH_DATA_journal:
- return true;
- default:
- return false;
- }
-}
-
/*
* On clean shutdown, store btree roots and current journal sequence number in
* the superblock:
@@ -722,7 +673,9 @@ struct bch_sb_field_ext {
x(member_seq, BCH_VERSION(1, 4)) \
x(subvolume_fs_parent, BCH_VERSION(1, 5)) \
x(btree_subvolume_children, BCH_VERSION(1, 6)) \
- x(mi_btree_bitmap, BCH_VERSION(1, 7))
+ x(mi_btree_bitmap, BCH_VERSION(1, 7)) \
+ x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \
+ x(disk_accounting_v2, BCH_VERSION(1, 9))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@@ -1174,7 +1127,6 @@ static inline bool jset_entry_is_key(struct jset_entry *e)
switch (e->type) {
case BCH_JSET_ENTRY_btree_keys:
case BCH_JSET_ENTRY_btree_root:
- case BCH_JSET_ENTRY_overwrite:
case BCH_JSET_ENTRY_write_buffer_keys:
return true;
}
@@ -1375,7 +1327,9 @@ enum btree_id_flags {
x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \
BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \
x(subvolume_children, 19, 0, \
- BIT_ULL(KEY_TYPE_set))
+ BIT_ULL(KEY_TYPE_set)) \
+ x(accounting, 20, BTREE_ID_SNAPSHOT_FIELD, \
+ BIT_ULL(KEY_TYPE_accounting)) \
enum btree_id {
#define x(name, nr, ...) BTREE_ID_##name = nr,
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index 4b8fba754b1c..3c23bdf788ce 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -5,6 +5,7 @@
#include <linux/uuid.h>
#include <asm/ioctl.h>
#include "bcachefs_format.h"
+#include "bkey_types.h"
/*
* Flags common to multiple ioctls:
@@ -85,6 +86,7 @@ struct bch_ioctl_incremental {
#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline)
#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online)
+#define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting)
/* ioctl below act on a particular file, not the filesystem as a whole: */
@@ -251,12 +253,18 @@ struct bch_replicas_usage {
struct bch_replicas_entry_v1 r;
} __packed;
+static inline unsigned replicas_usage_bytes(struct bch_replicas_usage *u)
+{
+ return offsetof(struct bch_replicas_usage, r) + replicas_entry_bytes(&u->r);
+}
+
static inline struct bch_replicas_usage *
replicas_usage_next(struct bch_replicas_usage *u)
{
- return (void *) u + replicas_entry_bytes(&u->r) + 8;
+ return (void *) u + replicas_usage_bytes(u);
}
+/* Obsolete */
/*
* BCH_IOCTL_FS_USAGE: query filesystem disk space usage
*
@@ -282,6 +290,7 @@ struct bch_ioctl_fs_usage {
struct bch_replicas_usage replicas[];
};
+/* Obsolete */
/*
* BCH_IOCTL_DEV_USAGE: query device disk space usage
*
@@ -306,6 +315,7 @@ struct bch_ioctl_dev_usage {
} d[10];
};
+/* Obsolete */
struct bch_ioctl_dev_usage_v2 {
__u64 dev;
__u32 flags;
@@ -409,4 +419,28 @@ struct bch_ioctl_fsck_online {
__u64 opts; /* string */
};
+/*
+ * BCH_IOCTL_QUERY_ACCOUNTING: query filesystem disk accounting
+ *
+ * Returns disk space usage broken out by data type, number of replicas, and
+ * by component device
+ *
+ * @replica_entries_bytes - size, in bytes, allocated for replica usage entries
+ *
+ * On success, @replica_entries_bytes will be changed to indicate the number of
+ * bytes actually used.
+ *
+ * Returns -ERANGE if @replica_entries_bytes was too small
+ */
+struct bch_ioctl_query_accounting {
+ __u64 capacity;
+ __u64 used;
+ __u64 online_reserved;
+
+ __u32 accounting_u64s; /* input parameter */
+ __u32 accounting_types_mask; /* input parameter */
+
+ struct bkey_i_accounting accounting[];
+};
+
#endif /* _BCACHEFS_IOCTL_H */
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 94a1d1982fa8..587d7318a2e8 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -660,8 +660,9 @@ int bch2_bkey_format_invalid(struct bch_fs *c,
bch2_bkey_format_field_overflows(f, i)) {
unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
- u64 packed_max = f->bits_per_field[i]
- ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
+ unsigned packed_bits = min(64, f->bits_per_field[i]);
+ u64 packed_max = packed_bits
+ ? ~((~0ULL << 1) << (packed_bits - 1))
: 0;
prt_printf(err, "field %u too large: %llu + %llu > %llu",
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index fcd43915df07..936357149cf0 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -194,6 +194,13 @@ static inline struct bpos bkey_max(struct bpos l, struct bpos r)
return bkey_gt(l, r) ? l : r;
}
+static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
+{
+ return bpos_eq(l.k->p, r.k->p) &&
+ bkey_bytes(l.k) == bkey_bytes(r.k) &&
+ !memcmp(l.v, r.v, bkey_val_bytes(l.k));
+}
+
void bch2_bpos_swab(struct bpos *);
void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index bd32aac05192..5f07cf853d0c 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -7,6 +7,7 @@
#include "btree_types.h"
#include "alloc_background.h"
#include "dirent.h"
+#include "disk_accounting.h"
#include "ec.h"
#include "error.h"
#include "extents.h"
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 4f5e411771ba..f5d85b50b6f2 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -602,8 +602,8 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure
struct btree_cache *bc = &c->btree_cache;
struct task_struct *old;
- old = cmpxchg(&bc->alloc_lock, NULL, current);
- if (old == NULL || old == current)
+ old = NULL;
+ if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current)
goto success;
if (!cl) {
@@ -614,8 +614,8 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure
closure_wait(&bc->alloc_wait, cl);
/* Try again, after adding ourselves to waitlist */
- old = cmpxchg(&bc->alloc_lock, NULL, current);
- if (old == NULL || old == current) {
+ old = NULL;
+ if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) {
/* We raced */
closure_wake_up(&bc->alloc_wait);
goto success;
@@ -1257,6 +1257,14 @@ const char *bch2_btree_id_str(enum btree_id btree)
return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)";
}
+void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree)
+{
+ if (btree < BTREE_ID_NR)
+ prt_str(out, __bch2_btree_ids[btree]);
+ else
+ prt_printf(out, "(unknown btree %u)", btree);
+}
+
void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
{
prt_printf(out, "%s level %u/%u\n ",
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index fed35de3e4de..c0eb87a057cc 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -132,6 +132,8 @@ static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b)
}
const char *bch2_btree_id_str(enum btree_id);
+void bch2_btree_id_to_text(struct printbuf *, enum btree_id);
+
void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 0e477a926579..6cbf2aa6a947 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -20,6 +20,7 @@
#include "buckets.h"
#include "clock.h"
#include "debug.h"
+#include "disk_accounting.h"
#include "ec.h"
#include "error.h"
#include "extents.h"
@@ -44,6 +45,22 @@
#define DROP_PREV_NODE 11
#define DID_FILL_FROM_SCAN 12
+static const char * const bch2_gc_phase_strs[] = {
+#define x(n) #n,
+ GC_PHASES()
+#undef x
+ NULL
+};
+
+void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p)
+{
+ prt_str(out, bch2_gc_phase_strs[p->phase]);
+ prt_char(out, ' ');
+ bch2_btree_id_to_text(out, p->btree);
+ prt_printf(out, " l=%u ", p->level);
+ bch2_bpos_to_text(out, p->pos);
+}
+
static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
{
return (struct bkey_s) {{{
@@ -174,10 +191,11 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
return 0;
}
-static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b,
+static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *b,
struct btree *prev, struct btree *cur,
struct bpos *pulled_from_scan)
{
+ struct bch_fs *c = trans->c;
struct bpos expected_start = !prev
? b->data->min_key
: bpos_successor(prev->key.k.p);
@@ -215,29 +233,29 @@ static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b,
*pulled_from_scan = cur->data->min_key;
ret = DID_FILL_FROM_SCAN;
} else {
- if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
+ if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
"btree node with incorrect min_key%s", buf.buf))
ret = set_node_min(c, cur, expected_start);
}
} else { /* overlap */
if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */
if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */
- if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_next_node,
+ if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_next_node,
"btree node overwritten by next node%s", buf.buf))
ret = DROP_PREV_NODE;
} else {
- if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
+ if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
"btree node with incorrect max_key%s", buf.buf))
ret = set_node_max(c, prev,
bpos_predecessor(cur->data->min_key));
}
} else {
if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */
- if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_prev_node,
+ if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_prev_node,
"btree node overwritten by prev node%s", buf.buf))
ret = DROP_THIS_NODE;
} else {
- if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
+ if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key,
"btree node with incorrect min_key%s", buf.buf))
ret = set_node_min(c, cur, expected_start);
}
@@ -249,9 +267,10 @@ fsck_err:
return ret;
}
-static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
+static int btree_repair_node_end(struct btree_trans *trans, struct btree *b,
struct btree *child, struct bpos *pulled_from_scan)
{
+ struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
int ret = 0;
@@ -265,7 +284,7 @@ static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
prt_str(&buf, "\n child: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
- if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
+ if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
"btree node with incorrect max_key%s", buf.buf)) {
if (b->c.level == 1 &&
bpos_lt(*pulled_from_scan, b->key.k.p)) {
@@ -324,8 +343,8 @@ again:
printbuf_reset(&buf);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
- if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), c,
- btree_node_unreadable,
+ if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
+ trans, btree_node_unreadable,
"Topology repair: unreadable btree node at btree %s level %u:\n"
" %s",
bch2_btree_id_str(b->c.btree_id),
@@ -362,7 +381,7 @@ again:
continue;
}
- ret = btree_check_node_boundaries(c, b, prev, cur, pulled_from_scan);
+ ret = btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan);
if (ret == DID_FILL_FROM_SCAN) {
new_pass = true;
ret = 0;
@@ -403,7 +422,7 @@ again:
if (!ret && !IS_ERR_OR_NULL(prev)) {
BUG_ON(cur);
- ret = btree_repair_node_end(c, b, prev, pulled_from_scan);
+ ret = btree_repair_node_end(trans, b, prev, pulled_from_scan);
if (ret == DID_FILL_FROM_SCAN) {
new_pass = true;
ret = 0;
@@ -461,8 +480,8 @@ again:
printbuf_reset(&buf);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- if (mustfix_fsck_err_on(!have_child, c,
- btree_node_topology_interior_node_empty,
+ if (mustfix_fsck_err_on(!have_child,
+ trans, btree_node_topology_interior_node_empty,
"empty interior btree node at btree %s level %u\n"
" %s",
bch2_btree_id_str(b->c.btree_id),
@@ -509,7 +528,7 @@ reconstruct_root:
r->error = 0;
if (!bch2_btree_has_scanned_nodes(c, i)) {
- mustfix_fsck_err(c, btree_root_unreadable_and_scan_found_nothing,
+ mustfix_fsck_err(trans, btree_root_unreadable_and_scan_found_nothing,
"no nodes found for btree %s, continue?", bch2_btree_id_str(i));
bch2_btree_root_alloc_fake_trans(trans, i, 0);
} else {
@@ -583,8 +602,9 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
BUG_ON(bch2_journal_seq_verify &&
k.k->version.lo > atomic64_read(&c->journal.seq));
- if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c,
- bkey_version_in_future,
+ if (fsck_err_on(btree_id != BTREE_ID_accounting &&
+ k.k->version.lo > atomic64_read(&c->key_version),
+ trans, bkey_version_in_future,
"key version number higher than recorded %llu\n %s",
atomic64_read(&c->key_version),
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
@@ -592,7 +612,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
}
if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
- c, btree_bitmap_not_marked,
+ trans, btree_bitmap_not_marked,
"btree ptr not marked in member info btree allocated bitmap\n %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k),
@@ -622,7 +642,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
}
ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
- BTREE_TRIGGER_gc|flags);
+ BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags);
out:
fsck_err:
printbuf_exit(&buf);
@@ -633,29 +653,14 @@ fsck_err:
static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial)
{
struct bch_fs *c = trans->c;
- int level = 0, target_depth = btree_node_type_needs_gc(__btree_node_type(0, btree)) ? 0 : 1;
+ unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1;
int ret = 0;
/* We need to make sure every leaf node is readable before going RW */
if (initial)
target_depth = 0;
- /* root */
- mutex_lock(&c->btree_root_lock);
- struct btree *b = bch2_btree_id_root(c, btree)->b;
- if (!btree_node_fake(b)) {
- gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX));
- ret = lockrestart_do(trans,
- bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1,
- NULL, NULL, bkey_i_to_s_c(&b->key), initial));
- level = b->c.level;
- }
- mutex_unlock(&c->btree_root_lock);
-
- if (ret)
- return ret;
-
- for (; level >= target_depth; --level) {
+ for (unsigned level = target_depth; level < BTREE_MAX_DEPTH; level++) {
struct btree *prev = NULL;
struct btree_iter iter;
bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level,
@@ -666,9 +671,35 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in
bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial);
}));
if (ret)
- break;
+ goto err;
}
+ /* root */
+ do {
+retry_root:
+ bch2_trans_begin(trans);
+
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN,
+ 0, bch2_btree_id_root(c, btree)->b->c.level, 0);
+ struct btree *b = bch2_btree_iter_peek_node(&iter);
+ ret = PTR_ERR_OR_ZERO(b);
+ if (ret)
+ goto err_root;
+
+ if (b != btree_node_root(c, b)) {
+ bch2_trans_iter_exit(trans, &iter);
+ goto retry_root;
+ }
+
+ gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX));
+ struct bkey_s_c k = bkey_i_to_s_c(&b->key);
+ ret = bch2_gc_mark_key(trans, btree, b->c.level + 1, NULL, NULL, k, initial);
+err_root:
+ bch2_trans_iter_exit(trans, &iter);
+ } while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
+err:
+ bch_err_fn(c, ret);
return ret;
}
@@ -697,7 +728,7 @@ static int bch2_gc_btrees(struct bch_fs *c)
ret = bch2_gc_btree(trans, btree, true);
if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
- c, btree_node_read_error,
+ trans, btree_node_read_error,
"btree node read error for %s",
bch2_btree_id_str(btree)))
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
@@ -720,131 +751,25 @@ static int bch2_mark_superblocks(struct bch_fs *c)
static void bch2_gc_free(struct bch_fs *c)
{
+ bch2_accounting_gc_free(c);
+
genradix_free(&c->reflink_gc_table);
genradix_free(&c->gc_stripes);
for_each_member_device(c, ca) {
kvfree(rcu_dereference_protected(ca->buckets_gc, 1));
ca->buckets_gc = NULL;
-
- free_percpu(ca->usage_gc);
- ca->usage_gc = NULL;
- }
-
- free_percpu(c->usage_gc);
- c->usage_gc = NULL;
-}
-
-static int bch2_gc_done(struct bch_fs *c)
-{
- struct bch_dev *ca = NULL;
- struct printbuf buf = PRINTBUF;
- unsigned i;
- int ret = 0;
-
- percpu_down_write(&c->mark_lock);
-
-#define copy_field(_err, _f, _msg, ...) \
- if (fsck_err_on(dst->_f != src->_f, c, _err, \
- _msg ": got %llu, should be %llu" , ##__VA_ARGS__, \
- dst->_f, src->_f)) \
- dst->_f = src->_f
-#define copy_dev_field(_err, _f, _msg, ...) \
- copy_field(_err, _f, "dev %u has wrong " _msg, ca->dev_idx, ##__VA_ARGS__)
-#define copy_fs_field(_err, _f, _msg, ...) \
- copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__)
-
- for (i = 0; i < ARRAY_SIZE(c->usage); i++)
- bch2_fs_usage_acc_to_base(c, i);
-
- __for_each_member_device(c, ca) {
- struct bch_dev_usage *dst = ca->usage_base;
- struct bch_dev_usage *src = (void *)
- bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc,
- dev_usage_u64s());
-
- for (i = 0; i < BCH_DATA_NR; i++) {
- copy_dev_field(dev_usage_buckets_wrong,
- d[i].buckets, "%s buckets", bch2_data_type_str(i));
- copy_dev_field(dev_usage_sectors_wrong,
- d[i].sectors, "%s sectors", bch2_data_type_str(i));
- copy_dev_field(dev_usage_fragmented_wrong,
- d[i].fragmented, "%s fragmented", bch2_data_type_str(i));
- }
- }
-
- {
- unsigned nr = fs_usage_u64s(c);
- struct bch_fs_usage *dst = c->usage_base;
- struct bch_fs_usage *src = (void *)
- bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
-
- copy_fs_field(fs_usage_hidden_wrong,
- b.hidden, "hidden");
- copy_fs_field(fs_usage_btree_wrong,
- b.btree, "btree");
-
- copy_fs_field(fs_usage_data_wrong,
- b.data, "data");
- copy_fs_field(fs_usage_cached_wrong,
- b.cached, "cached");
- copy_fs_field(fs_usage_reserved_wrong,
- b.reserved, "reserved");
- copy_fs_field(fs_usage_nr_inodes_wrong,
- b.nr_inodes,"nr_inodes");
-
- for (i = 0; i < BCH_REPLICAS_MAX; i++)
- copy_fs_field(fs_usage_persistent_reserved_wrong,
- persistent_reserved[i],
- "persistent_reserved[%i]", i);
-
- for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry_v1 *e =
- cpu_replicas_entry(&c->replicas, i);
-
- printbuf_reset(&buf);
- bch2_replicas_entry_to_text(&buf, e);
-
- copy_fs_field(fs_usage_replicas_wrong,
- replicas[i], "%s", buf.buf);
- }
}
-
-#undef copy_fs_field
-#undef copy_dev_field
-#undef copy_stripe_field
-#undef copy_field
-fsck_err:
- bch2_dev_put(ca);
- bch_err_fn(c, ret);
- percpu_up_write(&c->mark_lock);
- printbuf_exit(&buf);
- return ret;
}
static int bch2_gc_start(struct bch_fs *c)
{
- BUG_ON(c->usage_gc);
-
- c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
- sizeof(u64), GFP_KERNEL);
- if (!c->usage_gc) {
- bch_err(c, "error allocating c->usage_gc");
- return -BCH_ERR_ENOMEM_gc_start;
- }
-
for_each_member_device(c, ca) {
- BUG_ON(ca->usage_gc);
-
- ca->usage_gc = alloc_percpu(struct bch_dev_usage);
- if (!ca->usage_gc) {
- bch_err(c, "error allocating ca->usage_gc");
+ int ret = bch2_dev_usage_init(ca, true);
+ if (ret) {
bch2_dev_put(ca);
- return -BCH_ERR_ENOMEM_gc_start;
+ return ret;
}
-
- this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets,
- ca->mi.nbuckets - ca->mi.first_bucket);
}
return 0;
@@ -858,6 +783,7 @@ static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
l.oldest_gen != r.oldest_gen ||
l.data_type != r.data_type ||
l.dirty_sectors != r.dirty_sectors ||
+ l.stripe_sectors != r.stripe_sectors ||
l.cached_sectors != r.cached_sectors ||
l.stripe_redundancy != r.stripe_redundancy ||
l.stripe != r.stripe;
@@ -891,6 +817,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
gc.data_type = old->data_type;
gc.dirty_sectors = old->dirty_sectors;
}
+ percpu_up_read(&c->mark_lock);
/*
* gc.data_type doesn't yet include need_discard & need_gc_gen states -
@@ -899,12 +826,16 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
alloc_data_type_set(&gc, gc.data_type);
if (gc.data_type != old_gc.data_type ||
- gc.dirty_sectors != old_gc.dirty_sectors)
- bch2_dev_usage_update(c, ca, &old_gc, &gc, 0, true);
- percpu_up_read(&c->mark_lock);
+ gc.dirty_sectors != old_gc.dirty_sectors) {
+ ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc);
+ if (ret)
+ return ret;
+ }
+
+ gc.fragmentation_lru = alloc_lru_idx_fragmentation(gc, ca);
- if (fsck_err_on(new.data_type != gc.data_type, c,
- alloc_key_data_type_wrong,
+ if (fsck_err_on(new.data_type != gc.data_type,
+ trans, alloc_key_data_type_wrong,
"bucket %llu:%llu gen %u has wrong data_type"
": got %s, should be %s",
iter->pos.inode, iter->pos.offset,
@@ -914,25 +845,23 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
new.data_type = gc.data_type;
#define copy_bucket_field(_errtype, _f) \
- if (fsck_err_on(new._f != gc._f, c, _errtype, \
+ if (fsck_err_on(new._f != gc._f, \
+ trans, _errtype, \
"bucket %llu:%llu gen %u data type %s has wrong " #_f \
- ": got %u, should be %u", \
+ ": got %llu, should be %llu", \
iter->pos.inode, iter->pos.offset, \
gc.gen, \
bch2_data_type_str(gc.data_type), \
- new._f, gc._f)) \
+ (u64) new._f, (u64) gc._f)) \
new._f = gc._f; \
- copy_bucket_field(alloc_key_gen_wrong,
- gen);
- copy_bucket_field(alloc_key_dirty_sectors_wrong,
- dirty_sectors);
- copy_bucket_field(alloc_key_cached_sectors_wrong,
- cached_sectors);
- copy_bucket_field(alloc_key_stripe_wrong,
- stripe);
- copy_bucket_field(alloc_key_stripe_redundancy_wrong,
- stripe_redundancy);
+ copy_bucket_field(alloc_key_gen_wrong, gen);
+ copy_bucket_field(alloc_key_dirty_sectors_wrong, dirty_sectors);
+ copy_bucket_field(alloc_key_stripe_sectors_wrong, stripe_sectors);
+ copy_bucket_field(alloc_key_cached_sectors_wrong, cached_sectors);
+ copy_bucket_field(alloc_key_stripe_wrong, stripe);
+ copy_bucket_field(alloc_key_stripe_redundancy_wrong, stripe_redundancy);
+ copy_bucket_field(alloc_key_fragmentation_lru_wrong, fragmentation_lru);
#undef copy_bucket_field
if (!bch2_alloc_v4_cmp(*old, new))
@@ -946,7 +875,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
a->v = new;
/*
- * The trigger normally makes sure this is set, but we're not running
+ * The trigger normally makes sure these are set, but we're not running
* triggers:
*/
if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
@@ -981,14 +910,16 @@ static int bch2_gc_alloc_done(struct bch_fs *c)
static int bch2_gc_alloc_start(struct bch_fs *c)
{
+ int ret = 0;
+
for_each_member_device(c, ca) {
struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO);
if (!buckets) {
bch2_dev_put(ca);
- bch_err(c, "error allocating ca->buckets[gc]");
- return -BCH_ERR_ENOMEM_gc_alloc_start;
+ ret = -BCH_ERR_ENOMEM_gc_alloc_start;
+ break;
}
buckets->first_bucket = ca->mi.first_bucket;
@@ -998,27 +929,6 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
rcu_assign_pointer(ca->buckets_gc, buckets);
}
- struct bch_dev *ca = NULL;
- int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_prefetch, k, ({
- ca = bch2_dev_iterate(c, ca, k.k->p.inode);
- if (!ca) {
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
- continue;
- }
-
- if (bucket_valid(ca, k.k->p.offset)) {
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
-
- struct bucket *g = gc_bucket(ca, k.k->p.offset);
- g->gen_valid = 1;
- g->gen = a->gen;
- }
- 0;
- })));
- bch2_dev_put(ca);
bch_err_fn(c, ret);
return ret;
}
@@ -1048,8 +958,8 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
return -EINVAL;
}
- if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
- reflink_v_refcount_wrong,
+ if (fsck_err_on(r->refcount != le64_to_cpu(*refcount),
+ trans, reflink_v_refcount_wrong,
"reflink key has wrong refcount:\n"
" %s\n"
" should be %u",
@@ -1147,7 +1057,8 @@ static int bch2_gc_write_stripes_key(struct btree_trans *trans,
if (bad)
bch2_bkey_val_to_text(&buf, c, k);
- if (fsck_err_on(bad, c, stripe_sector_count_wrong,
+ if (fsck_err_on(bad,
+ trans, stripe_sector_count_wrong,
"%s", buf.buf)) {
struct bkey_i_stripe *new;
@@ -1210,7 +1121,8 @@ int bch2_check_allocations(struct bch_fs *c)
bch2_btree_interior_updates_flush(c);
- ret = bch2_gc_start(c) ?:
+ ret = bch2_gc_accounting_start(c) ?:
+ bch2_gc_start(c) ?:
bch2_gc_alloc_start(c) ?:
bch2_gc_reflink_start(c);
if (ret)
@@ -1219,7 +1131,9 @@ int bch2_check_allocations(struct bch_fs *c)
gc_pos_set(c, gc_phase(GC_PHASE_start));
ret = bch2_mark_superblocks(c);
- BUG_ON(ret);
+ bch_err_msg(c, ret, "marking superblocks");
+ if (ret)
+ goto out;
ret = bch2_gc_btrees(c);
if (ret)
@@ -1227,15 +1141,11 @@ int bch2_check_allocations(struct bch_fs *c)
c->gc_count++;
- bch2_journal_block(&c->journal);
-out:
ret = bch2_gc_alloc_done(c) ?:
- bch2_gc_done(c) ?:
+ bch2_gc_accounting_done(c) ?:
bch2_gc_stripes_done(c) ?:
bch2_gc_reflink_done(c);
-
- bch2_journal_unblock(&c->journal);
-
+out:
percpu_down_write(&c->mark_lock);
/* Indicates that gc is no longer in progress: */
__gc_pos_set(c, gc_phase(GC_PHASE_not_running));
@@ -1330,7 +1240,7 @@ int bch2_gc_gens(struct bch_fs *c)
int ret;
/*
- * Ideally we would be using state_lock and not gc_lock here, but that
+ * Ideally we would be using state_lock and not gc_gens_lock here, but that
* introduces a deadlock in the RO path - we currently take the state
* lock at the start of going RO, thus the gc thread may get stuck:
*/
@@ -1338,7 +1248,8 @@ int bch2_gc_gens(struct bch_fs *c)
return 0;
trace_and_count(c, gc_gens_start, c);
- down_read(&c->gc_lock);
+
+ down_read(&c->state_lock);
for_each_member_device(c, ca) {
struct bucket_gens *gens = bucket_gens(ca);
@@ -1407,7 +1318,7 @@ err:
ca->oldest_gen = NULL;
}
- up_read(&c->gc_lock);
+ up_read(&c->state_lock);
mutex_unlock(&c->gc_gens_lock);
if (!bch2_err_matches(ret, EROFS))
bch_err_fn(c, ret);
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 876d81e2017d..8a47e8bd0791 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -47,17 +47,10 @@ static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level,
};
}
-/*
- * GC position of the pointers within a btree node: note, _not_ for &b->key
- * itself, that lives in the parent node:
- */
-static inline struct gc_pos gc_pos_btree_node(struct btree *b)
-{
- return gc_pos_btree(b->c.btree_id, b->c.level, b->key.k.p);
-}
-
static inline int gc_btree_order(enum btree_id btree)
{
+ if (btree == BTREE_ID_alloc)
+ return -2;
if (btree == BTREE_ID_stripes)
return -1;
return btree;
@@ -65,11 +58,11 @@ static inline int gc_btree_order(enum btree_id btree)
static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
{
- return cmp_int(l.phase, r.phase) ?:
- cmp_int(gc_btree_order(l.btree),
- gc_btree_order(r.btree)) ?:
- -cmp_int(l.level, r.level) ?:
- bpos_cmp(l.pos, r.pos);
+ return cmp_int(l.phase, r.phase) ?:
+ cmp_int(gc_btree_order(l.btree),
+ gc_btree_order(r.btree)) ?:
+ cmp_int(l.level, r.level) ?:
+ bpos_cmp(l.pos, r.pos);
}
static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
@@ -85,6 +78,8 @@ static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
return ret;
}
+void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *);
+
int bch2_gc_gens(struct bch_fs *);
void bch2_gc_gens_async(struct bch_fs *);
void bch2_fs_gc_init(struct bch_fs *);
diff --git a/fs/bcachefs/btree_gc_types.h b/fs/bcachefs/btree_gc_types.h
index b82c24bcc088..c24dd6edf377 100644
--- a/fs/bcachefs/btree_gc_types.h
+++ b/fs/bcachefs/btree_gc_types.h
@@ -4,11 +4,16 @@
#include <linux/generic-radix-tree.h>
+#define GC_PHASES() \
+ x(not_running) \
+ x(start) \
+ x(sb) \
+ x(btree)
+
enum gc_phase {
- GC_PHASE_not_running,
- GC_PHASE_start,
- GC_PHASE_sb,
- GC_PHASE_btree,
+#define x(n) GC_PHASE_##n,
+ GC_PHASES()
+#undef x
};
struct gc_pos {
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 7bca15c604f5..2c424435ca4a 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -46,8 +46,6 @@ void bch2_btree_node_io_unlock(struct btree *b)
void bch2_btree_node_io_lock(struct btree *b)
{
- bch2_assert_btree_nodes_not_locked();
-
wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
TASK_UNINTERRUPTIBLE);
}
@@ -66,16 +64,12 @@ void __bch2_btree_node_wait_on_write(struct btree *b)
void bch2_btree_node_wait_on_read(struct btree *b)
{
- bch2_assert_btree_nodes_not_locked();
-
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
TASK_UNINTERRUPTIBLE);
}
void bch2_btree_node_wait_on_write(struct btree *b)
{
- bch2_assert_btree_nodes_not_locked();
-
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
TASK_UNINTERRUPTIBLE);
}
@@ -534,7 +528,7 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
printbuf_indent_add(out, 2);
prt_printf(out, "\nnode offset %u/%u",
- b->written, btree_ptr_sectors_written(&b->key));
+ b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)));
if (i)
prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
if (k)
@@ -585,7 +579,7 @@ static int __btree_err(int ret,
switch (ret) {
case -BCH_ERR_btree_node_read_err_fixable:
ret = !silent
- ? bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf)
+ ? __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf)
: -BCH_ERR_fsck_fix;
if (ret != -BCH_ERR_fsck_fix &&
ret != -BCH_ERR_fsck_ignore)
@@ -689,6 +683,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
int write, bool have_retry, bool *saw_error)
{
unsigned version = le16_to_cpu(i->version);
+ unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
struct printbuf buf1 = PRINTBUF;
struct printbuf buf2 = PRINTBUF;
int ret = 0;
@@ -732,11 +727,13 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
btree_node_unsupported_version,
"BSET_SEPARATE_WHITEOUTS no longer supported");
- if (btree_err_on(offset + sectors > btree_sectors(c),
+ if (!write &&
+ btree_err_on(offset + sectors > (ptr_written ?: btree_sectors(c)),
-BCH_ERR_btree_node_read_err_fixable,
c, ca, b, i, NULL,
bset_past_end_of_btree_node,
- "bset past end of btree node")) {
+ "bset past end of btree node (offset %u len %u but written %zu)",
+ offset, sectors, ptr_written ?: btree_sectors(c))) {
i->u64s = 0;
ret = 0;
goto out;
@@ -1002,7 +999,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
unsigned u64s;
- unsigned ptr_written = btree_ptr_sectors_written(&b->key);
+ unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
+ u64 max_journal_seq = 0;
struct printbuf buf = PRINTBUF;
int ret = 0, retry_read = 0, write = READ;
u64 start_time = local_clock();
@@ -1178,6 +1176,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
sort_iter_add(iter,
vstruct_idx(i, 0),
vstruct_last(i));
+
+ max_journal_seq = max(max_journal_seq, le64_to_cpu(i->journal_seq));
}
if (ptr_written) {
@@ -1214,6 +1214,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
swap(sorted, b->data);
set_btree_bset(b, b->set, &b->data->keys);
b->nsets = 1;
+ b->data->keys.journal_seq = cpu_to_le64(max_journal_seq);
BUG_ON(b->nr.live_u64s != u64s);
@@ -1796,15 +1797,16 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
struct btree_write *w)
{
- unsigned long old, new, v = READ_ONCE(b->will_make_reachable);
+ unsigned long old, new;
+ old = READ_ONCE(b->will_make_reachable);
do {
- old = new = v;
+ new = old;
if (!(old & 1))
break;
new &= ~1UL;
- } while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old);
+ } while (!try_cmpxchg(&b->will_make_reachable, &old, new));
if (old & 1)
closure_put(&((struct btree_update *) new)->cl);
@@ -1815,14 +1817,14 @@ static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
{
struct btree_write *w = btree_prev_write(b);
- unsigned long old, new, v;
+ unsigned long old, new;
unsigned type = 0;
bch2_btree_complete_write(c, b, w);
- v = READ_ONCE(b->flags);
+ old = READ_ONCE(b->flags);
do {
- old = new = v;
+ new = old;
if ((old & (1U << BTREE_NODE_dirty)) &&
(old & (1U << BTREE_NODE_need_write)) &&
@@ -1842,7 +1844,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
new &= ~(1U << BTREE_NODE_write_in_flight);
new &= ~(1U << BTREE_NODE_write_in_flight_inner);
}
- } while ((v = cmpxchg(&b->flags, old, new)) != old);
+ } while (!try_cmpxchg(&b->flags, &old, new));
if (new & (1U << BTREE_NODE_write_in_flight))
__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type);
@@ -2014,8 +2016,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
* dirty bit requires a write lock, we can't race with other threads
* redirtying it:
*/
+ old = READ_ONCE(b->flags);
do {
- old = new = READ_ONCE(b->flags);
+ new = old;
if (!(old & (1 << BTREE_NODE_dirty)))
return;
@@ -2046,7 +2049,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
new |= (1 << BTREE_NODE_write_in_flight_inner);
new |= (1 << BTREE_NODE_just_written);
new ^= (1 << BTREE_NODE_write_idx);
- } while (cmpxchg_acquire(&b->flags, old, new) != old);
+ } while (!try_cmpxchg_acquire(&b->flags, &old, new));
if (new & (1U << BTREE_NODE_need_write))
return;
@@ -2133,7 +2136,7 @@ do_write:
if (!b->written &&
b->key.k.type == KEY_TYPE_btree_ptr_v2)
- BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write);
+ BUG_ON(btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)) != sectors_to_write);
memset(data + bytes_to_write, 0,
(sectors_to_write << 9) - bytes_to_write);
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 2b8b564fc560..63d76f5c6403 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -27,10 +27,10 @@ static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b
atomic_dec(&c->btree_cache.dirty);
}
-static inline unsigned btree_ptr_sectors_written(struct bkey_i *k)
+static inline unsigned btree_ptr_sectors_written(struct bkey_s_c k)
{
- return k->k.type == KEY_TYPE_btree_ptr_v2
- ? le16_to_cpu(bkey_i_to_btree_ptr_v2(k)->v.sectors_written)
+ return k.k->type == KEY_TYPE_btree_ptr_v2
+ ? le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors_written)
: 0;
}
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 3a1419d17888..36872207f09b 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -325,7 +325,7 @@ out:
}
void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
- struct bpos pos, bool key_cache)
+ struct bpos pos)
{
bch2_trans_verify_not_unlocked(trans);
@@ -336,19 +336,12 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
btree_trans_sort_paths(trans);
trans_for_each_path_inorder(trans, path, iter) {
- int cmp = cmp_int(path->btree_id, id) ?:
- cmp_int(path->cached, key_cache);
-
- if (cmp > 0)
- break;
- if (cmp < 0)
- continue;
-
- if (!btree_node_locked(path, 0) ||
+ if (path->btree_id != id ||
+ !btree_node_locked(path, 0) ||
!path->should_be_locked)
continue;
- if (!key_cache) {
+ if (!path->cached) {
if (bkey_ge(pos, path->l[0].b->data->min_key) &&
bkey_le(pos, path->l[0].b->key.k.p))
return;
@@ -361,9 +354,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
bch2_dump_trans_paths_updates(trans);
bch2_bpos_to_text(&buf, pos);
- panic("not locked: %s %s%s\n",
- bch2_btree_id_str(id), buf.buf,
- key_cache ? " cached" : "");
+ panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf);
}
#else
@@ -996,7 +987,7 @@ retry_all:
bch2_trans_unlock(trans);
cond_resched();
- trans->locked = true;
+ trans_set_locked(trans);
if (unlikely(trans->memory_allocation_failure)) {
struct closure cl;
@@ -1465,7 +1456,7 @@ void bch2_dump_trans_updates(struct btree_trans *trans)
struct printbuf buf = PRINTBUF;
bch2_trans_updates_to_text(&buf, trans);
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ bch2_print_str(trans->c, buf.buf);
printbuf_exit(&buf);
}
@@ -1482,6 +1473,14 @@ static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_tra
path->level);
bch2_bpos_to_text(out, path->pos);
+ if (!path->cached && btree_node_locked(path, path->level)) {
+ prt_char(out, ' ');
+ struct btree *b = path_l(path)->b;
+ bch2_bpos_to_text(out, b->data->min_key);
+ prt_char(out, '-');
+ bch2_bpos_to_text(out, b->key.k.p);
+ }
+
#ifdef TRACK_PATH_ALLOCATED
prt_printf(out, " %pS", (void *) path->ip_allocated);
#endif
@@ -1557,7 +1556,7 @@ void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
__bch2_trans_paths_to_text(&buf, trans, nosort);
bch2_trans_updates_to_text(&buf, trans);
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ bch2_print_str(trans->c, buf.buf);
printbuf_exit(&buf);
}
@@ -1801,13 +1800,12 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *
goto hole;
} else {
struct bkey_cached *ck = (void *) path->l[0].b;
-
- EBUG_ON(ck &&
- (path->btree_id != ck->key.btree_id ||
- !bkey_eq(path->pos, ck->key.pos)));
- if (!ck || !ck->valid)
+ if (!ck)
return bkey_s_c_null;
+ EBUG_ON(path->btree_id != ck->key.btree_id ||
+ !bkey_eq(path->pos, ck->key.pos));
+
*u = ck->k->k;
k = bkey_i_to_s_c(ck->k);
}
@@ -3089,7 +3087,8 @@ u32 bch2_trans_begin(struct btree_trans *trans)
bch2_trans_srcu_unlock(trans);
trans->last_begin_ip = _RET_IP_;
- trans->locked = true;
+
+ trans_set_locked(trans);
if (trans->restarted) {
bch2_btree_path_traverse_all(trans);
@@ -3130,7 +3129,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
memset(trans, 0, sizeof(*trans));
- closure_init_stack(&trans->ref);
seqmutex_lock(&c->btree_trans_lock);
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
@@ -3150,23 +3148,16 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
BUG_ON(pos_task &&
pid == pos_task->pid &&
pos->locked);
-
- if (pos_task && pid < pos_task->pid) {
- list_add_tail(&trans->list, &pos->list);
- goto list_add_done;
- }
}
}
- list_add_tail(&trans->list, &c->btree_trans_list);
-list_add_done:
+
+ list_add(&trans->list, &c->btree_trans_list);
seqmutex_unlock(&c->btree_trans_lock);
got_trans:
- trans->ref.closure_get_happened = false;
trans->c = c;
trans->last_begin_time = local_clock();
trans->fn_idx = fn_idx;
trans->locking_wait.task = current;
- trans->locked = true;
trans->journal_replay_not_finished =
unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)) &&
atomic_inc_not_zero(&c->journal_keys.ref);
@@ -3180,6 +3171,9 @@ got_trans:
trans->paths_allocated[0] = 1;
+ static struct lock_class_key lockdep_key;
+ lockdep_init_map(&trans->dep_map, "bcachefs_btree", &lockdep_key, 0);
+
if (fn_idx < BCH_TRANSACTIONS_NR) {
trans->fn = bch2_btree_transaction_fns[fn_idx];
@@ -3200,6 +3194,9 @@ got_trans:
trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
trans->srcu_lock_time = jiffies;
trans->srcu_held = true;
+ trans_set_locked(trans);
+
+ closure_init_stack_release(&trans->ref);
return trans;
}
@@ -3244,23 +3241,14 @@ void bch2_trans_put(struct btree_trans *trans)
srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
}
- if (trans->fs_usage_deltas) {
- if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
- REPLICAS_DELTA_LIST_MAX)
- mempool_free(trans->fs_usage_deltas,
- &c->replicas_delta_pool);
- else
- kfree(trans->fs_usage_deltas);
- }
-
if (unlikely(trans->journal_replay_not_finished))
bch2_journal_keys_put(c);
/*
- * trans->ref protects trans->locking_wait.task, btree_paths arary; used
+ * trans->ref protects trans->locking_wait.task, btree_paths array; used
* by cycle detector
*/
- closure_sync(&trans->ref);
+ closure_return_sync(&trans->ref);
trans->locking_wait.task = NULL;
unsigned long *paths_allocated = trans->paths_allocated;
@@ -3288,6 +3276,21 @@ void bch2_trans_put(struct btree_trans *trans)
}
}
+bool bch2_current_has_btree_trans(struct bch_fs *c)
+{
+ seqmutex_lock(&c->btree_trans_lock);
+ struct btree_trans *trans;
+ bool ret = false;
+ list_for_each_entry(trans, &c->btree_trans_list, list)
+ if (trans->locking_wait.task == current &&
+ trans->locked) {
+ ret = true;
+ break;
+ }
+ seqmutex_unlock(&c->btree_trans_lock);
+ return ret;
+}
+
static void __maybe_unused
bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
struct btree_bkey_cached_common *b)
@@ -3385,8 +3388,6 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
per_cpu_ptr(c->btree_trans_bufs, cpu)->trans;
if (trans) {
- closure_sync(&trans->ref);
-
seqmutex_lock(&c->btree_trans_lock);
list_del(&trans->list);
seqmutex_unlock(&c->btree_trans_lock);
@@ -3443,7 +3444,22 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
BTREE_TRANS_MEM_MAX) ?:
init_srcu_struct(&c->btree_trans_barrier);
- if (!ret)
- c->btree_trans_barrier_initialized = true;
- return ret;
+ if (ret)
+ return ret;
+
+ /*
+ * static annotation (hackily done) for lock ordering of reclaim vs.
+ * btree node locks:
+ */
+#ifdef CONFIG_LOCKDEP
+ fs_reclaim_acquire(GFP_KERNEL);
+ struct btree_trans *trans = bch2_trans_get(c);
+ trans_set_locked(trans);
+ bch2_trans_put(trans);
+ fs_reclaim_release(GFP_KERNEL);
+#endif
+
+ c->btree_trans_barrier_initialized = true;
+ return 0;
+
}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 798eb1c47966..c7725865309c 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -268,12 +268,11 @@ static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_trans_verify_paths(struct btree_trans *);
-void bch2_assert_pos_locked(struct btree_trans *, enum btree_id,
- struct bpos, bool);
+void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos);
#else
static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
- struct bpos pos, bool key_cache) {}
+ struct bpos pos) {}
#endif
void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
@@ -866,6 +865,14 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
_p; \
})
+#define bch2_trans_run(_c, _do) \
+({ \
+ struct btree_trans *trans = bch2_trans_get(_c); \
+ int _ret = (_do); \
+ bch2_trans_put(trans); \
+ _ret; \
+})
+
void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
@@ -875,6 +882,8 @@ void bch2_dump_trans_paths_updates(struct btree_trans *);
struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
void bch2_trans_put(struct btree_trans *);
+bool bch2_current_has_btree_trans(struct bch_fs *);
+
extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
unsigned bch2_trans_get_fn_idx(const char *);
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index 332dbf164929..74933490aaba 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -16,21 +16,6 @@
* operations for the regular btree iter code to use:
*/
-static int __journal_key_cmp(enum btree_id l_btree_id,
- unsigned l_level,
- struct bpos l_pos,
- const struct journal_key *r)
-{
- return (cmp_int(l_btree_id, r->btree_id) ?:
- cmp_int(l_level, r->level) ?:
- bpos_cmp(l_pos, r->k->k.p));
-}
-
-static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
-{
- return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
-}
-
static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
{
size_t gap_size = keys->size - keys->nr;
@@ -548,7 +533,13 @@ static void __journal_keys_sort(struct journal_keys *keys)
struct journal_key *dst = keys->data;
darray_for_each(*keys, src) {
- if (src + 1 < &darray_top(*keys) &&
+ /*
+ * We don't accumulate accounting keys here because we have to
+ * compare each individual accounting key against the version in
+ * the btree during replay:
+ */
+ if (src->k->k.type != KEY_TYPE_accounting &&
+ src + 1 < &darray_top(*keys) &&
!journal_key_cmp(src, src + 1))
continue;
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
index 1ba4a79b0ef9..1653de9d609b 100644
--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -2,6 +2,8 @@
#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H
#define _BCACHEFS_BTREE_JOURNAL_ITER_H
+#include "bkey.h"
+
struct journal_iter {
struct list_head list;
enum btree_id btree_id;
@@ -26,6 +28,21 @@ struct btree_and_journal_iter {
bool prefetch;
};
+static inline int __journal_key_cmp(enum btree_id l_btree_id,
+ unsigned l_level,
+ struct bpos l_pos,
+ const struct journal_key *r)
+{
+ return (cmp_int(l_btree_id, r->btree_id) ?:
+ cmp_int(l_level, r->level) ?:
+ bpos_cmp(l_pos, r->k->k.p));
+}
+
+static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
+{
+ return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
+}
+
struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
unsigned, struct bpos, struct bpos, size_t *);
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 2d3c0d45c37f..f2f2e525460b 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -39,6 +39,15 @@ static const struct rhashtable_params bch2_btree_key_cache_params = {
.automatic_shrinking = true,
};
+static inline void btree_path_cached_set(struct btree_trans *trans, struct btree_path *path,
+ struct bkey_cached *ck,
+ enum btree_node_locked_type lock_held)
+{
+ path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
+ path->l[0].b = (void *) ck;
+ mark_btree_node_locked(trans, path, 0, lock_held);
+}
+
__flatten
inline struct bkey_cached *
bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
@@ -196,9 +205,22 @@ static void bkey_cached_free_fast(struct btree_key_cache *bc,
six_unlock_intent(&ck->c.lock);
}
+static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
+{
+ struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp);
+ if (unlikely(!ck))
+ return NULL;
+ ck->k = kmalloc(key_u64s * sizeof(u64), gfp);
+ if (unlikely(!ck->k)) {
+ kmem_cache_free(bch2_key_cache, ck);
+ return NULL;
+ }
+ ck->u64s = key_u64s;
+ return ck;
+}
+
static struct bkey_cached *
-bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
- bool *was_new)
+bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s)
{
struct bch_fs *c = trans->c;
struct btree_key_cache *bc = &c->btree_key_cache;
@@ -259,9 +281,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
return ERR_PTR(ret);
}
- path->l[0].b = (void *) ck;
- path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
- mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
+ btree_path_cached_set(trans, path, ck, BTREE_NODE_INTENT_LOCKED);
ret = bch2_btree_node_lock_write(trans, path, &ck->c);
if (unlikely(ret)) {
@@ -274,8 +294,10 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
}
ck = allocate_dropping_locks(trans, ret,
- kmem_cache_zalloc(bch2_key_cache, _gfp));
+ __bkey_cached_alloc(key_u64s, _gfp));
if (ret) {
+ if (ck)
+ kfree(ck->k);
kmem_cache_free(bch2_key_cache, ck);
return ERR_PTR(ret);
}
@@ -289,7 +311,6 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
ck->c.cached = true;
BUG_ON(!six_trylock_intent(&ck->c.lock));
BUG_ON(!six_trylock_write(&ck->c.lock));
- *was_new = true;
return ck;
}
@@ -319,71 +340,102 @@ out:
return ck;
}
-static struct bkey_cached *
-btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
+static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *path,
+ struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct btree_key_cache *bc = &c->btree_key_cache;
- struct bkey_cached *ck;
- bool was_new = false;
- ck = bkey_cached_alloc(trans, path, &was_new);
- if (IS_ERR(ck))
- return ck;
+ /*
+ * bch2_varint_decode can read past the end of the buffer by at
+ * most 7 bytes (it won't be used):
+ */
+ unsigned key_u64s = k.k->u64s + 1;
+
+ /*
+ * Allocate some extra space so that the transaction commit path is less
+ * likely to have to reallocate, since that requires a transaction
+ * restart:
+ */
+ key_u64s = min(256U, (key_u64s * 3) / 2);
+ key_u64s = roundup_pow_of_two(key_u64s);
+
+ struct bkey_cached *ck = bkey_cached_alloc(trans, path, key_u64s);
+ int ret = PTR_ERR_OR_ZERO(ck);
+ if (ret)
+ return ret;
if (unlikely(!ck)) {
ck = bkey_cached_reuse(bc);
if (unlikely(!ck)) {
bch_err(c, "error allocating memory for key cache item, btree %s",
bch2_btree_id_str(path->btree_id));
- return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create);
+ return -BCH_ERR_ENOMEM_btree_key_cache_create;
}
-
- mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
}
ck->c.level = 0;
ck->c.btree_id = path->btree_id;
ck->key.btree_id = path->btree_id;
ck->key.pos = path->pos;
- ck->valid = false;
ck->flags = 1U << BKEY_CACHED_ACCESSED;
- if (unlikely(rhashtable_lookup_insert_fast(&bc->table,
- &ck->hash,
- bch2_btree_key_cache_params))) {
- /* We raced with another fill: */
-
- if (likely(was_new)) {
- six_unlock_write(&ck->c.lock);
- six_unlock_intent(&ck->c.lock);
- kfree(ck);
- } else {
- bkey_cached_free_fast(bc, ck);
+ if (unlikely(key_u64s > ck->u64s)) {
+ mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
+
+ struct bkey_i *new_k = allocate_dropping_locks(trans, ret,
+ kmalloc(key_u64s * sizeof(u64), _gfp));
+ if (unlikely(!new_k)) {
+ bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
+ bch2_btree_id_str(ck->key.btree_id), key_u64s);
+ ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
+ } else if (ret) {
+ kfree(new_k);
+ goto err;
}
- mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
- return NULL;
+ kfree(ck->k);
+ ck->k = new_k;
+ ck->u64s = key_u64s;
}
- atomic_long_inc(&bc->nr_keys);
+ bkey_reassemble(ck->k, k);
+
+ ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params);
+ if (unlikely(ret)) /* raced with another fill? */
+ goto err;
+ atomic_long_inc(&bc->nr_keys);
six_unlock_write(&ck->c.lock);
- return ck;
+ enum six_lock_type lock_want = __btree_lock_want(path, 0);
+ if (lock_want == SIX_LOCK_read)
+ six_lock_downgrade(&ck->c.lock);
+ btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want);
+ path->uptodate = BTREE_ITER_UPTODATE;
+ return 0;
+err:
+ bkey_cached_free_fast(bc, ck);
+ mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
+
+ return ret;
}
-static int btree_key_cache_fill(struct btree_trans *trans,
- struct btree_path *ck_path,
- struct bkey_cached *ck)
+static noinline int btree_key_cache_fill(struct btree_trans *trans,
+ struct btree_path *ck_path,
+ unsigned flags)
{
+ if (flags & BTREE_ITER_cached_nofill) {
+ ck_path->uptodate = BTREE_ITER_UPTODATE;
+ return 0;
+ }
+
+ struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
- unsigned new_u64s = 0;
- struct bkey_i *new_k = NULL;
int ret;
- bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos,
+ bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos,
BTREE_ITER_key_cache_fill|
BTREE_ITER_cached_nofill);
iter.flags &= ~BTREE_ITER_with_journal;
@@ -392,70 +444,15 @@ static int btree_key_cache_fill(struct btree_trans *trans,
if (ret)
goto err;
- if (!bch2_btree_node_relock(trans, ck_path, 0)) {
- trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
- goto err;
- }
-
- /*
- * bch2_varint_decode can read past the end of the buffer by at
- * most 7 bytes (it won't be used):
- */
- new_u64s = k.k->u64s + 1;
-
- /*
- * Allocate some extra space so that the transaction commit path is less
- * likely to have to reallocate, since that requires a transaction
- * restart:
- */
- new_u64s = min(256U, (new_u64s * 3) / 2);
-
- if (new_u64s > ck->u64s) {
- new_u64s = roundup_pow_of_two(new_u64s);
- new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
- if (!new_k) {
- bch2_trans_unlock(trans);
-
- new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
- if (!new_k) {
- bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
- bch2_btree_id_str(ck->key.btree_id), new_u64s);
- ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
- goto err;
- }
-
- ret = bch2_trans_relock(trans);
- if (ret) {
- kfree(new_k);
- goto err;
- }
-
- if (!bch2_btree_node_relock(trans, ck_path, 0)) {
- kfree(new_k);
- trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
- goto err;
- }
- }
- }
+ /* Recheck after btree lookup, before allocating: */
+ ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0;
+ if (unlikely(ret))
+ goto out;
- ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c);
- if (ret) {
- kfree(new_k);
+ ret = btree_key_cache_create(trans, ck_path, k);
+ if (ret)
goto err;
- }
-
- if (new_k) {
- kfree(ck->k);
- ck->u64s = new_u64s;
- ck->k = new_k;
- }
-
- bkey_reassemble(ck->k, k);
- ck->valid = true;
- bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
-
+out:
/* We're not likely to need this iterator again: */
bch2_set_btree_iter_dontneed(&iter);
err:
@@ -463,137 +460,62 @@ err:
return ret;
}
-static noinline int
-bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path,
- unsigned flags)
+static inline int btree_path_traverse_cached_fast(struct btree_trans *trans,
+ struct btree_path *path)
{
struct bch_fs *c = trans->c;
struct bkey_cached *ck;
- int ret = 0;
-
- BUG_ON(path->level);
-
- path->l[1].b = NULL;
-
- if (bch2_btree_node_relock_notrace(trans, path, 0)) {
- ck = (void *) path->l[0].b;
- goto fill;
- }
retry:
ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
- if (!ck) {
- ck = btree_key_cache_create(trans, path);
- ret = PTR_ERR_OR_ZERO(ck);
- if (ret)
- goto err;
- if (!ck)
- goto retry;
-
- mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
- path->locks_want = 1;
- } else {
- enum six_lock_type lock_want = __btree_lock_want(path, 0);
-
- ret = btree_node_lock(trans, path, (void *) ck, 0,
- lock_want, _THIS_IP_);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto err;
-
- BUG_ON(ret);
-
- if (ck->key.btree_id != path->btree_id ||
- !bpos_eq(ck->key.pos, path->pos)) {
- six_unlock_type(&ck->c.lock, lock_want);
- goto retry;
- }
-
- mark_btree_node_locked(trans, path, 0,
- (enum btree_node_locked_type) lock_want);
- }
+ if (!ck)
+ return -ENOENT;
- path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
- path->l[0].b = (void *) ck;
-fill:
- path->uptodate = BTREE_ITER_UPTODATE;
+ enum six_lock_type lock_want = __btree_lock_want(path, 0);
- if (!ck->valid && !(flags & BTREE_ITER_cached_nofill)) {
- ret = bch2_btree_path_upgrade(trans, path, 1) ?:
- btree_key_cache_fill(trans, path, ck) ?:
- bch2_btree_path_relock(trans, path, _THIS_IP_);
- if (ret)
- goto err;
+ int ret = btree_node_lock(trans, path, (void *) ck, 0, lock_want, _THIS_IP_);
+ if (ret)
+ return ret;
- path->uptodate = BTREE_ITER_UPTODATE;
+ if (ck->key.btree_id != path->btree_id ||
+ !bpos_eq(ck->key.pos, path->pos)) {
+ six_unlock_type(&ck->c.lock, lock_want);
+ goto retry;
}
if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
- BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
- BUG_ON(path->uptodate);
-
- return ret;
-err:
- path->uptodate = BTREE_ITER_NEED_TRAVERSE;
- if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- btree_node_unlock(trans, path, 0);
- path->l[0].b = ERR_PTR(ret);
- }
- return ret;
+ btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want);
+ path->uptodate = BTREE_ITER_UPTODATE;
+ return 0;
}
int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
unsigned flags)
{
- struct bch_fs *c = trans->c;
- struct bkey_cached *ck;
- int ret = 0;
-
EBUG_ON(path->level);
path->l[1].b = NULL;
if (bch2_btree_node_relock_notrace(trans, path, 0)) {
- ck = (void *) path->l[0].b;
- goto fill;
+ path->uptodate = BTREE_ITER_UPTODATE;
+ return 0;
}
-retry:
- ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
- if (!ck) {
- return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
- } else {
- enum six_lock_type lock_want = __btree_lock_want(path, 0);
-
- ret = btree_node_lock(trans, path, (void *) ck, 0,
- lock_want, _THIS_IP_);
- EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart));
-
- if (ret)
- return ret;
- if (ck->key.btree_id != path->btree_id ||
- !bpos_eq(ck->key.pos, path->pos)) {
- six_unlock_type(&ck->c.lock, lock_want);
- goto retry;
+ int ret;
+ do {
+ ret = btree_path_traverse_cached_fast(trans, path);
+ if (unlikely(ret == -ENOENT))
+ ret = btree_key_cache_fill(trans, path, flags);
+ } while (ret == -EEXIST);
+
+ if (unlikely(ret)) {
+ path->uptodate = BTREE_ITER_NEED_TRAVERSE;
+ if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ btree_node_unlock(trans, path, 0);
+ path->l[0].b = ERR_PTR(ret);
}
-
- mark_btree_node_locked(trans, path, 0,
- (enum btree_node_locked_type) lock_want);
}
-
- path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
- path->l[0].b = (void *) ck;
-fill:
- if (!ck->valid)
- return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
-
- if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
- set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
-
- path->uptodate = BTREE_ITER_UPTODATE;
- EBUG_ON(!ck->valid);
- EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
-
return ret;
}
@@ -632,8 +554,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
goto out;
}
- BUG_ON(!ck->valid);
-
if (journal_seq && ck->journal.seq != journal_seq)
goto out;
@@ -755,7 +675,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
BUG_ON(insert->k.u64s > ck->u64s);
bkey_copy(ck->k, insert);
- ck->valid = true;
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
@@ -794,10 +713,9 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
struct btree_path *path)
{
struct bch_fs *c = trans->c;
+ struct btree_key_cache *bc = &c->btree_key_cache;
struct bkey_cached *ck = (void *) path->l[0].b;
- BUG_ON(!ck->valid);
-
/*
* We just did an update to the btree, bypassing the key cache: the key
* cache key is now stale and must be dropped, even if dirty:
@@ -808,7 +726,11 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
bch2_journal_pin_drop(&c->journal, &ck->journal);
}
- ck->valid = false;
+ bkey_cached_evict(bc, ck);
+ bkey_cached_free_fast(bc, ck);
+
+ mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
}
static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index d66fff22109a..efe2a007b482 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -10,19 +10,9 @@ void bch2_btree_lock_init(struct btree_bkey_cached_common *b,
enum six_lock_init_flags flags)
{
__six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags);
- lockdep_set_novalidate_class(&b->lock);
+ lockdep_set_notrack_class(&b->lock);
}
-#ifdef CONFIG_LOCKDEP
-void bch2_assert_btree_nodes_not_locked(void)
-{
-#if 0
- //Re-enable when lock_class_is_held() is merged:
- BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
-#endif
-}
-#endif
-
/* Btree node locking: */
struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
@@ -231,7 +221,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
prt_newline(&buf);
}
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf);
printbuf_exit(&buf);
BUG();
}
@@ -792,7 +782,7 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace)
return bch2_trans_relock_fail(trans, path, &f, trace);
}
- trans->locked = true;
+ trans_set_locked(trans);
out:
bch2_trans_verify_locks(trans);
return 0;
@@ -812,16 +802,14 @@ void bch2_trans_unlock_noassert(struct btree_trans *trans)
{
__bch2_trans_unlock(trans);
- trans->locked = false;
- trans->last_unlock_ip = _RET_IP_;
+ trans_set_unlocked(trans);
}
void bch2_trans_unlock(struct btree_trans *trans)
{
__bch2_trans_unlock(trans);
- trans->locked = false;
- trans->last_unlock_ip = _RET_IP_;
+ trans_set_unlocked(trans);
}
void bch2_trans_unlock_long(struct btree_trans *trans)
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 7f41545b9147..11a64ead8685 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -15,12 +15,6 @@
void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags);
-#ifdef CONFIG_LOCKDEP
-void bch2_assert_btree_nodes_not_locked(void);
-#else
-static inline void bch2_assert_btree_nodes_not_locked(void) {}
-#endif
-
void bch2_trans_unlock_noassert(struct btree_trans *);
static inline bool is_btree_node(struct btree_path *path, unsigned l)
@@ -136,6 +130,7 @@ static inline void btree_node_unlock(struct btree_trans *trans,
int lock_type = btree_node_locked_type(path, level);
EBUG_ON(level >= BTREE_MAX_DEPTH);
+ EBUG_ON(lock_type == BTREE_NODE_WRITE_LOCKED);
if (lock_type != BTREE_NODE_UNLOCKED) {
six_unlock_type(&path->l[level].b->c.lock, lock_type);
@@ -193,6 +188,30 @@ int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
/* lock: */
+static inline void trans_set_locked(struct btree_trans *trans)
+{
+ if (!trans->locked) {
+ lock_acquire_exclusive(&trans->dep_map, 0, 0, NULL, _THIS_IP_);
+ trans->locked = true;
+ trans->last_unlock_ip = 0;
+
+ trans->pf_memalloc_nofs = (current->flags & PF_MEMALLOC_NOFS) != 0;
+ current->flags |= PF_MEMALLOC_NOFS;
+ }
+}
+
+static inline void trans_set_unlocked(struct btree_trans *trans)
+{
+ if (trans->locked) {
+ lock_release(&trans->dep_map, _THIS_IP_);
+ trans->locked = false;
+ trans->last_unlock_ip = _RET_IP_;
+
+ if (!trans->pf_memalloc_nofs)
+ current->flags &= ~PF_MEMALLOC_NOFS;
+ }
+}
+
static inline int __btree_node_lock_nopath(struct btree_trans *trans,
struct btree_bkey_cached_common *b,
enum six_lock_type type,
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 2cb0442f6cc9..001107226377 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -22,7 +22,9 @@ struct find_btree_nodes_worker {
static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
{
- prt_printf(out, "%s l=%u seq=%u cookie=%llx ", bch2_btree_id_str(n->btree_id), n->level, n->seq, n->cookie);
+ prt_printf(out, "%s l=%u seq=%u journal_seq=%llu cookie=%llx ",
+ bch2_btree_id_str(n->btree_id), n->level, n->seq,
+ n->journal_seq, n->cookie);
bch2_bpos_to_text(out, n->min_key);
prt_str(out, "-");
bch2_bpos_to_text(out, n->max_key);
@@ -63,19 +65,37 @@ static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_n
memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
}
+static inline u64 bkey_journal_seq(struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case KEY_TYPE_inode_v3:
+ return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_journal_seq);
+ default:
+ return 0;
+ }
+}
+
static bool found_btree_node_is_readable(struct btree_trans *trans,
struct found_btree_node *f)
{
- struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } k;
+ struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
- found_btree_node_to_key(&k.k, f);
+ found_btree_node_to_key(&tmp.k, f);
- struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false);
+ struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false);
bool ret = !IS_ERR_OR_NULL(b);
if (!ret)
return ret;
f->sectors_written = b->written;
+ f->journal_seq = le64_to_cpu(b->data->keys.journal_seq);
+
+ struct bkey_s_c k;
+ struct bkey unpacked;
+ struct btree_node_iter iter;
+ for_each_btree_node_key_unpack(b, k, &iter, &unpacked)
+ f->journal_seq = max(f->journal_seq, bkey_journal_seq(k));
+
six_unlock_read(&b->c.lock);
/*
@@ -84,7 +104,7 @@ static bool found_btree_node_is_readable(struct btree_trans *trans,
* this node
*/
if (b != btree_node_root(trans->c, b))
- bch2_btree_node_evict(trans, &k.k);
+ bch2_btree_node_evict(trans, &tmp.k);
return ret;
}
@@ -105,7 +125,8 @@ static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
static int found_btree_node_cmp_time(const struct found_btree_node *l,
const struct found_btree_node *r)
{
- return cmp_int(l->seq, r->seq);
+ return cmp_int(l->seq, r->seq) ?:
+ cmp_int(l->journal_seq, r->journal_seq);
}
static int found_btree_node_cmp_pos(const void *_l, const void *_r)
@@ -309,15 +330,15 @@ again:
} else if (n->level) {
n->overwritten = true;
} else {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "overlapping btree nodes with same seq! halting\n ");
- found_btree_node_to_text(&buf, c, start);
- prt_str(&buf, "\n ");
- found_btree_node_to_text(&buf, c, n);
- bch_err(c, "%s", buf.buf);
- printbuf_exit(&buf);
- return -BCH_ERR_fsck_repair_unimplemented;
+ if (bpos_cmp(start->max_key, n->max_key) >= 0)
+ n->overwritten = true;
+ else {
+ n->range_updated = true;
+ n->min_key = bpos_successor(start->max_key);
+ n->range_updated = true;
+ bubble_up(n, end);
+ goto again;
+ }
}
}
diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h
index 5cfaeb5ac831..b6c36c45d0be 100644
--- a/fs/bcachefs/btree_node_scan_types.h
+++ b/fs/bcachefs/btree_node_scan_types.h
@@ -11,6 +11,7 @@ struct found_btree_node {
u8 level;
unsigned sectors_written;
u32 seq;
+ u64 journal_seq;
u64 cookie;
struct bpos min_key;
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 74e1ff225674..cca336fe46e9 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -10,6 +10,7 @@
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
#include "buckets.h"
+#include "disk_accounting.h"
#include "errcode.h"
#include "error.h"
#include "journal.h"
@@ -136,7 +137,8 @@ static inline void bch2_trans_unlock_write(struct btree_trans *trans)
{
if (likely(trans->write_locked)) {
trans_for_each_update(trans, i)
- if (!same_leaf_as_prev(trans, i))
+ if (btree_node_locked_type(trans->paths + i->path, i->level) ==
+ BTREE_NODE_WRITE_LOCKED)
bch2_btree_node_unlock_write_inlined(trans,
trans->paths + i->path, insert_l(trans, i)->b);
trans->write_locked = false;
@@ -228,14 +230,14 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
struct btree_write *w = container_of(pin, struct btree_write, journal);
struct btree *b = container_of(w, struct btree, writes[i]);
struct btree_trans *trans = bch2_trans_get(c);
- unsigned long old, new, v;
+ unsigned long old, new;
unsigned idx = w - b->writes;
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
- v = READ_ONCE(b->flags);
+ old = READ_ONCE(b->flags);
do {
- old = new = v;
+ new = old;
if (!(old & (1 << BTREE_NODE_dirty)) ||
!!(old & (1 << BTREE_NODE_write_idx)) != idx ||
@@ -245,7 +247,7 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
new &= ~BTREE_WRITE_TYPE_MASK;
new |= BTREE_WRITE_journal_reclaim;
new |= 1 << BTREE_NODE_need_write;
- } while ((v = cmpxchg(&b->flags, old, new)) != old);
+ } while (!try_cmpxchg(&b->flags, &old, new));
btree_node_write_if_need(c, b, SIX_LOCK_read);
six_unlock_read(&b->c.lock);
@@ -456,34 +458,36 @@ static int run_one_mem_trigger(struct btree_trans *trans,
struct btree_insert_entry *i,
unsigned flags)
{
- struct bkey_s_c old = { &i->old_k, i->old_v };
- struct bkey_i *new = i->k;
- const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
- const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
- int ret;
-
verify_update_old_key(trans, i);
if (unlikely(flags & BTREE_TRIGGER_norun))
return 0;
- if (old_ops->trigger == new_ops->trigger) {
- ret = bch2_key_trigger(trans, i->btree_id, i->level,
+ struct bkey_s_c old = { &i->old_k, i->old_v };
+ struct bkey_i *new = i->k;
+ const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
+ const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
+
+ if (old_ops->trigger == new_ops->trigger)
+ return bch2_key_trigger(trans, i->btree_id, i->level,
old, bkey_i_to_s(new),
BTREE_TRIGGER_insert|BTREE_TRIGGER_overwrite|flags);
- } else {
- ret = bch2_key_trigger_new(trans, i->btree_id, i->level,
+ else
+ return bch2_key_trigger_new(trans, i->btree_id, i->level,
bkey_i_to_s(new), flags) ?:
- bch2_key_trigger_old(trans, i->btree_id, i->level,
+ bch2_key_trigger_old(trans, i->btree_id, i->level,
old, flags);
- }
-
- return ret;
}
static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
bool overwrite)
{
+ verify_update_old_key(trans, i);
+
+ if ((i->flags & BTREE_TRIGGER_norun) ||
+ !btree_node_type_has_trans_triggers(i->bkey_type))
+ return 0;
+
/*
* Transactional triggers create new btree_insert_entries, so we can't
* pass them a pointer to a btree_insert_entry, that memory is going to
@@ -495,12 +499,6 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
unsigned flags = i->flags|BTREE_TRIGGER_transactional;
- verify_update_old_key(trans, i);
-
- if ((i->flags & BTREE_TRIGGER_norun) ||
- !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
- return 0;
-
if (!i->insert_trigger_run &&
!i->overwrite_trigger_run &&
old_ops->trigger == new_ops->trigger) {
@@ -523,10 +521,8 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
unsigned btree_id_start)
{
- bool trans_trigger_run;
- int ret, overwrite;
-
- for (overwrite = 1; overwrite >= 0; --overwrite) {
+ for (int overwrite = 1; overwrite >= 0; --overwrite) {
+ bool trans_trigger_run;
/*
* Running triggers will append more updates to the list of updates as
@@ -541,7 +537,7 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
if (trans->updates[i].btree_id != btree_id)
continue;
- ret = run_one_trans_trigger(trans, trans->updates + i, overwrite);
+ int ret = run_one_trans_trigger(trans, trans->updates + i, overwrite);
if (ret < 0)
return ret;
if (ret)
@@ -594,7 +590,7 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
#ifdef CONFIG_BCACHEFS_DEBUG
trans_for_each_update(trans, i)
BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
- (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
+ btree_node_type_has_trans_triggers(i->bkey_type) &&
(!i->insert_trigger_run || !i->overwrite_trigger_run));
#endif
return 0;
@@ -602,24 +598,25 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
{
- trans_for_each_update(trans, i) {
- /*
- * XXX: synchronization of cached update triggers with gc
- * XXX: synchronization of interior node updates with gc
- */
- BUG_ON(i->cached || i->level);
-
- if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) &&
- gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) {
+ trans_for_each_update(trans, i)
+ if (btree_node_type_has_triggers(i->bkey_type) &&
+ gc_visited(trans->c, gc_pos_btree(i->btree_id, i->level, i->k->k.p))) {
int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_gc);
if (ret)
return ret;
}
- }
return 0;
}
+static struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset)
+{
+ return (struct bversion) {
+ .hi = res->seq >> 32,
+ .lo = (res->seq << 32) | (res->offset + offset),
+ };
+}
+
static inline int
bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
struct btree_insert_entry **stopped_at,
@@ -628,7 +625,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
struct bch_fs *c = trans->c;
struct btree_trans_commit_hook *h;
unsigned u64s = 0;
- int ret;
+ int ret = 0;
bch2_trans_verify_not_unlocked(trans);
bch2_trans_verify_not_in_restart(trans);
@@ -693,23 +690,40 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
i->k->k.version = MAX_VERSION;
}
- if (trans->fs_usage_deltas &&
- bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
- return -BCH_ERR_btree_insert_need_mark_replicas;
-
- /* XXX: we only want to run this if deltas are nonzero */
- bch2_trans_account_disk_usage_change(trans);
-
h = trans->hooks;
while (h) {
ret = h->fn(trans, h);
if (ret)
- goto revert_fs_usage;
+ return ret;
h = h->next;
}
+ struct jset_entry *entry = trans->journal_entries;
+
+ if (likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply))) {
+ percpu_down_read(&c->mark_lock);
+
+ for (entry = trans->journal_entries;
+ entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+ entry = vstruct_next(entry))
+ if (jset_entry_is_key(entry) && entry->start->k.type == KEY_TYPE_accounting) {
+ struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start);
+
+ a->k.version = journal_pos_to_bversion(&trans->journal_res,
+ (u64 *) entry - (u64 *) trans->journal_entries);
+ BUG_ON(bversion_zero(a->k.version));
+ ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), false);
+ if (ret)
+ goto revert_fs_usage;
+ }
+ percpu_up_read(&c->mark_lock);
+
+ /* XXX: we only want to run this if deltas are nonzero */
+ bch2_trans_account_disk_usage_change(trans);
+ }
+
trans_for_each_update(trans, i)
- if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) {
+ if (btree_node_type_has_atomic_triggers(i->bkey_type)) {
ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_atomic|i->flags);
if (ret)
goto fatal_err;
@@ -764,29 +778,44 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
trans_for_each_update(trans, i) {
struct btree_path *path = trans->paths + i->path;
- if (!i->cached) {
+ if (!i->cached)
bch2_btree_insert_key_leaf(trans, path, i->k, trans->journal_res.seq);
- } else if (!i->key_cache_already_flushed)
+ else if (!i->key_cache_already_flushed)
bch2_btree_insert_key_cached(trans, flags, i);
- else {
+ else
bch2_btree_key_cache_drop(trans, path);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
- }
}
return 0;
fatal_err:
- bch2_fatal_error(c);
+ bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret));
+ percpu_down_read(&c->mark_lock);
revert_fs_usage:
- if (trans->fs_usage_deltas)
- bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas);
+ for (struct jset_entry *entry2 = trans->journal_entries;
+ entry2 != entry;
+ entry2 = vstruct_next(entry2))
+ if (jset_entry_is_key(entry2) && entry2->start->k.type == KEY_TYPE_accounting) {
+ struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start);
+
+ bch2_accounting_neg(a);
+ bch2_accounting_mem_mod_locked(trans, a.c, false);
+ bch2_accounting_neg(a);
+ }
+ percpu_up_read(&c->mark_lock);
return ret;
}
static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
{
+ /*
+ * Accounting keys aren't deduped in the journal: we have to compare
+ * each individual update against what's in the btree to see if it has
+ * been applied yet, and accounting updates also don't overwrite,
+ * they're deltas that accumulate.
+ */
trans_for_each_update(trans, i)
- bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
+ if (i->k->k.type != KEY_TYPE_accounting)
+ bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
}
static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
@@ -922,7 +951,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
break;
case -BCH_ERR_btree_insert_need_mark_replicas:
ret = drop_locks_do(trans,
- bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas));
+ bch2_accounting_update_sb(trans));
break;
case -BCH_ERR_journal_res_get_blocked:
/*
@@ -993,15 +1022,24 @@ static noinline int
do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
- int ret = 0;
trans_for_each_update(trans, i) {
- ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
+ int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
if (ret)
- break;
+ return ret;
}
- return ret;
+ for (struct jset_entry *i = trans->journal_entries;
+ i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+ i = vstruct_next(i))
+ if (i->type == BCH_JSET_ENTRY_btree_keys ||
+ i->type == BCH_JSET_ENTRY_write_buffer_keys) {
+ int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->start);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
}
int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
@@ -1017,8 +1055,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
!trans->journal_entries_u64s)
goto out_reset;
- memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
-
ret = bch2_trans_commit_run_triggers(trans);
if (ret)
goto out_reset;
@@ -1115,6 +1151,7 @@ retry:
bch2_trans_verify_not_in_restart(trans);
if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+ memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 87f485e9c552..b256b2a20a4f 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -388,7 +388,6 @@ struct bkey_cached {
unsigned long flags;
unsigned long btree_trans_barrier_seq;
u16 u64s;
- bool valid;
struct bkey_cached_key key;
struct rhash_head hash;
@@ -478,12 +477,13 @@ struct btree_trans {
btree_path_idx_t nr_sorted;
btree_path_idx_t nr_paths;
btree_path_idx_t nr_paths_max;
+ btree_path_idx_t nr_updates;
u8 fn_idx;
- u8 nr_updates;
u8 lock_must_abort;
bool lock_may_not_fail:1;
bool srcu_held:1;
bool locked:1;
+ bool pf_memalloc_nofs:1;
bool write_locked:1;
bool used_mempool:1;
bool in_traverse_all:1;
@@ -522,8 +522,10 @@ struct btree_trans {
unsigned journal_u64s;
unsigned extra_disk_res; /* XXX kill */
- struct replicas_delta_list *fs_usage_deltas;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
/* Entries before this are zeroed out on every bch2_trans_get() call */
struct list_head list;
@@ -754,9 +756,19 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS)
-static inline bool btree_node_type_needs_gc(enum btree_node_type type)
+static inline bool btree_node_type_has_trans_triggers(enum btree_node_type type)
+{
+ return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS;
+}
+
+static inline bool btree_node_type_has_atomic_triggers(enum btree_node_type type)
+{
+ return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS;
+}
+
+static inline bool btree_node_type_has_triggers(enum btree_node_type type)
{
- return BTREE_NODE_TYPE_HAS_TRIGGERS & BIT_ULL(type);
+ return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS;
}
static inline bool btree_node_type_is_extents(enum btree_node_type type)
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index f3c645a43dcb..d6f6df10dcc3 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -656,14 +656,16 @@ int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
* @disk_res: must be non-NULL whenever inserting or potentially
* splitting data extents
* @flags: transaction commit flags
+ * @iter_flags: btree iter update trigger flags
*
* Returns: 0 on success, error code on failure
*/
int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k,
- struct disk_reservation *disk_res, int flags)
+ struct disk_reservation *disk_res, int flags,
+ enum btree_iter_update_trigger_flags iter_flags)
{
return bch2_trans_do(c, disk_res, NULL, flags,
- bch2_btree_insert_trans(trans, id, k, 0));
+ bch2_btree_insert_trans(trans, id, k, iter_flags));
}
int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index b4894e4d5447..60393e98084d 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -29,6 +29,7 @@ void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
"pin journal entry referred to by trans->journal_res.seq") \
x(journal_reclaim, "operation required for journal reclaim; may return error" \
"instead of deadlocking if BCH_WATERMARK_reclaim not specified")\
+ x(skip_accounting_apply, "we're in journal replay - accounting updates have already been applied")
enum __bch_trans_commit_flags {
/* First bits for bch_watermark: */
@@ -56,8 +57,9 @@ int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *,
enum btree_iter_update_trigger_flags);
-int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
- struct disk_reservation *, int flags);
+int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct
+ disk_reservation *, int flags, enum
+ btree_iter_update_trigger_flags iter_flags);
int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
struct bpos, struct bpos, unsigned, u64 *);
@@ -130,7 +132,19 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr
enum btree_id btree,
struct bkey_i *k)
{
- if (unlikely(trans->journal_replay_not_finished))
+ /*
+ * Most updates skip the btree write buffer until journal replay is
+ * finished because synchronization with journal replay relies on having
+ * a btree node locked - if we're overwriting a key in the journal that
+ * journal replay hasn't yet replayed, we have to mark it as
+ * overwritten.
+ *
+ * But accounting updates don't overwrite, they're deltas, and they have
+ * to be flushed to the btree strictly in order for journal replay to be
+ * able to tell which updates need to be applied:
+ */
+ if (k->k.type != KEY_TYPE_accounting &&
+ unlikely(trans->journal_replay_not_finished))
return bch2_btree_insert_clone_trans(trans, btree, k);
struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s));
@@ -178,14 +192,6 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
(_journal_seq), (_flags)))
-#define bch2_trans_run(_c, _do) \
-({ \
- struct btree_trans *trans = bch2_trans_get(_c); \
- int _ret = (_do); \
- bch2_trans_put(trans); \
- _ret; \
-})
-
#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \
bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do))
@@ -203,14 +209,6 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans)
trans->journal_entries_u64s = 0;
trans->hooks = NULL;
trans->extra_disk_res = 0;
-
- if (trans->fs_usage_deltas) {
- trans->fs_usage_deltas->used = 0;
- memset((void *) trans->fs_usage_deltas +
- offsetof(struct replicas_delta_list, memset_start), 0,
- (void *) &trans->fs_usage_deltas->memset_end -
- (void *) &trans->fs_usage_deltas->memset_start);
- }
}
static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 60b8544cea48..31ee50184be2 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -61,7 +61,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
if (!bpos_eq(b->data->min_key, POS_MIN)) {
printbuf_reset(&buf);
bch2_bpos_to_text(&buf, b->data->min_key);
- need_fsck_err(c, btree_root_bad_min_key,
+ need_fsck_err(trans, btree_root_bad_min_key,
"btree root with incorrect min_key: %s", buf.buf);
goto topology_repair;
}
@@ -69,7 +69,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
if (!bpos_eq(b->data->max_key, SPOS_MAX)) {
printbuf_reset(&buf);
bch2_bpos_to_text(&buf, b->data->max_key);
- need_fsck_err(c, btree_root_bad_max_key,
+ need_fsck_err(trans, btree_root_bad_max_key,
"btree root with incorrect max_key: %s", buf.buf);
goto topology_repair;
}
@@ -105,7 +105,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
prt_str(&buf, "\n next ");
bch2_bkey_val_to_text(&buf, c, k);
- need_fsck_err(c, btree_node_topology_bad_min_key, "%s", buf.buf);
+ need_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf);
goto topology_repair;
}
@@ -122,7 +122,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
bch2_btree_id_str(b->c.btree_id), b->c.level);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- need_fsck_err(c, btree_node_topology_empty_interior_node, "%s", buf.buf);
+ need_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf);
goto topology_repair;
} else if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
bch2_topology_error(c);
@@ -135,7 +135,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
prt_str(&buf, "\n last key ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
- need_fsck_err(c, btree_node_topology_bad_max_key, "%s", buf.buf);
+ need_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf);
goto topology_repair;
}
out:
@@ -1356,10 +1356,10 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
struct bch_fs *c = as->c;
struct bkey_packed *k;
struct printbuf buf = PRINTBUF;
- unsigned long old, new, v;
+ unsigned long old, new;
BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
- !btree_ptr_sectors_written(insert));
+ !btree_ptr_sectors_written(bkey_i_to_s_c(insert)));
if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)))
bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
@@ -1395,14 +1395,14 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
set_btree_node_dirty_acct(c, b);
- v = READ_ONCE(b->flags);
+ old = READ_ONCE(b->flags);
do {
- old = new = v;
+ new = old;
new &= ~BTREE_WRITE_TYPE_MASK;
new |= BTREE_WRITE_interior;
new |= 1 << BTREE_NODE_need_write;
- } while ((v = cmpxchg(&b->flags, old, new)) != old);
+ } while (!try_cmpxchg(&b->flags, &old, new));
printbuf_exit(&buf);
}
@@ -2647,6 +2647,28 @@ bch2_btree_roots_to_journal_entries(struct bch_fs *c,
return end;
}
+static void bch2_btree_alloc_to_text(struct printbuf *out,
+ struct bch_fs *c,
+ struct btree_alloc *a)
+{
+ printbuf_indent_add(out, 2);
+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&a->k));
+ prt_newline(out);
+
+ struct open_bucket *ob;
+ unsigned i;
+ open_bucket_for_each(c, &a->ob, ob, i)
+ bch2_open_bucket_to_text(out, c, ob);
+
+ printbuf_indent_sub(out, 2);
+}
+
+void bch2_btree_reserve_cache_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ for (unsigned i = 0; i < c->btree_reserve_cache_nr; i++)
+ bch2_btree_alloc_to_text(out, c, &c->btree_reserve_cache[i]);
+}
+
void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
{
if (c->btree_node_rewrite_worker)
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index b5b76ce01cfc..02c6ecada97c 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -335,6 +335,8 @@ struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
void bch2_do_pending_node_rewrites(struct bch_fs *);
void bch2_free_pending_node_rewrites(struct bch_fs *);
+void bch2_btree_reserve_cache_to_text(struct printbuf *, struct bch_fs *);
+
void bch2_fs_btree_interior_update_exit(struct bch_fs *);
void bch2_fs_btree_interior_update_init_early(struct bch_fs *);
int bch2_fs_btree_interior_update_init(struct bch_fs *);
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 75c8a196b3f6..3f56b584f8ec 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -1,11 +1,14 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bkey_buf.h"
#include "btree_locking.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
+#include "disk_accounting.h"
#include "error.h"
+#include "extents.h"
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
@@ -132,7 +135,9 @@ static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
struct btree_write_buffered_key *wb,
- bool *write_locked, size_t *fast)
+ bool *write_locked,
+ bool *accounting_accumulated,
+ size_t *fast)
{
struct btree_path *path;
int ret;
@@ -145,6 +150,16 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
if (ret)
return ret;
+ if (!*accounting_accumulated && wb->k.k.type == KEY_TYPE_accounting) {
+ struct bkey u;
+ struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u);
+
+ if (k.k->type == KEY_TYPE_accounting)
+ bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k),
+ bkey_s_c_to_accounting(k));
+ }
+ *accounting_accumulated = true;
+
/*
* We can't clone a path that has write locks: unshare it now, before
* set_pos and traverse():
@@ -257,8 +272,9 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
struct journal *j = &c->journal;
struct btree_write_buffer *wb = &c->btree_write_buffer;
struct btree_iter iter = { NULL };
- size_t skipped = 0, fast = 0, slowpath = 0;
+ size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0;
bool write_locked = false;
+ bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags);
int ret = 0;
bch2_trans_unlock(trans);
@@ -299,11 +315,22 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
BUG_ON(!k->journal_seq);
+ if (!accounting_replay_done &&
+ k->k.k.type == KEY_TYPE_accounting) {
+ slowpath++;
+ continue;
+ }
+
if (i + 1 < &darray_top(wb->sorted) &&
wb_key_eq(i, i + 1)) {
struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
- skipped++;
+ if (k->k.k.type == KEY_TYPE_accounting &&
+ n->k.k.type == KEY_TYPE_accounting)
+ bch2_accounting_accumulate(bkey_i_to_accounting(&n->k),
+ bkey_i_to_s_c_accounting(&k->k));
+
+ overwritten++;
n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);
k->journal_seq = 0;
continue;
@@ -338,13 +365,15 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
bch2_btree_iter_set_pos(&iter, k->k.k.p);
btree_iter_path(trans, &iter)->preserve = false;
+ bool accounting_accumulated = false;
do {
if (race_fault()) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
break;
}
- ret = wb_flush_one(trans, &iter, k, &write_locked, &fast);
+ ret = wb_flush_one(trans, &iter, k, &write_locked,
+ &accounting_accumulated, &fast);
if (!write_locked)
bch2_trans_begin(trans);
} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
@@ -385,8 +414,15 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
if (!i->journal_seq)
continue;
- bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
- bch2_btree_write_buffer_journal_flush);
+ if (!accounting_replay_done &&
+ i->k.k.type == KEY_TYPE_accounting) {
+ could_not_insert++;
+ continue;
+ }
+
+ if (!could_not_insert)
+ bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
+ bch2_btree_write_buffer_journal_flush);
bch2_trans_begin(trans);
@@ -399,13 +435,45 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
btree_write_buffered_insert(trans, i));
if (ret)
goto err;
+
+ i->journal_seq = 0;
+ }
+
+ /*
+ * If journal replay hasn't finished with accounting keys we
+ * can't flush accounting keys at all - condense them and leave
+ * them for next time.
+ *
+ * Q: Can the write buffer overflow?
+ * A Shouldn't be any actual risk. It's just new accounting
+ * updates that the write buffer can't flush, and those are only
+ * going to be generated by interior btree node updates as
+ * journal replay has to split/rewrite nodes to make room for
+ * its updates.
+ *
+ * And for those new acounting updates, updates to the same
+ * counters get accumulated as they're flushed from the journal
+ * to the write buffer - see the patch for eytzingcer tree
+ * accumulated. So we could only overflow if the number of
+ * distinct counters touched somehow was very large.
+ */
+ if (could_not_insert) {
+ struct btree_write_buffered_key *dst = wb->flushing.keys.data;
+
+ darray_for_each(wb->flushing.keys, i)
+ if (i->journal_seq)
+ *dst++ = *i;
+ wb->flushing.keys.nr = dst - wb->flushing.keys.data;
}
}
err:
+ if (ret || !could_not_insert) {
+ bch2_journal_pin_drop(j, &wb->flushing.pin);
+ wb->flushing.keys.nr = 0;
+ }
+
bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret));
- trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
- bch2_journal_pin_drop(j, &wb->flushing.pin);
- wb->flushing.keys.nr = 0;
+ trace_write_buffer_flush(trans, wb->flushing.keys.nr, overwritten, fast, 0);
return ret;
}
@@ -492,6 +560,41 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans)
return ret;
}
+/*
+ * In check and repair code, when checking references to write buffer btrees we
+ * need to issue a flush before we have a definitive error: this issues a flush
+ * if this is a key we haven't yet checked.
+ */
+int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans,
+ struct bkey_s_c referring_k,
+ struct bkey_buf *last_flushed)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_buf tmp;
+ int ret = 0;
+
+ bch2_bkey_buf_init(&tmp);
+
+ if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) {
+ bch2_bkey_buf_reassemble(&tmp, c, referring_k);
+
+ if (bkey_is_btree_ptr(referring_k.k)) {
+ bch2_trans_unlock(trans);
+ bch2_btree_interior_updates_flush(c);
+ }
+
+ ret = bch2_btree_write_buffer_flush_sync(trans);
+ if (ret)
+ goto err;
+
+ bch2_bkey_buf_copy(last_flushed, c, tmp.k);
+ ret = -BCH_ERR_transaction_restart_write_buffer_flush;
+ }
+err:
+ bch2_bkey_buf_exit(&tmp, c);
+ return ret;
+}
+
static void bch2_btree_write_buffer_flush_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work);
@@ -507,6 +610,29 @@ static void bch2_btree_write_buffer_flush_work(struct work_struct *work)
bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
}
+static void wb_accounting_sort(struct btree_write_buffer *wb)
+{
+ eytzinger0_sort(wb->accounting.data, wb->accounting.nr,
+ sizeof(wb->accounting.data[0]),
+ wb_key_cmp, NULL);
+}
+
+int bch2_accounting_key_to_wb_slowpath(struct bch_fs *c, enum btree_id btree,
+ struct bkey_i_accounting *k)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+ struct btree_write_buffered_key new = { .btree = btree };
+
+ bkey_copy(&new.k, &k->k_i);
+
+ int ret = darray_push(&wb->accounting, new);
+ if (ret)
+ return ret;
+
+ wb_accounting_sort(wb);
+ return 0;
+}
+
int bch2_journal_key_to_wb_slowpath(struct bch_fs *c,
struct journal_keys_to_wb *dst,
enum btree_id btree, struct bkey_i *k)
@@ -576,11 +702,35 @@ void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_ke
bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin,
bch2_btree_write_buffer_journal_flush);
+
+ darray_for_each(wb->accounting, i)
+ memset(&i->k.v, 0, bkey_val_bytes(&i->k.k));
}
-void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst)
+int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst)
{
struct btree_write_buffer *wb = &c->btree_write_buffer;
+ unsigned live_accounting_keys = 0;
+ int ret = 0;
+
+ darray_for_each(wb->accounting, i)
+ if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&i->k))) {
+ i->journal_seq = dst->seq;
+ live_accounting_keys++;
+ ret = __bch2_journal_key_to_wb(c, dst, i->btree, &i->k);
+ if (ret)
+ break;
+ }
+
+ if (live_accounting_keys * 2 < wb->accounting.nr) {
+ struct btree_write_buffered_key *dst = wb->accounting.data;
+
+ darray_for_each(wb->accounting, src)
+ if (!bch2_accounting_key_is_zero(bkey_i_to_s_c_accounting(&src->k)))
+ *dst++ = *src;
+ wb->accounting.nr = dst - wb->accounting.data;
+ wb_accounting_sort(wb);
+ }
if (!dst->wb->keys.nr)
bch2_journal_pin_drop(&c->journal, &dst->wb->pin);
@@ -593,6 +743,8 @@ void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys
if (dst->wb == &wb->flushing)
mutex_unlock(&wb->flushing.lock);
mutex_unlock(&wb->inc.lock);
+
+ return ret;
}
static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
@@ -616,7 +768,7 @@ static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_bu
buf->need_flush_to_write_buffer = false;
spin_unlock(&c->journal.lock);
out:
- bch2_journal_keys_to_write_buffer_end(c, &dst);
+ ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret;
return ret;
}
@@ -648,6 +800,7 @@ void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) &&
!bch2_journal_error(&c->journal));
+ darray_exit(&wb->accounting);
darray_exit(&wb->sorted);
darray_exit(&wb->flushing.keys);
darray_exit(&wb->inc.keys);
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
index eebcd2b15249..725e79654216 100644
--- a/fs/bcachefs/btree_write_buffer.h
+++ b/fs/bcachefs/btree_write_buffer.h
@@ -3,6 +3,7 @@
#define _BCACHEFS_BTREE_WRITE_BUFFER_H
#include "bkey.h"
+#include "disk_accounting.h"
static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c)
{
@@ -23,22 +24,54 @@ int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *);
int bch2_btree_write_buffer_tryflush(struct btree_trans *);
+struct bkey_buf;
+int bch2_btree_write_buffer_maybe_flush(struct btree_trans *, struct bkey_s_c, struct bkey_buf *);
+
struct journal_keys_to_wb {
struct btree_write_buffer_keys *wb;
size_t room;
u64 seq;
};
+static inline int wb_key_cmp(const void *_l, const void *_r)
+{
+ const struct btree_write_buffered_key *l = _l;
+ const struct btree_write_buffered_key *r = _r;
+
+ return cmp_int(l->btree, r->btree) ?: bpos_cmp(l->k.k.p, r->k.k.p);
+}
+
+int bch2_accounting_key_to_wb_slowpath(struct bch_fs *,
+ enum btree_id, struct bkey_i_accounting *);
+
+static inline int bch2_accounting_key_to_wb(struct bch_fs *c,
+ enum btree_id btree, struct bkey_i_accounting *k)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+ struct btree_write_buffered_key search;
+ search.btree = btree;
+ search.k.k.p = k->k.p;
+
+ unsigned idx = eytzinger0_find(wb->accounting.data, wb->accounting.nr,
+ sizeof(wb->accounting.data[0]),
+ wb_key_cmp, &search);
+
+ if (idx >= wb->accounting.nr)
+ return bch2_accounting_key_to_wb_slowpath(c, btree, k);
+
+ struct bkey_i_accounting *dst = bkey_i_to_accounting(&wb->accounting.data[idx].k);
+ bch2_accounting_accumulate(dst, accounting_i_to_s_c(k));
+ return 0;
+}
+
int bch2_journal_key_to_wb_slowpath(struct bch_fs *,
struct journal_keys_to_wb *,
enum btree_id, struct bkey_i *);
-static inline int bch2_journal_key_to_wb(struct bch_fs *c,
+static inline int __bch2_journal_key_to_wb(struct bch_fs *c,
struct journal_keys_to_wb *dst,
enum btree_id btree, struct bkey_i *k)
{
- EBUG_ON(!dst->seq);
-
if (unlikely(!dst->room))
return bch2_journal_key_to_wb_slowpath(c, dst, btree, k);
@@ -51,8 +84,19 @@ static inline int bch2_journal_key_to_wb(struct bch_fs *c,
return 0;
}
+static inline int bch2_journal_key_to_wb(struct bch_fs *c,
+ struct journal_keys_to_wb *dst,
+ enum btree_id btree, struct bkey_i *k)
+{
+ EBUG_ON(!dst->seq);
+
+ return k->k.type == KEY_TYPE_accounting
+ ? bch2_accounting_key_to_wb(c, btree, bkey_i_to_accounting(k))
+ : __bch2_journal_key_to_wb(c, dst, btree, k);
+}
+
void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64);
-void bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *);
+int bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *);
int bch2_btree_write_buffer_resize(struct bch_fs *, size_t);
void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h
index 9b9433de9c36..e9e76e20f43b 100644
--- a/fs/bcachefs/btree_write_buffer_types.h
+++ b/fs/bcachefs/btree_write_buffer_types.h
@@ -52,6 +52,8 @@ struct btree_write_buffer {
struct btree_write_buffer_keys inc;
struct btree_write_buffer_keys flushing;
struct work_struct flush_work;
+
+ DARRAY(struct btree_write_buffered_key) accounting;
};
#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 743d57eba760..2650a0d24663 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -13,6 +13,7 @@
#include "btree_update.h"
#include "buckets.h"
#include "buckets_waiting_for_journal.h"
+#include "disk_accounting.h"
#include "ec.h"
#include "error.h"
#include "inode.h"
@@ -25,197 +26,10 @@
#include <linux/preempt.h>
-static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
- enum bch_data_type data_type,
- s64 sectors)
-{
- switch (data_type) {
- case BCH_DATA_btree:
- fs_usage->btree += sectors;
- break;
- case BCH_DATA_user:
- case BCH_DATA_parity:
- fs_usage->data += sectors;
- break;
- case BCH_DATA_cached:
- fs_usage->cached += sectors;
- break;
- default:
- break;
- }
-}
-
-void bch2_fs_usage_initialize(struct bch_fs *c)
-{
- percpu_down_write(&c->mark_lock);
- struct bch_fs_usage *usage = c->usage_base;
-
- for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++)
- bch2_fs_usage_acc_to_base(c, i);
-
- for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++)
- usage->b.reserved += usage->persistent_reserved[i];
-
- for (unsigned i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry_v1 *e =
- cpu_replicas_entry(&c->replicas, i);
-
- fs_usage_data_type_to_base(&usage->b, e->data_type, usage->replicas[i]);
- }
-
- for_each_member_device(c, ca) {
- struct bch_dev_usage dev = bch2_dev_usage_read(ca);
-
- usage->b.hidden += (dev.d[BCH_DATA_sb].buckets +
- dev.d[BCH_DATA_journal].buckets) *
- ca->mi.bucket_size;
- }
-
- percpu_up_write(&c->mark_lock);
-}
-
-static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
- unsigned journal_seq,
- bool gc)
-{
- BUG_ON(!gc && !journal_seq);
-
- return this_cpu_ptr(gc
- ? ca->usage_gc
- : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
-}
-
void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
{
- struct bch_fs *c = ca->fs;
- unsigned seq, i, u64s = dev_usage_u64s();
-
- do {
- seq = read_seqcount_begin(&c->usage_lock);
- memcpy(usage, ca->usage_base, u64s * sizeof(u64));
- for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
- acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage[i], u64s);
- } while (read_seqcount_retry(&c->usage_lock, seq));
-}
-
-u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
-{
- ssize_t offset = v - (u64 *) c->usage_base;
- unsigned i, seq;
- u64 ret;
-
- BUG_ON(offset < 0 || offset >= fs_usage_u64s(c));
- percpu_rwsem_assert_held(&c->mark_lock);
-
- do {
- seq = read_seqcount_begin(&c->usage_lock);
- ret = *v;
-
- for (i = 0; i < ARRAY_SIZE(c->usage); i++)
- ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset);
- } while (read_seqcount_retry(&c->usage_lock, seq));
-
- return ret;
-}
-
-struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
-{
- struct bch_fs_usage_online *ret;
- unsigned nr_replicas = READ_ONCE(c->replicas.nr);
- unsigned seq, i;
-retry:
- ret = kmalloc(__fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_KERNEL);
- if (unlikely(!ret))
- return NULL;
-
- percpu_down_read(&c->mark_lock);
-
- if (nr_replicas != c->replicas.nr) {
- nr_replicas = c->replicas.nr;
- percpu_up_read(&c->mark_lock);
- kfree(ret);
- goto retry;
- }
-
- ret->online_reserved = percpu_u64_get(c->online_reserved);
-
- do {
- seq = read_seqcount_begin(&c->usage_lock);
- unsafe_memcpy(&ret->u, c->usage_base,
- __fs_usage_u64s(nr_replicas) * sizeof(u64),
- "embedded variable length struct");
- for (i = 0; i < ARRAY_SIZE(c->usage); i++)
- acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i],
- __fs_usage_u64s(nr_replicas));
- } while (read_seqcount_retry(&c->usage_lock, seq));
-
- return ret;
-}
-
-void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
-{
- unsigned u64s = fs_usage_u64s(c);
-
- BUG_ON(idx >= ARRAY_SIZE(c->usage));
-
- preempt_disable();
- write_seqcount_begin(&c->usage_lock);
-
- acc_u64s_percpu((u64 *) c->usage_base,
- (u64 __percpu *) c->usage[idx], u64s);
- percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
-
- rcu_read_lock();
- for_each_member_device_rcu(c, ca, NULL) {
- u64s = dev_usage_u64s();
-
- acc_u64s_percpu((u64 *) ca->usage_base,
- (u64 __percpu *) ca->usage[idx], u64s);
- percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
- }
- rcu_read_unlock();
-
- write_seqcount_end(&c->usage_lock);
- preempt_enable();
-}
-
-void bch2_fs_usage_to_text(struct printbuf *out,
- struct bch_fs *c,
- struct bch_fs_usage_online *fs_usage)
-{
- unsigned i;
-
- prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity);
-
- prt_printf(out, "hidden:\t\t\t\t%llu\n",
- fs_usage->u.b.hidden);
- prt_printf(out, "data:\t\t\t\t%llu\n",
- fs_usage->u.b.data);
- prt_printf(out, "cached:\t\t\t\t%llu\n",
- fs_usage->u.b.cached);
- prt_printf(out, "reserved:\t\t\t%llu\n",
- fs_usage->u.b.reserved);
- prt_printf(out, "nr_inodes:\t\t\t%llu\n",
- fs_usage->u.b.nr_inodes);
- prt_printf(out, "online reserved:\t\t%llu\n",
- fs_usage->online_reserved);
-
- for (i = 0;
- i < ARRAY_SIZE(fs_usage->u.persistent_reserved);
- i++) {
- prt_printf(out, "%u replicas:\n", i + 1);
- prt_printf(out, "\treserved:\t\t%llu\n",
- fs_usage->u.persistent_reserved[i]);
- }
-
- for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry_v1 *e =
- cpu_replicas_entry(&c->replicas, i);
-
- prt_printf(out, "\t");
- bch2_replicas_entry_to_text(out, e);
- prt_printf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
- }
+ memset(usage, 0, sizeof(*usage));
+ acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, dev_usage_u64s());
}
static u64 reserve_factor(u64 r)
@@ -223,16 +37,6 @@ static u64 reserve_factor(u64 r)
return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
}
-u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
-{
- return min(fs_usage->u.b.hidden +
- fs_usage->u.b.btree +
- fs_usage->u.b.data +
- reserve_factor(fs_usage->u.b.reserved +
- fs_usage->online_reserved),
- c->capacity);
-}
-
static struct bch_fs_usage_short
__bch2_fs_usage_read_short(struct bch_fs *c)
{
@@ -240,17 +44,17 @@ __bch2_fs_usage_read_short(struct bch_fs *c)
u64 data, reserved;
ret.capacity = c->capacity -
- bch2_fs_usage_read_one(c, &c->usage_base->b.hidden);
+ percpu_u64_get(&c->usage->hidden);
- data = bch2_fs_usage_read_one(c, &c->usage_base->b.data) +
- bch2_fs_usage_read_one(c, &c->usage_base->b.btree);
- reserved = bch2_fs_usage_read_one(c, &c->usage_base->b.reserved) +
+ data = percpu_u64_get(&c->usage->data) +
+ percpu_u64_get(&c->usage->btree);
+ reserved = percpu_u64_get(&c->usage->reserved) +
percpu_u64_get(c->online_reserved);
ret.used = min(ret.capacity, data + reserve_factor(reserved));
ret.free = ret.capacity - ret.used;
- ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes);
+ ret.nr_inodes = percpu_u64_get(&c->usage->nr_inodes);
return ret;
}
@@ -267,11 +71,6 @@ bch2_fs_usage_read_short(struct bch_fs *c)
return ret;
}
-void bch2_dev_usage_init(struct bch_dev *ca)
-{
- ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket;
-}
-
void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage)
{
prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n");
@@ -285,186 +84,6 @@ void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage)
}
}
-void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
- const struct bch_alloc_v4 *old,
- const struct bch_alloc_v4 *new,
- u64 journal_seq, bool gc)
-{
- struct bch_fs_usage *fs_usage;
- struct bch_dev_usage *u;
-
- preempt_disable();
- fs_usage = fs_usage_ptr(c, journal_seq, gc);
-
- if (data_type_is_hidden(old->data_type))
- fs_usage->b.hidden -= ca->mi.bucket_size;
- if (data_type_is_hidden(new->data_type))
- fs_usage->b.hidden += ca->mi.bucket_size;
-
- u = dev_usage_ptr(ca, journal_seq, gc);
-
- u->d[old->data_type].buckets--;
- u->d[new->data_type].buckets++;
-
- u->d[old->data_type].sectors -= bch2_bucket_sectors_dirty(*old);
- u->d[new->data_type].sectors += bch2_bucket_sectors_dirty(*new);
-
- u->d[BCH_DATA_cached].sectors += new->cached_sectors;
- u->d[BCH_DATA_cached].sectors -= old->cached_sectors;
-
- u->d[old->data_type].fragmented -= bch2_bucket_sectors_fragmented(ca, *old);
- u->d[new->data_type].fragmented += bch2_bucket_sectors_fragmented(ca, *new);
-
- preempt_enable();
-}
-
-static inline int __update_replicas(struct bch_fs *c,
- struct bch_fs_usage *fs_usage,
- struct bch_replicas_entry_v1 *r,
- s64 sectors)
-{
- int idx = bch2_replicas_entry_idx(c, r);
-
- if (idx < 0)
- return -1;
-
- fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
- fs_usage->replicas[idx] += sectors;
- return 0;
-}
-
-int bch2_update_replicas(struct bch_fs *c, struct bkey_s_c k,
- struct bch_replicas_entry_v1 *r, s64 sectors,
- unsigned journal_seq, bool gc)
-{
- struct bch_fs_usage *fs_usage;
- int idx, ret = 0;
- struct printbuf buf = PRINTBUF;
-
- percpu_down_read(&c->mark_lock);
-
- idx = bch2_replicas_entry_idx(c, r);
- if (idx < 0 &&
- fsck_err(c, ptr_to_missing_replicas_entry,
- "no replicas entry\n while marking %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- percpu_up_read(&c->mark_lock);
- ret = bch2_mark_replicas(c, r);
- percpu_down_read(&c->mark_lock);
-
- if (ret)
- goto err;
- idx = bch2_replicas_entry_idx(c, r);
- }
- if (idx < 0) {
- ret = -1;
- goto err;
- }
-
- preempt_disable();
- fs_usage = fs_usage_ptr(c, journal_seq, gc);
- fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
- fs_usage->replicas[idx] += sectors;
- preempt_enable();
-err:
-fsck_err:
- percpu_up_read(&c->mark_lock);
- printbuf_exit(&buf);
- return ret;
-}
-
-static inline int update_cached_sectors(struct bch_fs *c,
- struct bkey_s_c k,
- unsigned dev, s64 sectors,
- unsigned journal_seq, bool gc)
-{
- struct bch_replicas_padded r;
-
- bch2_replicas_entry_cached(&r.e, dev);
-
- return bch2_update_replicas(c, k, &r.e, sectors, journal_seq, gc);
-}
-
-static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more,
- gfp_t gfp)
-{
- struct replicas_delta_list *d = trans->fs_usage_deltas;
- unsigned new_size = d ? (d->size + more) * 2 : 128;
- unsigned alloc_size = sizeof(*d) + new_size;
-
- WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX);
-
- if (!d || d->used + more > d->size) {
- d = krealloc(d, alloc_size, gfp|__GFP_ZERO);
-
- if (unlikely(!d)) {
- if (alloc_size > REPLICAS_DELTA_LIST_MAX)
- return -ENOMEM;
-
- d = mempool_alloc(&trans->c->replicas_delta_pool, gfp);
- if (!d)
- return -ENOMEM;
-
- memset(d, 0, REPLICAS_DELTA_LIST_MAX);
-
- if (trans->fs_usage_deltas)
- memcpy(d, trans->fs_usage_deltas,
- trans->fs_usage_deltas->size + sizeof(*d));
-
- new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d);
- kfree(trans->fs_usage_deltas);
- }
-
- d->size = new_size;
- trans->fs_usage_deltas = d;
- }
-
- return 0;
-}
-
-int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
-{
- return allocate_dropping_locks_errcode(trans,
- __replicas_deltas_realloc(trans, more, _gfp));
-}
-
-int bch2_update_replicas_list(struct btree_trans *trans,
- struct bch_replicas_entry_v1 *r,
- s64 sectors)
-{
- struct replicas_delta_list *d;
- struct replicas_delta *n;
- unsigned b;
- int ret;
-
- if (!sectors)
- return 0;
-
- b = replicas_entry_bytes(r) + 8;
- ret = bch2_replicas_deltas_realloc(trans, b);
- if (ret)
- return ret;
-
- d = trans->fs_usage_deltas;
- n = (void *) d->d + d->used;
- n->delta = sectors;
- unsafe_memcpy((void *) n + offsetof(struct replicas_delta, r),
- r, replicas_entry_bytes(r),
- "flexible array member embedded in strcuct with padding");
- bch2_replicas_entry_sort(&n->r);
- d->used += b;
- return 0;
-}
-
-int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 sectors)
-{
- struct bch_replicas_padded r;
-
- bch2_replicas_entry_cached(&r.e, dev);
-
- return bch2_update_replicas_list(trans, &r.e, sectors);
-}
-
static int bch2_check_fix_ptr(struct btree_trans *trans,
struct bkey_s_c k,
struct extent_ptr_decoded p,
@@ -477,7 +96,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
if (!ca) {
- if (fsck_err(c, ptr_to_invalid_device,
+ if (fsck_err(trans, ptr_to_invalid_device,
"pointer to missing device %u\n"
"while marking %s",
p.ptr.dev,
@@ -489,7 +108,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
if (!g) {
- if (fsck_err(c, ptr_to_invalid_device,
+ if (fsck_err(trans, ptr_to_invalid_device,
"pointer to invalid bucket on device %u\n"
"while marking %s",
p.ptr.dev,
@@ -502,7 +121,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
if (fsck_err_on(!g->gen_valid,
- c, ptr_to_missing_alloc_key,
+ trans, ptr_to_missing_alloc_key,
"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
@@ -519,7 +138,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
}
if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
- c, ptr_gen_newer_than_bucket_gen,
+ trans, ptr_gen_newer_than_bucket_gen,
"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
@@ -533,6 +152,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
g->gen_valid = true;
g->gen = p.ptr.gen;
g->data_type = 0;
+ g->stripe_sectors = 0;
g->dirty_sectors = 0;
g->cached_sectors = 0;
} else {
@@ -541,7 +161,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
}
if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
- c, ptr_gen_newer_than_bucket_gen,
+ trans, ptr_gen_newer_than_bucket_gen,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
@@ -552,7 +172,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
*do_update = true;
if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
- c, stale_dirty_ptr,
+ trans, stale_dirty_ptr,
"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
@@ -566,7 +186,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
goto out;
if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type),
- c, ptr_bucket_data_type_mismatch,
+ trans, ptr_bucket_data_type_mismatch,
"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
@@ -578,6 +198,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
g->gen_valid = true;
g->gen = p.ptr.gen;
g->data_type = data_type;
+ g->stripe_sectors = 0;
g->dirty_sectors = 0;
g->cached_sectors = 0;
} else {
@@ -589,7 +210,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
if (fsck_err_on(!m || !m->alive,
- c, ptr_to_missing_stripe,
+ trans, ptr_to_missing_stripe,
"pointer to nonexistent stripe %llu\n"
"while marking %s",
(u64) p.ec.idx,
@@ -598,7 +219,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
*do_update = true;
if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p),
- c, ptr_to_incorrect_stripe,
+ trans, ptr_to_incorrect_stripe,
"pointer does not match stripe %llu\n"
"while marking %s",
(u64) p.ec.idx,
@@ -766,8 +387,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
BUG_ON(!sectors);
if (gen_after(ptr->gen, b_gen)) {
- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen,
+ bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ ptr_gen_newer_than_bucket_gen,
"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
@@ -780,8 +401,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
}
if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- BCH_FSCK_ERR_ptr_too_stale,
+ bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ ptr_too_stale,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
@@ -800,12 +421,12 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
}
if (b_gen != ptr->gen) {
- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- BCH_FSCK_ERR_stale_dirty_ptr,
+ bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ stale_dirty_ptr,
"bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
- *bucket_gen(ca, bucket_nr),
+ bucket_gen_get(ca, bucket_nr),
bch2_data_type_str(bucket_data_type ?: ptr_data_type),
ptr->gen,
(printbuf_reset(&buf),
@@ -816,8 +437,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
}
if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) {
- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- BCH_FSCK_ERR_ptr_bucket_data_type_mismatch,
+ bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ ptr_bucket_data_type_mismatch,
"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
@@ -831,8 +452,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
}
if ((u64) *bucket_sectors + sectors > U32_MAX) {
- bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- BCH_FSCK_ERR_bucket_sector_count_overflow,
+ bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+ bucket_sector_count_overflow,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
@@ -855,47 +476,6 @@ err:
goto out;
}
-void bch2_trans_fs_usage_revert(struct btree_trans *trans,
- struct replicas_delta_list *deltas)
-{
- struct bch_fs *c = trans->c;
- struct bch_fs_usage *dst;
- struct replicas_delta *d, *top = (void *) deltas->d + deltas->used;
- s64 added = 0;
- unsigned i;
-
- percpu_down_read(&c->mark_lock);
- preempt_disable();
- dst = fs_usage_ptr(c, trans->journal_res.seq, false);
-
- /* revert changes: */
- for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
- switch (d->r.data_type) {
- case BCH_DATA_btree:
- case BCH_DATA_user:
- case BCH_DATA_parity:
- added += d->delta;
- }
- BUG_ON(__update_replicas(c, dst, &d->r, -d->delta));
- }
-
- dst->b.nr_inodes -= deltas->nr_inodes;
-
- for (i = 0; i < BCH_REPLICAS_MAX; i++) {
- added -= deltas->persistent_reserved[i];
- dst->b.reserved -= deltas->persistent_reserved[i];
- dst->persistent_reserved[i] -= deltas->persistent_reserved[i];
- }
-
- if (added > 0) {
- trans->disk_res->sectors += added;
- this_cpu_add(*c->online_reserved, added);
- }
-
- preempt_enable();
- percpu_up_read(&c->mark_lock);
-}
-
void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
@@ -904,8 +484,6 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
bool warn = false;
percpu_down_read(&c->mark_lock);
- preempt_disable();
- struct bch_fs_usage_base *dst = &fs_usage_ptr(c, trans->journal_res.seq, false)->b;
struct bch_fs_usage_base *src = &trans->fs_usage_delta;
s64 added = src->btree + src->data + src->reserved;
@@ -916,13 +494,13 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
*/
s64 should_not_have_added = added - (s64) disk_res_sectors;
if (unlikely(should_not_have_added > 0)) {
- u64 old, new, v = atomic64_read(&c->sectors_available);
+ u64 old, new;
+ old = atomic64_read(&c->sectors_available);
do {
- old = v;
new = max_t(s64, 0, old - should_not_have_added);
- } while ((v = atomic64_cmpxchg(&c->sectors_available,
- old, new)) != old);
+ } while (!atomic64_try_cmpxchg(&c->sectors_available,
+ &old, new));
added -= should_not_have_added;
warn = true;
@@ -933,13 +511,9 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
this_cpu_sub(*c->online_reserved, added);
}
- dst->hidden += src->hidden;
- dst->btree += src->btree;
- dst->data += src->data;
- dst->cached += src->cached;
- dst->reserved += src->reserved;
- dst->nr_inodes += src->nr_inodes;
-
+ preempt_disable();
+ struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
+ acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64));
preempt_enable();
percpu_up_read(&c->mark_lock);
@@ -949,55 +523,18 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
should_not_have_added, disk_res_sectors);
}
-int bch2_trans_fs_usage_apply(struct btree_trans *trans,
- struct replicas_delta_list *deltas)
-{
- struct bch_fs *c = trans->c;
- struct replicas_delta *d, *d2;
- struct replicas_delta *top = (void *) deltas->d + deltas->used;
- struct bch_fs_usage *dst;
- unsigned i;
-
- percpu_down_read(&c->mark_lock);
- preempt_disable();
- dst = fs_usage_ptr(c, trans->journal_res.seq, false);
-
- for (d = deltas->d; d != top; d = replicas_delta_next(d))
- if (__update_replicas(c, dst, &d->r, d->delta))
- goto need_mark;
-
- dst->b.nr_inodes += deltas->nr_inodes;
-
- for (i = 0; i < BCH_REPLICAS_MAX; i++) {
- dst->b.reserved += deltas->persistent_reserved[i];
- dst->persistent_reserved[i] += deltas->persistent_reserved[i];
- }
-
- preempt_enable();
- percpu_up_read(&c->mark_lock);
- return 0;
-need_mark:
- /* revert changes: */
- for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2))
- BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta));
-
- preempt_enable();
- percpu_up_read(&c->mark_lock);
- return -1;
-}
-
/* KEY_TYPE_extent: */
static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca,
struct bkey_s_c k,
- const struct bch_extent_ptr *ptr,
+ const struct extent_ptr_decoded *p,
s64 sectors, enum bch_data_type ptr_data_type,
struct bch_alloc_v4 *a)
{
- u32 *dst_sectors = !ptr->cached
- ? &a->dirty_sectors
- : &a->cached_sectors;
- int ret = bch2_bucket_ref_update(trans, ca, k, ptr, sectors, ptr_data_type,
+ u32 *dst_sectors = p->has_ec ? &a->stripe_sectors :
+ !p->ptr.cached ? &a->dirty_sectors :
+ &a->cached_sectors;
+ int ret = bch2_bucket_ref_update(trans, ca, k, &p->ptr, sectors, ptr_data_type,
a->gen, a->data_type, dst_sectors);
if (ret)
@@ -1032,9 +569,9 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
*sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len);
if (flags & BTREE_TRIGGER_transactional) {
- struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket);
+ struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0);
ret = PTR_ERR_OR_ZERO(a) ?:
- __mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &a->v);
+ __mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &a->v);
if (ret)
goto err;
@@ -1057,14 +594,14 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
bucket_lock(g);
struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
- ret = __mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &new);
- if (!ret) {
- alloc_to_bucket(g, new);
- bch2_dev_usage_update(c, ca, &old, &new, 0, true);
- }
+ ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &new);
+ alloc_to_bucket(g, new);
bucket_unlock(g);
err_unlock:
percpu_up_read(&c->mark_lock);
+
+ if (!ret)
+ ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
}
err:
bch2_dev_put(ca);
@@ -1104,10 +641,12 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
stripe_blockcount_get(&s->v, p.ec.block) +
sectors);
- struct bch_replicas_padded r;
- bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
- r.e.data_type = data_type;
- ret = bch2_update_replicas_list(trans, &r.e, sectors);
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_replicas,
+ };
+ bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
+ acc.replicas.data_type = data_type;
+ ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -1116,8 +655,6 @@ err:
if (flags & BTREE_TRIGGER_gc) {
struct bch_fs *c = trans->c;
- BUG_ON(!(flags & BTREE_TRIGGER_gc));
-
struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL);
if (!m) {
bch_err(c, "error allocating memory for gc_stripes, idx %llu",
@@ -1140,11 +677,16 @@ err:
m->block_sectors[p.ec.block] += sectors;
- struct bch_replicas_padded r = m->r;
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_replicas,
+ };
+ memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e));
mutex_unlock(&c->ec_stripes_heap_lock);
- r.e.data_type = data_type;
- bch2_update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true);
+ acc.replicas.data_type = data_type;
+ int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, true);
+ if (ret)
+ return ret;
}
return 0;
@@ -1156,20 +698,26 @@ static int __trigger_extent(struct btree_trans *trans,
enum btree_iter_update_trigger_flags flags)
{
bool gc = flags & BTREE_TRIGGER_gc;
- struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- struct bch_replicas_padded r;
enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
? BCH_DATA_btree
: BCH_DATA_user;
s64 replicas_sectors = 0;
int ret = 0;
- r.e.data_type = data_type;
- r.e.nr_devs = 0;
- r.e.nr_required = 1;
+ struct disk_accounting_pos acc_replicas_key = {
+ .type = BCH_DISK_ACCOUNTING_replicas,
+ .replicas.data_type = data_type,
+ .replicas.nr_devs = 0,
+ .replicas.nr_required = 1,
+ };
+
+ struct disk_accounting_pos acct_compression_key = {
+ .type = BCH_DISK_ACCOUNTING_compression,
+ };
+ u64 compression_acct[3] = { 1, 0, 0 };
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
s64 disk_sectors = 0;
@@ -1179,19 +727,16 @@ static int __trigger_extent(struct btree_trans *trans,
bool stale = ret > 0;
+ if (p.ptr.cached && stale)
+ continue;
+
if (p.ptr.cached) {
- if (!stale) {
- ret = !gc
- ? bch2_update_cached_sectors_list(trans, p.ptr.dev, disk_sectors)
- : update_cached_sectors(c, k, p.ptr.dev, disk_sectors, 0, true);
- bch2_fs_fatal_err_on(ret && gc, c, "%s: no replicas entry while updating cached sectors",
- bch2_err_str(ret));
- if (ret)
- return ret;
- }
+ ret = bch2_mod_dev_cached_sectors(trans, p.ptr.dev, disk_sectors, gc);
+ if (ret)
+ return ret;
} else if (!p.has_ec) {
replicas_sectors += disk_sectors;
- r.e.devs[r.e.nr_devs++] = p.ptr.dev;
+ acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev;
} else {
ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
if (ret)
@@ -1202,21 +747,72 @@ static int __trigger_extent(struct btree_trans *trans,
* if so they're not required for mounting if we have an
* erasure coded pointer in this extent:
*/
- r.e.nr_required = 0;
+ acc_replicas_key.replicas.nr_required = 0;
}
- }
- if (r.e.nr_devs) {
- ret = !gc
- ? bch2_update_replicas_list(trans, &r.e, replicas_sectors)
- : bch2_update_replicas(c, k, &r.e, replicas_sectors, 0, true);
- if (unlikely(ret && gc)) {
- struct printbuf buf = PRINTBUF;
+ if (acct_compression_key.compression.type &&
+ acct_compression_key.compression.type != p.crc.compression_type) {
+ if (flags & BTREE_TRIGGER_overwrite)
+ bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
- bch2_bkey_val_to_text(&buf, c, k);
- bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf);
- printbuf_exit(&buf);
+ ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct,
+ ARRAY_SIZE(compression_acct), gc);
+ if (ret)
+ return ret;
+
+ compression_acct[0] = 1;
+ compression_acct[1] = 0;
+ compression_acct[2] = 0;
+ }
+
+ acct_compression_key.compression.type = p.crc.compression_type;
+ if (p.crc.compression_type) {
+ compression_acct[1] += p.crc.uncompressed_size;
+ compression_acct[2] += p.crc.compressed_size;
}
+ }
+
+ if (acc_replicas_key.replicas.nr_devs) {
+ ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc);
+ if (ret)
+ return ret;
+ }
+
+ if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) {
+ struct disk_accounting_pos acc_snapshot_key = {
+ .type = BCH_DISK_ACCOUNTING_snapshot,
+ .snapshot.id = k.k->p.snapshot,
+ };
+ ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, &replicas_sectors, 1, gc);
+ if (ret)
+ return ret;
+ }
+
+ if (acct_compression_key.compression.type) {
+ if (flags & BTREE_TRIGGER_overwrite)
+ bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
+
+ ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct,
+ ARRAY_SIZE(compression_acct), gc);
+ if (ret)
+ return ret;
+ }
+
+ if (level) {
+ struct disk_accounting_pos acc_btree_key = {
+ .type = BCH_DISK_ACCOUNTING_btree,
+ .btree.id = btree_id,
+ };
+ ret = bch2_disk_accounting_mod(trans, &acc_btree_key, &replicas_sectors, 1, gc);
+ if (ret)
+ return ret;
+ }
+
+ if (bch2_bkey_rebalance_opts(k)) {
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_rebalance_work,
+ };
+ ret = bch2_disk_accounting_mod(trans, &acc, &replicas_sectors, 1, gc);
if (ret)
return ret;
}
@@ -1269,36 +865,18 @@ static int __trigger_reservation(struct btree_trans *trans,
enum btree_id btree_id, unsigned level, struct bkey_s_c k,
enum btree_iter_update_trigger_flags flags)
{
- struct bch_fs *c = trans->c;
- unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
- s64 sectors = (s64) k.k->size * replicas;
-
- if (flags & BTREE_TRIGGER_overwrite)
- sectors = -sectors;
-
- if (flags & BTREE_TRIGGER_transactional) {
- int ret = bch2_replicas_deltas_realloc(trans, 0);
- if (ret)
- return ret;
+ if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
+ s64 sectors = k.k->size;
- struct replicas_delta_list *d = trans->fs_usage_deltas;
- replicas = min(replicas, ARRAY_SIZE(d->persistent_reserved));
+ if (flags & BTREE_TRIGGER_overwrite)
+ sectors = -sectors;
- d->persistent_reserved[replicas - 1] += sectors;
- }
-
- if (flags & BTREE_TRIGGER_gc) {
- percpu_down_read(&c->mark_lock);
- preempt_disable();
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_persistent_reserved,
+ .persistent_reserved.nr_replicas = bkey_s_c_to_reservation(k).v->nr_replicas,
+ };
- struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage_gc);
-
- replicas = min(replicas, ARRAY_SIZE(fs_usage->persistent_reserved));
- fs_usage->b.reserved += sectors;
- fs_usage->persistent_reserved[replicas - 1] += sectors;
-
- preempt_enable();
- percpu_up_read(&c->mark_lock);
+ return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, flags & BTREE_TRIGGER_gc);
}
return 0;
@@ -1330,7 +908,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
if (a->v.data_type && type && a->v.data_type != type) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
- BCH_FSCK_ERR_bucket_metadata_type_mismatch,
+ bucket_metadata_type_mismatch,
"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
iter.pos.inode, iter.pos.offset, a->v.gen,
@@ -1352,10 +930,13 @@ err:
return ret;
}
-static int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca,
u64 b, enum bch_data_type data_type, unsigned sectors,
enum btree_iter_update_trigger_flags flags)
{
+ struct bch_fs *c = trans->c;
+ int ret = 0;
+
percpu_down_read(&c->mark_lock);
struct bucket *g = gc_bucket(ca, b);
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s",
@@ -1382,9 +963,10 @@ static int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
g->data_type = data_type;
g->dirty_sectors += sectors;
struct bch_alloc_v4 new = bucket_m_to_alloc(*g);
- bch2_dev_usage_update(c, ca, &old, &new, 0, true);
+ bucket_unlock(g);
percpu_up_read(&c->mark_lock);
- return 0;
+ ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
+ return ret;
err:
bucket_unlock(g);
err_unlock:
@@ -1408,7 +990,7 @@ int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
return 0;
if (flags & BTREE_TRIGGER_gc)
- return bch2_mark_metadata_bucket(trans->c, ca, b, type, sectors, flags);
+ return bch2_mark_metadata_bucket(trans, ca, b, type, sectors, flags);
else if (flags & BTREE_TRIGGER_transactional)
return commit_do(trans, NULL, NULL, 0,
__bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
@@ -1523,7 +1105,7 @@ int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
u64 sectors, int flags)
{
struct bch_fs_pcpu *pcpu;
- u64 old, v, get;
+ u64 old, get;
s64 sectors_available;
int ret;
@@ -1534,17 +1116,16 @@ int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
if (sectors <= pcpu->sectors_available)
goto out;
- v = atomic64_read(&c->sectors_available);
+ old = atomic64_read(&c->sectors_available);
do {
- old = v;
get = min((u64) sectors + SECTORS_CACHE, old);
if (get < sectors) {
preempt_enable();
goto recalculate;
}
- } while ((v = atomic64_cmpxchg(&c->sectors_available,
- old, old - get)) != old);
+ } while (!atomic64_try_cmpxchg(&c->sectors_available,
+ &old, old - get));
pcpu->sectors_available += get;
@@ -1636,7 +1217,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
bucket_gens->nbuckets - bucket_gens->first_bucket;
if (resize) {
- down_write(&c->gc_lock);
down_write(&ca->bucket_lock);
percpu_down_write(&c->mark_lock);
}
@@ -1659,7 +1239,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
if (resize) {
percpu_up_write(&c->mark_lock);
up_write(&ca->bucket_lock);
- up_write(&c->gc_lock);
}
ret = 0;
@@ -1674,23 +1253,14 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
{
kvfree(ca->buckets_nouse);
kvfree(rcu_dereference_protected(ca->bucket_gens, 1));
-
- for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++)
- free_percpu(ca->usage[i]);
- kfree(ca->usage_base);
+ free_percpu(ca->usage);
}
int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
{
- ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
- if (!ca->usage_base)
+ ca->usage = alloc_percpu(struct bch_dev_usage);
+ if (!ca->usage)
return -BCH_ERR_ENOMEM_usage_init;
- for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) {
- ca->usage[i] = alloc_percpu(struct bch_dev_usage);
- if (!ca->usage[i])
- return -BCH_ERR_ENOMEM_usage_init;
- }
-
return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 80ee0be9793e..2d35eeb24a2d 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -85,7 +85,7 @@ static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
return rcu_dereference_check(ca->buckets_gc,
!ca->fs ||
percpu_rwsem_is_held(&ca->fs->mark_lock) ||
- lockdep_is_held(&ca->fs->gc_lock) ||
+ lockdep_is_held(&ca->fs->state_lock) ||
lockdep_is_held(&ca->bucket_lock));
}
@@ -103,7 +103,7 @@ static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
return rcu_dereference_check(ca->bucket_gens,
!ca->fs ||
percpu_rwsem_is_held(&ca->fs->mark_lock) ||
- lockdep_is_held(&ca->fs->gc_lock) ||
+ lockdep_is_held(&ca->fs->state_lock) ||
lockdep_is_held(&ca->bucket_lock));
}
@@ -116,6 +116,14 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
return gens->b + b;
}
+static inline u8 bucket_gen_get(struct bch_dev *ca, size_t b)
+{
+ rcu_read_lock();
+ u8 gen = *bucket_gen(ca, b);
+ rcu_read_unlock();
+ return gen;
+}
+
static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
const struct bch_extent_ptr *ptr)
{
@@ -204,7 +212,6 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
return ret;
}
-void bch2_dev_usage_init(struct bch_dev *);
void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev_usage *);
static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
@@ -266,73 +273,14 @@ static inline u64 dev_buckets_available(struct bch_dev *ca,
/* Filesystem usage: */
-static inline unsigned __fs_usage_u64s(unsigned nr_replicas)
-{
- return sizeof(struct bch_fs_usage) / sizeof(u64) + nr_replicas;
-}
-
-static inline unsigned fs_usage_u64s(struct bch_fs *c)
-{
- return __fs_usage_u64s(READ_ONCE(c->replicas.nr));
-}
-
-static inline unsigned __fs_usage_online_u64s(unsigned nr_replicas)
-{
- return sizeof(struct bch_fs_usage_online) / sizeof(u64) + nr_replicas;
-}
-
-static inline unsigned fs_usage_online_u64s(struct bch_fs *c)
-{
- return __fs_usage_online_u64s(READ_ONCE(c->replicas.nr));
-}
-
static inline unsigned dev_usage_u64s(void)
{
return sizeof(struct bch_dev_usage) / sizeof(u64);
}
-u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *);
-
-struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *);
-
-void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned);
-
-void bch2_fs_usage_to_text(struct printbuf *,
- struct bch_fs *, struct bch_fs_usage_online *);
-
-u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *);
-
struct bch_fs_usage_short
bch2_fs_usage_read_short(struct bch_fs *);
-void bch2_dev_usage_update(struct bch_fs *, struct bch_dev *,
- const struct bch_alloc_v4 *,
- const struct bch_alloc_v4 *, u64, bool);
-
-/* key/bucket marking: */
-
-static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
- unsigned journal_seq,
- bool gc)
-{
- percpu_rwsem_assert_held(&c->mark_lock);
- BUG_ON(!gc && !journal_seq);
-
- return this_cpu_ptr(gc
- ? c->usage_gc
- : c->usage[journal_seq & JOURNAL_BUF_MASK]);
-}
-
-int bch2_update_replicas(struct bch_fs *, struct bkey_s_c,
- struct bch_replicas_entry_v1 *, s64,
- unsigned, bool);
-int bch2_update_replicas_list(struct btree_trans *,
- struct bch_replicas_entry_v1 *, s64);
-int bch2_update_cached_sectors_list(struct btree_trans *, unsigned, s64);
-int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned);
-
-void bch2_fs_usage_initialize(struct bch_fs *);
-
int bch2_bucket_ref_update(struct btree_trans *, struct bch_dev *,
struct bkey_s_c, const struct bch_extent_ptr *,
s64, enum bch_data_type, u8, u8, u32 *);
@@ -361,9 +309,6 @@ int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned,
void bch2_trans_account_disk_usage_change(struct btree_trans *);
-void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
-int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
-
int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, u64,
enum bch_data_type, unsigned,
enum btree_iter_update_trigger_flags);
@@ -424,13 +369,13 @@ static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reserv
#ifdef __KERNEL__
u64 old, new;
+ old = this_cpu_read(c->pcpu->sectors_available);
do {
- old = this_cpu_read(c->pcpu->sectors_available);
if (sectors > old)
return __bch2_disk_reservation_add(c, res, sectors, flags);
new = old - sectors;
- } while (this_cpu_cmpxchg(c->pcpu->sectors_available, old, new) != old);
+ } while (!this_cpu_try_cmpxchg(c->pcpu->sectors_available, &old, new));
this_cpu_add(*c->online_reserved, sectors);
res->sectors += sectors;
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index f636e17c4caf..c9698cdf866f 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -16,7 +16,8 @@ struct bucket {
u32 stripe;
u32 dirty_sectors;
u32 cached_sectors;
-};
+ u32 stripe_sectors;
+} __aligned(sizeof(long));
struct bucket_array {
struct rcu_head rcu;
@@ -35,7 +36,7 @@ struct bucket_gens {
};
struct bch_dev_usage {
- struct {
+ struct bch_dev_usage_type {
u64 buckets;
u64 sectors; /* _compressed_ sectors: */
/*
@@ -56,18 +57,6 @@ struct bch_fs_usage_base {
u64 nr_inodes;
};
-struct bch_fs_usage {
- /* all fields are in units of 512 byte sectors: */
- struct bch_fs_usage_base b;
- u64 persistent_reserved[BCH_REPLICAS_MAX];
- u64 replicas[];
-};
-
-struct bch_fs_usage_online {
- u64 online_reserved;
- struct bch_fs_usage u;
-};
-
struct bch_fs_usage_short {
u64 capacity;
u64 used;
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 9e54323f0f5f..ef1f74866e23 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -5,6 +5,7 @@
#include "bcachefs_ioctl.h"
#include "buckets.h"
#include "chardev.h"
+#include "disk_accounting.h"
#include "journal.h"
#include "move.h"
#include "recovery_passes.h"
@@ -213,16 +214,17 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
if (arg.opts) {
char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
-
ret = PTR_ERR_OR_ZERO(optstr) ?:
- bch2_parse_mount_opts(NULL, &thr->opts, optstr);
- kfree(optstr);
+ bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr);
+ if (!IS_ERR(optstr))
+ kfree(optstr);
if (ret)
goto err;
}
opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
+ opt_set(thr->opts, read_only, 1);
/* We need request_key() to be called before we punt to kthread: */
opt_set(thr->opts, nostart, true);
@@ -319,7 +321,8 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
return ret;
ret = bch2_dev_add(c, path);
- kfree(path);
+ if (!IS_ERR(path))
+ kfree(path);
return ret;
}
@@ -501,11 +504,9 @@ static long bch2_ioctl_data(struct bch_fs *c,
static long bch2_ioctl_fs_usage(struct bch_fs *c,
struct bch_ioctl_fs_usage __user *user_arg)
{
- struct bch_ioctl_fs_usage *arg = NULL;
- struct bch_replicas_usage *dst_e, *dst_end;
- struct bch_fs_usage_online *src;
+ struct bch_ioctl_fs_usage arg = {};
+ darray_char replicas = {};
u32 replica_entries_bytes;
- unsigned i;
int ret = 0;
if (!test_bit(BCH_FS_started, &c->flags))
@@ -514,62 +515,60 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
return -EFAULT;
- arg = kzalloc(size_add(sizeof(*arg), replica_entries_bytes), GFP_KERNEL);
- if (!arg)
- return -ENOMEM;
-
- src = bch2_fs_usage_read(c);
- if (!src) {
- ret = -ENOMEM;
+ ret = bch2_fs_replicas_usage_read(c, &replicas) ?:
+ (replica_entries_bytes < replicas.nr ? -ERANGE : 0) ?:
+ copy_to_user_errcode(&user_arg->replicas, replicas.data, replicas.nr);
+ if (ret)
goto err;
- }
-
- arg->capacity = c->capacity;
- arg->used = bch2_fs_sectors_used(c, src);
- arg->online_reserved = src->online_reserved;
-
- for (i = 0; i < BCH_REPLICAS_MAX; i++)
- arg->persistent_reserved[i] = src->u.persistent_reserved[i];
-
- dst_e = arg->replicas;
- dst_end = (void *) arg->replicas + replica_entries_bytes;
-
- for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry_v1 *src_e =
- cpu_replicas_entry(&c->replicas, i);
-
- /* check that we have enough space for one replicas entry */
- if (dst_e + 1 > dst_end) {
- ret = -ERANGE;
- break;
- }
- dst_e->sectors = src->u.replicas[i];
- dst_e->r = *src_e;
+ struct bch_fs_usage_short u = bch2_fs_usage_read_short(c);
+ arg.capacity = c->capacity;
+ arg.used = u.used;
+ arg.online_reserved = percpu_u64_get(c->online_reserved);
+ arg.replica_entries_bytes = replicas.nr;
- /* recheck after setting nr_devs: */
- if (replicas_usage_next(dst_e) > dst_end) {
- ret = -ERANGE;
- break;
- }
-
- memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs);
+ for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) {
+ struct disk_accounting_pos k = {
+ .type = BCH_DISK_ACCOUNTING_persistent_reserved,
+ .persistent_reserved.nr_replicas = i,
+ };
- dst_e = replicas_usage_next(dst_e);
+ bch2_accounting_mem_read(c,
+ disk_accounting_pos_to_bpos(&k),
+ &arg.persistent_reserved[i], 1);
}
- arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas;
+ ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
+err:
+ darray_exit(&replicas);
+ return ret;
+}
+
+static long bch2_ioctl_query_accounting(struct bch_fs *c,
+ struct bch_ioctl_query_accounting __user *user_arg)
+{
+ struct bch_ioctl_query_accounting arg;
+ darray_char accounting = {};
+ int ret = 0;
- percpu_up_read(&c->mark_lock);
- kfree(src);
+ if (!test_bit(BCH_FS_started, &c->flags))
+ return -EINVAL;
+ ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)) ?:
+ bch2_fs_accounting_read(c, &accounting, arg.accounting_types_mask) ?:
+ (arg.accounting_u64s * sizeof(u64) < accounting.nr ? -ERANGE : 0) ?:
+ copy_to_user_errcode(&user_arg->accounting, accounting.data, accounting.nr);
if (ret)
goto err;
- ret = copy_to_user_errcode(user_arg, arg,
- sizeof(*arg) + arg->replica_entries_bytes);
+ arg.capacity = c->capacity;
+ arg.used = bch2_fs_usage_read_short(c).used;
+ arg.online_reserved = percpu_u64_get(c->online_reserved);
+ arg.accounting_u64s = accounting.nr / sizeof(u64);
+
+ ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
err:
- kfree(arg);
+ darray_exit(&accounting);
return ret;
}
@@ -604,7 +603,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
arg.bucket_size = ca->mi.bucket_size;
arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket;
- for (i = 0; i < BCH_DATA_NR; i++) {
+ for (i = 0; i < ARRAY_SIZE(arg.d); i++) {
arg.d[i].buckets = src.d[i].buckets;
arg.d[i].sectors = src.d[i].sectors;
arg.d[i].fragmented = src.d[i].fragmented;
@@ -849,8 +848,9 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c,
char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
ret = PTR_ERR_OR_ZERO(optstr) ?:
- bch2_parse_mount_opts(c, &thr->opts, optstr);
- kfree(optstr);
+ bch2_parse_mount_opts(c, &thr->opts, NULL, optstr);
+ if (!IS_ERR(optstr))
+ kfree(optstr);
if (ret)
goto err;
@@ -925,6 +925,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
case BCH_IOCTL_FSCK_ONLINE:
BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online);
+ case BCH_IOCTL_QUERY_ACCOUNTING:
+ return bch2_ioctl_query_accounting(c, arg);
default:
return -ENOTTY;
}
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 3bd3aba90d8f..e7208bf1974e 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -10,6 +10,7 @@
#include <linux/xxhash.h>
#include <linux/key.h>
#include <linux/random.h>
+#include <linux/ratelimit.h>
#include <linux/scatterlist.h>
#include <crypto/algapi.h>
#include <crypto/chacha.h>
@@ -436,7 +437,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
struct printbuf buf = PRINTBUF;
prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n"
- "expected %0llx:%0llx got %0llx:%0llx (old type ",
+ " expected %0llx:%0llx got %0llx:%0llx (old type ",
__func__,
crc_old.csum.hi,
crc_old.csum.lo,
@@ -446,7 +447,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
prt_str(&buf, " new type ");
bch2_prt_csum_type(&buf, new_csum_type);
prt_str(&buf, ")");
- bch_err(c, "%s", buf.buf);
+ WARN_RATELIMIT(1, "%s", buf.buf);
printbuf_exit(&buf);
return -EIO;
}
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index 18fab9c44b1b..1d6b691e8da6 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -24,7 +24,6 @@ static inline void io_timer_swp(void *l, void *r, void __always_unused *args)
void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
{
- size_t i;
const struct min_heap_callbacks callbacks = {
.less = io_timer_cmp,
.swp = io_timer_swp,
@@ -32,14 +31,13 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
spin_lock(&clock->timer_lock);
- if (time_after_eq((unsigned long) atomic64_read(&clock->now),
- timer->expire)) {
+ if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) {
spin_unlock(&clock->timer_lock);
timer->fn(timer);
return;
}
- for (i = 0; i < clock->timers.nr; i++)
+ for (size_t i = 0; i < clock->timers.nr; i++)
if (clock->timers.data[i] == timer)
goto out;
@@ -50,7 +48,6 @@ out:
void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
{
- size_t i;
const struct min_heap_callbacks callbacks = {
.less = io_timer_cmp,
.swp = io_timer_swp,
@@ -58,7 +55,7 @@ void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
spin_lock(&clock->timer_lock);
- for (i = 0; i < clock->timers.nr; i++)
+ for (size_t i = 0; i < clock->timers.nr; i++)
if (clock->timers.data[i] == timer) {
min_heap_del(&clock->timers, i, &callbacks, NULL);
break;
@@ -92,33 +89,31 @@ static void io_clock_cpu_timeout(struct timer_list *timer)
wake_up_process(wait->task);
}
-void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until)
+void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until)
{
- struct io_clock_wait wait;
+ struct io_clock_wait wait = {
+ .io_timer.expire = until,
+ .io_timer.fn = io_clock_wait_fn,
+ .io_timer.fn2 = (void *) _RET_IP_,
+ .task = current,
+ };
- /* XXX: calculate sleep time rigorously */
- wait.io_timer.expire = until;
- wait.io_timer.fn = io_clock_wait_fn;
- wait.task = current;
- wait.expired = 0;
bch2_io_timer_add(clock, &wait.io_timer);
-
schedule();
-
bch2_io_timer_del(clock, &wait.io_timer);
}
void bch2_kthread_io_clock_wait(struct io_clock *clock,
- unsigned long io_until,
- unsigned long cpu_timeout)
+ u64 io_until, unsigned long cpu_timeout)
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
- struct io_clock_wait wait;
+ struct io_clock_wait wait = {
+ .io_timer.expire = io_until,
+ .io_timer.fn = io_clock_wait_fn,
+ .io_timer.fn2 = (void *) _RET_IP_,
+ .task = current,
+ };
- wait.io_timer.expire = io_until;
- wait.io_timer.fn = io_clock_wait_fn;
- wait.task = current;
- wait.expired = 0;
bch2_io_timer_add(clock, &wait.io_timer);
timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
@@ -144,8 +139,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
bch2_io_timer_del(clock, &wait.io_timer);
}
-static struct io_timer *get_expired_timer(struct io_clock *clock,
- unsigned long now)
+static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now)
{
struct io_timer *ret = NULL;
const struct min_heap_callbacks callbacks = {
@@ -153,41 +147,40 @@ static struct io_timer *get_expired_timer(struct io_clock *clock,
.swp = io_timer_swp,
};
- spin_lock(&clock->timer_lock);
-
if (clock->timers.nr &&
- time_after_eq(now, clock->timers.data[0]->expire)) {
+ time_after_eq64(now, clock->timers.data[0]->expire)) {
ret = *min_heap_peek(&clock->timers);
min_heap_pop(&clock->timers, &callbacks, NULL);
}
- spin_unlock(&clock->timer_lock);
-
return ret;
}
-void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
+void __bch2_increment_clock(struct io_clock *clock, u64 sectors)
{
struct io_timer *timer;
- unsigned long now = atomic64_add_return(sectors, &clock->now);
+ u64 now = atomic64_add_return(sectors, &clock->now);
+ spin_lock(&clock->timer_lock);
while ((timer = get_expired_timer(clock, now)))
timer->fn(timer);
+ spin_unlock(&clock->timer_lock);
}
void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
{
- unsigned long now;
- unsigned i;
-
out->atomic++;
spin_lock(&clock->timer_lock);
- now = atomic64_read(&clock->now);
+ u64 now = atomic64_read(&clock->now);
+
+ printbuf_tabstop_push(out, 40);
+ prt_printf(out, "current time:\t%llu\n", now);
- for (i = 0; i < clock->timers.nr; i++)
- prt_printf(out, "%ps:\t%li\n",
+ for (unsigned i = 0; i < clock->timers.nr; i++)
+ prt_printf(out, "%ps %ps:\t%llu\n",
clock->timers.data[i]->fn,
- clock->timers.data[i]->expire - now);
+ clock->timers.data[i]->fn2,
+ clock->timers.data[i]->expire);
spin_unlock(&clock->timer_lock);
--out->atomic;
}
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
index 70a0f7436c84..85c975dfbcfe 100644
--- a/fs/bcachefs/clock.h
+++ b/fs/bcachefs/clock.h
@@ -4,12 +4,11 @@
void bch2_io_timer_add(struct io_clock *, struct io_timer *);
void bch2_io_timer_del(struct io_clock *, struct io_timer *);
-void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
- unsigned long);
+void bch2_kthread_io_clock_wait(struct io_clock *, u64, unsigned long);
-void __bch2_increment_clock(struct io_clock *, unsigned);
+void __bch2_increment_clock(struct io_clock *, u64);
-static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors,
+static inline void bch2_increment_clock(struct bch_fs *c, u64 sectors,
int rw)
{
struct io_clock *clock = &c->io_clock[rw];
@@ -19,7 +18,7 @@ static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors,
__bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0));
}
-void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
+void bch2_io_clock_schedule_timeout(struct io_clock *, u64);
#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
({ \
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
index f2c8a25b7079..37554e4514fe 100644
--- a/fs/bcachefs/clock_types.h
+++ b/fs/bcachefs/clock_types.h
@@ -17,7 +17,8 @@ typedef void (*io_timer_fn)(struct io_timer *);
struct io_timer {
io_timer_fn fn;
- unsigned long expire;
+ void *fn2;
+ u64 expire;
};
/* Amount to buffer up on a percpu counter */
diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c
index ac35b8b705ae..b7d223f85873 100644
--- a/fs/bcachefs/darray.c
+++ b/fs/bcachefs/darray.c
@@ -13,7 +13,8 @@ int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, g
if (!data)
return -ENOMEM;
- memcpy(data, d->data, d->size * element_size);
+ if (d->size)
+ memcpy(data, d->data, d->size * element_size);
if (d->data != d->preallocated)
kvfree(d->data);
d->data = data;
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 1a0072eef109..0087b8555ead 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -5,7 +5,9 @@
#include "bkey_buf.h"
#include "btree_update.h"
#include "buckets.h"
+#include "compress.h"
#include "data_update.h"
+#include "disk_groups.h"
#include "ec.h"
#include "error.h"
#include "extents.h"
@@ -454,6 +456,38 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
}
}
+void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ printbuf_tabstop_push(out, 20);
+ prt_str(out, "rewrite ptrs:\t");
+ bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
+ prt_newline(out);
+
+ prt_str(out, "kill ptrs:\t");
+ bch2_prt_u64_base2(out, data_opts->kill_ptrs);
+ prt_newline(out);
+
+ prt_str(out, "target:\t");
+ bch2_target_to_text(out, c, data_opts->target);
+ prt_newline(out);
+
+ prt_str(out, "compression:\t");
+ bch2_compression_opt_to_text(out, background_compression(*io_opts));
+ prt_newline(out);
+
+ prt_str(out, "extra replicas:\t");
+ prt_u64(out, data_opts->extra_replicas);
+}
+
+void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
+{
+ bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
+ prt_newline(out);
+ bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
+}
+
int bch2_extent_drop_ptrs(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
@@ -643,6 +677,16 @@ int bch2_data_update_init(struct btree_trans *trans,
if (!(durability_have + durability_removing))
m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1);
+ if (!m->op.nr_replicas) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_data_update_to_text(&buf, m);
+ WARN(1, "trying to move an extent, but nr_replicas=0\n%s", buf.buf);
+ printbuf_exit(&buf);
+ ret = -BCH_ERR_data_update_done;
+ goto done;
+ }
+
m->op.nr_replicas_required = m->op.nr_replicas;
if (reserve_sectors) {
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
index 991095bbd469..8d36365bdea8 100644
--- a/fs/bcachefs/data_update.h
+++ b/fs/bcachefs/data_update.h
@@ -17,6 +17,9 @@ struct data_update_opts {
unsigned write_flags;
};
+void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *,
+ struct bch_io_opts *, struct data_update_opts *);
+
struct data_update {
/* extent being updated: */
enum btree_id btree_id;
@@ -27,6 +30,8 @@ struct data_update {
struct bch_write_op op;
};
+void bch2_data_update_to_text(struct printbuf *, struct data_update *);
+
int bch2_data_update_index_update(struct bch_write_op *);
void bch2_data_update_read_done(struct data_update *,
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 51cbf3928361..ebabab171fe5 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -568,6 +568,32 @@ static const struct file_operations cached_btree_nodes_ops = {
.read = bch2_cached_btree_nodes_read,
};
+typedef int (*list_cmp_fn)(const struct list_head *l, const struct list_head *r);
+
+static void list_sort(struct list_head *head, list_cmp_fn cmp)
+{
+ struct list_head *pos;
+
+ list_for_each(pos, head)
+ while (!list_is_last(pos, head) &&
+ cmp(pos, pos->next) > 0) {
+ struct list_head *pos2, *next = pos->next;
+
+ list_del(next);
+ list_for_each(pos2, head)
+ if (cmp(next, pos2) < 0)
+ goto pos_found;
+ BUG();
+pos_found:
+ list_add_tail(next, pos2);
+ }
+}
+
+static int list_ptr_order_cmp(const struct list_head *l, const struct list_head *r)
+{
+ return cmp_int(l, r);
+}
+
static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
{
@@ -575,41 +601,39 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
struct bch_fs *c = i->c;
struct btree_trans *trans;
ssize_t ret = 0;
- u32 seq;
i->ubuf = buf;
i->size = size;
i->ret = 0;
restart:
seqmutex_lock(&c->btree_trans_lock);
- list_for_each_entry(trans, &c->btree_trans_list, list) {
- struct task_struct *task = READ_ONCE(trans->locking_wait.task);
+ list_sort(&c->btree_trans_list, list_ptr_order_cmp);
- if (!task || task->pid <= i->iter)
+ list_for_each_entry(trans, &c->btree_trans_list, list) {
+ if ((ulong) trans <= i->iter)
continue;
- closure_get(&trans->ref);
- seq = seqmutex_seq(&c->btree_trans_lock);
- seqmutex_unlock(&c->btree_trans_lock);
+ i->iter = (ulong) trans;
- ret = flush_buf(i);
- if (ret) {
- closure_put(&trans->ref);
- goto unlocked;
- }
+ if (!closure_get_not_zero(&trans->ref))
+ continue;
+
+ u32 seq = seqmutex_unlock(&c->btree_trans_lock);
bch2_btree_trans_to_text(&i->buf, trans);
prt_printf(&i->buf, "backtrace:\n");
printbuf_indent_add(&i->buf, 2);
- bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL);
+ bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task, 0, GFP_KERNEL);
printbuf_indent_sub(&i->buf, 2);
prt_newline(&i->buf);
- i->iter = task->pid;
-
closure_put(&trans->ref);
+ ret = flush_buf(i);
+ if (ret)
+ goto unlocked;
+
if (!seqmutex_relock(&c->btree_trans_lock, seq))
goto restart;
}
@@ -804,50 +828,55 @@ static const struct file_operations btree_transaction_stats_op = {
.read = btree_transaction_stats_read,
};
-static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
+/* walk btree transactions until we find a deadlock and print it */
+static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c)
{
- struct dump_iter *i = file->private_data;
- struct bch_fs *c = i->c;
struct btree_trans *trans;
- ssize_t ret = 0;
- u32 seq;
-
- i->ubuf = buf;
- i->size = size;
- i->ret = 0;
-
- if (i->iter)
- goto out;
+ ulong iter = 0;
restart:
seqmutex_lock(&c->btree_trans_lock);
- list_for_each_entry(trans, &c->btree_trans_list, list) {
- struct task_struct *task = READ_ONCE(trans->locking_wait.task);
+ list_sort(&c->btree_trans_list, list_ptr_order_cmp);
- if (!task || task->pid <= i->iter)
+ list_for_each_entry(trans, &c->btree_trans_list, list) {
+ if ((ulong) trans <= iter)
continue;
- closure_get(&trans->ref);
- seq = seqmutex_seq(&c->btree_trans_lock);
- seqmutex_unlock(&c->btree_trans_lock);
+ iter = (ulong) trans;
- ret = flush_buf(i);
- if (ret) {
- closure_put(&trans->ref);
- goto out;
- }
+ if (!closure_get_not_zero(&trans->ref))
+ continue;
- bch2_check_for_deadlock(trans, &i->buf);
+ u32 seq = seqmutex_unlock(&c->btree_trans_lock);
- i->iter = task->pid;
+ bool found = bch2_check_for_deadlock(trans, out) != 0;
closure_put(&trans->ref);
+ if (found)
+ return;
+
if (!seqmutex_relock(&c->btree_trans_lock, seq))
goto restart;
}
seqmutex_unlock(&c->btree_trans_lock);
-out:
+}
+
+static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ ssize_t ret = 0;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ if (!i->iter) {
+ btree_deadlock_to_text(&i->buf, c);
+ i->iter++;
+ }
+
if (i->buf.allocation_failure)
ret = -ENOMEM;
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index c67460d8205d..d743da89308e 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -534,6 +534,14 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subvol_inum target)
{
struct qstr name = bch2_dirent_get_name(d);
+ /*
+ * Although not required by the kernel code, updating ctx->pos is needed
+ * for the bcachefs FUSE driver. Without this update, the FUSE
+ * implementation will be stuck in an infinite loop when reading
+ * directories (via the bcachefs_fuse_readdir callback).
+ * In kernel space, ctx->pos is updated by the VFS code.
+ */
+ ctx->pos = d.k->p.offset;
bool ret = dir_emit(ctx, name.name,
name.len,
target.inum,
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
new file mode 100644
index 000000000000..dcdd59249c23
--- /dev/null
+++ b/fs/bcachefs/disk_accounting.c
@@ -0,0 +1,790 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bcachefs_ioctl.h"
+#include "btree_cache.h"
+#include "btree_journal_iter.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
+#include "buckets.h"
+#include "compress.h"
+#include "disk_accounting.h"
+#include "error.h"
+#include "journal_io.h"
+#include "replicas.h"
+
+/*
+ * Notes on disk accounting:
+ *
+ * We have two parallel sets of counters to be concerned with, and both must be
+ * kept in sync.
+ *
+ * - Persistent/on disk accounting, stored in the accounting btree and updated
+ * via btree write buffer updates that treat new accounting keys as deltas to
+ * apply to existing values. But reading from a write buffer btree is
+ * expensive, so we also have
+ *
+ * - In memory accounting, where accounting is stored as an array of percpu
+ * counters, indexed by an eytzinger array of disk acounting keys/bpos (which
+ * are the same thing, excepting byte swabbing on big endian).
+ *
+ * Cheap to read, but non persistent.
+ *
+ * Disk accounting updates are generated by transactional triggers; these run as
+ * keys enter and leave the btree, and can compare old and new versions of keys;
+ * the output of these triggers are deltas to the various counters.
+ *
+ * Disk accounting updates are done as btree write buffer updates, where the
+ * counters in the disk accounting key are deltas that will be applied to the
+ * counter in the btree when the key is flushed by the write buffer (or journal
+ * replay).
+ *
+ * To do a disk accounting update:
+ * - initialize a disk_accounting_pos, to specify which counter is being update
+ * - initialize counter deltas, as an array of 1-3 s64s
+ * - call bch2_disk_accounting_mod()
+ *
+ * This queues up the accounting update to be done at transaction commit time.
+ * Underneath, it's a normal btree write buffer update.
+ *
+ * The transaction commit path is responsible for propagating updates to the in
+ * memory counters, with bch2_accounting_mem_mod().
+ *
+ * The commit path also assigns every disk accounting update a unique version
+ * number, based on the journal sequence number and offset within that journal
+ * buffer; this is used by journal replay to determine which updates have been
+ * done.
+ *
+ * The transaction commit path also ensures that replicas entry accounting
+ * updates are properly marked in the superblock (so that we know whether we can
+ * mount without data being unavailable); it will update the superblock if
+ * bch2_accounting_mem_mod() tells it to.
+ */
+
+static const char * const disk_accounting_type_strs[] = {
+#define x(t, n, ...) [n] = #t,
+ BCH_DISK_ACCOUNTING_TYPES()
+#undef x
+ NULL
+};
+
+static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos,
+ s64 *d, unsigned nr)
+{
+ struct bkey_i_accounting *acc = bkey_accounting_init(k);
+
+ acc->k.p = disk_accounting_pos_to_bpos(pos);
+ set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr);
+
+ memcpy_u64s_small(acc->v.d, d, nr);
+}
+
+int bch2_disk_accounting_mod(struct btree_trans *trans,
+ struct disk_accounting_pos *k,
+ s64 *d, unsigned nr, bool gc)
+{
+ /* Normalize: */
+ switch (k->type) {
+ case BCH_DISK_ACCOUNTING_replicas:
+ bubble_sort(k->replicas.devs, k->replicas.nr_devs, u8_cmp);
+ break;
+ }
+
+ BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS);
+
+ struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
+
+ accounting_key_init(&k_i.k, k, d, nr);
+
+ return likely(!gc)
+ ? bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k)
+ : bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
+}
+
+int bch2_mod_dev_cached_sectors(struct btree_trans *trans,
+ unsigned dev, s64 sectors,
+ bool gc)
+{
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_replicas,
+ };
+
+ bch2_replicas_entry_cached(&acc.replicas, dev);
+
+ return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
+}
+
+int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k,
+ enum bch_validate_flags flags,
+ struct printbuf *err)
+{
+ return 0;
+}
+
+void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k)
+{
+ if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) {
+ prt_printf(out, "unknown type %u", k->type);
+ return;
+ }
+
+ prt_str(out, disk_accounting_type_strs[k->type]);
+ prt_str(out, " ");
+
+ switch (k->type) {
+ case BCH_DISK_ACCOUNTING_nr_inodes:
+ break;
+ case BCH_DISK_ACCOUNTING_persistent_reserved:
+ prt_printf(out, "replicas=%u", k->persistent_reserved.nr_replicas);
+ break;
+ case BCH_DISK_ACCOUNTING_replicas:
+ bch2_replicas_entry_to_text(out, &k->replicas);
+ break;
+ case BCH_DISK_ACCOUNTING_dev_data_type:
+ prt_printf(out, "dev=%u data_type=", k->dev_data_type.dev);
+ bch2_prt_data_type(out, k->dev_data_type.data_type);
+ break;
+ case BCH_DISK_ACCOUNTING_compression:
+ bch2_prt_compression_type(out, k->compression.type);
+ break;
+ case BCH_DISK_ACCOUNTING_snapshot:
+ prt_printf(out, "id=%u", k->snapshot.id);
+ break;
+ case BCH_DISK_ACCOUNTING_btree:
+ prt_printf(out, "btree=%s", bch2_btree_id_str(k->btree.id));
+ break;
+ }
+}
+
+void bch2_accounting_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_s_c_accounting acc = bkey_s_c_to_accounting(k);
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, k.k->p);
+
+ bch2_accounting_key_to_text(out, &acc_k);
+
+ for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++)
+ prt_printf(out, " %lli", acc.v->d[i]);
+}
+
+void bch2_accounting_swab(struct bkey_s k)
+{
+ for (u64 *p = (u64 *) k.v;
+ p < (u64 *) bkey_val_end(k);
+ p++)
+ *p = swab64(*p);
+}
+
+static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p)
+{
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, p);
+
+ switch (acc_k.type) {
+ case BCH_DISK_ACCOUNTING_replicas:
+ unsafe_memcpy(r, &acc_k.replicas,
+ replicas_entry_bytes(&acc_k.replicas),
+ "variable length struct");
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p)
+{
+ struct bch_replicas_padded r;
+ return accounting_to_replicas(&r.e, p)
+ ? bch2_mark_replicas(c, &r.e)
+ : 0;
+}
+
+/*
+ * Ensure accounting keys being updated are present in the superblock, when
+ * applicable (i.e. replicas updates)
+ */
+int bch2_accounting_update_sb(struct btree_trans *trans)
+{
+ for (struct jset_entry *i = trans->journal_entries;
+ i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+ i = vstruct_next(i))
+ if (jset_entry_is_key(i) && i->start->k.type == KEY_TYPE_accounting) {
+ int ret = bch2_accounting_update_sb_one(trans->c, i->start->k.p);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a)
+{
+ struct bch_accounting_mem *acc = &c->accounting;
+
+ /* raced with another insert, already present: */
+ if (eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+ accounting_pos_cmp, &a.k->p) < acc->k.nr)
+ return 0;
+
+ struct accounting_mem_entry n = {
+ .pos = a.k->p,
+ .version = a.k->version,
+ .nr_counters = bch2_accounting_counters(a.k),
+ .v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64),
+ sizeof(u64), GFP_KERNEL),
+ };
+
+ if (!n.v[0])
+ goto err;
+
+ if (acc->gc_running) {
+ n.v[1] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64),
+ sizeof(u64), GFP_KERNEL);
+ if (!n.v[1])
+ goto err;
+ }
+
+ if (darray_push(&acc->k, n))
+ goto err;
+
+ eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+ accounting_pos_cmp, NULL);
+ return 0;
+err:
+ free_percpu(n.v[1]);
+ free_percpu(n.v[0]);
+ return -BCH_ERR_ENOMEM_disk_accounting;
+}
+
+int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc)
+{
+ struct bch_replicas_padded r;
+
+ if (accounting_to_replicas(&r.e, a.k->p) &&
+ !bch2_replicas_marked_locked(c, &r.e))
+ return -BCH_ERR_btree_insert_need_mark_replicas;
+
+ percpu_up_read(&c->mark_lock);
+ percpu_down_write(&c->mark_lock);
+ int ret = __bch2_accounting_mem_insert(c, a);
+ percpu_up_write(&c->mark_lock);
+ percpu_down_read(&c->mark_lock);
+ return ret;
+}
+
+static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e)
+{
+ for (unsigned i = 0; i < e->nr_counters; i++)
+ if (percpu_u64_get(e->v[0] + i) ||
+ (e->v[1] &&
+ percpu_u64_get(e->v[1] + i)))
+ return false;
+ return true;
+}
+
+void bch2_accounting_mem_gc(struct bch_fs *c)
+{
+ struct bch_accounting_mem *acc = &c->accounting;
+
+ percpu_down_write(&c->mark_lock);
+ struct accounting_mem_entry *dst = acc->k.data;
+
+ darray_for_each(acc->k, src) {
+ if (accounting_mem_entry_is_zero(src)) {
+ free_percpu(src->v[0]);
+ free_percpu(src->v[1]);
+ } else {
+ *dst++ = *src;
+ }
+ }
+
+ acc->k.nr = dst - acc->k.data;
+ eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+ accounting_pos_cmp, NULL);
+ percpu_up_write(&c->mark_lock);
+}
+
+/*
+ * Read out accounting keys for replicas entries, as an array of
+ * bch_replicas_usage entries.
+ *
+ * Note: this may be deprecated/removed at smoe point in the future and replaced
+ * with something more general, it exists to support the ioctl used by the
+ * 'bcachefs fs usage' command.
+ */
+int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage)
+{
+ struct bch_accounting_mem *acc = &c->accounting;
+ int ret = 0;
+
+ darray_init(usage);
+
+ percpu_down_read(&c->mark_lock);
+ darray_for_each(acc->k, i) {
+ struct {
+ struct bch_replicas_usage r;
+ u8 pad[BCH_BKEY_PTRS_MAX];
+ } u;
+
+ if (!accounting_to_replicas(&u.r.r, i->pos))
+ continue;
+
+ u64 sectors;
+ bch2_accounting_mem_read_counters(acc, i - acc->k.data, &sectors, 1, false);
+ u.r.sectors = sectors;
+
+ ret = darray_make_room(usage, replicas_usage_bytes(&u.r));
+ if (ret)
+ break;
+
+ memcpy(&darray_top(*usage), &u.r, replicas_usage_bytes(&u.r));
+ usage->nr += replicas_usage_bytes(&u.r);
+ }
+ percpu_up_read(&c->mark_lock);
+
+ if (ret)
+ darray_exit(usage);
+ return ret;
+}
+
+int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned accounting_types_mask)
+{
+
+ struct bch_accounting_mem *acc = &c->accounting;
+ int ret = 0;
+
+ darray_init(out_buf);
+
+ percpu_down_read(&c->mark_lock);
+ darray_for_each(acc->k, i) {
+ struct disk_accounting_pos a_p;
+ bpos_to_disk_accounting_pos(&a_p, i->pos);
+
+ if (!(accounting_types_mask & BIT(a_p.type)))
+ continue;
+
+ ret = darray_make_room(out_buf, sizeof(struct bkey_i_accounting) +
+ sizeof(u64) * i->nr_counters);
+ if (ret)
+ break;
+
+ struct bkey_i_accounting *a_out =
+ bkey_accounting_init((void *) &darray_top(*out_buf));
+ set_bkey_val_u64s(&a_out->k, i->nr_counters);
+ a_out->k.p = i->pos;
+ bch2_accounting_mem_read_counters(acc, i - acc->k.data,
+ a_out->v.d, i->nr_counters, false);
+
+ if (!bch2_accounting_key_is_zero(accounting_i_to_s_c(a_out)))
+ out_buf->nr += bkey_bytes(&a_out->k);
+ }
+
+ percpu_up_read(&c->mark_lock);
+
+ if (ret)
+ darray_exit(out_buf);
+ return ret;
+}
+
+void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct bch_accounting_mem *acc = &c->accounting;
+
+ percpu_down_read(&c->mark_lock);
+ out->atomic++;
+
+ eytzinger0_for_each(i, acc->k.nr) {
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, acc->k.data[i].pos);
+
+ bch2_accounting_key_to_text(out, &acc_k);
+
+ u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
+ bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
+
+ prt_str(out, ":");
+ for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++)
+ prt_printf(out, " %llu", v[j]);
+ prt_newline(out);
+ }
+
+ --out->atomic;
+ percpu_up_read(&c->mark_lock);
+}
+
+static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc)
+{
+ darray_for_each(acc->k, e) {
+ free_percpu(e->v[gc]);
+ e->v[gc] = NULL;
+ }
+}
+
+int bch2_gc_accounting_start(struct bch_fs *c)
+{
+ struct bch_accounting_mem *acc = &c->accounting;
+ int ret = 0;
+
+ percpu_down_write(&c->mark_lock);
+ darray_for_each(acc->k, e) {
+ e->v[1] = __alloc_percpu_gfp(e->nr_counters * sizeof(u64),
+ sizeof(u64), GFP_KERNEL);
+ if (!e->v[1]) {
+ bch2_accounting_free_counters(acc, true);
+ ret = -BCH_ERR_ENOMEM_disk_accounting;
+ break;
+ }
+ }
+
+ acc->gc_running = !ret;
+ percpu_up_write(&c->mark_lock);
+
+ return ret;
+}
+
+int bch2_gc_accounting_done(struct bch_fs *c)
+{
+ struct bch_accounting_mem *acc = &c->accounting;
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct printbuf buf = PRINTBUF;
+ struct bpos pos = POS_MIN;
+ int ret = 0;
+
+ percpu_down_write(&c->mark_lock);
+ while (1) {
+ unsigned idx = eytzinger0_find_ge(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+ accounting_pos_cmp, &pos);
+
+ if (idx >= acc->k.nr)
+ break;
+
+ struct accounting_mem_entry *e = acc->k.data + idx;
+ pos = bpos_successor(e->pos);
+
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, e->pos);
+
+ u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS];
+ u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS];
+
+ unsigned nr = e->nr_counters;
+ bch2_accounting_mem_read_counters(acc, idx, dst_v, nr, false);
+ bch2_accounting_mem_read_counters(acc, idx, src_v, nr, true);
+
+ if (memcmp(dst_v, src_v, nr * sizeof(u64))) {
+ printbuf_reset(&buf);
+ prt_str(&buf, "accounting mismatch for ");
+ bch2_accounting_key_to_text(&buf, &acc_k);
+
+ prt_str(&buf, ": got");
+ for (unsigned j = 0; j < nr; j++)
+ prt_printf(&buf, " %llu", dst_v[j]);
+
+ prt_str(&buf, " should be");
+ for (unsigned j = 0; j < nr; j++)
+ prt_printf(&buf, " %llu", src_v[j]);
+
+ for (unsigned j = 0; j < nr; j++)
+ src_v[j] -= dst_v[j];
+
+ if (fsck_err(trans, accounting_mismatch, "%s", buf.buf)) {
+ percpu_up_write(&c->mark_lock);
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false));
+ percpu_down_write(&c->mark_lock);
+ if (ret)
+ goto err;
+
+ if (!test_bit(BCH_FS_may_go_rw, &c->flags)) {
+ memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
+ struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
+
+ accounting_key_init(&k_i.k, &acc_k, src_v, nr);
+ bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), false);
+
+ preempt_disable();
+ struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
+ struct bch_fs_usage_base *src = &trans->fs_usage_delta;
+ acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64));
+ preempt_enable();
+ }
+ }
+ }
+ }
+err:
+fsck_err:
+ percpu_up_write(&c->mark_lock);
+ printbuf_exit(&buf);
+ bch2_trans_put(trans);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+
+ if (k.k->type != KEY_TYPE_accounting)
+ return 0;
+
+ percpu_down_read(&c->mark_lock);
+ int ret = __bch2_accounting_mem_mod(c, bkey_s_c_to_accounting(k), false);
+ percpu_up_read(&c->mark_lock);
+
+ if (bch2_accounting_key_is_zero(bkey_s_c_to_accounting(k)) &&
+ ret == -BCH_ERR_btree_insert_need_mark_replicas)
+ ret = 0;
+
+ struct disk_accounting_pos acc;
+ bpos_to_disk_accounting_pos(&acc, k.k->p);
+
+ if (fsck_err_on(ret == -BCH_ERR_btree_insert_need_mark_replicas,
+ trans, accounting_replicas_not_marked,
+ "accounting not marked in superblock replicas\n %s",
+ (bch2_accounting_key_to_text(&buf, &acc),
+ buf.buf)))
+ ret = bch2_accounting_update_sb_one(c, k.k->p);
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+/*
+ * At startup time, initialize the in memory accounting from the btree (and
+ * journal)
+ */
+int bch2_accounting_read(struct bch_fs *c)
+{
+ struct bch_accounting_mem *acc = &c->accounting;
+ struct btree_trans *trans = bch2_trans_get(c);
+
+ int ret = for_each_btree_key(trans, iter,
+ BTREE_ID_accounting, POS_MIN,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
+ struct bkey u;
+ struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u);
+ accounting_read_key(trans, k);
+ }));
+ if (ret)
+ goto err;
+
+ struct journal_keys *keys = &c->journal_keys;
+ struct journal_key *dst = keys->data;
+ move_gap(keys, keys->nr);
+
+ darray_for_each(*keys, i) {
+ if (i->k->k.type == KEY_TYPE_accounting) {
+ struct bkey_s_c k = bkey_i_to_s_c(i->k);
+ unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr,
+ sizeof(acc->k.data[0]),
+ accounting_pos_cmp, &k.k->p);
+
+ bool applied = idx < acc->k.nr &&
+ bversion_cmp(acc->k.data[idx].version, k.k->version) >= 0;
+
+ if (applied)
+ continue;
+
+ if (i + 1 < &darray_top(*keys) &&
+ i[1].k->k.type == KEY_TYPE_accounting &&
+ !journal_key_cmp(i, i + 1)) {
+ BUG_ON(bversion_cmp(i[0].k->k.version, i[1].k->k.version) >= 0);
+
+ i[1].journal_seq = i[0].journal_seq;
+
+ bch2_accounting_accumulate(bkey_i_to_accounting(i[1].k),
+ bkey_s_c_to_accounting(k));
+ continue;
+ }
+
+ ret = accounting_read_key(trans, k);
+ if (ret)
+ goto err;
+ }
+
+ *dst++ = *i;
+ }
+ keys->gap = keys->nr = dst - keys->data;
+
+ percpu_down_read(&c->mark_lock);
+ preempt_disable();
+ struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage);
+
+ for (unsigned i = 0; i < acc->k.nr; i++) {
+ struct disk_accounting_pos k;
+ bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos);
+
+ u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
+ bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
+
+ switch (k.type) {
+ case BCH_DISK_ACCOUNTING_persistent_reserved:
+ usage->reserved += v[0] * k.persistent_reserved.nr_replicas;
+ break;
+ case BCH_DISK_ACCOUNTING_replicas:
+ fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]);
+ break;
+ case BCH_DISK_ACCOUNTING_dev_data_type:
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, k.dev_data_type.dev);
+ if (ca) {
+ struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type];
+ percpu_u64_set(&d->buckets, v[0]);
+ percpu_u64_set(&d->sectors, v[1]);
+ percpu_u64_set(&d->fragmented, v[2]);
+
+ if (k.dev_data_type.data_type == BCH_DATA_sb ||
+ k.dev_data_type.data_type == BCH_DATA_journal)
+ usage->hidden += v[0] * ca->mi.bucket_size;
+ }
+ rcu_read_unlock();
+ break;
+ }
+ }
+ preempt_enable();
+ percpu_up_read(&c->mark_lock);
+err:
+ bch2_trans_put(trans);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev)
+{
+ return bch2_trans_run(c,
+ bch2_btree_write_buffer_flush_sync(trans) ?:
+ for_each_btree_key_commit(trans, iter, BTREE_ID_accounting, POS_MIN,
+ BTREE_ITER_all_snapshots, k, NULL, NULL, 0, ({
+ struct disk_accounting_pos acc;
+ bpos_to_disk_accounting_pos(&acc, k.k->p);
+
+ acc.type == BCH_DISK_ACCOUNTING_dev_data_type &&
+ acc.dev_data_type.dev == dev
+ ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_accounting, k.k->p, 0)
+ : 0;
+ })) ?:
+ bch2_btree_write_buffer_flush_sync(trans));
+}
+
+int bch2_dev_usage_init(struct bch_dev *ca, bool gc)
+{
+ struct bch_fs *c = ca->fs;
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_dev_data_type,
+ .dev_data_type.dev = ca->dev_idx,
+ .dev_data_type.data_type = BCH_DATA_free,
+ };
+ u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 };
+
+ int ret = bch2_trans_do(c, NULL, NULL, 0,
+ bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc));
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+void bch2_verify_accounting_clean(struct bch_fs *c)
+{
+ bool mismatch = false;
+ struct bch_fs_usage_base base = {}, base_inmem = {};
+
+ bch2_trans_run(c,
+ for_each_btree_key(trans, iter,
+ BTREE_ID_accounting, POS_MIN,
+ BTREE_ITER_all_snapshots, k, ({
+ u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
+ struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k);
+ unsigned nr = bch2_accounting_counters(k.k);
+
+ bch2_accounting_mem_read(c, k.k->p, v, nr);
+
+ if (memcmp(a.v->d, v, nr * sizeof(u64))) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, " !=");
+ for (unsigned j = 0; j < nr; j++)
+ prt_printf(&buf, " %llu", v[j]);
+
+ pr_err("%s", buf.buf);
+ printbuf_exit(&buf);
+ mismatch = true;
+ }
+
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, a.k->p);
+
+ switch (acc_k.type) {
+ case BCH_DISK_ACCOUNTING_persistent_reserved:
+ base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
+ break;
+ case BCH_DISK_ACCOUNTING_replicas:
+ fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]);
+ break;
+ case BCH_DISK_ACCOUNTING_dev_data_type: {
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev);
+ if (!ca) {
+ rcu_read_unlock();
+ continue;
+ }
+
+ v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets);
+ v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors);
+ v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented);
+ rcu_read_unlock();
+
+ if (memcmp(a.v->d, v, 3 * sizeof(u64))) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, " in mem");
+ for (unsigned j = 0; j < nr; j++)
+ prt_printf(&buf, " %llu", v[j]);
+
+ pr_err("dev accounting mismatch: %s", buf.buf);
+ printbuf_exit(&buf);
+ mismatch = true;
+ }
+ }
+ }
+
+ 0;
+ })));
+
+ acc_u64s_percpu(&base_inmem.hidden, &c->usage->hidden, sizeof(base_inmem) / sizeof(u64));
+
+#define check(x) \
+ if (base.x != base_inmem.x) { \
+ pr_err("fs_usage_base.%s mismatch: %llu != %llu", #x, base.x, base_inmem.x); \
+ mismatch = true; \
+ }
+
+ //check(hidden);
+ check(btree);
+ check(data);
+ check(cached);
+ check(reserved);
+ check(nr_inodes);
+
+ WARN_ON(mismatch);
+}
+
+void bch2_accounting_gc_free(struct bch_fs *c)
+{
+ lockdep_assert_held(&c->mark_lock);
+
+ struct bch_accounting_mem *acc = &c->accounting;
+
+ bch2_accounting_free_counters(acc, true);
+ acc->gc_running = false;
+}
+
+void bch2_fs_accounting_exit(struct bch_fs *c)
+{
+ struct bch_accounting_mem *acc = &c->accounting;
+
+ bch2_accounting_free_counters(acc, false);
+ darray_exit(&acc->k);
+}
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
new file mode 100644
index 000000000000..3d3f25e08b69
--- /dev/null
+++ b/fs/bcachefs/disk_accounting.h
@@ -0,0 +1,219 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_ACCOUNTING_H
+#define _BCACHEFS_DISK_ACCOUNTING_H
+
+#include "eytzinger.h"
+#include "sb-members.h"
+
+static inline void bch2_u64s_neg(u64 *v, unsigned nr)
+{
+ for (unsigned i = 0; i < nr; i++)
+ v[i] = -v[i];
+}
+
+static inline unsigned bch2_accounting_counters(const struct bkey *k)
+{
+ return bkey_val_u64s(k) - offsetof(struct bch_accounting, d) / sizeof(u64);
+}
+
+static inline void bch2_accounting_neg(struct bkey_s_accounting a)
+{
+ bch2_u64s_neg(a.v->d, bch2_accounting_counters(a.k));
+}
+
+static inline bool bch2_accounting_key_is_zero(struct bkey_s_c_accounting a)
+{
+ for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++)
+ if (a.v->d[i])
+ return false;
+ return true;
+}
+
+static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst,
+ struct bkey_s_c_accounting src)
+{
+ EBUG_ON(dst->k.u64s != src.k->u64s);
+
+ for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++)
+ dst->v.d[i] += src.v->d[i];
+ if (bversion_cmp(dst->k.version, src.k->version) < 0)
+ dst->k.version = src.k->version;
+}
+
+static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
+ enum bch_data_type data_type,
+ s64 sectors)
+{
+ switch (data_type) {
+ case BCH_DATA_btree:
+ fs_usage->btree += sectors;
+ break;
+ case BCH_DATA_user:
+ case BCH_DATA_parity:
+ fs_usage->data += sectors;
+ break;
+ case BCH_DATA_cached:
+ fs_usage->cached += sectors;
+ break;
+ default:
+ break;
+ }
+}
+
+static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p)
+{
+ acc->_pad = p;
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ bch2_bpos_swab(&acc->_pad);
+#endif
+}
+
+static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *k)
+{
+ struct bpos ret = k->_pad;
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ bch2_bpos_swab(&ret);
+#endif
+ return ret;
+}
+
+int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *,
+ s64 *, unsigned, bool);
+int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool);
+
+int bch2_accounting_invalid(struct bch_fs *, struct bkey_s_c,
+ enum bch_validate_flags, struct printbuf *);
+void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *);
+void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+void bch2_accounting_swab(struct bkey_s);
+
+#define bch2_bkey_ops_accounting ((struct bkey_ops) { \
+ .key_invalid = bch2_accounting_invalid, \
+ .val_to_text = bch2_accounting_to_text, \
+ .swab = bch2_accounting_swab, \
+ .min_val_size = 8, \
+})
+
+int bch2_accounting_update_sb(struct btree_trans *);
+
+static inline int accounting_pos_cmp(const void *_l, const void *_r)
+{
+ const struct bpos *l = _l, *r = _r;
+
+ return bpos_cmp(*l, *r);
+}
+
+int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, bool);
+void bch2_accounting_mem_gc(struct bch_fs *);
+
+static inline int __bch2_accounting_mem_mod(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc)
+{
+ struct bch_accounting_mem *acc = &c->accounting;
+ unsigned idx;
+
+ EBUG_ON(gc && !acc->gc_running);
+
+ while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+ accounting_pos_cmp, &a.k->p)) >= acc->k.nr) {
+ int ret = bch2_accounting_mem_insert(c, a, gc);
+ if (ret)
+ return ret;
+ }
+
+ struct accounting_mem_entry *e = &acc->k.data[idx];
+
+ EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters);
+
+ for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++)
+ this_cpu_add(e->v[gc][i], a.v->d[i]);
+ return 0;
+}
+
+/*
+ * Update in memory counters so they match the btree update we're doing; called
+ * from transaction commit path
+ */
+static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
+{
+ struct bch_fs *c = trans->c;
+
+ if (!gc) {
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, a.k->p);
+
+ switch (acc_k.type) {
+ case BCH_DISK_ACCOUNTING_persistent_reserved:
+ trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
+ break;
+ case BCH_DISK_ACCOUNTING_replicas:
+ fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]);
+ break;
+ case BCH_DISK_ACCOUNTING_dev_data_type:
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev);
+ if (ca) {
+ this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]);
+ this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]);
+ this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]);
+ }
+ rcu_read_unlock();
+ break;
+ }
+ }
+
+ return __bch2_accounting_mem_mod(c, a, gc);
+}
+
+static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
+{
+ percpu_down_read(&trans->c->mark_lock);
+ int ret = bch2_accounting_mem_mod_locked(trans, a, gc);
+ percpu_up_read(&trans->c->mark_lock);
+ return ret;
+}
+
+static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem *acc,
+ unsigned idx, u64 *v, unsigned nr, bool gc)
+{
+ memset(v, 0, sizeof(*v) * nr);
+
+ if (unlikely(idx >= acc->k.nr))
+ return;
+
+ struct accounting_mem_entry *e = &acc->k.data[idx];
+
+ nr = min_t(unsigned, nr, e->nr_counters);
+
+ for (unsigned i = 0; i < nr; i++)
+ v[i] = percpu_u64_get(e->v[gc] + i);
+}
+
+static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p,
+ u64 *v, unsigned nr)
+{
+ struct bch_accounting_mem *acc = &c->accounting;
+ unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+ accounting_pos_cmp, &p);
+
+ bch2_accounting_mem_read_counters(acc, idx, v, nr, false);
+}
+
+int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *);
+int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned);
+void bch2_fs_accounting_to_text(struct printbuf *, struct bch_fs *);
+
+int bch2_gc_accounting_start(struct bch_fs *);
+int bch2_gc_accounting_done(struct bch_fs *);
+
+int bch2_accounting_read(struct bch_fs *);
+
+int bch2_dev_usage_remove(struct bch_fs *, unsigned);
+int bch2_dev_usage_init(struct bch_dev *, bool);
+
+void bch2_verify_accounting_clean(struct bch_fs *c);
+
+void bch2_accounting_gc_free(struct bch_fs *);
+void bch2_fs_accounting_exit(struct bch_fs *);
+
+#endif /* _BCACHEFS_DISK_ACCOUNTING_H */
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
new file mode 100644
index 000000000000..cba417060b33
--- /dev/null
+++ b/fs/bcachefs/disk_accounting_format.h
@@ -0,0 +1,162 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
+#define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
+
+#include "replicas_format.h"
+
+/*
+ * Disk accounting - KEY_TYPE_accounting - on disk format:
+ *
+ * Here, the key has considerably more structure than a typical key (bpos); an
+ * accounting key is 'struct disk_accounting_pos', which is a union of bpos.
+ *
+ * More specifically: a key is just a muliword integer (where word endianness
+ * matches native byte order), so we're treating bpos as an opaque 20 byte
+ * integer and mapping bch_accounting_key to that.
+ *
+ * This is a type-tagged union of all our various subtypes; a disk accounting
+ * key can be device counters, replicas counters, et cetera - it's extensible.
+ *
+ * The value is a list of u64s or s64s; the number of counters is specific to a
+ * given accounting type.
+ *
+ * Unlike with other key types, updates are _deltas_, and the deltas are not
+ * resolved until the update to the underlying btree, done by btree write buffer
+ * flush or journal replay.
+ *
+ * Journal replay in particular requires special handling. The journal tracks a
+ * range of entries which may possibly have not yet been applied to the btree
+ * yet - it does not know definitively whether individual entries are dirty and
+ * still need to be applied.
+ *
+ * To handle this, we use the version field of struct bkey, and give every
+ * accounting update a unique version number - a total ordering in time; the
+ * version number is derived from the key's position in the journal. Then
+ * journal replay can compare the version number of the key from the journal
+ * with the version number of the key in the btree to determine if a key needs
+ * to be replayed.
+ *
+ * For this to work, we must maintain this strict time ordering of updates as
+ * they are flushed to the btree, both via write buffer flush and via journal
+ * replay. This has complications for the write buffer code while journal replay
+ * is still in progress; the write buffer cannot flush any accounting keys to
+ * the btree until journal replay has finished replaying its accounting keys, or
+ * the (newer) version number of the keys from the write buffer will cause
+ * updates from journal replay to be lost.
+ */
+
+struct bch_accounting {
+ struct bch_val v;
+ __u64 d[];
+};
+
+#define BCH_ACCOUNTING_MAX_COUNTERS 3
+
+#define BCH_DATA_TYPES() \
+ x(free, 0) \
+ x(sb, 1) \
+ x(journal, 2) \
+ x(btree, 3) \
+ x(user, 4) \
+ x(cached, 5) \
+ x(parity, 6) \
+ x(stripe, 7) \
+ x(need_gc_gens, 8) \
+ x(need_discard, 9) \
+ x(unstriped, 10)
+
+enum bch_data_type {
+#define x(t, n) BCH_DATA_##t,
+ BCH_DATA_TYPES()
+#undef x
+ BCH_DATA_NR
+};
+
+static inline bool data_type_is_empty(enum bch_data_type type)
+{
+ switch (type) {
+ case BCH_DATA_free:
+ case BCH_DATA_need_gc_gens:
+ case BCH_DATA_need_discard:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool data_type_is_hidden(enum bch_data_type type)
+{
+ switch (type) {
+ case BCH_DATA_sb:
+ case BCH_DATA_journal:
+ return true;
+ default:
+ return false;
+ }
+}
+
+#define BCH_DISK_ACCOUNTING_TYPES() \
+ x(nr_inodes, 0) \
+ x(persistent_reserved, 1) \
+ x(replicas, 2) \
+ x(dev_data_type, 3) \
+ x(compression, 4) \
+ x(snapshot, 5) \
+ x(btree, 6) \
+ x(rebalance_work, 7)
+
+enum disk_accounting_type {
+#define x(f, nr) BCH_DISK_ACCOUNTING_##f = nr,
+ BCH_DISK_ACCOUNTING_TYPES()
+#undef x
+ BCH_DISK_ACCOUNTING_TYPE_NR,
+};
+
+struct bch_nr_inodes {
+};
+
+struct bch_persistent_reserved {
+ __u8 nr_replicas;
+};
+
+struct bch_dev_data_type {
+ __u8 dev;
+ __u8 data_type;
+};
+
+struct bch_dev_stripe_buckets {
+ __u8 dev;
+};
+
+struct bch_acct_compression {
+ __u8 type;
+};
+
+struct bch_acct_snapshot {
+ __u32 id;
+};
+
+struct bch_acct_btree {
+ __u32 id;
+};
+
+struct disk_accounting_pos {
+ union {
+ struct {
+ __u8 type;
+ union {
+ struct bch_nr_inodes nr_inodes;
+ struct bch_persistent_reserved persistent_reserved;
+ struct bch_replicas_entry_v1 replicas;
+ struct bch_dev_data_type dev_data_type;
+ struct bch_dev_stripe_buckets dev_stripe_buckets;
+ struct bch_acct_compression compression;
+ struct bch_acct_snapshot snapshot;
+ struct bch_acct_btree btree;
+ };
+ };
+ struct bpos _pad;
+ };
+};
+
+#endif /* _BCACHEFS_DISK_ACCOUNTING_FORMAT_H */
diff --git a/fs/bcachefs/disk_accounting_types.h b/fs/bcachefs/disk_accounting_types.h
new file mode 100644
index 000000000000..1687a45177a7
--- /dev/null
+++ b/fs/bcachefs/disk_accounting_types.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_ACCOUNTING_TYPES_H
+#define _BCACHEFS_DISK_ACCOUNTING_TYPES_H
+
+#include "darray.h"
+
+struct accounting_mem_entry {
+ struct bpos pos;
+ struct bversion version;
+ unsigned nr_counters;
+ u64 __percpu *v[2];
+};
+
+struct bch_accounting_mem {
+ DARRAY(struct accounting_mem_entry) k;
+ bool gc_running;
+};
+
+#endif /* _BCACHEFS_DISK_ACCOUNTING_TYPES_H */
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 521a86df5e52..5df8de0b8c02 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -511,7 +511,7 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,
return -EINVAL;
if (!c)
- return 0;
+ return -BCH_ERR_option_needs_open_fs;
if (!strlen(val) || !strcmp(val, "none")) {
*res = 0;
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 452d6b142784..9b5b5c9a6c63 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -13,6 +13,7 @@
#include "btree_write_buffer.h"
#include "buckets.h"
#include "checksum.h"
+#include "disk_accounting.h"
#include "disk_groups.h"
#include "ec.h"
#include "error.h"
@@ -282,7 +283,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
if (flags & BTREE_TRIGGER_transactional) {
struct bkey_i_alloc_v4 *a =
- bch2_trans_start_alloc_update(trans, bucket);
+ bch2_trans_start_alloc_update(trans, bucket, 0);
ret = PTR_ERR_OR_ZERO(a) ?:
__mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags);
}
@@ -300,13 +301,12 @@ static int mark_stripe_bucket(struct btree_trans *trans,
bucket_lock(g);
struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
- if (!ret) {
- alloc_to_bucket(g, new);
- bch2_dev_usage_update(c, ca, &old, &new, 0, true);
- }
+ alloc_to_bucket(g, new);
bucket_unlock(g);
err_unlock:
percpu_up_read(&c->mark_lock);
+ if (!ret)
+ ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
}
err:
bch2_dev_put(ca);
@@ -368,7 +368,12 @@ int bch2_trigger_stripe(struct btree_trans *trans,
if (unlikely(flags & BTREE_TRIGGER_check_repair))
return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags);
- if (flags & BTREE_TRIGGER_transactional) {
+ BUG_ON(new_s && old_s &&
+ (new_s->nr_blocks != old_s->nr_blocks ||
+ new_s->nr_redundant != old_s->nr_redundant));
+
+
+ if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
/*
* If the pointers aren't changing, we don't need to do anything:
*/
@@ -379,26 +384,58 @@ int bch2_trigger_stripe(struct btree_trans *trans,
new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
return 0;
- BUG_ON(new_s && old_s &&
- (new_s->nr_blocks != old_s->nr_blocks ||
- new_s->nr_redundant != old_s->nr_redundant));
+ struct gc_stripe *gc = NULL;
+ if (flags & BTREE_TRIGGER_gc) {
+ gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
+ if (!gc) {
+ bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx);
+ return -BCH_ERR_ENOMEM_mark_stripe;
+ }
+
+ /*
+ * This will be wrong when we bring back runtime gc: we should
+ * be unmarking the old key and then marking the new key
+ *
+ * Also: when we bring back runtime gc, locking
+ */
+ gc->alive = true;
+ gc->sectors = le16_to_cpu(new_s->sectors);
+ gc->nr_blocks = new_s->nr_blocks;
+ gc->nr_redundant = new_s->nr_redundant;
+
+ for (unsigned i = 0; i < new_s->nr_blocks; i++)
+ gc->ptrs[i] = new_s->ptrs[i];
+
+ /*
+ * gc recalculates this field from stripe ptr
+ * references:
+ */
+ memset(gc->block_sectors, 0, sizeof(gc->block_sectors));
+ }
if (new_s) {
- s64 sectors = le16_to_cpu(new_s->sectors);
+ s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant;
- struct bch_replicas_padded r;
- bch2_bkey_to_replicas(&r.e, new);
- int ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_replicas,
+ };
+ bch2_bkey_to_replicas(&acc.replicas, new);
+ int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
if (ret)
return ret;
+
+ if (gc)
+ memcpy(&gc->r.e, &acc.replicas, replicas_entry_bytes(&acc.replicas));
}
if (old_s) {
- s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
+ s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant;
- struct bch_replicas_padded r;
- bch2_bkey_to_replicas(&r.e, old);
- int ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_replicas,
+ };
+ bch2_bkey_to_replicas(&acc.replicas, old);
+ int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
if (ret)
return ret;
}
@@ -447,52 +484,6 @@ int bch2_trigger_stripe(struct btree_trans *trans,
}
}
- if (flags & BTREE_TRIGGER_gc) {
- struct gc_stripe *m =
- genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
-
- if (!m) {
- bch_err(c, "error allocating memory for gc_stripes, idx %llu",
- idx);
- return -BCH_ERR_ENOMEM_mark_stripe;
- }
- /*
- * This will be wrong when we bring back runtime gc: we should
- * be unmarking the old key and then marking the new key
- */
- m->alive = true;
- m->sectors = le16_to_cpu(new_s->sectors);
- m->nr_blocks = new_s->nr_blocks;
- m->nr_redundant = new_s->nr_redundant;
-
- for (unsigned i = 0; i < new_s->nr_blocks; i++)
- m->ptrs[i] = new_s->ptrs[i];
-
- bch2_bkey_to_replicas(&m->r.e, new);
-
- /*
- * gc recalculates this field from stripe ptr
- * references:
- */
- memset(m->block_sectors, 0, sizeof(m->block_sectors));
-
- int ret = mark_stripe_buckets(trans, old, new, flags);
- if (ret)
- return ret;
-
- ret = bch2_update_replicas(c, new, &m->r.e,
- ((s64) m->sectors * m->nr_redundant),
- 0, true);
- if (ret) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, new);
- bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf);
- printbuf_exit(&buf);
- return ret;
- }
- }
-
return 0;
}
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 58612abf7927..a268af3e52bf 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -257,7 +257,8 @@
x(BCH_ERR_nopromote, nopromote_no_writes) \
x(BCH_ERR_nopromote, nopromote_enomem) \
x(0, need_inode_lock) \
- x(0, invalid_snapshot_node)
+ x(0, invalid_snapshot_node) \
+ x(0, option_needs_open_fs)
enum bch_errcode {
BCH_ERR_START = 2048,
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index d95c40f1b6af..a62b63108820 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "btree_iter.h"
#include "error.h"
#include "journal.h"
#include "recovery_passes.h"
@@ -98,7 +99,7 @@ static enum ask_yn parse_yn_response(char *buf)
}
#ifdef __KERNEL__
-static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
+static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans)
{
struct stdio_redirect *stdio = c->stdio;
@@ -108,25 +109,44 @@ static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
if (!stdio)
return YN_NO;
- char buf[100];
+ if (trans)
+ bch2_trans_unlock(trans);
+
+ unsigned long unlock_long_at = trans ? jiffies + HZ * 2 : 0;
+ darray_char line = {};
int ret;
do {
+ unsigned long t;
bch2_print(c, " (y,n, or Y,N for all errors of this type) ");
+rewait:
+ t = unlock_long_at
+ ? max_t(long, unlock_long_at - jiffies, 0)
+ : MAX_SCHEDULE_TIMEOUT;
+
+ int r = bch2_stdio_redirect_readline_timeout(stdio, &line, t);
+ if (r == -ETIME) {
+ bch2_trans_unlock_long(trans);
+ unlock_long_at = 0;
+ goto rewait;
+ }
- int r = bch2_stdio_redirect_readline(stdio, buf, sizeof(buf) - 1);
- if (r < 0)
- return YN_NO;
- buf[r] = '\0';
- } while ((ret = parse_yn_response(buf)) < 0);
+ if (r < 0) {
+ ret = YN_NO;
+ break;
+ }
+
+ darray_last(line) = '\0';
+ } while ((ret = parse_yn_response(line.data)) < 0);
+ darray_exit(&line);
return ret;
}
#else
#include "tools-util.h"
-static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
+static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans)
{
char *buf = NULL;
size_t buflen = 0;
@@ -198,7 +218,8 @@ static const u8 fsck_flags_extra[] = {
#undef x
};
-int bch2_fsck_err(struct bch_fs *c,
+int __bch2_fsck_err(struct bch_fs *c,
+ struct btree_trans *trans,
enum bch_fsck_flags flags,
enum bch_sb_error_id err,
const char *fmt, ...)
@@ -210,9 +231,16 @@ int bch2_fsck_err(struct bch_fs *c,
int ret = -BCH_ERR_fsck_ignore;
const char *action_orig = "fix?", *action = action_orig;
+ might_sleep();
+
if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
flags |= fsck_flags_extra[err];
+ if (!c)
+ c = trans->c;
+
+ WARN_ON(!trans && bch2_current_has_btree_trans(c));
+
if ((flags & FSCK_CAN_FIX) &&
test_bit(err, c->sb.errors_silent))
return -BCH_ERR_fsck_fix;
@@ -314,7 +342,15 @@ int bch2_fsck_err(struct bch_fs *c,
bch2_print_string_as_lines(KERN_ERR, out->buf);
print = false;
- int ask = bch2_fsck_ask_yn(c);
+ int ask = bch2_fsck_ask_yn(c, trans);
+
+ if (trans) {
+ ret = bch2_trans_relock(trans);
+ if (ret) {
+ mutex_unlock(&c->fsck_error_msgs_lock);
+ goto err;
+ }
+ }
if (ask >= YN_ALLNO && s)
s->fix = ask == YN_ALLNO
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 777711504c35..995e6bba9bad 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -110,18 +110,21 @@ struct fsck_err_state {
#define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err)
-__printf(4, 5) __cold
-int bch2_fsck_err(struct bch_fs *,
+__printf(5, 6) __cold
+int __bch2_fsck_err(struct bch_fs *, struct btree_trans *,
enum bch_fsck_flags,
enum bch_sb_error_id,
const char *, ...);
+#define bch2_fsck_err(c, _flags, _err_type, ...) \
+ __bch2_fsck_err(type_is(c, struct bch_fs *) ? (struct bch_fs *) c : NULL,\
+ type_is(c, struct btree_trans *) ? (struct btree_trans *) c : NULL,\
+ _flags, BCH_FSCK_ERR_##_err_type, __VA_ARGS__)
+
void bch2_flush_fsck_errs(struct bch_fs *);
#define __fsck_err(c, _flags, _err_type, ...) \
({ \
- int _ret = bch2_fsck_err(c, _flags, BCH_FSCK_ERR_##_err_type, \
- __VA_ARGS__); \
- \
+ int _ret = bch2_fsck_err(c, _flags, _err_type, __VA_ARGS__); \
if (_ret != -BCH_ERR_fsck_fix && \
_ret != -BCH_ERR_fsck_ignore) { \
ret = _ret; \
@@ -136,7 +139,14 @@ void bch2_flush_fsck_errs(struct bch_fs *);
/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
#define __fsck_err_on(cond, c, _flags, _err_type, ...) \
- (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false)
+({ \
+ might_sleep(); \
+ \
+ if (type_is(c, struct bch_fs *)) \
+ WARN_ON(bch2_current_has_btree_trans((struct bch_fs *) c));\
+ \
+ (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false);\
+})
#define need_fsck_err_on(cond, c, _err_type, ...) \
__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 410b8bd81b5a..07973198e35f 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -37,8 +37,8 @@ static void bch2_extent_crc_pack(union bch_extent_crc *,
struct bch_extent_crc_unpacked,
enum bch_extent_entry_type);
-static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
- unsigned dev)
+struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
+ unsigned dev)
{
struct bch_dev_io_failures *i;
@@ -52,7 +52,7 @@ static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
void bch2_mark_io_failure(struct bch_io_failures *failed,
struct extent_ptr_decoded *p)
{
- struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev);
+ struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);
if (!f) {
BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
@@ -140,7 +140,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
continue;
- f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
+ f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
if (f)
p.idx = f->nr_failed < f->nr_retries
? f->idx
@@ -1034,6 +1034,18 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc
--out->atomic;
}
+void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_crc_unpacked *crc)
+{
+ prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ",
+ crc->compressed_size,
+ crc->uncompressed_size,
+ crc->offset, crc->nonce);
+ bch2_prt_csum_type(out, crc->csum_type);
+ prt_printf(out, " %0llx:%0llx ", crc->csum.hi, crc->csum.lo);
+ prt_str(out, " compress ");
+ bch2_prt_compression_type(out, crc->compression_type);
+}
+
void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
@@ -1059,13 +1071,7 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct bch_extent_crc_unpacked crc =
bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
- prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ",
- crc.compressed_size,
- crc.uncompressed_size,
- crc.offset, crc.nonce);
- bch2_prt_csum_type(out, crc.csum_type);
- prt_str(out, " compress ");
- bch2_prt_compression_type(out, crc.compression_type);
+ bch2_extent_crc_unpacked_to_text(out, &crc);
break;
}
case BCH_EXTENT_ENTRY_stripe_ptr: {
@@ -1096,6 +1102,7 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
}
}
+
static int extent_ptr_invalid(struct bch_fs *c,
struct bkey_s_c k,
enum bch_validate_flags flags,
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 1ade959652b2..facdb8a86eec 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -212,6 +212,8 @@ static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc)
return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc);
}
+void bch2_extent_crc_unpacked_to_text(struct printbuf *, struct bch_extent_crc_unpacked *);
+
/* bkey_ptrs: generically over any key type that has ptrs */
struct bkey_ptrs_c {
@@ -397,6 +399,8 @@ out: \
/* utility code common to all keys with pointers: */
+struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *,
+ unsigned);
void bch2_mark_io_failure(struct bch_io_failures *,
struct extent_ptr_decoded *);
int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
index 24840aee335c..0541192d7bc0 100644
--- a/fs/bcachefs/eytzinger.h
+++ b/fs/bcachefs/eytzinger.h
@@ -48,7 +48,7 @@ static inline unsigned eytzinger1_right_child(unsigned i)
static inline unsigned eytzinger1_first(unsigned size)
{
- return rounddown_pow_of_two(size);
+ return size ? rounddown_pow_of_two(size) : 0;
}
static inline unsigned eytzinger1_last(unsigned size)
@@ -101,7 +101,9 @@ static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
static inline unsigned eytzinger1_extra(unsigned size)
{
- return (size + 1 - rounddown_pow_of_two(size)) << 1;
+ return size
+ ? (size + 1 - rounddown_pow_of_two(size)) << 1
+ : 0;
}
static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
@@ -284,6 +286,17 @@ static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size,
return eytzinger0_next(idx, nr);
}
+static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size,
+ cmp_func_t cmp, const void *search)
+{
+ ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
+
+ if (idx < nr && !cmp(base + idx * size, search))
+ return idx;
+
+ return eytzinger0_next(idx, nr);
+}
+
#define eytzinger0_find(base, nr, size, _cmp, search) \
({ \
void *_base = (base); \
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h
index dde237859514..c934e807b380 100644
--- a/fs/bcachefs/fs-common.h
+++ b/fs/bcachefs/fs-common.h
@@ -2,6 +2,8 @@
#ifndef _BCACHEFS_FS_COMMON_H
#define _BCACHEFS_FS_COMMON_H
+#include "dirent.h"
+
struct posix_acl;
#define BCH_CREATE_TMPFILE (1U << 0)
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 54873ecc635c..cc33d763f722 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -678,8 +678,8 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
bch2_pagecache_add_get(inode);
folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
- FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE,
- mapping_gfp_mask(mapping));
+ FGP_WRITEBEGIN | fgf_set_order(len),
+ mapping_gfp_mask(mapping));
if (IS_ERR_OR_NULL(folio))
goto err_unlock;
@@ -820,9 +820,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
darray_init(&fs);
ret = bch2_filemap_get_contig_folios_d(mapping, pos, end,
- FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT,
- mapping_gfp_mask(mapping),
- &fs);
+ FGP_WRITEBEGIN | fgf_set_order(len),
+ mapping_gfp_mask(mapping), &fs);
if (ret)
goto out;
@@ -864,24 +863,26 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
f_pos = pos;
f_offset = pos - folio_pos(darray_first(fs));
darray_for_each(fs, fi) {
+ ssize_t f_reserved;
+
f = *fi;
f_len = min(end, folio_end_pos(f)) - f_pos;
+ f_reserved = bch2_folio_reservation_get_partial(c, inode, f, &res, f_offset, f_len);
+
+ if (unlikely(f_reserved != f_len)) {
+ if (f_reserved < 0) {
+ if (f == darray_first(fs)) {
+ ret = f_reserved;
+ goto out;
+ }
+
+ folios_trunc(&fs, fi);
+ end = min(end, folio_end_pos(darray_last(fs)));
+ } else {
+ folios_trunc(&fs, fi + 1);
+ end = f_pos + f_reserved;
+ }
- /*
- * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
- * supposed to write as much as we have disk space for.
- *
- * On failure here we should still write out a partial page if
- * we aren't completely out of disk space - we don't do that
- * yet:
- */
- ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len);
- if (unlikely(ret)) {
- folios_trunc(&fs, fi);
- if (!fs.nr)
- goto out;
-
- end = min(end, folio_end_pos(darray_last(fs)));
break;
}
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 049b61bc9a5b..e246b1e05aa2 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -179,7 +179,7 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
struct bch_inode_info *inode = file_bch_inode(file);
struct address_space *mapping = file->f_mapping;
size_t count = iov_iter_count(iter);
- ssize_t ret;
+ ssize_t ret = 0;
if (!count)
return 0; /* skip atime */
@@ -205,7 +205,7 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
iocb->ki_pos += ret;
} else {
bch2_pagecache_add_get(inode);
- ret = generic_file_read_iter(iocb, iter);
+ ret = filemap_read(iocb, iter, ret);
bch2_pagecache_add_put(inode);
}
out:
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index 872283e5bd1e..a9cc5cad9cc9 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -423,7 +423,7 @@ int bch2_folio_reservation_get(struct bch_fs *c,
struct bch_inode_info *inode,
struct folio *folio,
struct bch2_folio_reservation *res,
- unsigned offset, unsigned len)
+ size_t offset, size_t len)
{
struct bch_folio *s = bch2_folio_create(folio, 0);
unsigned i, disk_sectors = 0, quota_sectors = 0;
@@ -437,8 +437,7 @@ int bch2_folio_reservation_get(struct bch_fs *c,
for (i = round_down(offset, block_bytes(c)) >> 9;
i < round_up(offset + len, block_bytes(c)) >> 9;
i++) {
- disk_sectors += sectors_to_reserve(&s->s[i],
- res->disk.nr_replicas);
+ disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas);
quota_sectors += s->s[i].state == SECTOR_unallocated;
}
@@ -449,12 +448,9 @@ int bch2_folio_reservation_get(struct bch_fs *c,
}
if (quota_sectors) {
- ret = bch2_quota_reservation_add(c, inode, &res->quota,
- quota_sectors, true);
+ ret = bch2_quota_reservation_add(c, inode, &res->quota, quota_sectors, true);
if (unlikely(ret)) {
- struct disk_reservation tmp = {
- .sectors = disk_sectors
- };
+ struct disk_reservation tmp = { .sectors = disk_sectors };
bch2_disk_reservation_put(c, &tmp);
res->disk.sectors -= disk_sectors;
@@ -465,6 +461,31 @@ int bch2_folio_reservation_get(struct bch_fs *c,
return 0;
}
+ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct folio *folio,
+ struct bch2_folio_reservation *res,
+ size_t offset, size_t len)
+{
+ size_t l, reserved = 0;
+ int ret;
+
+ while ((l = len - reserved)) {
+ while ((ret = bch2_folio_reservation_get(c, inode, folio, res, offset, l))) {
+ if ((offset & (block_bytes(c) - 1)) + l <= block_bytes(c))
+ return reserved ?: ret;
+
+ len = reserved + l;
+ l /= 2;
+ }
+
+ offset += l;
+ reserved += l;
+ }
+
+ return reserved;
+}
+
static void bch2_clear_folio_bits(struct folio *folio)
{
struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
index 828c3d7c8f19..fd7d692c087e 100644
--- a/fs/bcachefs/fs-io-pagecache.h
+++ b/fs/bcachefs/fs-io-pagecache.h
@@ -153,7 +153,12 @@ int bch2_folio_reservation_get(struct bch_fs *,
struct bch_inode_info *,
struct folio *,
struct bch2_folio_reservation *,
- unsigned, unsigned);
+ size_t, size_t);
+ssize_t bch2_folio_reservation_get_partial(struct bch_fs *,
+ struct bch_inode_info *,
+ struct folio *,
+ struct bch2_folio_reservation *,
+ size_t, size_t);
void bch2_set_folio_dirty(struct bch_fs *,
struct bch_inode_info *,
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index ef20b64033e0..77b85da30fb2 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -192,7 +192,9 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret;
+ int ret, err;
+
+ trace_bch2_fsync(file, datasync);
ret = file_write_and_wait_range(file, start, end);
if (ret)
@@ -205,6 +207,11 @@ out:
ret = bch2_err_class(ret);
if (ret == -EROFS)
ret = -EIO;
+
+ err = file_check_and_advance_wb_err(file);
+ if (!ret)
+ ret = err;
+
return ret;
}
@@ -508,7 +515,7 @@ static int inode_update_times_fn(struct btree_trans *trans,
return 0;
}
-static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
+static noinline long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
u64 end = offset + len;
@@ -547,7 +554,7 @@ err:
return ret;
}
-static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
+static noinline long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
loff_t offset, loff_t len,
bool insert)
{
@@ -583,7 +590,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
return ret;
}
-static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
+static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
u64 start_sector, u64 end_sector)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
@@ -704,7 +711,7 @@ bkey_err:
return ret;
}
-static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
+static noinline long bchfs_fallocate(struct bch_inode_info *inode, int mode,
loff_t offset, loff_t len)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
@@ -860,9 +867,6 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
return -EINVAL;
- if (remap_flags & REMAP_FILE_DEDUP)
- return -EOPNOTSUPP;
-
if ((pos_src & (block_bytes(c) - 1)) ||
(pos_dst & (block_bytes(c) - 1)))
return -EINVAL;
@@ -895,7 +899,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
if (ret)
goto err;
- file_update_time(file_dst);
+ if (!(remap_flags & REMAP_FILE_DEDUP))
+ file_update_time(file_dst);
bch2_mark_pagecache_unallocated(src, pos_src >> 9,
(pos_src + aligned_len) >> 9);
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 79a0c8732bce..aea8132d2c40 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -272,6 +272,70 @@ err1:
return ret;
}
+static int bch2_ioc_getversion(struct bch_inode_info *inode, u32 __user *arg)
+{
+ return put_user(inode->v.i_generation, arg);
+}
+
+static int bch2_ioc_getlabel(struct bch_fs *c, char __user *user_label)
+{
+ int ret;
+ size_t len;
+ char label[BCH_SB_LABEL_SIZE];
+
+ BUILD_BUG_ON(BCH_SB_LABEL_SIZE >= FSLABEL_MAX);
+
+ mutex_lock(&c->sb_lock);
+ memcpy(label, c->disk_sb.sb->label, BCH_SB_LABEL_SIZE);
+ mutex_unlock(&c->sb_lock);
+
+ len = strnlen(label, BCH_SB_LABEL_SIZE);
+ if (len == BCH_SB_LABEL_SIZE) {
+ bch_warn(c,
+ "label is too long, return the first %zu bytes",
+ --len);
+ }
+
+ ret = copy_to_user(user_label, label, len);
+
+ return ret ? -EFAULT : 0;
+}
+
+static int bch2_ioc_setlabel(struct bch_fs *c,
+ struct file *file,
+ struct bch_inode_info *inode,
+ const char __user *user_label)
+{
+ int ret;
+ char label[BCH_SB_LABEL_SIZE];
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(label, user_label, sizeof(label)))
+ return -EFAULT;
+
+ if (strnlen(label, BCH_SB_LABEL_SIZE) == BCH_SB_LABEL_SIZE) {
+ bch_err(c,
+ "unable to set label with more than %d bytes",
+ BCH_SB_LABEL_SIZE - 1);
+ return -EINVAL;
+ }
+
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
+
+ mutex_lock(&c->sb_lock);
+ strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE);
+ mutex_unlock(&c->sb_lock);
+
+ ret = bch2_write_super(c);
+
+ mnt_drop_write_file(file);
+ return ret;
+}
+
static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
{
u32 flags;
@@ -499,13 +563,21 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
break;
case FS_IOC_GETVERSION:
- ret = -ENOTTY;
+ ret = bch2_ioc_getversion(inode, (u32 __user *) arg);
break;
case FS_IOC_SETVERSION:
ret = -ENOTTY;
break;
+ case FS_IOC_GETFSLABEL:
+ ret = bch2_ioc_getlabel(c, (void __user *) arg);
+ break;
+
+ case FS_IOC_SETFSLABEL:
+ ret = bch2_ioc_setlabel(c, file, inode, (const void __user *) arg);
+ break;
+
case FS_IOC_GOINGDOWN:
ret = bch2_ioc_goingdown(c, (u32 __user *) arg);
break;
@@ -547,6 +619,12 @@ long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case FS_IOC32_SETFLAGS:
cmd = FS_IOC_SETFLAGS;
break;
+ case FS_IOC32_GETVERSION:
+ cmd = FS_IOC_GETVERSION;
+ break;
+ case FS_IOC_GETFSLABEL:
+ case FS_IOC_SETFSLABEL:
+ break;
default:
return -ENOIOCTLCMD;
}
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index f9c9a95d7d4c..3a5f49affa0a 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -26,11 +26,13 @@
#include "snapshot.h"
#include "super.h"
#include "xattr.h"
+#include "trace.h"
#include <linux/aio.h>
#include <linux/backing-dev.h>
#include <linux/exportfs.h>
#include <linux/fiemap.h>
+#include <linux/fs_context.h>
#include <linux/module.h>
#include <linux/pagemap.h>
#include <linux/posix_acl.h>
@@ -56,9 +58,7 @@ void bch2_inode_update_after_write(struct btree_trans *trans,
BUG_ON(bi->bi_inum != inode->v.i_ino);
- bch2_assert_pos_locked(trans, BTREE_ID_inodes,
- POS(0, bi->bi_inum),
- c->opts.inodes_use_key_cache);
+ bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum));
set_nlink(&inode->v, bch2_inode_nlink_get(bi));
i_uid_write(&inode->v, bi->bi_uid);
@@ -194,6 +194,12 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino
* discard_new_inode() expects it to be set...
*/
inode->v.i_flags |= I_NEW;
+ /*
+ * We don't want bch2_evict_inode() to delete the inode on disk,
+ * we just raced and had another inode in cache. Normally new
+ * inodes don't have nlink == 0 - except tmpfiles do...
+ */
+ set_nlink(&inode->v, 1);
discard_new_inode(&inode->v);
inode = old;
} else {
@@ -238,7 +244,6 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
inode->ei_flags = 0;
mutex_init(&inode->ei_quota_lock);
memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
- inode->v.i_state = 0;
if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) {
kmem_cache_free(bch2_inode_cache, inode);
@@ -511,11 +516,11 @@ static int __bch2_link(struct bch_fs *c,
struct bch_inode_info *dir,
struct dentry *dentry)
{
- struct btree_trans *trans = bch2_trans_get(c);
struct bch_inode_unpacked dir_u, inode_u;
int ret;
mutex_lock(&inode->ei_update_lock);
+ struct btree_trans *trans = bch2_trans_get(c);
ret = commit_do(trans, NULL, NULL, 0,
bch2_link_trans(trans,
@@ -562,11 +567,12 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
struct bch_inode_info *dir = to_bch_ei(vdir);
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
struct bch_inode_unpacked dir_u, inode_u;
- struct btree_trans *trans = bch2_trans_get(c);
int ret;
bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
+ struct btree_trans *trans = bch2_trans_get(c);
+
ret = commit_do(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc,
bch2_unlink_trans(trans,
@@ -589,8 +595,8 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
set_nlink(&inode->v, 0);
}
err:
- bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
bch2_trans_put(trans);
+ bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
return ret;
}
@@ -675,14 +681,14 @@ static int bch2_rename2(struct mnt_idmap *idmap,
return ret;
}
- trans = bch2_trans_get(c);
-
bch2_lock_inodes(INODE_UPDATE_LOCK,
src_dir,
dst_dir,
src_inode,
dst_inode);
+ trans = bch2_trans_get(c);
+
ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?:
bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol);
if (ret)
@@ -888,6 +894,16 @@ static int bch2_getattr(struct mnt_idmap *idmap,
stat->subvol = inode->ei_subvol;
stat->result_mask |= STATX_SUBVOL;
+ if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) {
+ stat->result_mask |= STATX_DIOALIGN;
+ /*
+ * this is incorrect; we should be tracking this in superblock,
+ * and checking the alignment of open devices
+ */
+ stat->dio_mem_align = SECTOR_SIZE;
+ stat->dio_offset_align = block_bytes(c);
+ }
+
if (request_mask & STATX_BTIME) {
stat->result_mask |= STATX_BTIME;
stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
@@ -1689,6 +1705,8 @@ static int bch2_sync_fs(struct super_block *sb, int wait)
struct bch_fs *c = sb->s_fs_info;
int ret;
+ trace_bch2_sync_fs(sb, wait);
+
if (c->opts.journal_flush_disabled)
return 0;
@@ -1717,15 +1735,11 @@ static struct bch_fs *bch2_path_to_fs(const char *path)
return c ?: ERR_PTR(-ENOENT);
}
-static int bch2_remount(struct super_block *sb, int *flags, char *data)
+static int bch2_remount(struct super_block *sb, int *flags,
+ struct bch_opts opts)
{
struct bch_fs *c = sb->s_fs_info;
- struct bch_opts opts = bch2_opts_empty();
- int ret;
-
- ret = bch2_parse_mount_opts(c, &opts, data);
- if (ret)
- goto err;
+ int ret = 0;
opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
@@ -1785,7 +1799,8 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
const struct bch_option *opt = &bch2_opt_table[i];
u64 v = bch2_opt_get_by_id(&c->opts, i);
- if (!(opt->flags & OPT_MOUNT))
+ if ((opt->flags & OPT_HIDDEN) ||
+ !(opt->flags & OPT_MOUNT))
continue;
if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
@@ -1852,7 +1867,6 @@ static const struct super_operations bch_super_operations = {
.statfs = bch2_statfs,
.show_devname = bch2_show_devname,
.show_options = bch2_show_options,
- .remount_fs = bch2_remount,
.put_super = bch2_put_super,
.freeze_fs = bch2_freeze,
.unfreeze_fs = bch2_unfreeze,
@@ -1885,76 +1899,63 @@ static int bch2_test_super(struct super_block *s, void *data)
return true;
}
-static struct dentry *bch2_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int bch2_fs_get_tree(struct fs_context *fc)
{
struct bch_fs *c;
struct super_block *sb;
struct inode *vinode;
- struct bch_opts opts = bch2_opts_empty();
+ struct bch2_opts_parse *opts_parse = fc->fs_private;
+ struct bch_opts opts = opts_parse->opts;
+ darray_str devs;
+ darray_fs devs_to_fs = {};
int ret;
- opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
-
- ret = bch2_parse_mount_opts(NULL, &opts, data);
- if (ret) {
- ret = bch2_err_class(ret);
- return ERR_PTR(ret);
- }
+ opt_set(opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
+ opt_set(opts, nostart, true);
- if (!dev_name || strlen(dev_name) == 0)
- return ERR_PTR(-EINVAL);
+ if (!fc->source || strlen(fc->source) == 0)
+ return -EINVAL;
- darray_str devs;
- ret = bch2_split_devs(dev_name, &devs);
+ ret = bch2_split_devs(fc->source, &devs);
if (ret)
- return ERR_PTR(ret);
+ return ret;
- darray_fs devs_to_fs = {};
darray_for_each(devs, i) {
ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i));
- if (ret) {
- sb = ERR_PTR(ret);
- goto got_sb;
- }
+ if (ret)
+ goto err;
}
- sb = sget(fs_type, bch2_test_super, bch2_noset_super, flags|SB_NOSEC, &devs_to_fs);
+ sb = sget(fc->fs_type, bch2_test_super, bch2_noset_super, fc->sb_flags|SB_NOSEC, &devs_to_fs);
if (!IS_ERR(sb))
goto got_sb;
c = bch2_fs_open(devs.data, devs.nr, opts);
- if (IS_ERR(c)) {
- sb = ERR_CAST(c);
- goto got_sb;
- }
+ ret = PTR_ERR_OR_ZERO(c);
+ if (ret)
+ goto err;
/* Some options can't be parsed until after the fs is started: */
- ret = bch2_parse_mount_opts(c, &opts, data);
- if (ret) {
- bch2_fs_stop(c);
- sb = ERR_PTR(ret);
- goto got_sb;
- }
+ opts = bch2_opts_empty();
+ ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf);
+ if (ret)
+ goto err_stop_fs;
bch2_opts_apply(&c->opts, opts);
- sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
- if (IS_ERR(sb))
- bch2_fs_stop(c);
-got_sb:
- darray_exit(&devs_to_fs);
- bch2_darray_str_exit(&devs);
-
- if (IS_ERR(sb)) {
- ret = PTR_ERR(sb);
- goto err;
- }
+ ret = bch2_fs_start(c);
+ if (ret)
+ goto err_stop_fs;
+ sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c);
+ ret = PTR_ERR_OR_ZERO(sb);
+ if (ret)
+ goto err_stop_fs;
+got_sb:
c = sb->s_fs_info;
if (sb->s_root) {
- if ((flags ^ sb->s_flags) & SB_RDONLY) {
+ if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) {
ret = -EBUSY;
goto err_put_super;
}
@@ -2020,12 +2021,12 @@ got_sb:
sb->s_flags |= SB_ACTIVE;
out:
- return dget(sb->s_root);
-
-err_put_super:
- __bch2_fs_stop(c);
- deactivate_locked_super(sb);
+ fc->root = dget(sb->s_root);
err:
+ darray_exit(&devs_to_fs);
+ bch2_darray_str_exit(&devs);
+ if (ret)
+ pr_err("error: %s", bch2_err_str(ret));
/*
* On an inconsistency error in recovery we might see an -EROFS derived
* errorcode (from the journal), but we don't want to return that to
@@ -2034,7 +2035,16 @@ err:
*/
if (bch2_err_matches(ret, EROFS) && ret != -EROFS)
ret = -EIO;
- return ERR_PTR(bch2_err_class(ret));
+ return bch2_err_class(ret);
+
+err_stop_fs:
+ bch2_fs_stop(c);
+ goto err;
+
+err_put_super:
+ __bch2_fs_stop(c);
+ deactivate_locked_super(sb);
+ goto err;
}
static void bch2_kill_sb(struct super_block *sb)
@@ -2045,12 +2055,76 @@ static void bch2_kill_sb(struct super_block *sb)
bch2_fs_free(c);
}
+static void bch2_fs_context_free(struct fs_context *fc)
+{
+ struct bch2_opts_parse *opts = fc->fs_private;
+
+ if (opts) {
+ printbuf_exit(&opts->parse_later);
+ kfree(opts);
+ }
+}
+
+static int bch2_fs_parse_param(struct fs_context *fc,
+ struct fs_parameter *param)
+{
+ /*
+ * the "source" param, i.e., the name of the device(s) to mount,
+ * is handled by the VFS layer.
+ */
+ if (!strcmp(param->key, "source"))
+ return -ENOPARAM;
+
+ struct bch2_opts_parse *opts = fc->fs_private;
+ struct bch_fs *c = NULL;
+
+ /* for reconfigure, we already have a struct bch_fs */
+ if (fc->root)
+ c = fc->root->d_sb->s_fs_info;
+
+ int ret = bch2_parse_one_mount_opt(c, &opts->opts,
+ &opts->parse_later, param->key,
+ param->string);
+
+ return bch2_err_class(ret);
+}
+
+static int bch2_fs_reconfigure(struct fs_context *fc)
+{
+ struct super_block *sb = fc->root->d_sb;
+ struct bch2_opts_parse *opts = fc->fs_private;
+
+ return bch2_remount(sb, &fc->sb_flags, opts->opts);
+}
+
+static const struct fs_context_operations bch2_context_ops = {
+ .free = bch2_fs_context_free,
+ .parse_param = bch2_fs_parse_param,
+ .get_tree = bch2_fs_get_tree,
+ .reconfigure = bch2_fs_reconfigure,
+};
+
+static int bch2_init_fs_context(struct fs_context *fc)
+{
+ struct bch2_opts_parse *opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+
+ if (!opts)
+ return -ENOMEM;
+
+ opts->parse_later = PRINTBUF;
+
+ fc->ops = &bch2_context_ops;
+ fc->fs_private = opts;
+
+ return 0;
+}
+
static struct file_system_type bcache_fs_type = {
- .owner = THIS_MODULE,
- .name = "bcachefs",
- .mount = bch2_mount,
- .kill_sb = bch2_kill_sb,
- .fs_flags = FS_REQUIRES_DEV,
+ .owner = THIS_MODULE,
+ .name = "bcachefs",
+ .init_fs_context = bch2_init_fs_context,
+ .kill_sb = bch2_kill_sb,
+ .fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("bcachefs");
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 921bcdb3e5e4..cc4f0963c0c5 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -455,31 +455,42 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub
return 0;
}
-static int reconstruct_inode(struct btree_trans *trans, u32 snapshot, u64 inum, u64 size, unsigned mode)
+static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum)
{
struct bch_fs *c = trans->c;
- struct bch_inode_unpacked new_inode;
+ unsigned i_mode = S_IFREG;
+ u64 i_size = 0;
- bch2_inode_init_early(c, &new_inode);
- bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, mode|0755, 0, NULL);
- new_inode.bi_size = size;
- new_inode.bi_inum = inum;
+ switch (btree) {
+ case BTREE_ID_extents: {
+ struct btree_iter iter = {};
- return __bch2_fsck_write_inode(trans, &new_inode, snapshot);
-}
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
+ struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
+ bch2_trans_iter_exit(trans, &iter);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
-static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 inum)
-{
- struct btree_iter iter = {};
+ i_size = k.k->p.offset << 9;
+ break;
+ }
+ case BTREE_ID_dirents:
+ i_mode = S_IFDIR;
+ break;
+ case BTREE_ID_xattrs:
+ break;
+ default:
+ BUG();
+ }
- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
- struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
- bch2_trans_iter_exit(trans, &iter);
- int ret = bkey_err(k);
- if (ret)
- return ret;
+ struct bch_inode_unpacked new_inode;
+ bch2_inode_init_early(c, &new_inode);
+ bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
+ new_inode.bi_size = i_size;
+ new_inode.bi_inum = inum;
- return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG);
+ return __bch2_fsck_write_inode(trans, &new_inode, snapshot);
}
struct snapshots_seen {
@@ -824,8 +835,8 @@ static int hash_check_key(struct btree_trans *trans,
break;
if (fsck_err_on(k.k->type == desc.key_type &&
- !desc.cmp_bkey(k, hash_k), c,
- hash_table_key_duplicate,
+ !desc.cmp_bkey(k, hash_k),
+ trans, hash_table_key_duplicate,
"duplicate hash table keys:\n%s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, hash_k),
@@ -844,7 +855,7 @@ out:
printbuf_exit(&buf);
return ret;
bad_hash:
- if (fsck_err(c, hash_table_key_wrong_offset,
+ if (fsck_err(trans, hash_table_key_wrong_offset,
"hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
(printbuf_reset(&buf),
@@ -919,11 +930,11 @@ static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c i
return ret;
if (fsck_err_on(ret,
- c, inode_points_to_missing_dirent,
+ trans, inode_points_to_missing_dirent,
"inode points to missing dirent\n%s",
(bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
- c, inode_points_to_wrong_dirent,
+ trans, inode_points_to_wrong_dirent,
"inode points to dirent that does not point back:\n%s",
(bch2_bkey_val_to_text(&buf, c, inode_k),
prt_newline(&buf),
@@ -986,7 +997,7 @@ static int check_inode(struct btree_trans *trans,
if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed ||
inode_d_type(prev) != inode_d_type(&u),
- c, inode_snapshot_mismatch,
+ trans, inode_snapshot_mismatch,
"inodes in different snapshots don't match")) {
bch_err(c, "repair not implemented yet");
return -BCH_ERR_fsck_repair_unimplemented;
@@ -1018,7 +1029,8 @@ static int check_inode(struct btree_trans *trans,
if (ret < 0)
return ret;
- fsck_err_on(!ret, c, unlinked_inode_not_on_deleted_list,
+ fsck_err_on(!ret,
+ trans, unlinked_inode_not_on_deleted_list,
"inode %llu:%u unlinked, but not on deleted list",
u.bi_inum, k.k->p.snapshot);
ret = 0;
@@ -1026,7 +1038,7 @@ static int check_inode(struct btree_trans *trans,
if (u.bi_flags & BCH_INODE_unlinked &&
(!c->sb.clean ||
- fsck_err(c, inode_unlinked_but_clean,
+ fsck_err(trans, inode_unlinked_but_clean,
"filesystem marked clean, but inode %llu unlinked",
u.bi_inum))) {
ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
@@ -1036,7 +1048,7 @@ static int check_inode(struct btree_trans *trans,
if (u.bi_flags & BCH_INODE_i_size_dirty &&
(!c->sb.clean ||
- fsck_err(c, inode_i_size_dirty_but_clean,
+ fsck_err(trans, inode_i_size_dirty_but_clean,
"filesystem marked clean, but inode %llu has i_size dirty",
u.bi_inum))) {
bch_verbose(c, "truncating inode %llu", u.bi_inum);
@@ -1066,7 +1078,7 @@ static int check_inode(struct btree_trans *trans,
if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
(!c->sb.clean ||
- fsck_err(c, inode_i_sectors_dirty_but_clean,
+ fsck_err(trans, inode_i_sectors_dirty_but_clean,
"filesystem marked clean, but inode %llu has i_sectors dirty",
u.bi_inum))) {
s64 sectors;
@@ -1101,7 +1113,7 @@ static int check_inode(struct btree_trans *trans,
if (fsck_err_on(u.bi_parent_subvol &&
(u.bi_subvol == 0 ||
u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
- c, inode_bi_parent_nonzero,
+ trans, inode_bi_parent_nonzero,
"inode %llu:%u has subvol %u but nonzero parent subvol %u",
u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
u.bi_parent_subvol = 0;
@@ -1121,13 +1133,13 @@ static int check_inode(struct btree_trans *trans,
}
if (fsck_err_on(ret,
- c, inode_bi_subvol_missing,
+ trans, inode_bi_subvol_missing,
"inode %llu:%u bi_subvol points to missing subvolume %u",
u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
k.k->p.snapshot),
- c, inode_bi_subvol_wrong,
+ trans, inode_bi_subvol_wrong,
"inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
u.bi_inum, k.k->p.snapshot, u.bi_subvol,
le64_to_cpu(s.inode),
@@ -1170,6 +1182,71 @@ int bch2_check_inodes(struct bch_fs *c)
return ret;
}
+static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
+{
+ switch (btree) {
+ case BTREE_ID_extents:
+ return S_ISREG(mode) || S_ISLNK(mode);
+ case BTREE_ID_dirents:
+ return S_ISDIR(mode);
+ case BTREE_ID_xattrs:
+ return true;
+ default:
+ BUG();
+ }
+}
+
+static int check_key_has_inode(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct inode_walker *inode,
+ struct inode_walker_entry *i,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ int ret = PTR_ERR_OR_ZERO(i);
+ if (ret)
+ return ret;
+
+ if (k.k->type == KEY_TYPE_whiteout)
+ goto out;
+
+ if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
+ ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+
+ inode->last_pos.inode--;
+ ret = -BCH_ERR_transaction_restart_nested;
+ goto err;
+ }
+
+ if (fsck_err_on(!i,
+ trans, key_in_missing_inode,
+ "key in missing inode:\n %s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ goto delete;
+
+ if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
+ trans, key_in_wrong_inode_type,
+ "key for wrong inode mode %o:\n %s",
+ i->inode.bi_mode,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ goto delete;
+out:
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+delete:
+ ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
+ goto out;
+}
+
static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
@@ -1192,7 +1269,7 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal
}
if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
- c, inode_i_sectors_wrong,
+ trans, inode_i_sectors_wrong,
"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
w->last_pos.inode, i->snapshot,
i->inode.bi_sectors, i->count)) {
@@ -1340,7 +1417,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
prt_printf(&buf, "\n overwriting %s extent",
pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
- if (fsck_err(c, extent_overlapping,
+ if (fsck_err(trans, extent_overlapping,
"overlapping extents%s", buf.buf)) {
struct btree_iter *old_iter = &iter1;
struct disk_reservation res = { 0 };
@@ -1476,43 +1553,20 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
goto err;
}
+ ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
+ if (ret)
+ goto err;
+
i = walk_inode(trans, inode, k);
ret = PTR_ERR_OR_ZERO(i);
if (ret)
goto err;
- ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
+ ret = check_key_has_inode(trans, iter, inode, i, k);
if (ret)
goto err;
if (k.k->type != KEY_TYPE_whiteout) {
- if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
- ret = reconstruct_reg_inode(trans, k.k->p.snapshot, k.k->p.inode) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
- if (ret)
- goto err;
-
- inode->last_pos.inode--;
- ret = -BCH_ERR_transaction_restart_nested;
- goto err;
- }
-
- if (fsck_err_on(!i, c, extent_in_missing_inode,
- "extent in missing inode:\n %s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- goto delete;
-
- if (fsck_err_on(i &&
- !S_ISREG(i->inode.bi_mode) &&
- !S_ISLNK(i->inode.bi_mode),
- c, extent_in_non_reg_inode,
- "extent in non regular inode mode %o:\n %s",
- i->inode.bi_mode,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- goto delete;
-
ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
&inode->recalculate_sums);
if (ret)
@@ -1525,7 +1579,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
* didn't have one, iterate over all inodes:
*/
if (!i)
- i = inode->inodes.data + inode->inodes.nr - 1;
+ i = &darray_last(inode->inodes);
for (;
inode->inodes.data && i >= inode->inodes.data;
@@ -1538,7 +1592,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
!bkey_extent_is_reservation(k),
- c, extent_past_end_of_inode,
+ trans, extent_past_end_of_inode,
"extent type past end of inode %llu:%u, i_size %llu\n %s",
i->inode.bi_inum, i->snapshot, i->inode.bi_size,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
@@ -1574,9 +1628,6 @@ fsck_err:
printbuf_exit(&buf);
bch_err_fn(c, ret);
return ret;
-delete:
- ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
- goto out;
}
/*
@@ -1656,7 +1707,7 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_
}
if (fsck_err_on(i->inode.bi_nlink != i->count,
- c, inode_dir_wrong_nlink,
+ trans, inode_dir_wrong_nlink,
"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
i->inode.bi_nlink = i->count;
@@ -1692,7 +1743,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
return 0;
if (bch2_inode_should_have_bp(target) &&
- !fsck_err(c, inode_wrong_backpointer,
+ !fsck_err(trans, inode_wrong_backpointer,
"dirent points to inode that does not point back:\n %s",
(bch2_bkey_val_to_text(&buf, c, d.s_c),
prt_printf(&buf, "\n "),
@@ -1718,7 +1769,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
ret = 0;
if (fsck_err_on(!backpointer_exists,
- c, inode_wrong_backpointer,
+ trans, inode_wrong_backpointer,
"inode %llu:%u has wrong backpointer:\n"
"got %llu:%llu\n"
"should be %llu:%llu",
@@ -1741,7 +1792,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
if (fsck_err_on(backpointer_exists &&
(S_ISDIR(target->bi_mode) ||
target->bi_subvol),
- c, inode_dir_multiple_links,
+ trans, inode_dir_multiple_links,
"%s %llu:%u with multiple links\n%s",
S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
target->bi_inum, target_snapshot, buf.buf)) {
@@ -1755,7 +1806,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
* it up, it ignores inodes with nlink 0
*/
if (fsck_err_on(backpointer_exists && !target->bi_nlink,
- c, inode_multiple_links_but_nlink_0,
+ trans, inode_multiple_links_but_nlink_0,
"inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
target->bi_nlink++;
@@ -1791,7 +1842,7 @@ static int check_dirent_target(struct btree_trans *trans,
goto err;
if (fsck_err_on(d.v->d_type != inode_d_type(target),
- c, dirent_d_type_wrong,
+ trans, dirent_d_type_wrong,
"incorrect d_type: got %s, should be %s:\n%s",
bch2_d_type_str(d.v->d_type),
bch2_d_type_str(inode_d_type(target)),
@@ -1889,11 +1940,12 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
parent_snapshot = d.k->p.snapshot;
}
- if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol,
+ if (fsck_err_on(ret,
+ trans, dirent_to_missing_parent_subvol,
"dirent parent_subvol points to missing subvolume\n%s",
(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
- c, dirent_not_visible_in_parent_subvol,
+ trans, dirent_not_visible_in_parent_subvol,
"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
parent_snapshot,
(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
@@ -1919,7 +1971,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
return ret;
if (ret) {
- if (fsck_err(c, dirent_to_missing_subvol,
+ if (fsck_err(trans, dirent_to_missing_subvol,
"dirent points to missing subvolume\n%s",
(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
return __remove_dirent(trans, d.k->p);
@@ -1928,7 +1980,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
}
if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
- c, subvol_fs_path_parent_wrong,
+ trans, subvol_fs_path_parent_wrong,
"subvol with wrong fs_path_parent, should be be %u\n%s",
parent_subvol,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
@@ -1956,7 +2008,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
}
if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
- c, inode_bi_parent_wrong,
+ trans, inode_bi_parent_wrong,
"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
target_inum,
subvol_root.bi_parent_subvol, parent_subvol)) {
@@ -2009,49 +2061,21 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
goto err;
}
- BUG_ON(!btree_iter_path(trans, iter)->should_be_locked);
-
i = walk_inode(trans, dir, k);
ret = PTR_ERR_OR_ZERO(i);
if (ret < 0)
goto err;
- if (dir->first_this_inode && dir->inodes.nr)
- *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
- dir->first_this_inode = false;
-
- if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
- ret = reconstruct_inode(trans, k.k->p.snapshot, k.k->p.inode, 0, S_IFDIR) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
- if (ret)
- goto err;
-
- dir->last_pos.inode--;
- ret = -BCH_ERR_transaction_restart_nested;
+ ret = check_key_has_inode(trans, iter, dir, i, k);
+ if (ret)
goto err;
- }
-
- if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
- "dirent in nonexisting directory:\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_internal_snapshot_node);
- goto out;
- }
if (!i)
goto out;
- if (fsck_err_on(!S_ISDIR(i->inode.bi_mode),
- c, dirent_in_non_dir_inode,
- "dirent in non directory inode type %s:\n%s",
- bch2_d_type_str(inode_d_type(&i->inode)),
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = bch2_btree_delete_at(trans, iter, 0);
- goto out;
- }
+ if (dir->first_this_inode)
+ *hash_info = bch2_hash_info_init(c, &i->inode);
+ dir->first_this_inode = false;
ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
if (ret < 0)
@@ -2077,7 +2101,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
goto err;
if (fsck_err_on(!target->inodes.nr,
- c, dirent_to_missing_inode,
+ trans, dirent_to_missing_inode,
"dirent points to missing inode:\n%s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k),
@@ -2156,20 +2180,18 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
return ret;
- if (inode->first_this_inode && inode->inodes.nr)
- *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
- inode->first_this_inode = false;
-
- if (fsck_err_on(!i, c, xattr_in_missing_inode,
- "xattr for missing inode %llu",
- k.k->p.inode))
- return bch2_btree_delete_at(trans, iter, 0);
+ ret = check_key_has_inode(trans, iter, inode, i, k);
+ if (ret)
+ return ret;
if (!i)
return 0;
+ if (inode->first_this_inode)
+ *hash_info = bch2_hash_info_init(c, &i->inode);
+ inode->first_this_inode = false;
+
ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
-fsck_err:
bch_err_fn(c, ret);
return ret;
}
@@ -2207,7 +2229,7 @@ static int check_root_trans(struct btree_trans *trans)
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
- if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
+ if (mustfix_fsck_err_on(ret, trans, root_subvol_missing,
"root subvol missing")) {
struct bkey_i_subvolume *root_subvol =
bch2_trans_kmalloc(trans, sizeof(*root_subvol));
@@ -2233,10 +2255,11 @@ static int check_root_trans(struct btree_trans *trans)
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
- if (mustfix_fsck_err_on(ret, c, root_dir_missing,
+ if (mustfix_fsck_err_on(ret,
+ trans, root_dir_missing,
"root directory missing") ||
mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
- c, root_inode_not_dir,
+ trans, root_inode_not_dir,
"root inode not a directory")) {
bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
0, NULL);
@@ -2308,7 +2331,7 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
break;
if (fsck_err_on(!ret,
- c, subvol_unreachable,
+ trans, subvol_unreachable,
"unreachable subvolume %s",
(bch2_bkey_val_to_text(&buf, c, s.s_c),
buf.buf))) {
@@ -2333,7 +2356,7 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
goto err;
if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
- c, subvol_unreachable,
+ trans, subvol_unreachable,
"unreachable subvolume %s",
(bch2_bkey_val_to_text(&buf, c, s.s_c),
buf.buf))) {
@@ -2412,7 +2435,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
if (bch2_err_matches(ret, ENOENT)) {
ret = 0;
- if (fsck_err(c, inode_unreachable,
+ if (fsck_err(trans, inode_unreachable,
"unreachable inode\n%s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, inode_k),
@@ -2458,7 +2481,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
pr_err("%llu:%u", i->inum, i->snapshot);
pr_err("%llu:%u", inode.bi_inum, snapshot);
- if (fsck_err(c, dir_loop, "directory structure loop")) {
+ if (fsck_err(trans, dir_loop, "directory structure loop")) {
ret = remove_backpointer(trans, &inode);
bch_err_msg(c, ret, "removing dirent");
if (ret)
@@ -2664,7 +2687,6 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
struct nlink_table *links,
size_t *idx, u64 range_end)
{
- struct bch_fs *c = trans->c;
struct bch_inode_unpacked u;
struct nlink *link = &links->d[*idx];
int ret = 0;
@@ -2690,7 +2712,7 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
}
if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
- c, inode_wrong_nlink,
+ trans, inode_wrong_nlink,
"inode %llu type %s has wrong i_nlink (%u, should be %u)",
u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
bch2_inode_nlink_get(&u), link->count)) {
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index aafa79fa6351..1e20020eadd1 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -8,6 +8,7 @@
#include "buckets.h"
#include "compress.h"
#include "dirent.h"
+#include "disk_accounting.h"
#include "error.h"
#include "extents.h"
#include "extent_update.h"
@@ -534,12 +535,13 @@ fsck_err:
static void __bch2_inode_unpacked_to_text(struct printbuf *out,
struct bch_inode_unpacked *inode)
{
+ prt_printf(out, "\n");
printbuf_indent_add(out, 2);
prt_printf(out, "mode=%o\n", inode->bi_mode);
prt_str(out, "flags=");
prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
- prt_printf(out, " (%x)\n", inode->bi_flags);
+ prt_printf(out, "(%x)\n", inode->bi_flags);
prt_printf(out, "journal_seq=%llu\n", inode->bi_journal_seq);
prt_printf(out, "bi_size=%llu\n", inode->bi_size);
@@ -550,6 +552,8 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out,
prt_printf(out, #_name "=%llu\n", (u64) inode->_name);
BCH_INODE_FIELDS_v3()
#undef x
+
+ bch2_printbuf_strip_trailing_newline(out);
printbuf_indent_sub(out, 2);
}
@@ -596,39 +600,26 @@ int bch2_trigger_inode(struct btree_trans *trans,
struct bkey_s new,
enum btree_iter_update_trigger_flags flags)
{
- s64 nr = (s64) bkey_is_inode(new.k) - (s64) bkey_is_inode(old.k);
-
- if (flags & BTREE_TRIGGER_transactional) {
- if (nr) {
- int ret = bch2_replicas_deltas_realloc(trans, 0);
- if (ret)
- return ret;
-
- trans->fs_usage_deltas->nr_inodes += nr;
- }
-
- bool old_deleted = bkey_is_deleted_inode(old);
- bool new_deleted = bkey_is_deleted_inode(new.s_c);
- if (old_deleted != new_deleted) {
- int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
- new.k->p, new_deleted);
- if (ret)
- return ret;
- }
- }
-
if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
BUG_ON(!trans->journal_res.seq);
-
bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
}
- if (flags & BTREE_TRIGGER_gc) {
- struct bch_fs *c = trans->c;
+ s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
+ if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr) {
+ struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_nr_inodes };
+ int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1, flags & BTREE_TRIGGER_gc);
+ if (ret)
+ return ret;
+ }
- percpu_down_read(&c->mark_lock);
- this_cpu_add(c->usage_gc->b.nr_inodes, nr);
- percpu_up_read(&c->mark_lock);
+ int deleted_delta = (int) bkey_is_deleted_inode(new.s_c) -
+ (int) bkey_is_deleted_inode(old);
+ if ((flags & BTREE_TRIGGER_transactional) && deleted_delta) {
+ int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
+ new.k->p, deleted_delta > 0);
+ if (ret)
+ return ret;
}
return 0;
@@ -1096,8 +1087,8 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
return ret;
ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
- if (fsck_err_on(!bkey_is_inode(k.k), c,
- deleted_inode_missing,
+ if (fsck_err_on(!bkey_is_inode(k.k),
+ trans, deleted_inode_missing,
"nonexistent inode %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
@@ -1109,7 +1100,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
if (S_ISDIR(inode.bi_mode)) {
ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot);
if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY),
- c, deleted_inode_is_dir,
+ trans, deleted_inode_is_dir,
"non empty directory %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
@@ -1117,15 +1108,14 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
goto out;
}
- if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), c,
- deleted_inode_not_unlinked,
+ if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked),
+ trans, deleted_inode_not_unlinked,
"non-deleted inode %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
if (c->sb.clean &&
- !fsck_err(c,
- deleted_inode_but_clean,
+ !fsck_err(trans, deleted_inode_but_clean,
"filesystem marked as clean but have deleted inode %llu:%u",
pos.offset, pos.snapshot)) {
ret = 0;
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 679f5f5e5d15..da0e4a745099 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -88,7 +88,7 @@ struct bkey_inode_buf {
#define x(_name, _bits) + 8 + _bits / 8
u8 _pad[0 + BCH_INODE_FIELDS_v3()];
#undef x
-} __packed __aligned(8);
+};
void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *);
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 4ec979b4b23e..2cf6297756f8 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -125,8 +125,12 @@ err_noprint:
bch2_bkey_buf_exit(&old, c);
if (closure_nr_remaining(&cl) != 1) {
- bch2_trans_unlock(trans);
- closure_sync(&cl);
+ bch2_trans_unlock_long(trans);
+
+ if (closure_sync_timeout(&cl, HZ * 10)) {
+ bch2_print_allocator_stuck(c);
+ closure_sync(&cl);
+ }
}
return ret;
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index c97fa7002b06..4531c9ab3e12 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -93,21 +93,24 @@ static const struct rhashtable_params bch_promote_params = {
static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
struct bpos pos,
struct bch_io_opts opts,
- unsigned flags)
+ unsigned flags,
+ struct bch_io_failures *failed)
{
- BUG_ON(!opts.promote_target);
+ if (!failed) {
+ BUG_ON(!opts.promote_target);
- if (!(flags & BCH_READ_MAY_PROMOTE))
- return -BCH_ERR_nopromote_may_not;
+ if (!(flags & BCH_READ_MAY_PROMOTE))
+ return -BCH_ERR_nopromote_may_not;
- if (bch2_bkey_has_target(c, k, opts.promote_target))
- return -BCH_ERR_nopromote_already_promoted;
+ if (bch2_bkey_has_target(c, k, opts.promote_target))
+ return -BCH_ERR_nopromote_already_promoted;
- if (bkey_extent_is_unwritten(k))
- return -BCH_ERR_nopromote_unwritten;
+ if (bkey_extent_is_unwritten(k))
+ return -BCH_ERR_nopromote_unwritten;
- if (bch2_target_congested(c, opts.promote_target))
- return -BCH_ERR_nopromote_congested;
+ if (bch2_target_congested(c, opts.promote_target))
+ return -BCH_ERR_nopromote_congested;
+ }
if (rhashtable_lookup_fast(&c->promote_table, &pos,
bch_promote_params))
@@ -164,7 +167,8 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
struct extent_ptr_decoded *pick,
struct bch_io_opts opts,
unsigned sectors,
- struct bch_read_bio **rbio)
+ struct bch_read_bio **rbio,
+ struct bch_io_failures *failed)
{
struct bch_fs *c = trans->c;
struct promote_op *op = NULL;
@@ -217,14 +221,28 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
bio = &op->write.op.wbio.bio;
bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
+ struct data_update_opts update_opts = {};
+
+ if (!failed) {
+ update_opts.target = opts.promote_target;
+ update_opts.extra_replicas = 1;
+ update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED;
+ } else {
+ update_opts.target = opts.foreground_target;
+
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ unsigned i = 0;
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (bch2_dev_io_failures(failed, ptr->dev))
+ update_opts.rewrite_ptrs |= BIT(i);
+ i++;
+ }
+ }
+
ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
writepoint_hashed((unsigned long) current),
opts,
- (struct data_update_opts) {
- .target = opts.promote_target,
- .extra_replicas = 1,
- .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
- },
+ update_opts,
btree_id, k);
/*
* possible errors: -BCH_ERR_nocow_lock_blocked,
@@ -258,10 +276,17 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
unsigned flags,
struct bch_read_bio **rbio,
bool *bounce,
- bool *read_full)
+ bool *read_full,
+ struct bch_io_failures *failed)
{
struct bch_fs *c = trans->c;
- bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
+ /*
+ * if failed != NULL we're not actually doing a promote, we're
+ * recovering from an io/checksum error
+ */
+ bool promote_full = (failed ||
+ *read_full ||
+ READ_ONCE(c->promote_whole_extents));
/* data might have to be decompressed in the write path: */
unsigned sectors = promote_full
? max(pick->crc.compressed_size, pick->crc.live_size)
@@ -272,7 +297,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
struct promote_op *promote;
int ret;
- ret = should_promote(c, k, pos, opts, flags);
+ ret = should_promote(c, k, pos, opts, flags, failed);
if (ret)
goto nopromote;
@@ -280,7 +305,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
k.k->type == KEY_TYPE_reflink_v
? BTREE_ID_reflink
: BTREE_ID_extents,
- k, pos, pick, opts, sectors, rbio);
+ k, pos, pick, opts, sectors, rbio, failed);
ret = PTR_ERR_OR_ZERO(promote);
if (ret)
goto nopromote;
@@ -389,7 +414,6 @@ retry:
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
- bch2_trans_unlock(trans);
if (!bch2_bkey_matches_ptr(c, k,
rbio->pick.ptr,
@@ -911,9 +935,9 @@ retry_pick:
bounce = true;
}
- if (orig->opts.promote_target)
+ if (orig->opts.promote_target)// || failed)
promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
- &rbio, &bounce, &read_full);
+ &rbio, &bounce, &read_full, failed);
if (!read_full) {
EBUG_ON(crc_is_compressed(pick.crc));
@@ -1004,6 +1028,9 @@ get_bio:
rbio->promote = promote;
INIT_WORK(&rbio->work, NULL);
+ if (flags & BCH_READ_NODECODE)
+ orig->pick = pick;
+
rbio->bio.bi_opf = orig->bio.bi_opf;
rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
rbio->bio.bi_end_io = bch2_read_endio;
@@ -1120,34 +1147,27 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
struct btree_iter iter;
struct bkey_buf sk;
struct bkey_s_c k;
- u32 snapshot;
int ret;
BUG_ON(flags & BCH_READ_NODECODE);
bch2_bkey_buf_init(&sk);
-retry:
- bch2_trans_begin(trans);
- iter = (struct btree_iter) { NULL };
-
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto err;
-
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
+ POS(inum.inum, bvec_iter.bi_sector),
BTREE_ITER_slots);
+
while (1) {
unsigned bytes, sectors, offset_into_extent;
enum btree_id data_btree = BTREE_ID_extents;
- /*
- * read_extent -> io_time_reset may cause a transaction restart
- * without returning an error, we need to check for that here:
- */
- ret = bch2_trans_relock(trans);
+ bch2_trans_begin(trans);
+
+ u32 snapshot;
+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
- break;
+ goto err;
+
+ bch2_btree_iter_set_snapshot(&iter, snapshot);
bch2_btree_iter_set_pos(&iter,
POS(inum.inum, bvec_iter.bi_sector));
@@ -1155,7 +1175,7 @@ retry:
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
- break;
+ goto err;
offset_into_extent = iter.pos.offset -
bkey_start_offset(k.k);
@@ -1166,7 +1186,7 @@ retry:
ret = bch2_read_indirect_extent(trans, &data_btree,
&offset_into_extent, &sk);
if (ret)
- break;
+ goto err;
k = bkey_i_to_s_c(sk.k);
@@ -1186,7 +1206,7 @@ retry:
data_btree, k,
offset_into_extent, failed, flags);
if (ret)
- break;
+ goto err;
if (flags & BCH_READ_LAST_FRAGMENT)
break;
@@ -1196,16 +1216,16 @@ retry:
ret = btree_trans_too_many_iters(trans);
if (ret)
+ goto err;
+err:
+ if (ret &&
+ !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
+ ret != READ_RETRY &&
+ ret != READ_RETRY_AVOID)
break;
}
-err:
- bch2_trans_iter_exit(trans, &iter);
-
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
- ret == READ_RETRY ||
- ret == READ_RETRY_AVOID)
- goto retry;
+ bch2_trans_iter_exit(trans, &iter);
bch2_trans_put(trans);
bch2_bkey_buf_exit(&sk, c);
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 05e0cbef420b..d31c8d006d97 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -69,11 +69,10 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
u64 io_latency = time_after64(now, submit_time)
? now - submit_time
: 0;
- u64 old, new, v = atomic64_read(latency);
+ u64 old, new;
+ old = atomic64_read(latency);
do {
- old = v;
-
/*
* If the io latency was reasonably close to the current
* latency, skip doing the update and atomic operation - most of
@@ -84,7 +83,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
break;
new = ewma_add(old, io_latency, 5);
- } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
+ } while (!atomic64_try_cmpxchg(latency, &old, new));
bch2_congested_acct(ca, io_latency, now, rw);
@@ -555,7 +554,7 @@ out:
err:
keys->top = keys->keys;
op->error = ret;
- op->flags |= BCH_WRITE_DONE;
+ op->flags |= BCH_WRITE_SUBMITTED;
goto out;
}
@@ -590,7 +589,7 @@ static CLOSURE_CALLBACK(bch2_write_index)
struct workqueue_struct *wq = index_update_wq(op);
unsigned long flags;
- if ((op->flags & BCH_WRITE_DONE) &&
+ if ((op->flags & BCH_WRITE_SUBMITTED) &&
(op->flags & BCH_WRITE_MOVE))
bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
@@ -635,7 +634,7 @@ void bch2_write_point_do_index_updates(struct work_struct *work)
__bch2_write_index(op);
- if (!(op->flags & BCH_WRITE_DONE))
+ if (!(op->flags & BCH_WRITE_SUBMITTED))
__bch2_write(op);
else
bch2_write_done(&op->cl);
@@ -1081,7 +1080,10 @@ do_write:
*_dst = dst;
return more;
csum_err:
- bch_err(c, "%s writ error: error verifying existing checksum while rewriting existing data (memory corruption?)",
+ bch_err_inum_offset_ratelimited(c,
+ op->pos.inode,
+ op->pos.offset << 9,
+ "%s write error: error verifying existing checksum while rewriting existing data (memory corruption?)",
op->flags & BCH_WRITE_MOVE ? "move" : "user");
ret = -EIO;
err:
@@ -1316,7 +1318,7 @@ retry:
wbio_init(bio)->put_bio = true;
bio->bi_opf = op->wbio.bio.bi_opf;
} else {
- op->flags |= BCH_WRITE_DONE;
+ op->flags |= BCH_WRITE_SUBMITTED;
}
op->pos.offset += bio_sectors(bio);
@@ -1330,7 +1332,7 @@ retry:
op->insert_keys.top, true);
bch2_keylist_push(&op->insert_keys);
- if (op->flags & BCH_WRITE_DONE)
+ if (op->flags & BCH_WRITE_SUBMITTED)
break;
bch2_btree_iter_advance(&iter);
}
@@ -1345,14 +1347,14 @@ err:
op->pos.inode, op->pos.offset << 9,
"%s: btree lookup error %s", __func__, bch2_err_str(ret));
op->error = ret;
- op->flags |= BCH_WRITE_DONE;
+ op->flags |= BCH_WRITE_SUBMITTED;
}
bch2_trans_put(trans);
darray_exit(&buckets);
/* fallback to cow write path? */
- if (!(op->flags & BCH_WRITE_DONE)) {
+ if (!(op->flags & BCH_WRITE_SUBMITTED)) {
closure_sync(&op->cl);
__bch2_nocow_write_done(op);
op->insert_keys.top = op->insert_keys.keys;
@@ -1408,7 +1410,7 @@ static void __bch2_write(struct bch_write_op *op)
if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
bch2_nocow_write(op);
- if (op->flags & BCH_WRITE_DONE)
+ if (op->flags & BCH_WRITE_SUBMITTED)
goto out_nofs_restore;
}
again:
@@ -1463,7 +1465,7 @@ again:
bch2_alloc_sectors_done_inlined(c, wp);
err:
if (ret <= 0) {
- op->flags |= BCH_WRITE_DONE;
+ op->flags |= BCH_WRITE_SUBMITTED;
if (ret < 0) {
if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT))
@@ -1499,7 +1501,7 @@ err:
* once, as that signals backpressure to the caller.
*/
if ((op->flags & BCH_WRITE_SYNC) ||
- (!(op->flags & BCH_WRITE_DONE) &&
+ (!(op->flags & BCH_WRITE_SUBMITTED) &&
!(op->flags & BCH_WRITE_IN_WORKER))) {
if (closure_sync_timeout(&op->cl, HZ * 10)) {
bch2_print_allocator_stuck(c);
@@ -1508,7 +1510,7 @@ err:
__bch2_write_index(op);
- if (!(op->flags & BCH_WRITE_DONE))
+ if (!(op->flags & BCH_WRITE_SUBMITTED))
goto again;
bch2_write_done(&op->cl);
} else {
@@ -1530,7 +1532,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
memset(&op->failed, 0, sizeof(op->failed));
op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
- op->flags |= BCH_WRITE_DONE;
+ op->flags |= BCH_WRITE_SUBMITTED;
bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h
index 6c276a48f95d..5400ce94ee57 100644
--- a/fs/bcachefs/io_write.h
+++ b/fs/bcachefs/io_write.h
@@ -33,7 +33,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
x(SYNC) \
x(MOVE) \
x(IN_WORKER) \
- x(DONE) \
+ x(SUBMITTED) \
x(IO_ERROR) \
x(CONVERT_UNWRITTEN)
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index dac2f498ae8b..649e3a01608a 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -230,7 +230,6 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *buf = journal_cur_buf(j);
union journal_res_state old, new;
- u64 v = atomic64_read(&j->reservations.counter);
unsigned sectors;
BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL &&
@@ -238,15 +237,16 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
lockdep_assert_held(&j->lock);
+ old.v = atomic64_read(&j->reservations.counter);
do {
- old.v = new.v = v;
+ new.v = old.v;
new.cur_entry_offset = closed_val;
if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ||
old.cur_entry_offset == new.cur_entry_offset)
return;
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
+ } while (!atomic64_try_cmpxchg(&j->reservations.counter,
+ &old.v, new.v));
if (!__journal_entry_is_open(old))
return;
@@ -353,7 +353,6 @@ static int journal_entry_open(struct journal *j)
((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK);
union journal_res_state old, new;
int u64s;
- u64 v;
lockdep_assert_held(&j->lock);
BUG_ON(journal_entry_is_open(j));
@@ -432,9 +431,9 @@ static int journal_entry_open(struct journal *j)
*/
j->cur_entry_u64s = u64s;
- v = atomic64_read(&j->reservations.counter);
+ old.v = atomic64_read(&j->reservations.counter);
do {
- old.v = new.v = v;
+ new.v = old.v;
BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL);
@@ -446,8 +445,8 @@ static int journal_entry_open(struct journal *j)
/* Handle any already added entries */
new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
+ } while (!atomic64_try_cmpxchg(&j->reservations.counter,
+ &old.v, new.v));
if (nr_unwritten_journal_entries(j) == 1)
mod_delayed_work(j->wq,
@@ -1095,7 +1094,7 @@ unlock:
return ret;
}
-int bch2_dev_journal_alloc(struct bch_dev *ca)
+int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
{
unsigned nr;
int ret;
@@ -1117,7 +1116,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
min(1 << 13,
(1 << 24) / ca->mi.bucket_size));
- ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
+ ret = __bch2_set_nr_journal_buckets(ca, nr, new_fs, NULL);
err:
bch_err_fn(ca, ret);
return ret;
@@ -1129,7 +1128,7 @@ int bch2_fs_journal_alloc(struct bch_fs *c)
if (ca->journal.nr)
continue;
- int ret = bch2_dev_journal_alloc(ca);
+ int ret = bch2_dev_journal_alloc(ca, true);
if (ret) {
percpu_ref_put(&ca->io_ref);
return ret;
@@ -1184,9 +1183,11 @@ void bch2_fs_journal_stop(struct journal *j)
journal_quiesce(j);
cancel_delayed_work_sync(&j->write_work);
- BUG_ON(!bch2_journal_error(j) &&
- test_bit(JOURNAL_replay_done, &j->flags) &&
- j->last_empty_seq != journal_cur_seq(j));
+ WARN(!bch2_journal_error(j) &&
+ test_bit(JOURNAL_replay_done, &j->flags) &&
+ j->last_empty_seq != journal_cur_seq(j),
+ "journal shutdown error: cur seq %llu but last empty seq %llu",
+ journal_cur_seq(j), j->last_empty_seq);
if (!bch2_journal_error(j))
clear_bit(JOURNAL_running, &j->flags);
@@ -1418,8 +1419,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
unsigned long now = jiffies;
u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes;
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 28);
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 28);
out->atomic++;
rcu_read_lock();
@@ -1521,6 +1522,11 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
struct journal_entry_pin *pin;
spin_lock(&j->lock);
+ if (!test_bit(JOURNAL_running, &j->flags)) {
+ spin_unlock(&j->lock);
+ return true;
+ }
+
*seq = max(*seq, j->pin.front);
if (*seq >= j->pin.back) {
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index fd1f7cdaa8bc..377a3750406e 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -327,10 +327,10 @@ static inline int journal_res_get_fast(struct journal *j,
unsigned flags)
{
union journal_res_state old, new;
- u64 v = atomic64_read(&j->reservations.counter);
+ old.v = atomic64_read(&j->reservations.counter);
do {
- old.v = new.v = v;
+ new.v = old.v;
/*
* Check if there is still room in the current journal
@@ -356,8 +356,8 @@ static inline int journal_res_get_fast(struct journal *j,
if (flags & JOURNAL_RES_GET_CHECK)
return 1;
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
+ } while (!atomic64_try_cmpxchg(&j->reservations.counter,
+ &old.v, new.v));
res->ref = true;
res->idx = old.idx;
@@ -433,7 +433,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
unsigned nr);
-int bch2_dev_journal_alloc(struct bch_dev *);
+int bch2_dev_journal_alloc(struct bch_dev *, bool);
int bch2_fs_journal_alloc(struct bch_fs *);
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 492426c8d869..7a833a3f1c63 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -415,6 +415,8 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c,
flags|BCH_VALIDATE_journal);
if (ret == FSCK_DELETED_KEY)
continue;
+ else if (ret)
+ return ret;
k = bkey_next(k);
}
@@ -722,13 +724,16 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs
prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
+ printbuf_indent_add(out, 2);
for (i = 0; i < nr_types; i++) {
+ prt_newline(out);
bch2_prt_data_type(out, i);
prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
le64_to_cpu(u->d[i].buckets),
le64_to_cpu(u->d[i].sectors),
le64_to_cpu(u->d[i].fragmented));
}
+ printbuf_indent_sub(out, 2);
}
static int journal_entry_log_validate(struct bch_fs *c,
@@ -1583,7 +1588,7 @@ static CLOSURE_CALLBACK(journal_write_done)
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_replicas_padded replicas;
union journal_res_state old, new;
- u64 v, seq = le64_to_cpu(w->data->seq);
+ u64 seq = le64_to_cpu(w->data->seq);
int err = 0;
bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
@@ -1642,14 +1647,15 @@ static CLOSURE_CALLBACK(journal_write_done)
if (j->watermark != BCH_WATERMARK_stripe)
journal_reclaim_kick(&c->journal);
- v = atomic64_read(&j->reservations.counter);
+ old.v = atomic64_read(&j->reservations.counter);
do {
- old.v = new.v = v;
+ new.v = old.v;
BUG_ON(journal_state_count(new, new.unwritten_idx));
BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK));
new.unwritten_idx++;
- } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v);
+ } while (!atomic64_try_cmpxchg(&j->reservations.counter,
+ &old.v, new.v));
closure_wake_up(&w->wait);
completed = true;
@@ -1677,6 +1683,13 @@ static CLOSURE_CALLBACK(journal_write_done)
mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
}
+ /*
+ * We don't typically trigger journal writes from her - the next journal
+ * write will be triggered immediately after the previous one is
+ * allocated, in bch2_journal_write() - but the journal write error path
+ * is special:
+ */
+ bch2_journal_do_writes(j);
spin_unlock(&j->lock);
}
@@ -1755,11 +1768,13 @@ static CLOSURE_CALLBACK(journal_write_preflush)
if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
spin_lock(&j->lock);
- closure_wait(&j->async_wait, cl);
+ if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
+ closure_wait(&j->async_wait, cl);
+ spin_unlock(&j->lock);
+ continue_at(cl, journal_write_preflush, j->wq);
+ return;
+ }
spin_unlock(&j->lock);
-
- continue_at(cl, journal_write_preflush, j->wq);
- return;
}
if (w->separate_flush) {
@@ -1847,8 +1862,14 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
}
}
- if (wb.wb)
- bch2_journal_keys_to_write_buffer_end(c, &wb);
+ if (wb.wb) {
+ ret = bch2_journal_keys_to_write_buffer_end(c, &wb);
+ if (ret) {
+ bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s",
+ bch2_err_str(ret));
+ return ret;
+ }
+ }
spin_lock(&c->journal.lock);
w->need_flush_to_write_buffer = false;
@@ -2013,8 +2034,9 @@ CLOSURE_CALLBACK(bch2_journal_write)
struct printbuf buf = PRINTBUF;
buf.atomic++;
- prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write: %s"),
- bch2_err_str(ret));
+ prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu: %s"),
+ le64_to_cpu(w->data->seq),
+ bch2_err_str(ret));
__bch2_journal_debug_to_text(&buf, j);
spin_unlock(&j->lock);
bch2_print_string_as_lines(KERN_ERR, buf.buf);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 79be0eaddfa0..d8a630742887 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -205,6 +205,17 @@ void bch2_journal_space_available(struct journal *j)
j->can_discard = can_discard;
if (nr_online < metadata_replicas_required(c)) {
+ struct printbuf buf = PRINTBUF;
+ prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n"
+ "rw journal devs:", nr_online, metadata_replicas_required(c));
+
+ rcu_read_lock();
+ for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal])
+ prt_printf(&buf, " %s", ca->name);
+ rcu_read_unlock();
+
+ bch_err(c, "%s", buf.buf);
+ printbuf_exit(&buf);
ret = JOURNAL_ERR_insufficient_devices;
goto out;
}
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index ed4846709611..1f25c111c54c 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -232,7 +232,7 @@ bool bch2_blacklist_entries_gc(struct bch_fs *c)
BUG_ON(nr != t->nr);
unsigned i;
- for (src = bl->start, i = eytzinger0_first(t->nr);
+ for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr);
src < bl->start + nr;
src++, i = eytzinger0_next(i, nr)) {
BUG_ON(t->entries[i].start != le64_to_cpu(src->start));
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index a40d116224ed..83b1586cb371 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -77,6 +77,45 @@ static const char * const bch2_lru_types[] = {
NULL
};
+int bch2_lru_check_set(struct btree_trans *trans,
+ u16 lru_id, u64 time,
+ struct bkey_s_c referring_k,
+ struct bkey_buf *last_flushed)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ struct btree_iter lru_iter;
+ struct bkey_s_c lru_k =
+ bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
+ lru_pos(lru_id,
+ bucket_to_u64(referring_k.k->p),
+ time), 0);
+ int ret = bkey_err(lru_k);
+ if (ret)
+ return ret;
+
+ if (lru_k.k->type != KEY_TYPE_set) {
+ ret = bch2_btree_write_buffer_maybe_flush(trans, referring_k, last_flushed);
+ if (ret)
+ goto err;
+
+ if (fsck_err(trans, alloc_key_to_missing_lru_entry,
+ "missing %s lru entry\n"
+ " %s",
+ bch2_lru_types[lru_type(lru_k)],
+ (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) {
+ ret = bch2_lru_set(trans, lru_id, bucket_to_u64(referring_k.k->p), time);
+ if (ret)
+ goto err;
+ }
+ }
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &lru_iter);
+ printbuf_exit(&buf);
+ return ret;
+}
+
static int bch2_check_lru_key(struct btree_trans *trans,
struct btree_iter *lru_iter,
struct bkey_s_c lru_k,
@@ -94,8 +133,8 @@ static int bch2_check_lru_key(struct btree_trans *trans,
u64 idx;
int ret;
- if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c,
- lru_entry_to_invalid_bucket,
+ if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos),
+ trans, lru_entry_to_invalid_bucket,
"lru key points to nonexistent device:bucket %llu:%llu",
alloc_pos.inode, alloc_pos.offset))
return bch2_btree_delete_at(trans, lru_iter, 0);
@@ -125,7 +164,7 @@ static int bch2_check_lru_key(struct btree_trans *trans,
goto out;
}
- if (fsck_err(c, lru_entry_bad,
+ if (fsck_err(trans, lru_entry_bad,
"incorrect lru entry: lru %s time %llu\n"
" %s\n"
" for %s",
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index bd71ba77de07..5bd8974a7f11 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -24,18 +24,6 @@ static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time)
return pos;
}
-#define BCH_LRU_TYPES() \
- x(read) \
- x(fragmentation)
-
-enum bch_lru_type {
-#define x(n) BCH_LRU_##n,
- BCH_LRU_TYPES()
-#undef x
-};
-
-#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1)
-
static inline enum bch_lru_type lru_type(struct bkey_s_c l)
{
u16 lru_id = l.k->p.inode >> 48;
@@ -61,6 +49,9 @@ int bch2_lru_del(struct btree_trans *, u16, u64, u64);
int bch2_lru_set(struct btree_trans *, u16, u64, u64);
int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
+struct bkey_buf;
+int bch2_lru_check_set(struct btree_trans *, u16, u64, struct bkey_s_c, struct bkey_buf *);
+
int bch2_check_lrus(struct bch_fs *);
#endif /* _BCACHEFS_LRU_H */
diff --git a/fs/bcachefs/lru_format.h b/fs/bcachefs/lru_format.h
new file mode 100644
index 000000000000..f372cb3b8cda
--- /dev/null
+++ b/fs/bcachefs/lru_format.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LRU_FORMAT_H
+#define _BCACHEFS_LRU_FORMAT_H
+
+struct bch_lru {
+ struct bch_val v;
+ __le64 idx;
+} __packed __aligned(8);
+
+#define BCH_LRU_TYPES() \
+ x(read) \
+ x(fragmentation)
+
+enum bch_lru_type {
+#define x(n) BCH_LRU_##n,
+ BCH_LRU_TYPES()
+#undef x
+};
+
+#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1)
+
+#define LRU_TIME_BITS 48
+#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1)
+
+#endif /* _BCACHEFS_LRU_FORMAT_H */
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 6e477fadaa2a..7d3920e03742 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -36,31 +36,6 @@ const char * const bch2_data_ops_strs[] = {
NULL
};
-static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- printbuf_tabstop_push(out, 20);
- prt_str(out, "rewrite ptrs:\t");
- bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
- prt_newline(out);
-
- prt_str(out, "kill ptrs:\t");
- bch2_prt_u64_base2(out, data_opts->kill_ptrs);
- prt_newline(out);
-
- prt_str(out, "target:\t");
- bch2_target_to_text(out, c, data_opts->target);
- prt_newline(out);
-
- prt_str(out, "compression:\t");
- bch2_compression_opt_to_text(out, background_compression(*io_opts));
- prt_newline(out);
-
- prt_str(out, "extra replicas:\t");
- prt_u64(out, data_opts->extra_replicas);
-}
-
static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
@@ -805,7 +780,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
if (!b)
goto next;
- unsigned sectors = btree_ptr_sectors_written(&b->key);
+ unsigned sectors = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
bch2_trans_iter_exit(trans, &iter);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index eb49dd045eff..deef4f024d20 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -290,18 +290,23 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
{
- prt_printf(out, "Currently waiting for: ");
+ printbuf_tabstop_push(out, 32);
+ prt_printf(out, "running:\t%u\n", c->copygc_running);
+ prt_printf(out, "copygc_wait:\t%llu\n", c->copygc_wait);
+ prt_printf(out, "copygc_wait_at:\t%llu\n", c->copygc_wait_at);
+
+ prt_printf(out, "Currently waiting for:\t");
prt_human_readable_u64(out, max(0LL, c->copygc_wait -
atomic64_read(&c->io_clock[WRITE].now)) << 9);
prt_newline(out);
- prt_printf(out, "Currently waiting since: ");
+ prt_printf(out, "Currently waiting since:\t");
prt_human_readable_u64(out, max(0LL,
atomic64_read(&c->io_clock[WRITE].now) -
c->copygc_wait_at) << 9);
prt_newline(out);
- prt_printf(out, "Currently calculated wait: ");
+ prt_printf(out, "Currently calculated wait:\t");
prt_human_readable_u64(out, bch2_copygc_wait_amount(c));
prt_newline(out);
}
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index bb068fd72465..e10fc1da71b1 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -378,6 +378,10 @@ int bch2_opt_parse(struct bch_fs *c,
break;
case BCH_OPT_FN:
ret = opt->fn.parse(c, val, res, err);
+
+ if (ret == -BCH_ERR_option_needs_open_fs)
+ return ret;
+
if (ret < 0) {
if (err)
prt_printf(err, "%s: parse error",
@@ -460,14 +464,81 @@ int bch2_opts_check_may_set(struct bch_fs *c)
return 0;
}
+int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
+ struct printbuf *parse_later,
+ const char *name, const char *val)
+{
+ struct printbuf err = PRINTBUF;
+ u64 v;
+ int ret, id;
+
+ id = bch2_mount_opt_lookup(name);
+
+ /* Check for the form "noopt", negation of a boolean opt: */
+ if (id < 0 &&
+ !val &&
+ !strncmp("no", name, 2)) {
+ id = bch2_mount_opt_lookup(name + 2);
+ val = "0";
+ }
+
+ /* Unknown options are ignored: */
+ if (id < 0)
+ return 0;
+
+ if (!(bch2_opt_table[id].flags & OPT_MOUNT))
+ goto bad_opt;
+
+ if (id == Opt_acl &&
+ !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
+ goto bad_opt;
+
+ if ((id == Opt_usrquota ||
+ id == Opt_grpquota) &&
+ !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
+ goto bad_opt;
+
+ ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
+ if (ret == -BCH_ERR_option_needs_open_fs && parse_later) {
+ prt_printf(parse_later, "%s=%s,", name, val);
+ if (parse_later->allocation_failure) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = 0;
+ goto out;
+ }
+
+ if (ret < 0)
+ goto bad_val;
+
+ if (opts)
+ bch2_opt_set_by_id(opts, id, v);
+
+ ret = 0;
+ goto out;
+
+bad_opt:
+ pr_err("Bad mount option %s", name);
+ ret = -BCH_ERR_option_name;
+ goto out;
+
+bad_val:
+ pr_err("Invalid mount option %s", err.buf);
+ ret = -BCH_ERR_option_value;
+
+out:
+ printbuf_exit(&err);
+ return ret;
+}
+
int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
- char *options)
+ struct printbuf *parse_later, char *options)
{
char *copied_opts, *copied_opts_start;
char *opt, *name, *val;
- int ret, id;
- struct printbuf err = PRINTBUF;
- u64 v;
+ int ret;
if (!options)
return 0;
@@ -488,53 +559,16 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
name = strsep(&opt, "=");
val = opt;
- id = bch2_mount_opt_lookup(name);
-
- /* Check for the form "noopt", negation of a boolean opt: */
- if (id < 0 &&
- !val &&
- !strncmp("no", name, 2)) {
- id = bch2_mount_opt_lookup(name + 2);
- val = "0";
- }
-
- /* Unknown options are ignored: */
- if (id < 0)
- continue;
-
- if (!(bch2_opt_table[id].flags & OPT_MOUNT))
- goto bad_opt;
-
- if (id == Opt_acl &&
- !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
- goto bad_opt;
-
- if ((id == Opt_usrquota ||
- id == Opt_grpquota) &&
- !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
- goto bad_opt;
-
- ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
+ ret = bch2_parse_one_mount_opt(c, opts, parse_later, name, val);
if (ret < 0)
- goto bad_val;
-
- bch2_opt_set_by_id(opts, id, v);
+ goto out;
}
ret = 0;
goto out;
-bad_opt:
- pr_err("Bad mount option %s", name);
- ret = -BCH_ERR_option_name;
- goto out;
-bad_val:
- pr_err("Invalid mount option %s", err.buf);
- ret = -BCH_ERR_option_value;
- goto out;
out:
kfree(copied_opts_start);
- printbuf_exit(&err);
return ret;
}
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index b197ec90d4cb..60b93018501f 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -63,6 +63,7 @@ enum opt_flags {
OPT_MUST_BE_POW_2 = (1 << 7), /* Must be power of 2 */
OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */
OPT_SB_FIELD_ILOG2 = (1 << 9), /* Superblock field is ilog2 of actual value */
+ OPT_HIDDEN = (1 << 10),
};
enum opt_type {
@@ -406,7 +407,7 @@ enum fsck_err_opts {
BCH2_NO_SB_OPT, BCH_SB_SECTOR, \
"offset", "Sector offset of superblock") \
x(read_only, u8, \
- OPT_FS|OPT_MOUNT, \
+ OPT_FS|OPT_MOUNT|OPT_HIDDEN, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, NULL) \
@@ -488,6 +489,13 @@ struct bch_opts {
#undef x
};
+struct bch2_opts_parse {
+ struct bch_opts opts;
+
+ /* to save opts that can't be parsed before the FS is opened: */
+ struct printbuf parse_later;
+};
+
static const __maybe_unused struct bch_opts bch2_opts_default = {
#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \
._name##_defined = true, \
@@ -566,7 +574,10 @@ void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *,
int bch2_opt_check_may_set(struct bch_fs *, int, u64);
int bch2_opts_check_may_set(struct bch_fs *);
-int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, char *);
+int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *,
+ struct printbuf *, const char *, const char *);
+int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *,
+ char *);
/* inode opts: */
diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c
index 9f529e4c1b16..4cf5a2af1e6f 100644
--- a/fs/bcachefs/printbuf.c
+++ b/fs/bcachefs/printbuf.c
@@ -316,6 +316,20 @@ void bch2_prt_newline(struct printbuf *buf)
buf->cur_tabstop = 0;
}
+void bch2_printbuf_strip_trailing_newline(struct printbuf *out)
+{
+ for (int p = out->pos - 1; p >= 0; --p) {
+ if (out->buf[p] == '\n') {
+ out->pos = p;
+ break;
+ }
+ if (out->buf[p] != ' ')
+ break;
+ }
+
+ printbuf_nul_terminate_reserved(out);
+}
+
static void __prt_tab(struct printbuf *out)
{
int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out));
diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h
index 9ecc56bc9635..1d570387b77f 100644
--- a/fs/bcachefs/printbuf.h
+++ b/fs/bcachefs/printbuf.h
@@ -115,6 +115,7 @@ void bch2_printbuf_indent_add(struct printbuf *, unsigned);
void bch2_printbuf_indent_sub(struct printbuf *, unsigned);
void bch2_prt_newline(struct printbuf *);
+void bch2_printbuf_strip_trailing_newline(struct printbuf *);
void bch2_prt_tab(struct printbuf *);
void bch2_prt_tab_rjust(struct printbuf *);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 1f9d044ed920..d89eb43c5ce9 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -10,6 +10,7 @@
#include "btree_io.h"
#include "buckets.h"
#include "dirent.h"
+#include "disk_accounting.h"
#include "errcode.h"
#include "error.h"
#include "fs-common.h"
@@ -90,6 +91,7 @@ static void bch2_reconstruct_alloc(struct bch_fs *c)
__set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
bch2_write_super(c);
@@ -134,6 +136,45 @@ static void replay_now_at(struct journal *j, u64 seq)
bch2_journal_pin_put(j, j->replay_journal_seq++);
}
+static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
+ struct journal_key *k)
+{
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+ BTREE_MAX_DEPTH, k->level,
+ BTREE_ITER_intent);
+ int ret = bch2_btree_iter_traverse(&iter);
+ if (ret)
+ goto out;
+
+ struct bkey u;
+ struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u);
+
+ /* Has this delta already been applied to the btree? */
+ if (bversion_cmp(old.k->version, k->k->k.version) >= 0) {
+ ret = 0;
+ goto out;
+ }
+
+ struct bkey_i *new = k->k;
+ if (old.k->type == KEY_TYPE_accounting) {
+ new = bch2_bkey_make_mut_noupdate(trans, bkey_i_to_s_c(k->k));
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ goto out;
+
+ bch2_accounting_accumulate(bkey_i_to_accounting(new),
+ bkey_s_c_to_accounting(old));
+ }
+
+ trans->journal_res.seq = k->journal_seq;
+
+ ret = bch2_trans_update(trans, &iter, new, BTREE_TRIGGER_norun);
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
static int bch2_journal_replay_key(struct btree_trans *trans,
struct journal_key *k)
{
@@ -184,6 +225,11 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
if (k->overwritten)
goto out;
+ if (k->k->k.type == KEY_TYPE_accounting) {
+ ret = bch2_trans_update_buffered(trans, BTREE_ID_accounting, k->k);
+ goto out;
+ }
+
ret = bch2_trans_update(trans, &iter, k->k, update_flags);
out:
bch2_trans_iter_exit(trans, &iter);
@@ -222,6 +268,30 @@ int bch2_journal_replay(struct bch_fs *c)
trans = bch2_trans_get(c);
/*
+ * Replay accounting keys first: we can't allow the write buffer to
+ * flush accounting keys until we're done
+ */
+ darray_for_each(*keys, k) {
+ if (!(k->k->k.type == KEY_TYPE_accounting && !k->allocated))
+ continue;
+
+ cond_resched();
+
+ ret = commit_do(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_journal_reclaim|
+ BCH_TRANS_COMMIT_skip_accounting_apply|
+ BCH_TRANS_COMMIT_no_journal_res,
+ bch2_journal_replay_accounting_key(trans, k));
+ if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret)))
+ goto err;
+
+ k->overwritten = true;
+ }
+
+ set_bit(BCH_FS_accounting_replay_done, &c->flags);
+
+ /*
* First, attempt to replay keys in sorted order. This is more
* efficient - better locality of btree access - but some might fail if
* that would cause a journal deadlock.
@@ -241,9 +311,10 @@ int bch2_journal_replay(struct bch_fs *c)
commit_do(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_journal_reclaim|
+ BCH_TRANS_COMMIT_skip_accounting_apply|
(!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
bch2_journal_replay_key(trans, k));
- BUG_ON(!ret && !k->overwritten);
+ BUG_ON(!ret && !k->overwritten && k->k->k.type != KEY_TYPE_accounting);
if (ret) {
ret = darray_push(&keys_sorted, k);
if (ret)
@@ -271,6 +342,7 @@ int bch2_journal_replay(struct bch_fs *c)
ret = commit_do(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_skip_accounting_apply|
(!k->allocated
? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim
: 0),
@@ -280,7 +352,7 @@ int bch2_journal_replay(struct bch_fs *c)
if (ret)
goto err;
- BUG_ON(!k->overwritten);
+ BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten);
}
/*
@@ -355,45 +427,10 @@ static int journal_replay_entry_early(struct bch_fs *c,
container_of(entry, struct jset_entry_usage, entry);
switch (entry->btree_id) {
- case BCH_FS_USAGE_reserved:
- if (entry->level < BCH_REPLICAS_MAX)
- c->usage_base->persistent_reserved[entry->level] =
- le64_to_cpu(u->v);
- break;
- case BCH_FS_USAGE_inodes:
- c->usage_base->b.nr_inodes = le64_to_cpu(u->v);
- break;
case BCH_FS_USAGE_key_version:
- atomic64_set(&c->key_version,
- le64_to_cpu(u->v));
+ atomic64_set(&c->key_version, le64_to_cpu(u->v));
break;
}
-
- break;
- }
- case BCH_JSET_ENTRY_data_usage: {
- struct jset_entry_data_usage *u =
- container_of(entry, struct jset_entry_data_usage, entry);
-
- ret = bch2_replicas_set_usage(c, &u->r,
- le64_to_cpu(u->v));
- break;
- }
- case BCH_JSET_ENTRY_dev_usage: {
- struct jset_entry_dev_usage *u =
- container_of(entry, struct jset_entry_dev_usage, entry);
- unsigned nr_types = jset_entry_dev_usage_nr_types(u);
-
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu(c, le32_to_cpu(u->dev));
- if (ca)
- for (unsigned i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) {
- ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets);
- ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors);
- ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented);
- }
- rcu_read_unlock();
-
break;
}
case BCH_JSET_ENTRY_blacklist: {
@@ -454,8 +491,6 @@ static int journal_replay_early(struct bch_fs *c,
}
}
- bch2_fs_usage_initialize(c);
-
return 0;
}
@@ -810,6 +845,10 @@ use_clean:
if (ret)
goto err;
+ set_bit(BCH_FS_btree_running, &c->flags);
+
+ ret = bch2_sb_set_upgrade_extra(c);
+
ret = bch2_run_recovery_passes(c);
if (ret)
goto err;
@@ -969,14 +1008,12 @@ int bch2_fs_initialize(struct bch_fs *c)
mutex_unlock(&c->sb_lock);
c->curr_recovery_pass = BCH_RECOVERY_PASS_NR;
+ set_bit(BCH_FS_btree_running, &c->flags);
set_bit(BCH_FS_may_go_rw, &c->flags);
for (unsigned i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc_fake(c, i, 0);
- for_each_member_device(c, ca)
- bch2_dev_usage_init(ca);
-
ret = bch2_fs_journal_alloc(c);
if (ret)
goto err;
@@ -986,12 +1023,21 @@ int bch2_fs_initialize(struct bch_fs *c)
* set up the journal.pin FIFO and journal.cur pointer:
*/
bch2_fs_journal_start(&c->journal, 1);
+ set_bit(BCH_FS_accounting_replay_done, &c->flags);
bch2_journal_set_replay_done(&c->journal);
ret = bch2_fs_read_write_early(c);
if (ret)
goto err;
+ for_each_member_device(c, ca) {
+ ret = bch2_dev_usage_init(ca, false);
+ if (ret) {
+ bch2_dev_put(ca);
+ goto err;
+ }
+ }
+
/*
* Write out the superblock and journal buckets, now that we can do
* btree updates
@@ -1025,7 +1071,7 @@ int bch2_fs_initialize(struct bch_fs *c)
bch2_inode_pack(&packed_inode, &root_inode);
packed_inode.inode.k.p.snapshot = U32_MAX;
- ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0);
+ ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0, 0);
bch_err_msg(c, ret, "creating root directory");
if (ret)
goto err;
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index 4a9eb9582b6e..73339a0a3111 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -5,6 +5,7 @@
#include "backpointers.h"
#include "btree_gc.h"
#include "btree_node_scan.h"
+#include "disk_accounting.h"
#include "ec.h"
#include "fsck.h"
#include "inode.h"
@@ -192,6 +193,8 @@ int bch2_run_online_recovery_passes(struct bch_fs *c)
{
int ret = 0;
+ down_read(&c->state_lock);
+
for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
struct recovery_pass_fn *p = recovery_pass_fns + i;
@@ -207,6 +210,8 @@ int bch2_run_online_recovery_passes(struct bch_fs *c)
break;
}
+ up_read(&c->state_lock);
+
return ret;
}
diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h
index 773aea9a0080..8c7dee5983d2 100644
--- a/fs/bcachefs/recovery_passes_types.h
+++ b/fs/bcachefs/recovery_passes_types.h
@@ -15,6 +15,7 @@
#define BCH_RECOVERY_PASSES() \
x(scan_for_btree_nodes, 37, 0) \
x(check_topology, 4, 0) \
+ x(accounting_read, 39, PASS_ALWAYS) \
x(alloc_read, 0, PASS_ALWAYS) \
x(stripes_read, 1, PASS_ALWAYS) \
x(initialize_subvolumes, 2, 0) \
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 9ac6cf21cfbf..5f92715e1525 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -171,7 +171,7 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
not_found:
BUG_ON(!(flags & BTREE_TRIGGER_check_repair));
- if (fsck_err(c, reflink_p_to_missing_reflink_v,
+ if (fsck_err(trans, reflink_p_to_missing_reflink_v,
"pointer to missing indirect extent\n"
" %s\n"
" missing range %llu-%llu",
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 57a1f09cca09..10c96cb2047a 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "buckets.h"
+#include "disk_accounting.h"
#include "journal.h"
#include "replicas.h"
#include "super-io.h"
@@ -243,145 +244,25 @@ static bool __replicas_has_entry(struct bch_replicas_cpu *r,
return __replicas_entry_idx(r, search) >= 0;
}
-bool bch2_replicas_marked(struct bch_fs *c,
+bool bch2_replicas_marked_locked(struct bch_fs *c,
struct bch_replicas_entry_v1 *search)
{
- bool marked;
-
- if (!search->nr_devs)
- return true;
-
verify_replicas_entry(search);
- percpu_down_read(&c->mark_lock);
- marked = __replicas_has_entry(&c->replicas, search) &&
- (likely((!c->replicas_gc.entries)) ||
- __replicas_has_entry(&c->replicas_gc, search));
- percpu_up_read(&c->mark_lock);
-
- return marked;
-}
-
-static void __replicas_table_update(struct bch_fs_usage *dst,
- struct bch_replicas_cpu *dst_r,
- struct bch_fs_usage *src,
- struct bch_replicas_cpu *src_r)
-{
- int src_idx, dst_idx;
-
- *dst = *src;
-
- for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
- if (!src->replicas[src_idx])
- continue;
-
- dst_idx = __replicas_entry_idx(dst_r,
- cpu_replicas_entry(src_r, src_idx));
- BUG_ON(dst_idx < 0);
-
- dst->replicas[dst_idx] = src->replicas[src_idx];
- }
-}
-
-static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
- struct bch_replicas_cpu *dst_r,
- struct bch_fs_usage __percpu *src_p,
- struct bch_replicas_cpu *src_r)
-{
- unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
- struct bch_fs_usage *dst, *src = (void *)
- bch2_acc_percpu_u64s((u64 __percpu *) src_p, src_nr);
-
- preempt_disable();
- dst = this_cpu_ptr(dst_p);
- preempt_enable();
-
- __replicas_table_update(dst, dst_r, src, src_r);
+ return !search->nr_devs ||
+ (__replicas_has_entry(&c->replicas, search) &&
+ (likely((!c->replicas_gc.entries)) ||
+ __replicas_has_entry(&c->replicas_gc, search)));
}
-/*
- * Resize filesystem accounting:
- */
-static int replicas_table_update(struct bch_fs *c,
- struct bch_replicas_cpu *new_r)
+bool bch2_replicas_marked(struct bch_fs *c,
+ struct bch_replicas_entry_v1 *search)
{
- struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR];
- struct bch_fs_usage_online *new_scratch = NULL;
- struct bch_fs_usage __percpu *new_gc = NULL;
- struct bch_fs_usage *new_base = NULL;
- unsigned i, bytes = sizeof(struct bch_fs_usage) +
- sizeof(u64) * new_r->nr;
- unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) +
- sizeof(u64) * new_r->nr;
- int ret = 0;
-
- memset(new_usage, 0, sizeof(new_usage));
-
- for (i = 0; i < ARRAY_SIZE(new_usage); i++)
- if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
- sizeof(u64), GFP_KERNEL)))
- goto err;
+ percpu_down_read(&c->mark_lock);
+ bool ret = bch2_replicas_marked_locked(c, search);
+ percpu_up_read(&c->mark_lock);
- if (!(new_base = kzalloc(bytes, GFP_KERNEL)) ||
- !(new_scratch = kmalloc(scratch_bytes, GFP_KERNEL)) ||
- (c->usage_gc &&
- !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL))))
- goto err;
-
- for (i = 0; i < ARRAY_SIZE(new_usage); i++)
- if (c->usage[i])
- __replicas_table_update_pcpu(new_usage[i], new_r,
- c->usage[i], &c->replicas);
- if (c->usage_base)
- __replicas_table_update(new_base, new_r,
- c->usage_base, &c->replicas);
- if (c->usage_gc)
- __replicas_table_update_pcpu(new_gc, new_r,
- c->usage_gc, &c->replicas);
-
- for (i = 0; i < ARRAY_SIZE(new_usage); i++)
- swap(c->usage[i], new_usage[i]);
- swap(c->usage_base, new_base);
- swap(c->usage_scratch, new_scratch);
- swap(c->usage_gc, new_gc);
- swap(c->replicas, *new_r);
-out:
- free_percpu(new_gc);
- kfree(new_scratch);
- for (i = 0; i < ARRAY_SIZE(new_usage); i++)
- free_percpu(new_usage[i]);
- kfree(new_base);
return ret;
-err:
- bch_err(c, "error updating replicas table: memory allocation failure");
- ret = -BCH_ERR_ENOMEM_replicas_table;
- goto out;
-}
-
-static unsigned reserve_journal_replicas(struct bch_fs *c,
- struct bch_replicas_cpu *r)
-{
- struct bch_replicas_entry_v1 *e;
- unsigned journal_res_u64s = 0;
-
- /* nr_inodes: */
- journal_res_u64s +=
- DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
-
- /* key_version: */
- journal_res_u64s +=
- DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
-
- /* persistent_reserved: */
- journal_res_u64s +=
- DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) *
- BCH_REPLICAS_MAX;
-
- for_each_cpu_replicas_entry(r, e)
- journal_res_u64s +=
- DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) +
- e->nr_devs, sizeof(u64));
- return journal_res_u64s;
}
noinline
@@ -417,10 +298,6 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
if (ret)
goto err;
-
- bch2_journal_entry_res_resize(&c->journal,
- &c->replicas_journal_res,
- reserve_journal_replicas(c, &new_r));
}
if (!new_r.entries &&
@@ -435,7 +312,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
/* don't update in memory replicas until changes are persistent */
percpu_down_write(&c->mark_lock);
if (new_r.entries)
- ret = replicas_table_update(c, &new_r);
+ swap(c->replicas, new_r);
if (new_gc.entries)
swap(new_gc, c->replicas_gc);
percpu_up_write(&c->mark_lock);
@@ -457,20 +334,6 @@ int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
? 0 : bch2_mark_replicas_slowpath(c, r);
}
-/* replicas delta list: */
-
-int bch2_replicas_delta_list_mark(struct bch_fs *c,
- struct replicas_delta_list *r)
-{
- struct replicas_delta *d = r->d;
- struct replicas_delta *top = (void *) r->d + r->used;
- int ret = 0;
-
- for (d = r->d; !ret && d != top; d = replicas_delta_next(d))
- ret = bch2_mark_replicas(c, &d->r);
- return ret;
-}
-
/*
* Old replicas_gc mechanism: only used for journal replicas entries now, should
* die at some point:
@@ -484,8 +347,9 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
percpu_down_write(&c->mark_lock);
ret = ret ?:
- bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc) ?:
- replicas_table_update(c, &c->replicas_gc);
+ bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc);
+ if (!ret)
+ swap(c->replicas, c->replicas_gc);
kfree(c->replicas_gc.entries);
c->replicas_gc.entries = NULL;
@@ -556,10 +420,10 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
int bch2_replicas_gc2(struct bch_fs *c)
{
struct bch_replicas_cpu new = { 0 };
- unsigned i, nr;
+ unsigned nr;
int ret = 0;
- bch2_journal_meta(&c->journal);
+ bch2_accounting_mem_gc(c);
retry:
nr = READ_ONCE(c->replicas.nr);
new.entry_size = READ_ONCE(c->replicas.entry_size);
@@ -580,24 +444,33 @@ retry:
goto retry;
}
- for (i = 0; i < c->replicas.nr; i++) {
+ for (unsigned i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
- if (e->data_type == BCH_DATA_journal ||
- c->usage_base->replicas[i] ||
- percpu_u64_get(&c->usage[0]->replicas[i]) ||
- percpu_u64_get(&c->usage[1]->replicas[i]) ||
- percpu_u64_get(&c->usage[2]->replicas[i]) ||
- percpu_u64_get(&c->usage[3]->replicas[i]))
+ struct disk_accounting_pos k = {
+ .type = BCH_DISK_ACCOUNTING_replicas,
+ };
+
+ memcpy(&k.replicas, e, replicas_entry_bytes(e));
+
+ struct bpos p = disk_accounting_pos_to_bpos(&k);
+
+ struct bch_accounting_mem *acc = &c->accounting;
+ bool kill = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+ accounting_pos_cmp, &p) >= acc->k.nr;
+
+ if (e->data_type == BCH_DATA_journal || !kill)
memcpy(cpu_replicas_entry(&new, new.nr++),
e, new.entry_size);
}
bch2_cpu_replicas_sort(&new);
- ret = bch2_cpu_replicas_to_sb_replicas(c, &new) ?:
- replicas_table_update(c, &new);
+ ret = bch2_cpu_replicas_to_sb_replicas(c, &new);
+
+ if (!ret)
+ swap(c->replicas, new);
kfree(new.entries);
@@ -611,34 +484,6 @@ retry:
return ret;
}
-int bch2_replicas_set_usage(struct bch_fs *c,
- struct bch_replicas_entry_v1 *r,
- u64 sectors)
-{
- int ret, idx = bch2_replicas_entry_idx(c, r);
-
- if (idx < 0) {
- struct bch_replicas_cpu n;
-
- n = cpu_replicas_add_entry(c, &c->replicas, r);
- if (!n.entries)
- return -BCH_ERR_ENOMEM_cpu_replicas;
-
- ret = replicas_table_update(c, &n);
- if (ret)
- return ret;
-
- kfree(n.entries);
-
- idx = bch2_replicas_entry_idx(c, r);
- BUG_ON(ret < 0);
- }
-
- c->usage_base->replicas[idx] = sectors;
-
- return 0;
-}
-
/* Replicas tracking - superblock: */
static int
@@ -724,8 +569,7 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
bch2_cpu_replicas_sort(&new_r);
percpu_down_write(&c->mark_lock);
-
- ret = replicas_table_update(c, &new_r);
+ swap(c->replicas, new_r);
percpu_up_write(&c->mark_lock);
kfree(new_r.entries);
@@ -1027,10 +871,8 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
{
- unsigned ret;
-
mutex_lock(&c->sb_lock);
- ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
+ unsigned ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
mutex_unlock(&c->sb_lock);
return ret;
@@ -1038,25 +880,6 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
void bch2_fs_replicas_exit(struct bch_fs *c)
{
- unsigned i;
-
- kfree(c->usage_scratch);
- for (i = 0; i < ARRAY_SIZE(c->usage); i++)
- free_percpu(c->usage[i]);
- kfree(c->usage_base);
kfree(c->replicas.entries);
kfree(c->replicas_gc.entries);
-
- mempool_exit(&c->replicas_delta_pool);
-}
-
-int bch2_fs_replicas_init(struct bch_fs *c)
-{
- bch2_journal_entry_res_resize(&c->journal,
- &c->replicas_journal_res,
- reserve_journal_replicas(c, &c->replicas));
-
- return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1,
- REPLICAS_DELTA_LIST_MAX) ?:
- replicas_table_update(c, &c->replicas);
}
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 654a4b26d3a3..622482559c3d 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -25,18 +25,13 @@ int bch2_replicas_entry_idx(struct bch_fs *,
void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *,
enum bch_data_type,
struct bch_devs_list);
+
+bool bch2_replicas_marked_locked(struct bch_fs *,
+ struct bch_replicas_entry_v1 *);
bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *);
int bch2_mark_replicas(struct bch_fs *,
struct bch_replicas_entry_v1 *);
-static inline struct replicas_delta *
-replicas_delta_next(struct replicas_delta *d)
-{
- return (void *) d + replicas_entry_bytes(&d->r) + 8;
-}
-
-int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
-
void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c);
static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e,
@@ -58,10 +53,6 @@ int bch2_replicas_gc_end(struct bch_fs *, int);
int bch2_replicas_gc_start(struct bch_fs *, unsigned);
int bch2_replicas_gc2(struct bch_fs *);
-int bch2_replicas_set_usage(struct bch_fs *,
- struct bch_replicas_entry_v1 *,
- u64);
-
#define for_each_cpu_replicas_entry(_r, _i) \
for (_i = (_r)->entries; \
(void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
@@ -88,6 +79,5 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0;
void bch2_fs_replicas_exit(struct bch_fs *);
-int bch2_fs_replicas_init(struct bch_fs *);
#endif /* _BCACHEFS_REPLICAS_H */
diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h
index ac90d142c4e8..fed71c861fe7 100644
--- a/fs/bcachefs/replicas_types.h
+++ b/fs/bcachefs/replicas_types.h
@@ -8,20 +8,4 @@ struct bch_replicas_cpu {
struct bch_replicas_entry_v1 *entries;
};
-struct replicas_delta {
- s64 delta;
- struct bch_replicas_entry_v1 r;
-} __packed;
-
-struct replicas_delta_list {
- unsigned size;
- unsigned used;
-
- struct {} memset_start;
- u64 nr_inodes;
- u64 persistent_reserved[BCH_REPLICAS_MAX];
- struct {} memset_end;
- struct replicas_delta d[];
-};
-
#endif /* _BCACHEFS_REPLICAS_TYPES_H */
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index 47f10ab57f40..c57d42bb8d1b 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -183,25 +183,6 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry **end,
u64 journal_seq)
{
- percpu_down_read(&c->mark_lock);
-
- if (!journal_seq) {
- for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++)
- bch2_fs_usage_acc_to_base(c, i);
- } else {
- bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
- }
-
- {
- struct jset_entry_usage *u =
- container_of(jset_entry_init(end, sizeof(*u)),
- struct jset_entry_usage, entry);
-
- u->entry.type = BCH_JSET_ENTRY_usage;
- u->entry.btree_id = BCH_FS_USAGE_inodes;
- u->v = cpu_to_le64(c->usage_base->b.nr_inodes);
- }
-
{
struct jset_entry_usage *u =
container_of(jset_entry_init(end, sizeof(*u)),
@@ -212,49 +193,6 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
u->v = cpu_to_le64(atomic64_read(&c->key_version));
}
- for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) {
- struct jset_entry_usage *u =
- container_of(jset_entry_init(end, sizeof(*u)),
- struct jset_entry_usage, entry);
-
- u->entry.type = BCH_JSET_ENTRY_usage;
- u->entry.btree_id = BCH_FS_USAGE_reserved;
- u->entry.level = i;
- u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]);
- }
-
- for (unsigned i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry_v1 *e =
- cpu_replicas_entry(&c->replicas, i);
- struct jset_entry_data_usage *u =
- container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
- struct jset_entry_data_usage, entry);
-
- u->entry.type = BCH_JSET_ENTRY_data_usage;
- u->v = cpu_to_le64(c->usage_base->replicas[i]);
- unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
- "embedded variable length struct");
- }
-
- for_each_member_device(c, ca) {
- unsigned b = sizeof(struct jset_entry_dev_usage) +
- sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
- struct jset_entry_dev_usage *u =
- container_of(jset_entry_init(end, b),
- struct jset_entry_dev_usage, entry);
-
- u->entry.type = BCH_JSET_ENTRY_dev_usage;
- u->dev = cpu_to_le32(ca->dev_idx);
-
- for (unsigned i = 0; i < BCH_DATA_NR; i++) {
- u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
- u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors);
- u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
- }
- }
-
- percpu_up_read(&c->mark_lock);
-
for (unsigned i = 0; i < 2; i++) {
struct jset_entry_clock *clock =
container_of(jset_entry_init(end, sizeof(*clock)),
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 4710b61631f0..dfbbd33c8731 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -54,9 +54,32 @@
BCH_FSCK_ERR_subvol_children_not_set) \
x(mi_btree_bitmap, \
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_btree_bitmap_not_marked)
-
-#define DOWNGRADE_TABLE()
+ BCH_FSCK_ERR_btree_bitmap_not_marked) \
+ x(disk_accounting_v2, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_bkey_version_in_future, \
+ BCH_FSCK_ERR_dev_usage_buckets_wrong, \
+ BCH_FSCK_ERR_dev_usage_sectors_wrong, \
+ BCH_FSCK_ERR_dev_usage_fragmented_wrong, \
+ BCH_FSCK_ERR_accounting_mismatch)
+
+#define DOWNGRADE_TABLE() \
+ x(bucket_stripe_sectors, \
+ 0) \
+ x(disk_accounting_v2, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
+ BCH_FSCK_ERR_dev_usage_buckets_wrong, \
+ BCH_FSCK_ERR_dev_usage_sectors_wrong, \
+ BCH_FSCK_ERR_dev_usage_fragmented_wrong, \
+ BCH_FSCK_ERR_fs_usage_hidden_wrong, \
+ BCH_FSCK_ERR_fs_usage_btree_wrong, \
+ BCH_FSCK_ERR_fs_usage_data_wrong, \
+ BCH_FSCK_ERR_fs_usage_cached_wrong, \
+ BCH_FSCK_ERR_fs_usage_reserved_wrong, \
+ BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \
+ BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \
+ BCH_FSCK_ERR_fs_usage_replicas_wrong, \
+ BCH_FSCK_ERR_bkey_version_in_future)
struct upgrade_downgrade_entry {
u64 recovery_passes;
@@ -80,6 +103,37 @@ UPGRADE_TABLE()
#undef x
};
+static int have_stripes(struct bch_fs *c)
+{
+ return !btree_node_fake(c->btree_roots_known[BTREE_ID_stripes].b);
+}
+
+int bch2_sb_set_upgrade_extra(struct bch_fs *c)
+{
+ unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
+ unsigned new_version = c->sb.version;
+ bool write_sb = false;
+ int ret = 0;
+
+ mutex_lock(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+ if (old_version < bcachefs_metadata_version_bucket_stripe_sectors &&
+ new_version >= bcachefs_metadata_version_bucket_stripe_sectors &&
+ (ret = have_stripes(c) > 0)) {
+ __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
+ __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_sectors_wrong, ext->errors_silent);
+ write_sb = true;
+ }
+
+ if (write_sb)
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ return ret < 0 ? ret : 0;
+}
+
void bch2_sb_set_upgrade(struct bch_fs *c,
unsigned old_version,
unsigned new_version)
@@ -101,16 +155,12 @@ void bch2_sb_set_upgrade(struct bch_fs *c,
ext->recovery_passes_required[0] |=
cpu_to_le64(bch2_recovery_passes_to_stable(passes));
- for (const u16 *e = i->errors;
- e < i->errors + i->nr_errors;
- e++) {
- __set_bit(*e, c->sb.errors_silent);
- ext->errors_silent[*e / 64] |= cpu_to_le64(BIT_ULL(*e % 64));
- }
+ for (const u16 *e = i->errors; e < i->errors + i->nr_errors; e++)
+ __set_bit_le64(*e, ext->errors_silent);
}
}
-#define x(ver, passes, ...) static const u16 downgrade_ver_##errors[] = { __VA_ARGS__ };
+#define x(ver, passes, ...) static const u16 downgrade_##ver##_errors[] = { __VA_ARGS__ };
DOWNGRADE_TABLE()
#undef x
@@ -125,6 +175,37 @@ DOWNGRADE_TABLE()
#undef x
};
+static int downgrade_table_extra(struct bch_fs *c, darray_char *table)
+{
+ struct bch_sb_field_downgrade_entry *dst = (void *) &darray_top(*table);
+ unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors);
+ int ret = 0;
+
+ unsigned nr_errors = le16_to_cpu(dst->nr_errors);
+
+ switch (le16_to_cpu(dst->version)) {
+ case bcachefs_metadata_version_bucket_stripe_sectors:
+ if (have_stripes(c)) {
+ bytes += sizeof(dst->errors[0]) * 2;
+
+ ret = darray_make_room(table, bytes);
+ if (ret)
+ return ret;
+
+ /* open coded __set_bit_le64, as dst is packed and
+ * dst->recovery_passes is misaligned */
+ unsigned b = BCH_RECOVERY_PASS_STABLE_check_allocations;
+ dst->recovery_passes[b / 64] |= cpu_to_le64(BIT_ULL(b % 64));
+
+ dst->errors[nr_errors++] = cpu_to_le16(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong);
+ }
+ break;
+ }
+
+ dst->nr_errors = cpu_to_le16(nr_errors);
+ return ret;
+}
+
static inline const struct bch_sb_field_downgrade_entry *
downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e)
{
@@ -210,6 +291,9 @@ const struct bch_sb_field_ops bch_sb_field_ops_downgrade = {
int bch2_sb_downgrade_update(struct bch_fs *c)
{
+ if (!test_bit(BCH_FS_btree_running, &c->flags))
+ return 0;
+
darray_char table = {};
int ret = 0;
@@ -234,7 +318,14 @@ int bch2_sb_downgrade_update(struct bch_fs *c)
for (unsigned i = 0; i < src->nr_errors; i++)
dst->errors[i] = cpu_to_le16(src->errors[i]);
- table.nr += bytes;
+ downgrade_table_extra(c, &table);
+
+ if (!dst->recovery_passes[0] &&
+ !dst->recovery_passes[1] &&
+ !dst->nr_errors)
+ continue;
+
+ table.nr += sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors);
}
struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade);
diff --git a/fs/bcachefs/sb-downgrade.h b/fs/bcachefs/sb-downgrade.h
index 57e6c916fc73..095b7cc9bb47 100644
--- a/fs/bcachefs/sb-downgrade.h
+++ b/fs/bcachefs/sb-downgrade.h
@@ -6,6 +6,7 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade;
int bch2_sb_downgrade_update(struct bch_fs *);
void bch2_sb_set_upgrade(struct bch_fs *, unsigned, unsigned);
+int bch2_sb_set_upgrade_extra(struct bch_fs *);
void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned);
#endif /* _BCACHEFS_SB_DOWNGRADE_H */
diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c
index bda33e59e226..c1270d790e43 100644
--- a/fs/bcachefs/sb-errors.c
+++ b/fs/bcachefs/sb-errors.c
@@ -110,19 +110,25 @@ out:
void bch2_sb_errors_from_cpu(struct bch_fs *c)
{
bch_sb_errors_cpu *src = &c->fsck_error_counts;
- struct bch_sb_field_errors *dst =
- bch2_sb_field_resize(&c->disk_sb, errors,
- bch2_sb_field_errors_u64s(src->nr));
+ struct bch_sb_field_errors *dst;
unsigned i;
+ mutex_lock(&c->fsck_error_counts_lock);
+
+ dst = bch2_sb_field_resize(&c->disk_sb, errors,
+ bch2_sb_field_errors_u64s(src->nr));
+
if (!dst)
- return;
+ goto err;
for (i = 0; i < src->nr; i++) {
SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id);
SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr);
dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time);
}
+
+err:
+ mutex_unlock(&c->fsck_error_counts_lock);
}
static int bch2_sb_errors_to_cpu(struct bch_fs *c)
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index d6f35a99c429..d1b2f2aa397a 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -227,8 +227,8 @@ enum bch_fsck_flags {
x(deleted_inode_is_dir, 213, 0) \
x(deleted_inode_not_unlinked, 214, 0) \
x(extent_overlapping, 215, 0) \
- x(extent_in_missing_inode, 216, 0) \
- x(extent_in_non_reg_inode, 217, 0) \
+ x(key_in_missing_inode, 216, 0) \
+ x(key_in_wrong_inode_type, 217, 0) \
x(extent_past_end_of_inode, 218, 0) \
x(dirent_empty_name, 219, 0) \
x(dirent_val_too_big, 220, 0) \
@@ -286,7 +286,8 @@ enum bch_fsck_flags {
x(accounting_mismatch, 272, 0) \
x(accounting_replicas_not_marked, 273, 0) \
x(invalid_btree_id, 274, 0) \
- x(alloc_key_io_time_bad, 275, 0)
+ x(alloc_key_io_time_bad, 275, 0) \
+ x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h
index c1860d8163fb..c4b3d8d3f414 100644
--- a/fs/bcachefs/seqmutex.h
+++ b/fs/bcachefs/seqmutex.h
@@ -19,17 +19,14 @@ static inline bool seqmutex_trylock(struct seqmutex *lock)
static inline void seqmutex_lock(struct seqmutex *lock)
{
mutex_lock(&lock->lock);
-}
-
-static inline void seqmutex_unlock(struct seqmutex *lock)
-{
lock->seq++;
- mutex_unlock(&lock->lock);
}
-static inline u32 seqmutex_seq(struct seqmutex *lock)
+static inline u32 seqmutex_unlock(struct seqmutex *lock)
{
- return lock->seq;
+ u32 seq = lock->seq;
+ mutex_unlock(&lock->lock);
+ return seq;
}
static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq)
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 4ef98e696673..96744b1a76f5 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -168,6 +168,9 @@ static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
+ if (unlikely(new_bytes > INT_MAX))
+ return NULL;
+
new = kvzalloc(new_bytes, GFP_KERNEL);
if (!new)
return NULL;
@@ -549,7 +552,7 @@ static int check_snapshot_tree(struct btree_trans *trans,
if (fsck_err_on(ret ||
root_id != bch2_snapshot_root(c, root_id) ||
st.k->p.offset != le32_to_cpu(s.tree),
- c, snapshot_tree_to_missing_snapshot,
+ trans, snapshot_tree_to_missing_snapshot,
"snapshot tree points to missing/incorrect snapshot:\n %s",
(bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
ret = bch2_btree_delete_at(trans, iter, 0);
@@ -562,19 +565,19 @@ static int check_snapshot_tree(struct btree_trans *trans,
goto err;
if (fsck_err_on(ret,
- c, snapshot_tree_to_missing_subvol,
+ trans, snapshot_tree_to_missing_subvol,
"snapshot tree points to missing subvolume:\n %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
fsck_err_on(!bch2_snapshot_is_ancestor(c,
le32_to_cpu(subvol.snapshot),
root_id),
- c, snapshot_tree_to_wrong_subvol,
+ trans, snapshot_tree_to_wrong_subvol,
"snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
- c, snapshot_tree_to_snapshot_subvol,
+ trans, snapshot_tree_to_snapshot_subvol,
"snapshot tree points to snapshot subvolume:\n %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
@@ -811,7 +814,7 @@ static int check_snapshot(struct btree_trans *trans,
}
} else {
if (fsck_err_on(s.subvol,
- c, snapshot_should_not_have_subvol,
+ trans, snapshot_should_not_have_subvol,
"snapshot should not point to subvol:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
@@ -828,7 +831,8 @@ static int check_snapshot(struct btree_trans *trans,
if (ret < 0)
goto err;
- if (fsck_err_on(!ret, c, snapshot_to_bad_snapshot_tree,
+ if (fsck_err_on(!ret,
+ trans, snapshot_to_bad_snapshot_tree,
"snapshot points to missing/incorrect tree:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
@@ -840,7 +844,7 @@ static int check_snapshot(struct btree_trans *trans,
real_depth = bch2_snapshot_depth(c, parent_id);
if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
- c, snapshot_bad_depth,
+ trans, snapshot_bad_depth,
"snapshot with incorrect depth field, should be %u:\n %s",
real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
@@ -856,7 +860,8 @@ static int check_snapshot(struct btree_trans *trans,
if (ret < 0)
goto err;
- if (fsck_err_on(!ret, c, snapshot_bad_skiplist,
+ if (fsck_err_on(!ret,
+ trans, snapshot_bad_skiplist,
"snapshot with bad skiplist field:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
@@ -1018,7 +1023,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c)
darray_for_each(*t, id) {
if (fsck_err_on(!bch2_snapshot_equiv(c, *id),
- c, snapshot_node_missing,
+ trans, snapshot_node_missing,
"snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
if (t->nr > 1) {
bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
@@ -1050,8 +1055,8 @@ int bch2_check_key_has_snapshot(struct btree_trans *trans,
struct printbuf buf = PRINTBUF;
int ret = 0;
- if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
- bkey_in_missing_snapshot,
+ if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot),
+ trans, bkey_in_missing_snapshot,
"key in missing snapshot %s, delete?",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = bch2_btree_delete_at(trans, iter,
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index dfc9cf305756..f56720b55862 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -57,7 +57,7 @@ static int check_subvol(struct btree_trans *trans,
if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL &&
subvol.v->fs_path_parent,
- c, subvol_root_fs_path_parent_nonzero,
+ trans, subvol_root_fs_path_parent_nonzero,
"root subvolume has nonzero fs_path_parent\n%s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
struct bkey_i_subvolume *n =
@@ -80,7 +80,7 @@ static int check_subvol(struct btree_trans *trans,
goto err;
if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set,
- c, subvol_children_not_set,
+ trans, subvol_children_not_set,
"subvolume not set in subvolume_children btree at %llu:%llu\n%s",
pos.inode, pos.offset,
(printbuf_reset(&buf),
@@ -101,7 +101,8 @@ static int check_subvol(struct btree_trans *trans,
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
- if (fsck_err_on(ret, c, subvol_to_missing_root,
+ if (fsck_err_on(ret,
+ trans, subvol_to_missing_root,
"subvolume %llu points to missing subvolume root %llu:%u",
k.k->p.offset, le64_to_cpu(subvol.v->inode),
le32_to_cpu(subvol.v->snapshot))) {
@@ -111,7 +112,7 @@ static int check_subvol(struct btree_trans *trans,
}
if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
- c, subvol_root_wrong_bi_subvol,
+ trans, subvol_root_wrong_bi_subvol,
"subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
inode.bi_inum, inode_iter.k.p.snapshot,
inode.bi_subvol, subvol.k->p.offset)) {
@@ -139,7 +140,7 @@ static int check_subvol(struct btree_trans *trans,
return ret;
if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset,
- c, subvol_not_master_and_not_snapshot,
+ trans, subvol_not_master_and_not_snapshot,
"subvolume %llu is not set as snapshot but is not master subvolume",
k.k->p.offset)) {
struct bkey_i_subvolume *s =
@@ -173,7 +174,6 @@ static int check_subvol_child(struct btree_trans *trans,
struct btree_iter *child_iter,
struct bkey_s_c child_k)
{
- struct bch_fs *c = trans->c;
struct bch_subvolume s;
int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset),
0, subvolume, &s);
@@ -182,7 +182,7 @@ static int check_subvol_child(struct btree_trans *trans,
if (fsck_err_on(ret ||
le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode,
- c, subvol_children_bad,
+ trans, subvol_children_bad,
"incorrect entry in subvolume_children btree %llu:%llu",
child_k.k->p.inode, child_k.k->p.offset)) {
ret = bch2_btree_delete_at(trans, child_iter, 0);
@@ -630,9 +630,9 @@ int bch2_initialize_subvolumes(struct bch_fs *c)
root_volume.v.snapshot = cpu_to_le32(U32_MAX);
root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO);
- ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?:
- bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?:
- bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0);
+ ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0, 0) ?:
+ bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0, 0) ?:
+ bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0, 0);
bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index b156fc85b8a3..8bc819832790 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1312,7 +1312,10 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
prt_printf(out, "Device index:\t%u\n", sb->dev_idx);
prt_printf(out, "Label:\t");
- prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
+ if (!strlen(sb->label))
+ prt_printf(out, "(none)");
+ else
+ prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
prt_newline(out);
prt_printf(out, "Version:\t");
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 9083df82073a..0455a1001fec 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -25,6 +25,7 @@
#include "clock.h"
#include "compress.h"
#include "debug.h"
+#include "disk_accounting.h"
#include "disk_groups.h"
#include "ec.h"
#include "errcode.h"
@@ -88,6 +89,19 @@ const char * const bch2_fs_flag_strs[] = {
NULL
};
+void bch2_print_str(struct bch_fs *c, const char *str)
+{
+#ifdef __KERNEL__
+ struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
+
+ if (unlikely(stdio)) {
+ bch2_stdio_redirect_printf(stdio, true, "%s", str);
+ return;
+ }
+#endif
+ bch2_print_string_as_lines(KERN_ERR, str);
+}
+
__printf(2, 0)
static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args)
{
@@ -222,22 +236,6 @@ struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
return c;
}
-static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
-{
- unsigned nr = 0, u64s =
- ((sizeof(struct jset_entry_dev_usage) +
- sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) /
- sizeof(u64);
-
- rcu_read_lock();
- for_each_member_device_rcu(c, ca, NULL)
- nr++;
- rcu_read_unlock();
-
- bch2_journal_entry_res_resize(&c->journal,
- &c->dev_usage_journal_res, u64s * nr);
-}
-
/* Filesystem RO/RW: */
/*
@@ -376,6 +374,7 @@ void bch2_fs_read_only(struct bch_fs *c)
BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
BUG_ON(c->btree_write_buffer.inc.keys.nr);
BUG_ON(c->btree_write_buffer.flushing.keys.nr);
+ bch2_verify_accounting_clean(c);
bch_verbose(c, "marking filesystem clean");
bch2_fs_mark_clean(c);
@@ -536,7 +535,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_find_btree_nodes_exit(&c->found_btree_nodes);
bch2_free_pending_node_rewrites(c);
- bch2_fs_allocator_background_exit(c);
+ bch2_fs_accounting_exit(c);
bch2_fs_sb_errors_exit(c);
bch2_fs_counters_exit(c);
bch2_fs_snapshots_exit(c);
@@ -564,11 +563,15 @@ static void __bch2_fs_free(struct bch_fs *c)
BUG_ON(atomic_read(&c->journal_keys.ref));
bch2_fs_btree_write_buffer_exit(c);
percpu_free_rwsem(&c->mark_lock);
- EBUG_ON(c->online_reserved && percpu_u64_get(c->online_reserved));
- free_percpu(c->online_reserved);
+ if (c->online_reserved) {
+ u64 v = percpu_u64_get(c->online_reserved);
+ WARN(v, "online_reserved not 0 at shutdown: %lli", v);
+ free_percpu(c->online_reserved);
+ }
darray_exit(&c->btree_roots_extra);
free_percpu(c->pcpu);
+ free_percpu(c->usage);
mempool_exit(&c->large_bkey_pool);
mempool_exit(&c->btree_bounce_pool);
bioset_exit(&c->btree_bio);
@@ -787,8 +790,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
INIT_LIST_HEAD(&c->list);
- mutex_init(&c->usage_scratch_lock);
-
mutex_init(&c->bio_bounce_pages_lock);
mutex_init(&c->snapshot_table_lock);
init_rwsem(&c->snapshot_create_lock);
@@ -896,6 +897,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
offsetof(struct btree_write_bio, wbio.bio)),
BIOSET_NEED_BVECS) ||
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
+ !(c->usage = alloc_percpu(struct bch_fs_usage_base)) ||
!(c->online_reserved = alloc_percpu(u64)) ||
mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
c->opts.btree_node_size) ||
@@ -911,7 +913,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_io_clock_init(&c->io_clock[READ]) ?:
bch2_io_clock_init(&c->io_clock[WRITE]) ?:
bch2_fs_journal_init(&c->journal) ?:
- bch2_fs_replicas_init(c) ?:
bch2_fs_btree_iter_init(c) ?:
bch2_fs_btree_cache_init(c) ?:
bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
@@ -942,7 +943,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_journal_entry_res_resize(&c->journal,
&c->btree_root_journal_res,
BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX));
- bch2_dev_usage_journal_reserve(c);
bch2_journal_entry_res_resize(&c->journal,
&c->clock_journal_res,
(sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);
@@ -968,7 +968,7 @@ static void print_mount_opts(struct bch_fs *c)
struct printbuf p = PRINTBUF;
bool first = true;
- prt_str(&p, "mounting version ");
+ prt_str(&p, "starting version ");
bch2_version_to_text(&p, c->sb.version);
if (c->opts.read_only) {
@@ -1195,6 +1195,7 @@ static void bch2_dev_free(struct bch_dev *ca)
kfree(ca->buckets_nouse);
bch2_free_super(&ca->disk_sb);
+ bch2_dev_allocator_background_exit(ca);
bch2_dev_journal_exit(ca);
free_percpu(ca->io_done);
@@ -1317,6 +1318,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
atomic_long_set(&ca->ref, 1);
#endif
+ bch2_dev_allocator_background_init(ca);
+
if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
!(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) ||
@@ -1529,6 +1532,7 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
* The allocator thread itself allocates btree nodes, so stop it first:
*/
bch2_dev_allocator_remove(c, ca);
+ bch2_recalc_capacity(c);
bch2_dev_journal_stop(&c->journal, ca);
}
@@ -1540,6 +1544,7 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
+ bch2_dev_do_discards(ca);
}
int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
@@ -1608,7 +1613,8 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
- BTREE_TRIGGER_norun, NULL);
+ BTREE_TRIGGER_norun, NULL) ?:
+ bch2_dev_usage_remove(c, ca->dev_idx);
bch_err_msg(c, ret, "removing dev alloc info");
return ret;
}
@@ -1645,6 +1651,16 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
if (ret)
goto err;
+ /*
+ * We need to flush the entire journal to get rid of keys that reference
+ * the device being removed before removing the superblock entry
+ */
+ bch2_journal_flush_all_pins(&c->journal);
+
+ /*
+ * this is really just needed for the bch2_replicas_gc_(start|end)
+ * calls, and could be cleaned up:
+ */
ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()");
if (ret)
@@ -1688,17 +1704,6 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
bch2_dev_free(ca);
/*
- * At this point the device object has been removed in-core, but the
- * on-disk journal might still refer to the device index via sb device
- * usage entries. Recovery fails if it sees usage information for an
- * invalid device. Flush journal pins to push the back of the journal
- * past now invalid device index references before we update the
- * superblock, but after the device object has been removed so any
- * further journal writes elide usage info for the device.
- */
- bch2_journal_flush_all_pins(&c->journal);
-
- /*
* Free this device's slot in the bch_member array - all pointers to
* this device must be gone:
*/
@@ -1710,8 +1715,6 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
mutex_unlock(&c->sb_lock);
up_write(&c->state_lock);
-
- bch2_dev_usage_journal_reserve(c);
return 0;
err:
if (ca->mi.state == BCH_MEMBER_STATE_rw &&
@@ -1759,13 +1762,11 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
goto err;
}
- bch2_dev_usage_init(ca);
-
ret = __bch2_dev_attach_bdev(ca, &sb);
if (ret)
goto err;
- ret = bch2_dev_journal_alloc(ca);
+ ret = bch2_dev_journal_alloc(ca, true);
bch_err_msg(c, ret, "allocating journal");
if (ret)
goto err;
@@ -1842,7 +1843,9 @@ have_slot:
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- bch2_dev_usage_journal_reserve(c);
+ ret = bch2_dev_usage_init(ca, false);
+ if (ret)
+ goto err_late;
ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
bch_err_msg(ca, ret, "marking new superblock");
@@ -1925,7 +1928,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
}
if (!ca->journal.nr) {
- ret = bch2_dev_journal_alloc(ca);
+ ret = bch2_dev_journal_alloc(ca, false);
bch_err_msg(ca, ret, "allocating journal");
if (ret)
goto err;
@@ -2014,15 +2017,18 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
mutex_unlock(&c->sb_lock);
if (ca->mi.freespace_initialized) {
- ret = bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
+ struct disk_accounting_pos acc = {
+ .type = BCH_DISK_ACCOUNTING_dev_data_type,
+ .dev_data_type.dev = ca->dev_idx,
+ .dev_data_type.data_type = BCH_DATA_free,
+ };
+ u64 v[3] = { nbuckets - old_nbuckets, 0, 0 };
+
+ ret = bch2_trans_do(ca->fs, NULL, NULL, 0,
+ bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?:
+ bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
if (ret)
goto err;
-
- /*
- * XXX: this is all wrong transactionally - we'll be able to do
- * this correctly after the disk space accounting rewrite
- */
- ca->usage_base->d[BCH_DATA_free].buckets += nbuckets - old_nbuckets;
}
bch2_recalc_capacity(c);
@@ -2034,6 +2040,9 @@ err:
/* return with ref on ca->ref: */
struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
{
+ if (!strncmp(name, "/dev/", strlen("/dev/")))
+ name += strlen("/dev/");
+
for_each_member_device(c, ca)
if (!strcmp(name, ca->name))
return ca;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 93ca74d108b1..1c0d1fb20276 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -17,10 +17,12 @@
#include "btree_iter.h"
#include "btree_key_cache.h"
#include "btree_update.h"
+#include "btree_update_interior.h"
#include "btree_gc.h"
#include "buckets.h"
#include "clock.h"
#include "compress.h"
+#include "disk_accounting.h"
#include "disk_groups.h"
#include "ec.h"
#include "inode.h"
@@ -140,8 +142,10 @@ write_attribute(trigger_gc);
write_attribute(trigger_discards);
write_attribute(trigger_invalidates);
write_attribute(trigger_journal_flush);
+write_attribute(trigger_journal_writes);
write_attribute(trigger_btree_cache_shrink);
write_attribute(trigger_btree_key_cache_shrink);
+write_attribute(trigger_freelist_wakeup);
rw_attribute(gc_gens_pos);
read_attribute(uuid);
@@ -168,6 +172,7 @@ read_attribute(compression_stats);
read_attribute(journal_debug);
read_attribute(btree_cache);
read_attribute(btree_key_cache);
+read_attribute(btree_reserve_cache);
read_attribute(stripes_heap);
read_attribute(open_buckets);
read_attribute(open_buckets_partial);
@@ -198,6 +203,8 @@ read_attribute(disk_groups);
read_attribute(has_data);
read_attribute(alloc_debug);
+read_attribute(accounting);
+read_attribute(usage_base);
#define x(t, n, ...) read_attribute(t);
BCH_PERSISTENT_COUNTERS()
@@ -251,91 +258,42 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
{
- struct btree_trans *trans;
- enum btree_id id;
- struct compression_type_stats {
- u64 nr_extents;
- u64 sectors_compressed;
- u64 sectors_uncompressed;
- } s[BCH_COMPRESSION_TYPE_NR];
- u64 compressed_incompressible = 0;
- int ret = 0;
-
- memset(s, 0, sizeof(s));
-
- if (!test_bit(BCH_FS_started, &c->flags))
- return -EPERM;
-
- trans = bch2_trans_get(c);
-
- for (id = 0; id < BTREE_ID_NR; id++) {
- if (!btree_type_has_ptrs(id))
- continue;
-
- ret = for_each_btree_key(trans, iter, id, POS_MIN,
- BTREE_ITER_all_snapshots, k, ({
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- struct bch_extent_crc_unpacked crc;
- const union bch_extent_entry *entry;
- bool compressed = false, incompressible = false;
-
- bkey_for_each_crc(k.k, ptrs, crc, entry) {
- incompressible |= crc.compression_type == BCH_COMPRESSION_TYPE_incompressible;
- compressed |= crc_is_compressed(crc);
-
- if (crc_is_compressed(crc)) {
- s[crc.compression_type].nr_extents++;
- s[crc.compression_type].sectors_compressed += crc.compressed_size;
- s[crc.compression_type].sectors_uncompressed += crc.uncompressed_size;
- }
- }
-
- compressed_incompressible += compressed && incompressible;
-
- if (!compressed) {
- unsigned t = incompressible ? BCH_COMPRESSION_TYPE_incompressible : 0;
-
- s[t].nr_extents++;
- s[t].sectors_compressed += k.k->size;
- s[t].sectors_uncompressed += k.k->size;
- }
- 0;
- }));
- }
-
- bch2_trans_put(trans);
-
- if (ret)
- return ret;
-
+ prt_str(out, "type");
printbuf_tabstop_push(out, 12);
printbuf_tabstop_push(out, 16);
printbuf_tabstop_push(out, 16);
printbuf_tabstop_push(out, 24);
prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n");
- for (unsigned i = 0; i < ARRAY_SIZE(s); i++) {
+ for (unsigned i = 1; i < BCH_COMPRESSION_TYPE_NR; i++) {
+ struct disk_accounting_pos a = {
+ .type = BCH_DISK_ACCOUNTING_compression,
+ .compression.type = i,
+ };
+ struct bpos p = disk_accounting_pos_to_bpos(&a);
+ u64 v[3];
+ bch2_accounting_mem_read(c, p, v, ARRAY_SIZE(v));
+
+ u64 nr_extents = v[0];
+ u64 sectors_uncompressed = v[1];
+ u64 sectors_compressed = v[2];
+
bch2_prt_compression_type(out, i);
prt_tab(out);
- prt_human_readable_u64(out, s[i].sectors_compressed << 9);
+ prt_human_readable_u64(out, sectors_compressed << 9);
prt_tab_rjust(out);
- prt_human_readable_u64(out, s[i].sectors_uncompressed << 9);
+ prt_human_readable_u64(out, sectors_uncompressed << 9);
prt_tab_rjust(out);
- prt_human_readable_u64(out, s[i].nr_extents
- ? div_u64(s[i].sectors_uncompressed << 9, s[i].nr_extents)
+ prt_human_readable_u64(out, nr_extents
+ ? div_u64(sectors_uncompressed << 9, nr_extents)
: 0);
prt_tab_rjust(out);
prt_newline(out);
}
- if (compressed_incompressible) {
- prt_printf(out, "%llu compressed & incompressible extents", compressed_incompressible);
- prt_newline(out);
- }
-
return 0;
}
@@ -346,6 +304,20 @@ static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
prt_printf(out, "\n");
}
+static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct bch_fs_usage_base b = {};
+
+ acc_u64s_percpu(&b.hidden, &c->usage->hidden, sizeof(b) / sizeof(u64));
+
+ prt_printf(out, "hidden:\t\t%llu\n", b.hidden);
+ prt_printf(out, "btree:\t\t%llu\n", b.btree);
+ prt_printf(out, "data:\t\t%llu\n", b.data);
+ prt_printf(out, "cached:\t%llu\n", b.cached);
+ prt_printf(out, "reserved:\t\t%llu\n", b.reserved);
+ prt_printf(out, "nr_inodes:\t%llu\n", b.nr_inodes);
+}
+
SHOW(bch2_fs)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
@@ -388,6 +360,9 @@ SHOW(bch2_fs)
if (attr == &sysfs_btree_key_cache)
bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
+ if (attr == &sysfs_btree_reserve_cache)
+ bch2_btree_reserve_cache_to_text(out, c);
+
if (attr == &sysfs_stripes_heap)
bch2_stripes_heap_to_text(out, c);
@@ -429,6 +404,12 @@ SHOW(bch2_fs)
if (attr == &sysfs_alloc_debug)
bch2_fs_alloc_debug_to_text(out, c);
+ if (attr == &sysfs_accounting)
+ bch2_fs_accounting_to_text(out, c);
+
+ if (attr == &sysfs_usage_base)
+ bch2_fs_usage_base_to_text(out, c);
+
return 0;
}
@@ -497,6 +478,12 @@ STORE(bch2_fs)
bch2_journal_meta(&c->journal);
}
+ if (attr == &sysfs_trigger_journal_writes)
+ bch2_journal_do_writes(&c->journal);
+
+ if (attr == &sysfs_trigger_freelist_wakeup)
+ closure_wake_up(&c->freelist_wait);
+
#ifdef CONFIG_BCACHEFS_TESTS
if (attr == &sysfs_perf_test) {
char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@@ -599,6 +586,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_journal_debug,
&sysfs_btree_cache,
&sysfs_btree_key_cache,
+ &sysfs_btree_reserve_cache,
&sysfs_new_stripes,
&sysfs_stripes_heap,
&sysfs_open_buckets,
@@ -615,8 +603,10 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_trigger_discards,
&sysfs_trigger_invalidates,
&sysfs_trigger_journal_flush,
+ &sysfs_trigger_journal_writes,
&sysfs_trigger_btree_cache_shrink,
&sysfs_trigger_btree_key_cache_shrink,
+ &sysfs_trigger_freelist_wakeup,
&sysfs_gc_gens_pos,
@@ -633,6 +623,8 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_disk_groups,
&sysfs_alloc_debug,
+ &sysfs_accounting,
+ &sysfs_usage_base,
NULL
};
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 68104b2056d9..01b768c9b767 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -121,7 +121,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
ck.k.p.offset = i;
ck.k.p.snapshot = U32_MAX;
- ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0);
bch_err_msg(c, ret, "insert error");
if (ret)
return ret;
@@ -176,7 +176,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
ck.k.p.snapshot = U32_MAX;
ck.k.size = 8;
- ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
+ ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0);
bch_err_msg(c, ret, "insert error");
if (ret)
return ret;
@@ -232,7 +232,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
ck.k.p.offset = i * 2;
ck.k.p.snapshot = U32_MAX;
- ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0, 0);
bch_err_msg(c, ret, "insert error");
if (ret)
return ret;
@@ -292,7 +292,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
ck.k.p.snapshot = U32_MAX;
ck.k.size = 8;
- ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
+ ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0, 0);
bch_err_msg(c, ret, "insert error");
if (ret)
return ret;
@@ -396,7 +396,7 @@ static int insert_test_extent(struct bch_fs *c,
k.k_i.k.size = end - start;
k.k_i.k.version.lo = test_version++;
- ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0);
+ ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0);
bch_err_fn(c, ret);
return ret;
}
@@ -481,7 +481,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
bkey_cookie_init(&cookie.k_i);
cookie.k.p.snapshot = snapid_hi;
- ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0);
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0);
if (ret)
return ret;
@@ -506,7 +506,7 @@ static int test_snapshots(struct bch_fs *c, u64 nr)
bkey_cookie_init(&cookie.k_i);
cookie.k.p.snapshot = U32_MAX;
- ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0);
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0);
if (ret)
return ret;
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
index b1af7ac430f6..0807ce9b171a 100644
--- a/fs/bcachefs/thread_with_file.c
+++ b/fs/bcachefs/thread_with_file.c
@@ -67,9 +67,14 @@ err:
/* stdio_redirect */
+static bool stdio_redirect_has_more_input(struct stdio_redirect *stdio, size_t seen)
+{
+ return stdio->input.buf.nr > seen || stdio->done;
+}
+
static bool stdio_redirect_has_input(struct stdio_redirect *stdio)
{
- return stdio->input.buf.nr || stdio->done;
+ return stdio_redirect_has_more_input(stdio, 0);
}
static bool stdio_redirect_has_output(struct stdio_redirect *stdio)
@@ -181,9 +186,13 @@ static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubu
}
spin_lock(&buf->lock);
- if (buf->buf.nr < STDIO_REDIRECT_BUFSIZE)
- darray_make_room_gfp(&buf->buf,
- min(b, STDIO_REDIRECT_BUFSIZE - buf->buf.nr), GFP_NOWAIT);
+ size_t makeroom = b;
+ if (!buf->waiting_for_line || memchr(buf->buf.data, '\n', buf->buf.nr))
+ makeroom = min_t(ssize_t, makeroom,
+ max_t(ssize_t, STDIO_REDIRECT_BUFSIZE - buf->buf.nr,
+ 0));
+ darray_make_room_gfp(&buf->buf, makeroom, GFP_NOWAIT);
+
b = min(len, darray_room(buf->buf));
if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) {
@@ -355,43 +364,67 @@ int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t le
return ret;
}
-int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *ubuf, size_t len)
+int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *stdio,
+ darray_char *line,
+ unsigned long timeout)
{
+ unsigned long until = jiffies + timeout, t;
struct stdio_buf *buf = &stdio->input;
- size_t copied = 0;
- ssize_t ret = 0;
+ size_t seen = 0;
again:
- do {
- wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
- sysctl_hung_task_timeout_secs * HZ / 2);
- } while (!stdio_redirect_has_input(stdio));
+ t = timeout != MAX_SCHEDULE_TIMEOUT
+ ? max_t(long, until - jiffies, 0)
+ : timeout;
- if (stdio->done) {
- ret = -1;
- goto out;
- }
+ t = min(t, sysctl_hung_task_timeout_secs * HZ / 2);
+
+ wait_event_timeout(buf->wait, stdio_redirect_has_more_input(stdio, seen), t);
+
+ if (stdio->done)
+ return -1;
spin_lock(&buf->lock);
- size_t b = min(len, buf->buf.nr);
- char *n = memchr(buf->buf.data, '\n', b);
- if (n)
- b = min_t(size_t, b, n + 1 - buf->buf.data);
+ seen = buf->buf.nr;
+ char *n = memchr(buf->buf.data, '\n', seen);
+
+ if (!n && timeout != MAX_SCHEDULE_TIMEOUT && jiffies >= until) {
+ spin_unlock(&buf->lock);
+ return -ETIME;
+ }
+
+ if (!n) {
+ buf->waiting_for_line = true;
+ spin_unlock(&buf->lock);
+ goto again;
+ }
+
+ size_t b = n + 1 - buf->buf.data;
+ if (b > line->size) {
+ spin_unlock(&buf->lock);
+ int ret = darray_resize(line, b);
+ if (ret)
+ return ret;
+ seen = 0;
+ goto again;
+ }
+
buf->buf.nr -= b;
- memcpy(ubuf, buf->buf.data, b);
+ memcpy(line->data, buf->buf.data, b);
memmove(buf->buf.data,
buf->buf.data + b,
buf->buf.nr);
- ubuf += b;
- len -= b;
- copied += b;
+ line->nr = b;
+
+ buf->waiting_for_line = false;
spin_unlock(&buf->lock);
wake_up(&buf->wait);
+ return 0;
+}
- if (!n && len)
- goto again;
-out:
- return copied ?: ret;
+int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, darray_char *line)
+{
+ return bch2_stdio_redirect_readline_timeout(stdio, line, MAX_SCHEDULE_TIMEOUT);
}
__printf(3, 0)
diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h
index 1d63d14d7dca..72497b921911 100644
--- a/fs/bcachefs/thread_with_file.h
+++ b/fs/bcachefs/thread_with_file.h
@@ -71,7 +71,9 @@ int bch2_run_thread_with_stdio(struct thread_with_stdio *,
int bch2_run_thread_with_stdout(struct thread_with_stdio *,
const struct thread_with_stdio_ops *);
int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
-int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t);
+
+int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *, darray_char *, unsigned long);
+int bch2_stdio_redirect_readline(struct stdio_redirect *, darray_char *);
__printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list);
__printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...);
diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h
index e0daf4eec341..f4d484d44f63 100644
--- a/fs/bcachefs/thread_with_file_types.h
+++ b/fs/bcachefs/thread_with_file_types.h
@@ -8,15 +8,12 @@ struct stdio_buf {
spinlock_t lock;
wait_queue_head_t wait;
darray_char buf;
+ bool waiting_for_line;
};
struct stdio_redirect {
struct stdio_buf input;
struct stdio_buf output;
-
- spinlock_t input_lock;
- wait_queue_head_t input_wait;
- darray_char input_buf;
bool done;
};
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 84fcf26e306e..d0e6b9deb6cb 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -200,6 +200,56 @@ DECLARE_EVENT_CLASS(bio,
(unsigned long long)__entry->sector, __entry->nr_sector)
);
+/* fs.c: */
+TRACE_EVENT(bch2_sync_fs,
+ TP_PROTO(struct super_block *sb, int wait),
+
+ TP_ARGS(sb, wait),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( int, wait )
+
+ ),
+
+ TP_fast_assign(
+ __entry->dev = sb->s_dev;
+ __entry->wait = wait;
+ ),
+
+ TP_printk("dev %d,%d wait %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->wait)
+);
+
+/* fs-io.c: */
+TRACE_EVENT(bch2_fsync,
+ TP_PROTO(struct file *file, int datasync),
+
+ TP_ARGS(file, datasync),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( ino_t, ino )
+ __field( ino_t, parent )
+ __field( int, datasync )
+ ),
+
+ TP_fast_assign(
+ struct dentry *dentry = file->f_path.dentry;
+
+ __entry->dev = dentry->d_sb->s_dev;
+ __entry->ino = d_inode(dentry)->i_ino;
+ __entry->parent = d_inode(dentry->d_parent)->i_ino;
+ __entry->datasync = datasync;
+ ),
+
+ TP_printk("dev %d,%d ino %lu parent %lu datasync %d ",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long) __entry->ino,
+ (unsigned long) __entry->parent, __entry->datasync)
+);
+
/* super-io.c: */
TRACE_EVENT(write_super,
TP_PROTO(struct bch_fs *c, unsigned long ip),
diff --git a/fs/bcachefs/two_state_shared_lock.h b/fs/bcachefs/two_state_shared_lock.h
index 905801772002..7f647846b511 100644
--- a/fs/bcachefs/two_state_shared_lock.h
+++ b/fs/bcachefs/two_state_shared_lock.h
@@ -36,15 +36,14 @@ static inline void bch2_two_state_unlock(two_state_lock_t *lock, int s)
static inline bool bch2_two_state_trylock(two_state_lock_t *lock, int s)
{
long i = s ? 1 : -1;
- long v = atomic_long_read(&lock->v), old;
+ long old;
+ old = atomic_long_read(&lock->v);
do {
- old = v;
-
- if (i > 0 ? v < 0 : v > 0)
+ if (i > 0 ? old < 0 : old > 0)
return false;
- } while ((v = atomic_long_cmpxchg_acquire(&lock->v,
- old, old + i)) != old);
+ } while (!atomic_long_try_cmpxchg_acquire(&lock->v, &old, old + i));
+
return true;
}
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 65d42206703c..138320eaa2ad 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -252,8 +252,10 @@ void bch2_prt_u64_base2(struct printbuf *out, u64 v)
bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1);
}
-void bch2_print_string_as_lines(const char *prefix, const char *lines)
+static void __bch2_print_string_as_lines(const char *prefix, const char *lines,
+ bool nonblocking)
{
+ bool locked = false;
const char *p;
if (!lines) {
@@ -261,7 +263,13 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines)
return;
}
- console_lock();
+ if (!nonblocking) {
+ console_lock();
+ locked = true;
+ } else {
+ locked = console_trylock();
+ }
+
while (1) {
p = strchrnul(lines, '\n');
printk("%s%.*s\n", prefix, (int) (p - lines), lines);
@@ -269,7 +277,18 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines)
break;
lines = p + 1;
}
- console_unlock();
+ if (locked)
+ console_unlock();
+}
+
+void bch2_print_string_as_lines(const char *prefix, const char *lines)
+{
+ return __bch2_print_string_as_lines(prefix, lines, false);
+}
+
+void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines)
+{
+ return __bch2_print_string_as_lines(prefix, lines, true);
}
int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr,
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index d0d99400f986..902b7f5406a2 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -201,6 +201,7 @@ void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
void bch2_prt_u64_base2(struct printbuf *, u64);
void bch2_print_string_as_lines(const char *prefix, const char *lines);
+void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines);
typedef DARRAY(unsigned long) bch_stacktrace;
int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t);
@@ -583,14 +584,19 @@ do { \
} \
} while (0)
+#define per_cpu_sum(_p) \
+({ \
+ typeof(*_p) _ret = 0; \
+ \
+ int cpu; \
+ for_each_possible_cpu(cpu) \
+ _ret += *per_cpu_ptr(_p, cpu); \
+ _ret; \
+})
+
static inline u64 percpu_u64_get(u64 __percpu *src)
{
- u64 ret = 0;
- int cpu;
-
- for_each_possible_cpu(cpu)
- ret += *per_cpu_ptr(src, cpu);
- return ret;
+ return per_cpu_sum(src);
}
static inline void percpu_u64_set(u64 __percpu *dst, u64 src)
@@ -604,9 +610,7 @@ static inline void percpu_u64_set(u64 __percpu *dst, u64 src)
static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr)
{
- unsigned i;
-
- for (i = 0; i < nr; i++)
+ for (unsigned i = 0; i < nr; i++)
acc[i] += src[i];
}
diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
index cb4f33ed9ab3..a9ebcd82c602 100644
--- a/fs/bcachefs/varint.c
+++ b/fs/bcachefs/varint.c
@@ -85,7 +85,7 @@ int bch2_varint_encode_fast(u8 *out, u64 v)
if (likely(bytes < 9)) {
v <<= bytes;
- v |= ~(~0 << (bytes - 1));
+ v |= ~(~0U << (bytes - 1));
} else {
*out++ = 255;
bytes = 9;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index d76f406d3b2e..f92f108840f5 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -475,6 +475,7 @@ static int befs_symlink_read_folio(struct file *unused, struct folio *folio)
befs_data_stream *data = &befs_ino->i_data.ds;
befs_off_t len = data->size;
char *link = folio_address(folio);
+ int err = -EIO;
if (len == 0 || len > PAGE_SIZE) {
befs_error(sb, "Long symlink with illegal length");
@@ -487,13 +488,10 @@ static int befs_symlink_read_folio(struct file *unused, struct folio *folio)
goto fail;
}
link[len - 1] = '\0';
- folio_mark_uptodate(folio);
- folio_unlock(folio);
- return 0;
+ err = 0;
fail:
- folio_set_error(folio);
- folio_unlock(folio);
- return -EIO;
+ folio_end_read(folio, err == 0);
+ return err;
}
/*
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index a43897b03ce9..5ae8045f4df4 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1003,7 +1003,8 @@ out_free_interp:
if (elf_read_implies_exec(*elf_ex, executable_stack))
current->personality |= READ_IMPLIES_EXEC;
- if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+ const int snapshot_randomize_va_space = READ_ONCE(randomize_va_space);
+ if (!(current->personality & ADDR_NO_RANDOMIZE) && snapshot_randomize_va_space)
current->flags |= PF_RANDOMIZE;
setup_new_exec(bprm);
@@ -1061,10 +1062,40 @@ out_free_interp:
* Header for ET_DYN binaries to calculate the
* randomization (load_bias) for all the LOAD
* Program Headers.
+ */
+
+ /*
+ * Calculate the entire size of the ELF mapping
+ * (total_size), used for the initial mapping,
+ * due to load_addr_set which is set to true later
+ * once the initial mapping is performed.
+ *
+ * Note that this is only sensible when the LOAD
+ * segments are contiguous (or overlapping). If
+ * used for LOADs that are far apart, this would
+ * cause the holes between LOADs to be mapped,
+ * running the risk of having the mapping fail,
+ * as it would be larger than the ELF file itself.
*
+ * As a result, only ET_DYN does this, since
+ * some ET_EXEC (e.g. ia64) may have large virtual
+ * memory holes between LOADs.
+ *
+ */
+ total_size = total_mapping_size(elf_phdata,
+ elf_ex->e_phnum);
+ if (!total_size) {
+ retval = -EINVAL;
+ goto out_free_dentry;
+ }
+
+ /* Calculate any requested alignment. */
+ alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
+
+ /*
* There are effectively two types of ET_DYN
- * binaries: programs (i.e. PIE: ET_DYN with INTERP)
- * and loaders (ET_DYN without INTERP, since they
+ * binaries: programs (i.e. PIE: ET_DYN with PT_INTERP)
+ * and loaders (ET_DYN without PT_INTERP, since they
* _are_ the ELF interpreter). The loaders must
* be loaded away from programs since the program
* may otherwise collide with the loader (especially
@@ -1084,15 +1115,44 @@ out_free_interp:
* without MAP_FIXED nor MAP_FIXED_NOREPLACE).
*/
if (interpreter) {
+ /* On ET_DYN with PT_INTERP, we do the ASLR. */
load_bias = ELF_ET_DYN_BASE;
if (current->flags & PF_RANDOMIZE)
load_bias += arch_mmap_rnd();
- alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
+ /* Adjust alignment as requested. */
if (alignment)
load_bias &= ~(alignment - 1);
elf_flags |= MAP_FIXED_NOREPLACE;
- } else
- load_bias = 0;
+ } else {
+ /*
+ * For ET_DYN without PT_INTERP, we rely on
+ * the architectures's (potentially ASLR) mmap
+ * base address (via a load_bias of 0).
+ *
+ * When a large alignment is requested, we
+ * must do the allocation at address "0" right
+ * now to discover where things will load so
+ * that we can adjust the resulting alignment.
+ * In this case (load_bias != 0), we can use
+ * MAP_FIXED_NOREPLACE to make sure the mapping
+ * doesn't collide with anything.
+ */
+ if (alignment > ELF_MIN_ALIGN) {
+ load_bias = elf_load(bprm->file, 0, elf_ppnt,
+ elf_prot, elf_flags, total_size);
+ if (BAD_ADDR(load_bias)) {
+ retval = IS_ERR_VALUE(load_bias) ?
+ PTR_ERR((void*)load_bias) : -EINVAL;
+ goto out_free_dentry;
+ }
+ vm_munmap(load_bias, total_size);
+ /* Adjust alignment as requested. */
+ if (alignment)
+ load_bias &= ~(alignment - 1);
+ elf_flags |= MAP_FIXED_NOREPLACE;
+ } else
+ load_bias = 0;
+ }
/*
* Since load_bias is used for all subsequent loading
@@ -1102,31 +1162,6 @@ out_free_interp:
* is then page aligned.
*/
load_bias = ELF_PAGESTART(load_bias - vaddr);
-
- /*
- * Calculate the entire size of the ELF mapping
- * (total_size), used for the initial mapping,
- * due to load_addr_set which is set to true later
- * once the initial mapping is performed.
- *
- * Note that this is only sensible when the LOAD
- * segments are contiguous (or overlapping). If
- * used for LOADs that are far apart, this would
- * cause the holes between LOADs to be mapped,
- * running the risk of having the mapping fail,
- * as it would be larger than the ELF file itself.
- *
- * As a result, only ET_DYN does this, since
- * some ET_EXEC (e.g. ia64) may have large virtual
- * memory holes between LOADs.
- *
- */
- total_size = total_mapping_size(elf_phdata,
- elf_ex->e_phnum);
- if (!total_size) {
- retval = -EINVAL;
- goto out_free_dentry;
- }
}
error = elf_load(bprm->file, load_bias + vaddr, elf_ppnt,
@@ -1216,7 +1251,6 @@ out_free_interp:
}
reloc_func_desc = interp_load_addr;
- allow_write_access(interpreter);
fput(interpreter);
kfree(interp_elf_ex);
@@ -1251,7 +1285,7 @@ out_free_interp:
mm->end_data = end_data;
mm->start_stack = bprm->p;
- if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
+ if ((current->flags & PF_RANDOMIZE) && (snapshot_randomize_va_space > 1)) {
/*
* For architectures with ELF randomization, when executing
* a loader directly (i.e. no interpreter listed in ELF
@@ -1308,7 +1342,6 @@ out_free_dentry:
kfree(interp_elf_ex);
kfree(interp_elf_phdata);
out_free_file:
- allow_write_access(interpreter);
if (interpreter)
fput(interpreter);
out_free_ph:
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index b799701454a9..28a3439f163a 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -394,7 +394,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
goto error;
}
- allow_write_access(interpreter);
fput(interpreter);
interpreter = NULL;
}
@@ -466,10 +465,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
retval = 0;
error:
- if (interpreter) {
- allow_write_access(interpreter);
+ if (interpreter)
fput(interpreter);
- }
kfree(interpreter_name);
kfree(exec_params.phdrs);
kfree(exec_params.loadmap);
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 68fa225f89e5..31660d8cc2c6 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -247,13 +247,10 @@ static int load_misc_binary(struct linux_binprm *bprm)
if (retval < 0)
goto ret;
- if (fmt->flags & MISC_FMT_OPEN_FILE) {
+ if (fmt->flags & MISC_FMT_OPEN_FILE)
interp_file = file_clone_open(fmt->interp_file);
- if (!IS_ERR(interp_file))
- deny_write_access(interp_file);
- } else {
+ else
interp_file = open_exec(fmt->interpreter);
- }
retval = PTR_ERR(interp_file);
if (IS_ERR(interp_file))
goto ret;
@@ -1086,4 +1083,5 @@ static void __exit exit_misc_binfmt(void)
core_initcall(init_misc_binfmt);
module_exit(exit_misc_binfmt);
+MODULE_DESCRIPTION("Kernel support for miscellaneous binaries");
MODULE_LICENSE("GPL");
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 1b6625e95958..637daf6e4d45 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -155,4 +155,5 @@ static void __exit exit_script_binfmt(void)
core_initcall(init_script_binfmt);
module_exit(exit_script_binfmt);
+MODULE_DESCRIPTION("Kernel support for scripts starting with #!");
MODULE_LICENSE("GPL");
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 525af975f61c..87617f2968bc 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -33,7 +33,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \
- lru_cache.o raid-stripe-tree.o
+ lru_cache.o raid-stripe-tree.o fiemap.o direct-io.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 6fce3e8d3dac..b2eb9cde2c5d 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -34,7 +34,7 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e
static inline u8 get_unaligned_le8(const void *p)
{
- return *(u8 *)p;
+ return *(const u8 *)p;
}
static inline void put_unaligned_le8(u8 val, void *p)
@@ -48,8 +48,8 @@ static inline void put_unaligned_le8(u8 val, void *p)
offsetof(type, member), \
sizeof_field(type, member)))
-#define write_eb_member(eb, ptr, type, member, result) (\
- write_extent_buffer(eb, (char *)(result), \
+#define write_eb_member(eb, ptr, type, member, source) ( \
+ write_extent_buffer(eb, (const char *)(source), \
((unsigned long)(ptr)) + \
offsetof(type, member), \
sizeof_field(type, member)))
@@ -315,11 +315,8 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
-BTRFS_SETGET_FUNCS(stripe_extent_encoding, struct btrfs_stripe_extent, encoding, 8);
BTRFS_SETGET_FUNCS(raid_stride_devid, struct btrfs_raid_stride, devid, 64);
BTRFS_SETGET_FUNCS(raid_stride_physical, struct btrfs_raid_stride, physical, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_stripe_extent_encoding,
- struct btrfs_stripe_extent, encoding, 8);
BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_devid, struct btrfs_raid_stride, devid, 64);
BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_physical, struct btrfs_raid_stride, physical, 64);
@@ -353,7 +350,7 @@ static inline void btrfs_tree_block_key(const struct extent_buffer *eb,
static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb,
struct btrfs_tree_block_info *item,
- struct btrfs_disk_key *key)
+ const struct btrfs_disk_key *key)
{
write_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
}
@@ -446,7 +443,7 @@ void btrfs_node_key(const struct extent_buffer *eb,
struct btrfs_disk_key *disk_key, int nr);
static inline void btrfs_set_node_key(const struct extent_buffer *eb,
- struct btrfs_disk_key *disk_key, int nr)
+ const struct btrfs_disk_key *disk_key, int nr)
{
unsigned long ptr;
@@ -512,7 +509,7 @@ static inline void btrfs_item_key(const struct extent_buffer *eb,
}
static inline void btrfs_set_item_key(struct extent_buffer *eb,
- struct btrfs_disk_key *disk_key, int nr)
+ const struct btrfs_disk_key *disk_key, int nr)
{
struct btrfs_item *item = btrfs_item_nr(eb, nr);
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index e3a57196b0ee..f04d93109960 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -29,7 +29,7 @@ struct btrfs_failed_bio {
/* Is this a data path I/O that needs storage layer checksum and repair? */
static inline bool is_data_bbio(struct btrfs_bio *bbio)
{
- return bbio->inode && is_data_inode(&bbio->inode->vfs_inode);
+ return bbio->inode && is_data_inode(bbio->inode);
}
static bool bbio_has_ordered_extent(struct btrfs_bio *bbio)
@@ -732,7 +732,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
* point, so they are handled as part of the no-checksum case.
*/
if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) &&
- !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
+ !test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) &&
!btrfs_is_data_reloc_root(inode->root)) {
if (should_async_write(bbio) &&
btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num))
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 1a66be33bb04..498442d0c216 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1022,6 +1022,13 @@ static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
}
}
+static struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
+{
+ if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
+ return fs_info->block_group_root;
+ return btrfs_extent_root(fs_info, 0);
+}
+
static int remove_block_group_item(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_block_group *block_group)
@@ -1757,24 +1764,21 @@ static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed)
{
- const struct btrfs_space_info *space_info = bg->space_info;
- const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
+ const int thresh_pct = btrfs_calc_reclaim_threshold(bg->space_info);
+ u64 thresh_bytes = mult_perc(bg->length, thresh_pct);
const u64 new_val = bg->used;
const u64 old_val = new_val + bytes_freed;
- u64 thresh;
- if (reclaim_thresh == 0)
+ if (thresh_bytes == 0)
return false;
- thresh = mult_perc(bg->length, reclaim_thresh);
-
/*
* If we were below the threshold before don't reclaim, we are likely a
* brand new block group and we don't want to relocate new block groups.
*/
- if (old_val < thresh)
+ if (old_val < thresh_bytes)
return false;
- if (new_val >= thresh)
+ if (new_val >= thresh_bytes)
return false;
return true;
}
@@ -1822,6 +1826,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
while (!list_empty(&fs_info->reclaim_bgs)) {
u64 zone_unusable;
+ u64 reclaimed;
int ret = 0;
bg = list_first_entry(&fs_info->reclaim_bgs,
@@ -1835,6 +1840,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
/* Don't race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
+ spin_lock(&space_info->lock);
spin_lock(&bg->lock);
if (bg->reserved || bg->pinned || bg->ro) {
/*
@@ -1844,6 +1850,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
* this block group.
*/
spin_unlock(&bg->lock);
+ spin_unlock(&space_info->lock);
up_write(&space_info->groups_sem);
goto next;
}
@@ -1862,6 +1869,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
if (!btrfs_test_opt(fs_info, DISCARD_ASYNC))
btrfs_mark_bg_unused(bg);
spin_unlock(&bg->lock);
+ spin_unlock(&space_info->lock);
up_write(&space_info->groups_sem);
goto next;
@@ -1878,10 +1886,12 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
*/
if (!should_reclaim_block_group(bg, bg->length)) {
spin_unlock(&bg->lock);
+ spin_unlock(&space_info->lock);
up_write(&space_info->groups_sem);
goto next;
}
spin_unlock(&bg->lock);
+ spin_unlock(&space_info->lock);
/*
* Get out fast, in case we're read-only or unmounting the
@@ -1914,18 +1924,38 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
div64_u64(bg->used * 100, bg->length),
div64_u64(zone_unusable * 100, bg->length));
trace_btrfs_reclaim_block_group(bg);
+ reclaimed = bg->used;
ret = btrfs_relocate_chunk(fs_info, bg->start);
if (ret) {
btrfs_dec_block_group_ro(bg);
btrfs_err(fs_info, "error relocating chunk %llu",
bg->start);
+ reclaimed = 0;
+ spin_lock(&space_info->lock);
+ space_info->reclaim_errors++;
+ if (READ_ONCE(space_info->periodic_reclaim))
+ space_info->periodic_reclaim_ready = false;
+ spin_unlock(&space_info->lock);
}
+ spin_lock(&space_info->lock);
+ space_info->reclaim_count++;
+ space_info->reclaim_bytes += reclaimed;
+ spin_unlock(&space_info->lock);
next:
- if (ret) {
+ if (ret && !READ_ONCE(space_info->periodic_reclaim)) {
/* Refcount held by the reclaim_bgs list after splice. */
- btrfs_get_block_group(bg);
- list_add_tail(&bg->bg_list, &retry_list);
+ spin_lock(&fs_info->unused_bgs_lock);
+ /*
+ * This block group might be added to the unused list
+ * during the above process. Move it back to the
+ * reclaim list otherwise.
+ */
+ if (list_empty(&bg->bg_list)) {
+ btrfs_get_block_group(bg);
+ list_add_tail(&bg->bg_list, &retry_list);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
}
btrfs_put_block_group(bg);
@@ -1955,6 +1985,7 @@ end:
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
{
+ btrfs_reclaim_sweep(fs_info);
spin_lock(&fs_info->unused_bgs_lock);
if (!list_empty(&fs_info->reclaim_bgs))
queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
@@ -3653,9 +3684,12 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
old_val += num_bytes;
cache->used = old_val;
cache->reserved -= num_bytes;
+ cache->reclaim_mark = 0;
space_info->bytes_reserved -= num_bytes;
space_info->bytes_used += num_bytes;
space_info->disk_used += num_bytes * factor;
+ if (READ_ONCE(space_info->periodic_reclaim))
+ btrfs_space_info_update_reclaimable(space_info, -num_bytes);
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
} else {
@@ -3665,8 +3699,10 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes);
space_info->bytes_used -= num_bytes;
space_info->disk_used -= num_bytes * factor;
-
- reclaim = should_reclaim_block_group(cache, num_bytes);
+ if (READ_ONCE(space_info->periodic_reclaim))
+ btrfs_space_info_update_reclaimable(space_info, num_bytes);
+ else
+ reclaim = should_reclaim_block_group(cache, num_bytes);
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
@@ -4320,13 +4356,13 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
spin_lock(&block_group->lock);
if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF,
&block_group->runtime_flags)) {
- struct inode *inode = block_group->inode;
+ struct btrfs_inode *inode = block_group->inode;
block_group->inode = NULL;
spin_unlock(&block_group->lock);
ASSERT(block_group->io_ctl.inode == NULL);
- iput(inode);
+ iput(&inode->vfs_inode);
} else {
spin_unlock(&block_group->lock);
}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 85e2d4cd12dc..915111338fc0 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -115,7 +115,7 @@ struct btrfs_caching_control {
struct btrfs_block_group {
struct btrfs_fs_info *fs_info;
- struct inode *inode;
+ struct btrfs_inode *inode;
spinlock_t lock;
u64 start;
u64 length;
@@ -263,6 +263,7 @@ struct btrfs_block_group {
struct work_struct zone_finish_work;
struct extent_buffer *last_eb;
enum btrfs_block_group_size_class size_class;
+ u64 reclaim_mark;
};
static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 6ed495ca7a31..3056c8aed8ef 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -19,7 +19,6 @@
#include <uapi/linux/btrfs_tree.h>
#include <trace/events/btrfs.h>
#include "block-rsv.h"
-#include "btrfs_inode.h"
#include "extent_map.h"
#include "extent_io.h"
#include "extent-io-tree.h"
@@ -99,6 +98,29 @@ enum {
* range).
*/
BTRFS_INODE_COW_WRITE_ERROR,
+ /*
+ * Indicate this is a directory that points to a subvolume for which
+ * there is no root reference item. That's a case like the following:
+ *
+ * $ btrfs subvolume create /mnt/parent
+ * $ btrfs subvolume create /mnt/parent/child
+ * $ btrfs subvolume snapshot /mnt/parent /mnt/snap
+ *
+ * If subvolume "parent" is root 256, subvolume "child" is root 257 and
+ * snapshot "snap" is root 258, then there's no root reference item (key
+ * BTRFS_ROOT_REF_KEY in the root tree) for the subvolume "child"
+ * associated to root 258 (the snapshot) - there's only for the root
+ * of the "parent" subvolume (root 256). In the chunk root we have a
+ * (256 BTRFS_ROOT_REF_KEY 257) key but we don't have a
+ * (258 BTRFS_ROOT_REF_KEY 257) key - the sames goes for backrefs, we
+ * have a (257 BTRFS_ROOT_BACKREF_KEY 256) but we don't have a
+ * (257 BTRFS_ROOT_BACKREF_KEY 258) key.
+ *
+ * So when opening the "child" dentry from the snapshot's directory,
+ * we don't find a root ref item and we create a stub inode. This is
+ * done at new_simple_dir(), called from btrfs_lookup_dentry().
+ */
+ BTRFS_INODE_ROOT_STUB,
};
/* in memory btrfs inode */
@@ -106,10 +128,14 @@ struct btrfs_inode {
/* which subvolume this inode belongs to */
struct btrfs_root *root;
- /* key used to find this inode on disk. This is used by the code
- * to read in roots of subvolumes
+#if BITS_PER_LONG == 32
+ /*
+ * The objectid of the corresponding BTRFS_INODE_ITEM_KEY.
+ * On 64 bits platforms we can get it from vfs_inode.i_ino, which is an
+ * unsigned long and therefore 64 bits on such platforms.
*/
- struct btrfs_key location;
+ u64 objectid;
+#endif
/* Cached value of inode property 'compression'. */
u8 prop_compress;
@@ -165,9 +191,6 @@ struct btrfs_inode {
*/
struct list_head delalloc_inodes;
- /* node for the red-black tree that links inodes in subvolume root */
- struct rb_node rb_node;
-
unsigned long runtime_flags;
/* full 64 bit generation number, struct vfs_inode doesn't have a big
@@ -228,11 +251,20 @@ struct btrfs_inode {
u64 last_dir_index_offset;
};
- /*
- * Total number of bytes pending defrag, used by stat to check whether
- * it needs COW. Protected by 'lock'.
- */
- u64 defrag_bytes;
+ union {
+ /*
+ * Total number of bytes pending defrag, used by stat to check whether
+ * it needs COW. Protected by 'lock'.
+ * Used by inodes other than the data relocation inode.
+ */
+ u64 defrag_bytes;
+
+ /*
+ * Logical address of the block group being relocated.
+ * Used only by the data relocation inode.
+ */
+ u64 reloc_block_group_start;
+ };
/*
* The size of the file stored in the metadata on disk. data=ordered
@@ -241,12 +273,21 @@ struct btrfs_inode {
*/
u64 disk_i_size;
- /*
- * If this is a directory then index_cnt is the counter for the index
- * number for new files that are created. For an empty directory, this
- * must be initialized to BTRFS_DIR_START_INDEX.
- */
- u64 index_cnt;
+ union {
+ /*
+ * If this is a directory then index_cnt is the counter for the
+ * index number for new files that are created. For an empty
+ * directory, this must be initialized to BTRFS_DIR_START_INDEX.
+ */
+ u64 index_cnt;
+
+ /*
+ * If this is not a directory, this is the number of bytes
+ * outstanding that are going to need csums. This is used in
+ * ENOSPC accounting. Protected by 'lock'.
+ */
+ u64 csum_bytes;
+ };
/* Cache the directory index number to speed the dir/file remove */
u64 dir_index;
@@ -258,22 +299,25 @@ struct btrfs_inode {
*/
u64 last_unlink_trans;
- /*
- * The id/generation of the last transaction where this inode was
- * either the source or the destination of a clone/dedupe operation.
- * Used when logging an inode to know if there are shared extents that
- * need special care when logging checksum items, to avoid duplicate
- * checksum items in a log (which can lead to a corruption where we end
- * up with missing checksum ranges after log replay).
- * Protected by the vfs inode lock.
- */
- u64 last_reflink_trans;
+ union {
+ /*
+ * The id/generation of the last transaction where this inode
+ * was either the source or the destination of a clone/dedupe
+ * operation. Used when logging an inode to know if there are
+ * shared extents that need special care when logging checksum
+ * items, to avoid duplicate checksum items in a log (which can
+ * lead to a corruption where we end up with missing checksum
+ * ranges after log replay). Protected by the VFS inode lock.
+ * Used for regular files only.
+ */
+ u64 last_reflink_trans;
- /*
- * Number of bytes outstanding that are going to need csums. This is
- * used in ENOSPC accounting. Protected by 'lock'.
- */
- u64 csum_bytes;
+ /*
+ * In case this a root stub inode (BTRFS_INODE_ROOT_STUB flag set),
+ * the ID of that root.
+ */
+ u64 ref_root_id;
+ };
/* Backwards incompatible flags, lower half of inode_item::flags */
u32 flags;
@@ -331,10 +375,9 @@ static inline unsigned long btrfs_inode_hash(u64 objectid,
*/
static inline u64 btrfs_ino(const struct btrfs_inode *inode)
{
- u64 ino = inode->location.objectid;
+ u64 ino = inode->objectid;
- /* type == BTRFS_ROOT_ITEM_KEY: subvol dir */
- if (inode->location.type == BTRFS_ROOT_ITEM_KEY)
+ if (test_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags))
ino = inode->vfs_inode.i_ino;
return ino;
}
@@ -348,20 +391,36 @@ static inline u64 btrfs_ino(const struct btrfs_inode *inode)
#endif
+static inline void btrfs_get_inode_key(const struct btrfs_inode *inode,
+ struct btrfs_key *key)
+{
+ key->objectid = btrfs_ino(inode);
+ key->type = BTRFS_INODE_ITEM_KEY;
+ key->offset = 0;
+}
+
+static inline void btrfs_set_inode_number(struct btrfs_inode *inode, u64 ino)
+{
+#if BITS_PER_LONG == 32
+ inode->objectid = ino;
+#endif
+ inode->vfs_inode.i_ino = ino;
+}
+
static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size)
{
i_size_write(&inode->vfs_inode, size);
inode->disk_i_size = size;
}
-static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)
+static inline bool btrfs_is_free_space_inode(const struct btrfs_inode *inode)
{
return test_bit(BTRFS_INODE_FREE_SPACE_INODE, &inode->runtime_flags);
}
-static inline bool is_data_inode(struct inode *inode)
+static inline bool is_data_inode(const struct btrfs_inode *inode)
{
- return btrfs_ino(BTRFS_I(inode)) != BTRFS_BTREE_INODE_OBJECTID;
+ return btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID;
}
static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
@@ -455,8 +514,8 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
u32 bio_offset, struct bio_vec *bv);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
- u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes, bool nowait, bool strict);
+ struct btrfs_file_extent *file_extent,
+ bool nowait, bool strict);
void btrfs_del_delalloc_inode(struct btrfs_inode *inode);
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
@@ -515,9 +574,9 @@ void btrfs_free_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
int __init btrfs_init_cachep(void);
void __cold btrfs_destroy_cachep(void);
-struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
- struct btrfs_root *root, struct btrfs_path *path);
-struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root);
+struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
+ struct btrfs_path *path);
+struct inode *btrfs_iget(u64 ino, struct btrfs_root *root);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct page *page, u64 start, u64 len);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
@@ -551,10 +610,6 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
const struct btrfs_ioctl_encoded_io_args *encoded);
-ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
- size_t done_before);
-struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
- size_t done_before);
struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino);
extern const struct dentry_operations btrfs_dentry_operations;
@@ -571,5 +626,10 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags);
void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes,
const u64 del_bytes);
void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end);
+u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
+ u64 num_bytes);
+struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
+ const struct btrfs_file_extent *file_extent,
+ int type);
#endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 6441e47d8a5e..a8e2c461aff7 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -261,7 +261,7 @@ void btrfs_free_compr_folio(struct folio *folio)
folio_put(folio);
}
-static void end_bbio_comprssed_read(struct btrfs_bio *bbio)
+static void end_bbio_compressed_read(struct btrfs_bio *bbio)
{
struct compressed_bio *cb = to_compressed_bio(bbio);
blk_status_t status = bbio->bio.bi_status;
@@ -334,7 +334,7 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
* This also calls the writeback end hooks for the file pages so that metadata
* and checksums can be updated in the file.
*/
-static void end_bbio_comprssed_write(struct btrfs_bio *bbio)
+static void end_bbio_compressed_write(struct btrfs_bio *bbio)
{
struct compressed_bio *cb = to_compressed_bio(bbio);
struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
@@ -374,7 +374,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
blk_opf_t write_flags,
bool writeback)
{
- struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_inode *inode = ordered->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct compressed_bio *cb;
@@ -383,7 +383,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
cb = alloc_compressed_bio(inode, ordered->file_offset,
REQ_OP_WRITE | write_flags,
- end_bbio_comprssed_write);
+ end_bbio_compressed_write);
cb->start = ordered->file_offset;
cb->len = ordered->num_bytes;
cb->compressed_folios = compressed_folios;
@@ -507,13 +507,15 @@ static noinline int add_ra_bio_pages(struct inode *inode,
*/
if (!em || cur < em->start ||
(cur + fs_info->sectorsize > extent_map_end(em)) ||
- (em->block_start >> SECTOR_SHIFT) != orig_bio->bi_iter.bi_sector) {
+ (extent_map_block_start(em) >> SECTOR_SHIFT) !=
+ orig_bio->bi_iter.bi_sector) {
free_extent_map(em);
unlock_extent(tree, cur, page_end, NULL);
unlock_page(page);
put_page(page);
break;
}
+ add_size = min(em->start + em->len, page_end + 1) - cur;
free_extent_map(em);
if (page->index == end_index) {
@@ -526,7 +528,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
}
}
- add_size = min(em->start + em->len, page_end + 1) - cur;
ret = bio_add_page(orig_bio, page, add_size, offset_in_page(cur));
if (ret != add_size) {
unlock_extent(tree, cur, page_end, NULL);
@@ -585,12 +586,12 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
}
ASSERT(extent_map_is_compressed(em));
- compressed_len = em->block_len;
+ compressed_len = em->disk_num_bytes;
cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ,
- end_bbio_comprssed_read);
+ end_bbio_compressed_read);
- cb->start = em->orig_start;
+ cb->start = em->start - em->offset;
em_len = em->len;
em_start = em->start;
@@ -608,7 +609,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
goto out_free_bio;
}
- ret2 = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios, 0);
+ ret2 = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios);
if (ret2) {
ret = BLK_STS_RESOURCE;
goto out_free_compressed_pages;
@@ -1506,7 +1507,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
*
* Return non-zero if the compression should be done, 0 otherwise.
*/
-int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
+int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end)
{
struct list_head *ws_list = get_workspace(0, 0);
struct heuristic_ws *ws;
@@ -1516,7 +1517,7 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
ws = list_entry(ws_list, struct heuristic_ws, list);
- heuristic_collect_sample(inode, start, end, ws);
+ heuristic_collect_sample(&inode->vfs_inode, start, end, ws);
if (sample_repeated_patterns(ws)) {
ret = 1;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index c20c1a1b09d5..cfdc64319186 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -144,7 +144,7 @@ extern const struct btrfs_compress_op btrfs_zstd_compress;
const char* btrfs_compress_type2str(enum btrfs_compression_type type);
bool btrfs_compress_is_valid_type(const char *str, size_t len);
-int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
+int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end);
int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
struct folio **in_folio_ret);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 1a49b9232990..451203055bbf 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -321,7 +321,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != fs_info->running_transaction->transid);
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
- trans->transid != root->last_trans);
+ trans->transid != btrfs_get_root_last_trans(root));
level = btrfs_header_level(buf);
if (level == 0)
@@ -417,7 +417,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
u64 refs;
u64 owner;
u64 flags;
- u64 new_flags = 0;
int ret;
/*
@@ -462,8 +461,16 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
}
owner = btrfs_header_owner(buf);
- BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
- !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+ if (unlikely(owner == BTRFS_TREE_RELOC_OBJECTID &&
+ !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))) {
+ btrfs_crit(fs_info,
+"found tree block at bytenr %llu level %d root %llu refs %llu flags %llx without full backref flag set",
+ buf->start, btrfs_header_level(buf),
+ btrfs_root_id(root), refs, flags);
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
if (refs > 1) {
if ((owner == btrfs_root_id(root) ||
@@ -481,7 +488,10 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (ret)
return ret;
}
- new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ ret = btrfs_set_disk_extent_flags(trans, buf,
+ BTRFS_BLOCK_FLAG_FULL_BACKREF);
+ if (ret)
+ return ret;
} else {
if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
@@ -491,11 +501,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (ret)
return ret;
}
- if (new_flags != 0) {
- ret = btrfs_set_disk_extent_flags(trans, buf, new_flags);
- if (ret)
- return ret;
- }
} else {
if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
@@ -551,7 +556,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
trans->transid != fs_info->running_transaction->transid);
WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
- trans->transid != root->last_trans);
+ trans->transid != btrfs_get_root_last_trans(root));
level = btrfs_header_level(buf);
@@ -588,19 +593,15 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
if (ret) {
- btrfs_tree_unlock(cow);
- free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
- return ret;
+ goto error_unlock_cow;
}
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
if (ret) {
- btrfs_tree_unlock(cow);
- free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
- return ret;
+ goto error_unlock_cow;
}
}
@@ -612,27 +613,27 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
if (ret < 0) {
- btrfs_tree_unlock(cow);
- free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
- return ret;
+ goto error_unlock_cow;
}
atomic_inc(&cow->refs);
rcu_assign_pointer(root->node, cow);
- btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
- parent_start, last_ref);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
+ parent_start, last_ref);
free_extent_buffer(buf);
add_root_to_dirty_list(root);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto error_unlock_cow;
+ }
} else {
WARN_ON(trans->transid != btrfs_header_generation(parent));
ret = btrfs_tree_mod_log_insert_key(parent, parent_slot,
BTRFS_MOD_LOG_KEY_REPLACE);
if (ret) {
- btrfs_tree_unlock(cow);
- free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
- return ret;
+ goto error_unlock_cow;
}
btrfs_set_node_blockptr(parent, parent_slot,
cow->start);
@@ -642,14 +643,16 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
if (last_ref) {
ret = btrfs_tree_mod_log_free_eb(buf);
if (ret) {
- btrfs_tree_unlock(cow);
- free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
- return ret;
+ goto error_unlock_cow;
}
}
- btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
- parent_start, last_ref);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
+ parent_start, last_ref);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto error_unlock_cow;
+ }
}
if (unlock_orig)
btrfs_tree_unlock(buf);
@@ -657,6 +660,11 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(trans, cow);
*cow_ret = cow;
return 0;
+
+error_unlock_cow:
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
+ return ret;
}
static inline int should_cow_block(struct btrfs_trans_handle *trans,
@@ -983,9 +991,13 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
free_extent_buffer(mid);
root_sub_used_bytes(root);
- btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
/* once for the root ptr */
free_extent_buffer_stale(mid);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
return 0;
}
if (btrfs_header_nritems(mid) >
@@ -1053,10 +1065,14 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto out;
}
root_sub_used_bytes(root);
- btrfs_free_tree_block(trans, btrfs_root_id(root), right,
- 0, 1);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root),
+ right, 0, 1);
free_extent_buffer_stale(right);
right = NULL;
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
} else {
struct btrfs_disk_key right_key;
btrfs_node_key(right, &right_key, 0);
@@ -1111,9 +1127,13 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto out;
}
root_sub_used_bytes(root);
- btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
free_extent_buffer_stale(mid);
mid = NULL;
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
} else {
/* update the parent key to reflect our changes */
struct btrfs_disk_key mid_key;
@@ -1551,12 +1571,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
if (ret) {
free_extent_buffer(tmp);
btrfs_release_path(p);
- return -EIO;
- }
- if (btrfs_check_eb_owner(tmp, btrfs_root_id(root))) {
- free_extent_buffer(tmp);
- btrfs_release_path(p);
- return -EUCLEAN;
+ return ret;
}
if (unlock_up)
@@ -2883,7 +2898,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
old = root->node;
ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
if (ret < 0) {
- btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
+ int ret2;
+
+ ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
+ if (ret2 < 0)
+ btrfs_abort_transaction(trans, ret2);
btrfs_tree_unlock(c);
free_extent_buffer(c);
return ret;
@@ -4452,9 +4471,12 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
root_sub_used_bytes(root);
atomic_inc(&leaf->refs);
- btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
free_extent_buffer_stale(leaf);
- return 0;
+ if (ret < 0)
+ btrfs_abort_transaction(trans, ret);
+
+ return ret;
}
/*
* delete the item at the leaf level in path. If that empties
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c03c58246033..c8568b1a61c4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -221,9 +221,11 @@ struct btrfs_root {
struct list_head root_list;
- spinlock_t inode_lock;
- /* red-black tree that keeps track of in-memory inodes */
- struct rb_root inode_tree;
+ /*
+ * Xarray that keeps track of in-memory inodes, protected by the lock
+ * @inode_lock.
+ */
+ struct xarray inodes;
/*
* Xarray that keeps track of delayed nodes of every inode, protected
@@ -354,6 +356,16 @@ static inline void btrfs_set_root_last_log_commit(struct btrfs_root *root, int c
WRITE_ONCE(root->last_log_commit, commit_id);
}
+static inline u64 btrfs_get_root_last_trans(const struct btrfs_root *root)
+{
+ return READ_ONCE(root->last_trans);
+}
+
+static inline void btrfs_set_root_last_trans(struct btrfs_root *root, u64 transid)
+{
+ WRITE_ONCE(root->last_trans, transid);
+}
+
/*
* Structure that conveys information about an extent that is going to replace
* all the extents in a file range.
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 407ccec3e57e..f6dbda37a361 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -139,7 +139,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
if (trans)
transid = trans->transid;
else
- transid = inode->root->last_trans;
+ transid = btrfs_get_root_last_trans(root);
defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
if (!defrag)
@@ -255,7 +255,7 @@ again:
goto cleanup;
}
- inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
+ inode = btrfs_iget(defrag->ino, inode_root);
btrfs_put_root(inode_root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
@@ -707,8 +707,10 @@ iterate:
*/
if (key.offset > start) {
em->start = start;
- em->orig_start = start;
- em->block_start = EXTENT_MAP_HOLE;
+ em->disk_bytenr = EXTENT_MAP_HOLE;
+ em->disk_num_bytes = 0;
+ em->ram_bytes = 0;
+ em->offset = 0;
em->len = key.offset - start;
break;
}
@@ -825,7 +827,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
*/
next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked);
/* No more em or hole */
- if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+ if (!next || next->disk_bytenr >= EXTENT_MAP_LAST_BYTE)
goto out;
if (next->flags & EXTENT_FLAG_PREALLOC)
goto out;
@@ -992,12 +994,12 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
* This is for users who want to convert inline extents to
* regular ones through max_inline= mount option.
*/
- if (em->block_start == EXTENT_MAP_INLINE &&
+ if (em->disk_bytenr == EXTENT_MAP_INLINE &&
em->len <= inode->root->fs_info->max_inline)
goto next;
/* Skip holes and preallocated extents. */
- if (em->block_start == EXTENT_MAP_HOLE ||
+ if (em->disk_bytenr == EXTENT_MAP_HOLE ||
(em->flags & EXTENT_FLAG_PREALLOC))
goto next;
@@ -1062,7 +1064,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
* So if an inline extent passed all above checks, just add it
* for defrag, and be converted to regular extents.
*/
- if (em->block_start == EXTENT_MAP_INLINE)
+ if (em->disk_bytenr == EXTENT_MAP_INLINE)
goto add;
next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index b3527efd0b4b..7aa8a395d838 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -111,7 +111,7 @@
* making error handling and cleanup easier.
*/
-int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
+int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h
index ce4f889e4f17..3f32953c0a80 100644
--- a/fs/btrfs/delalloc-space.h
+++ b/fs/btrfs/delalloc-space.h
@@ -9,7 +9,7 @@ struct extent_changeset;
struct btrfs_inode;
struct btrfs_fs_info;
-int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
+int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes);
int btrfs_check_data_free_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len,
bool noflush);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 95a0497fa866..508bdbae29a0 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -77,14 +77,14 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
return node;
}
- spin_lock(&root->inode_lock);
+ xa_lock(&root->delayed_nodes);
node = xa_load(&root->delayed_nodes, ino);
if (node) {
if (btrfs_inode->delayed_node) {
refcount_inc(&node->refs); /* can be accessed */
BUG_ON(btrfs_inode->delayed_node != node);
- spin_unlock(&root->inode_lock);
+ xa_unlock(&root->delayed_nodes);
return node;
}
@@ -111,10 +111,10 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
node = NULL;
}
- spin_unlock(&root->inode_lock);
+ xa_unlock(&root->delayed_nodes);
return node;
}
- spin_unlock(&root->inode_lock);
+ xa_unlock(&root->delayed_nodes);
return NULL;
}
@@ -148,21 +148,21 @@ again:
kmem_cache_free(delayed_node_cache, node);
return ERR_PTR(-ENOMEM);
}
- spin_lock(&root->inode_lock);
+ xa_lock(&root->delayed_nodes);
ptr = xa_load(&root->delayed_nodes, ino);
if (ptr) {
/* Somebody inserted it, go back and read it. */
- spin_unlock(&root->inode_lock);
+ xa_unlock(&root->delayed_nodes);
kmem_cache_free(delayed_node_cache, node);
node = NULL;
goto again;
}
- ptr = xa_store(&root->delayed_nodes, ino, node, GFP_ATOMIC);
+ ptr = __xa_store(&root->delayed_nodes, ino, node, GFP_ATOMIC);
ASSERT(xa_err(ptr) != -EINVAL);
ASSERT(xa_err(ptr) != -ENOMEM);
ASSERT(ptr == NULL);
btrfs_inode->delayed_node = node;
- spin_unlock(&root->inode_lock);
+ xa_unlock(&root->delayed_nodes);
return node;
}
@@ -275,14 +275,12 @@ static void __btrfs_release_delayed_node(
if (refcount_dec_and_test(&delayed_node->refs)) {
struct btrfs_root *root = delayed_node->root;
- spin_lock(&root->inode_lock);
+ xa_erase(&root->delayed_nodes, delayed_node->inode_id);
/*
* Once our refcount goes to zero, nobody is allowed to bump it
* back up. We can delete it now.
*/
ASSERT(refcount_read(&delayed_node->refs) == 0);
- xa_erase(&root->delayed_nodes, delayed_node->inode_id);
- spin_unlock(&root->inode_lock);
kmem_cache_free(delayed_node_cache, delayed_node);
}
}
@@ -1471,7 +1469,7 @@ static void btrfs_release_dir_index_item_space(struct btrfs_trans_handle *trans)
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
const char *name, int name_len,
struct btrfs_inode *dir,
- struct btrfs_disk_key *disk_key, u8 flags,
+ const struct btrfs_disk_key *disk_key, u8 flags,
u64 index)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1684,7 +1682,7 @@ int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode)
return 0;
}
-bool btrfs_readdir_get_delayed_items(struct inode *inode,
+bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode,
u64 last_index,
struct list_head *ins_list,
struct list_head *del_list)
@@ -1692,7 +1690,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
struct btrfs_delayed_node *delayed_node;
struct btrfs_delayed_item *item;
- delayed_node = btrfs_get_delayed_node(BTRFS_I(inode));
+ delayed_node = btrfs_get_delayed_node(inode);
if (!delayed_node)
return false;
@@ -1700,8 +1698,8 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
* We can only do one readdir with delayed items at a time because of
* item->readdir_list.
*/
- btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
- btrfs_inode_lock(BTRFS_I(inode), 0);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_lock(inode, 0);
mutex_lock(&delayed_node->mutex);
item = __btrfs_first_delayed_insertion_item(delayed_node);
@@ -1732,7 +1730,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
return true;
}
-void btrfs_readdir_put_delayed_items(struct inode *inode,
+void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode,
struct list_head *ins_list,
struct list_head *del_list)
{
@@ -1754,10 +1752,10 @@ void btrfs_readdir_put_delayed_items(struct inode *inode,
* The VFS is going to do up_read(), so we need to downgrade back to a
* read lock.
*/
- downgrade_write(&inode->i_rwsem);
+ downgrade_write(&inode->vfs_inode.i_rwsem);
}
-int btrfs_should_delete_dir_index(struct list_head *del_list,
+int btrfs_should_delete_dir_index(const struct list_head *del_list,
u64 index)
{
struct btrfs_delayed_item *curr;
@@ -1778,7 +1776,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
* Read dir info stored in the delayed tree.
*/
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
- struct list_head *ins_list)
+ const struct list_head *ins_list)
{
struct btrfs_dir_item *di;
struct btrfs_delayed_item *curr, *next;
@@ -1916,7 +1914,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
BTRFS_I(inode)->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime);
inode->i_generation = BTRFS_I(inode)->generation;
- BTRFS_I(inode)->index_cnt = (u64)-1;
+ if (S_ISDIR(inode->i_mode))
+ BTRFS_I(inode)->index_cnt = (u64)-1;
mutex_unlock(&delayed_node->mutex);
btrfs_release_delayed_node(delayed_node);
@@ -2057,9 +2056,9 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
struct btrfs_delayed_node *node;
int count;
- spin_lock(&root->inode_lock);
+ xa_lock(&root->delayed_nodes);
if (xa_empty(&root->delayed_nodes)) {
- spin_unlock(&root->inode_lock);
+ xa_unlock(&root->delayed_nodes);
return;
}
@@ -2076,7 +2075,7 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
if (count >= ARRAY_SIZE(delayed_nodes))
break;
}
- spin_unlock(&root->inode_lock);
+ xa_unlock(&root->delayed_nodes);
index++;
for (int i = 0; i < count; i++) {
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 64e115d97499..7cfefdfe54ea 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -110,7 +110,7 @@ void btrfs_init_delayed_root(struct btrfs_delayed_root *delayed_root);
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
const char *name, int name_len,
struct btrfs_inode *dir,
- struct btrfs_disk_key *disk_key, u8 flags,
+ const struct btrfs_disk_key *disk_key, u8 flags,
u64 index);
int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
@@ -143,17 +143,17 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info);
/* Used for readdir() */
-bool btrfs_readdir_get_delayed_items(struct inode *inode,
+bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode,
u64 last_index,
struct list_head *ins_list,
struct list_head *del_list);
-void btrfs_readdir_put_delayed_items(struct inode *inode,
+void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode,
struct list_head *ins_list,
struct list_head *del_list);
-int btrfs_should_delete_dir_index(struct list_head *del_list,
+int btrfs_should_delete_dir_index(const struct list_head *del_list,
u64 index);
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
- struct list_head *ins_list);
+ const struct list_head *ins_list);
/* Used during directory logging. */
void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 6cc80fb10da2..2ac9296edccb 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -195,48 +195,6 @@ void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info)
}
/*
- * Transfer bytes to our delayed refs rsv.
- *
- * @fs_info: the filesystem
- * @num_bytes: number of bytes to transfer
- *
- * This transfers up to the num_bytes amount, previously reserved, to the
- * delayed_refs_rsv. Any extra bytes are returned to the space info.
- */
-void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
- u64 num_bytes)
-{
- struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
- u64 to_free = 0;
-
- spin_lock(&delayed_refs_rsv->lock);
- if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
- u64 delta = delayed_refs_rsv->size -
- delayed_refs_rsv->reserved;
- if (num_bytes > delta) {
- to_free = num_bytes - delta;
- num_bytes = delta;
- }
- } else {
- to_free = num_bytes;
- num_bytes = 0;
- }
-
- if (num_bytes)
- delayed_refs_rsv->reserved += num_bytes;
- if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
- delayed_refs_rsv->full = true;
- spin_unlock(&delayed_refs_rsv->lock);
-
- if (num_bytes)
- trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
- 0, num_bytes, 1);
- if (to_free)
- btrfs_space_info_free_bytes_may_use(fs_info,
- delayed_refs_rsv->space_info, to_free);
-}
-
-/*
* Refill based on our delayed refs usage.
*
* @fs_info: the filesystem
@@ -861,6 +819,12 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
spin_lock_init(&head_ref->lock);
mutex_init(&head_ref->mutex);
+ /* If not metadata set an impossible level to help debugging. */
+ if (generic_ref->type == BTRFS_REF_METADATA)
+ head_ref->level = generic_ref->tree_ref.level;
+ else
+ head_ref->level = U8_MAX;
+
if (qrecord) {
if (generic_ref->ref_root && reserved) {
qrecord->data_rsv = reserved;
@@ -1114,7 +1078,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
}
int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes,
+ u64 bytenr, u64 num_bytes, u8 level,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_delayed_ref_head *head_ref;
@@ -1124,6 +1088,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
.action = BTRFS_UPDATE_DELAYED_HEAD,
.bytenr = bytenr,
.num_bytes = num_bytes,
+ .tree_ref.level = level,
};
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 04b180ebe1fe..ef15e998be03 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -108,7 +108,6 @@ struct btrfs_delayed_ref_node {
struct btrfs_delayed_extent_op {
struct btrfs_disk_key key;
- u8 level;
bool update_key;
bool update_flags;
u64 flags_to_set;
@@ -172,6 +171,9 @@ struct btrfs_delayed_ref_head {
*/
u64 reserved_bytes;
+ /* Tree block level, for metadata only. */
+ u8 level;
+
/*
* when a new extent is allocated, it is just reserved in memory
* The actual extent isn't inserted into the extent allocation tree
@@ -355,7 +357,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
u64 reserved);
int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes,
+ u64 bytenr, u64 num_bytes, u8 level,
struct btrfs_delayed_extent_op *extent_op);
void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
@@ -386,8 +388,6 @@ void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info);
void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info);
int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
enum btrfs_reserve_flush_enum flush);
-void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
- u64 num_bytes);
bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 7130040d92ab..f638c458d285 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -684,7 +684,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
if (ret)
btrfs_err(fs_info, "kobj add dev failed %d", ret);
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
/*
* Commit dev_replace state and reserve 1 item for it.
@@ -880,7 +880,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
}
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
/*
* We have to use this loop approach because at this point src_device
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 9c07d5c3e5ad..001c0c2f872c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -22,7 +22,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
*trans,
struct btrfs_root *root,
struct btrfs_path *path,
- struct btrfs_key *cpu_key,
+ const struct btrfs_key *cpu_key,
u32 data_size,
const char *name,
int name_len)
@@ -108,7 +108,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
*/
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
const struct fscrypt_str *name, struct btrfs_inode *dir,
- struct btrfs_key *location, u8 type, u64 index)
+ const struct btrfs_key *location, u8 type, u64 index)
{
int ret = 0;
int ret2 = 0;
@@ -379,7 +379,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
* for a specific name.
*/
struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
+ const struct btrfs_path *path,
const char *name, int name_len)
{
struct btrfs_dir_item *dir_item;
@@ -417,7 +417,7 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- struct btrfs_dir_item *di)
+ const struct btrfs_dir_item *di)
{
struct extent_buffer *leaf;
diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h
index 00b3d83d7569..5f6dfafc91f1 100644
--- a/fs/btrfs/dir-item.h
+++ b/fs/btrfs/dir-item.h
@@ -17,7 +17,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
const struct fscrypt_str *name);
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
const struct fscrypt_str *name, struct btrfs_inode *dir,
- struct btrfs_key *location, u8 type, u64 index);
+ const struct btrfs_key *location, u8 type, u64 index);
struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
@@ -33,7 +33,7 @@ struct btrfs_dir_item *btrfs_search_dir_index_item(struct btrfs_root *root,
int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- struct btrfs_dir_item *di);
+ const struct btrfs_dir_item *di);
int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid,
@@ -45,7 +45,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
const char *name, u16 name_len,
int mod);
struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
+ const struct btrfs_path *path,
const char *name,
int name_len);
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
new file mode 100644
index 000000000000..f9fb2db6a1e4
--- /dev/null
+++ b/fs/btrfs/direct-io.c
@@ -0,0 +1,1052 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/fsverity.h>
+#include <linux/iomap.h>
+#include "ctree.h"
+#include "delalloc-space.h"
+#include "direct-io.h"
+#include "extent-tree.h"
+#include "file.h"
+#include "fs.h"
+#include "transaction.h"
+#include "volumes.h"
+
+struct btrfs_dio_data {
+ ssize_t submitted;
+ struct extent_changeset *data_reserved;
+ struct btrfs_ordered_extent *ordered;
+ bool data_space_reserved;
+ bool nocow_done;
+};
+
+struct btrfs_dio_private {
+ /* Range of I/O */
+ u64 file_offset;
+ u32 bytes;
+
+ /* This must be last */
+ struct btrfs_bio bbio;
+};
+
+static struct bio_set btrfs_dio_bioset;
+
+static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
+ struct extent_state **cached_state,
+ unsigned int iomap_flags)
+{
+ const bool writing = (iomap_flags & IOMAP_WRITE);
+ const bool nowait = (iomap_flags & IOMAP_NOWAIT);
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_ordered_extent *ordered;
+ int ret = 0;
+
+ while (1) {
+ if (nowait) {
+ if (!try_lock_extent(io_tree, lockstart, lockend,
+ cached_state))
+ return -EAGAIN;
+ } else {
+ lock_extent(io_tree, lockstart, lockend, cached_state);
+ }
+ /*
+ * We're concerned with the entire range that we're going to be
+ * doing DIO to, so we need to make sure there's no ordered
+ * extents in this range.
+ */
+ ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
+ lockend - lockstart + 1);
+
+ /*
+ * We need to make sure there are no buffered pages in this
+ * range either, we could have raced between the invalidate in
+ * generic_file_direct_write and locking the extent. The
+ * invalidate needs to happen so that reads after a write do not
+ * get stale data.
+ */
+ if (!ordered &&
+ (!writing || !filemap_range_has_page(inode->i_mapping,
+ lockstart, lockend)))
+ break;
+
+ unlock_extent(io_tree, lockstart, lockend, cached_state);
+
+ if (ordered) {
+ if (nowait) {
+ btrfs_put_ordered_extent(ordered);
+ ret = -EAGAIN;
+ break;
+ }
+ /*
+ * If we are doing a DIO read and the ordered extent we
+ * found is for a buffered write, we can not wait for it
+ * to complete and retry, because if we do so we can
+ * deadlock with concurrent buffered writes on page
+ * locks. This happens only if our DIO read covers more
+ * than one extent map, if at this point has already
+ * created an ordered extent for a previous extent map
+ * and locked its range in the inode's io tree, and a
+ * concurrent write against that previous extent map's
+ * range and this range started (we unlock the ranges
+ * in the io tree only when the bios complete and
+ * buffered writes always lock pages before attempting
+ * to lock range in the io tree).
+ */
+ if (writing ||
+ test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
+ btrfs_start_ordered_extent(ordered);
+ else
+ ret = nowait ? -EAGAIN : -ENOTBLK;
+ btrfs_put_ordered_extent(ordered);
+ } else {
+ /*
+ * We could trigger writeback for this range (and wait
+ * for it to complete) and then invalidate the pages for
+ * this range (through invalidate_inode_pages2_range()),
+ * but that can lead us to a deadlock with a concurrent
+ * call to readahead (a buffered read or a defrag call
+ * triggered a readahead) on a page lock due to an
+ * ordered dio extent we created before but did not have
+ * yet a corresponding bio submitted (whence it can not
+ * complete), which makes readahead wait for that
+ * ordered extent to complete while holding a lock on
+ * that page.
+ */
+ ret = nowait ? -EAGAIN : -ENOTBLK;
+ }
+
+ if (ret)
+ break;
+
+ cond_resched();
+ }
+
+ return ret;
+}
+
+static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
+ struct btrfs_dio_data *dio_data,
+ const u64 start,
+ const struct btrfs_file_extent *file_extent,
+ const int type)
+{
+ struct extent_map *em = NULL;
+ struct btrfs_ordered_extent *ordered;
+
+ if (type != BTRFS_ORDERED_NOCOW) {
+ em = btrfs_create_io_em(inode, start, file_extent, type);
+ if (IS_ERR(em))
+ goto out;
+ }
+
+ ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,
+ (1 << type) |
+ (1 << BTRFS_ORDERED_DIRECT));
+ if (IS_ERR(ordered)) {
+ if (em) {
+ free_extent_map(em);
+ btrfs_drop_extent_map_range(inode, start,
+ start + file_extent->num_bytes - 1, false);
+ }
+ em = ERR_CAST(ordered);
+ } else {
+ ASSERT(!dio_data->ordered);
+ dio_data->ordered = ordered;
+ }
+ out:
+
+ return em;
+}
+
+static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
+ struct btrfs_dio_data *dio_data,
+ u64 start, u64 len)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_file_extent file_extent;
+ struct extent_map *em;
+ struct btrfs_key ins;
+ u64 alloc_hint;
+ int ret;
+
+ alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len);
+again:
+ ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
+ 0, alloc_hint, &ins, 1, 1);
+ if (ret == -EAGAIN) {
+ ASSERT(btrfs_is_zoned(fs_info));
+ wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
+ TASK_UNINTERRUPTIBLE);
+ goto again;
+ }
+ if (ret)
+ return ERR_PTR(ret);
+
+ file_extent.disk_bytenr = ins.objectid;
+ file_extent.disk_num_bytes = ins.offset;
+ file_extent.num_bytes = ins.offset;
+ file_extent.ram_bytes = ins.offset;
+ file_extent.offset = 0;
+ file_extent.compression = BTRFS_COMPRESS_NONE;
+ em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent,
+ BTRFS_ORDERED_REGULAR);
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ if (IS_ERR(em))
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
+ 1);
+
+ return em;
+}
+
+static int btrfs_get_blocks_direct_write(struct extent_map **map,
+ struct inode *inode,
+ struct btrfs_dio_data *dio_data,
+ u64 start, u64 *lenp,
+ unsigned int iomap_flags)
+{
+ const bool nowait = (iomap_flags & IOMAP_NOWAIT);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_file_extent file_extent;
+ struct extent_map *em = *map;
+ int type;
+ u64 block_start;
+ struct btrfs_block_group *bg;
+ bool can_nocow = false;
+ bool space_reserved = false;
+ u64 len = *lenp;
+ u64 prev_len;
+ int ret = 0;
+
+ /*
+ * We don't allocate a new extent in the following cases
+ *
+ * 1) The inode is marked as NODATACOW. In this case we'll just use the
+ * existing extent.
+ * 2) The extent is marked as PREALLOC. We're good to go here and can
+ * just use the extent.
+ *
+ */
+ if ((em->flags & EXTENT_FLAG_PREALLOC) ||
+ ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ em->disk_bytenr != EXTENT_MAP_HOLE)) {
+ if (em->flags & EXTENT_FLAG_PREALLOC)
+ type = BTRFS_ORDERED_PREALLOC;
+ else
+ type = BTRFS_ORDERED_NOCOW;
+ len = min(len, em->len - (start - em->start));
+ block_start = extent_map_block_start(em) + (start - em->start);
+
+ if (can_nocow_extent(inode, start, &len,
+ &file_extent, false, false) == 1) {
+ bg = btrfs_inc_nocow_writers(fs_info, block_start);
+ if (bg)
+ can_nocow = true;
+ }
+ }
+
+ prev_len = len;
+ if (can_nocow) {
+ struct extent_map *em2;
+
+ /* We can NOCOW, so only need to reserve metadata space. */
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
+ nowait);
+ if (ret < 0) {
+ /* Our caller expects us to free the input extent map. */
+ free_extent_map(em);
+ *map = NULL;
+ btrfs_dec_nocow_writers(bg);
+ if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
+ ret = -EAGAIN;
+ goto out;
+ }
+ space_reserved = true;
+
+ em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start,
+ &file_extent, type);
+ btrfs_dec_nocow_writers(bg);
+ if (type == BTRFS_ORDERED_PREALLOC) {
+ free_extent_map(em);
+ *map = em2;
+ em = em2;
+ }
+
+ if (IS_ERR(em2)) {
+ ret = PTR_ERR(em2);
+ goto out;
+ }
+
+ dio_data->nocow_done = true;
+ } else {
+ /* Our caller expects us to free the input extent map. */
+ free_extent_map(em);
+ *map = NULL;
+
+ if (nowait) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ /*
+ * If we could not allocate data space before locking the file
+ * range and we can't do a NOCOW write, then we have to fail.
+ */
+ if (!dio_data->data_space_reserved) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ /*
+ * We have to COW and we have already reserved data space before,
+ * so now we reserve only metadata.
+ */
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
+ false);
+ if (ret < 0)
+ goto out;
+ space_reserved = true;
+
+ em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+ *map = em;
+ len = min(len, em->len - (start - em->start));
+ if (len < prev_len)
+ btrfs_delalloc_release_metadata(BTRFS_I(inode),
+ prev_len - len, true);
+ }
+
+ /*
+ * We have created our ordered extent, so we can now release our reservation
+ * for an outstanding extent.
+ */
+ btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
+
+ /*
+ * Need to update the i_size under the extent lock so buffered
+ * readers will get the updated i_size when we unlock.
+ */
+ if (start + len > i_size_read(inode))
+ i_size_write(inode, start + len);
+out:
+ if (ret && space_reserved) {
+ btrfs_delalloc_release_extents(BTRFS_I(inode), len);
+ btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
+ }
+ *lenp = len;
+ return ret;
+}
+
+static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
+ loff_t length, unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct extent_map *em;
+ struct extent_state *cached_state = NULL;
+ struct btrfs_dio_data *dio_data = iter->private;
+ u64 lockstart, lockend;
+ const bool write = !!(flags & IOMAP_WRITE);
+ int ret = 0;
+ u64 len = length;
+ const u64 data_alloc_len = length;
+ bool unlock_extents = false;
+
+ /*
+ * We could potentially fault if we have a buffer > PAGE_SIZE, and if
+ * we're NOWAIT we may submit a bio for a partial range and return
+ * EIOCBQUEUED, which would result in an errant short read.
+ *
+ * The best way to handle this would be to allow for partial completions
+ * of iocb's, so we could submit the partial bio, return and fault in
+ * the rest of the pages, and then submit the io for the rest of the
+ * range. However we don't have that currently, so simply return
+ * -EAGAIN at this point so that the normal path is used.
+ */
+ if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
+ return -EAGAIN;
+
+ /*
+ * Cap the size of reads to that usually seen in buffered I/O as we need
+ * to allocate a contiguous array for the checksums.
+ */
+ if (!write)
+ len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
+
+ lockstart = start;
+ lockend = start + len - 1;
+
+ /*
+ * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
+ * enough if we've written compressed pages to this area, so we need to
+ * flush the dirty pages again to make absolutely sure that any
+ * outstanding dirty pages are on disk - the first flush only starts
+ * compression on the data, while keeping the pages locked, so by the
+ * time the second flush returns we know bios for the compressed pages
+ * were submitted and finished, and the pages no longer under writeback.
+ *
+ * If we have a NOWAIT request and we have any pages in the range that
+ * are locked, likely due to compression still in progress, we don't want
+ * to block on page locks. We also don't want to block on pages marked as
+ * dirty or under writeback (same as for the non-compression case).
+ * iomap_dio_rw() did the same check, but after that and before we got
+ * here, mmap'ed writes may have happened or buffered reads started
+ * (readpage() and readahead(), which lock pages), as we haven't locked
+ * the file range yet.
+ */
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags)) {
+ if (flags & IOMAP_NOWAIT) {
+ if (filemap_range_needs_writeback(inode->i_mapping,
+ lockstart, lockend))
+ return -EAGAIN;
+ } else {
+ ret = filemap_fdatawrite_range(inode->i_mapping, start,
+ start + length - 1);
+ if (ret)
+ return ret;
+ }
+ }
+
+ memset(dio_data, 0, sizeof(*dio_data));
+
+ /*
+ * We always try to allocate data space and must do it before locking
+ * the file range, to avoid deadlocks with concurrent writes to the same
+ * range if the range has several extents and the writes don't expand the
+ * current i_size (the inode lock is taken in shared mode). If we fail to
+ * allocate data space here we continue and later, after locking the
+ * file range, we fail with ENOSPC only if we figure out we can not do a
+ * NOCOW write.
+ */
+ if (write && !(flags & IOMAP_NOWAIT)) {
+ ret = btrfs_check_data_free_space(BTRFS_I(inode),
+ &dio_data->data_reserved,
+ start, data_alloc_len, false);
+ if (!ret)
+ dio_data->data_space_reserved = true;
+ else if (ret && !(BTRFS_I(inode)->flags &
+ (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
+ goto err;
+ }
+
+ /*
+ * If this errors out it's because we couldn't invalidate pagecache for
+ * this range and we need to fallback to buffered IO, or we are doing a
+ * NOWAIT read/write and we need to block.
+ */
+ ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
+ if (ret < 0)
+ goto err;
+
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto unlock_err;
+ }
+
+ /*
+ * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
+ * io. INLINE is special, and we could probably kludge it in here, but
+ * it's still buffered so for safety lets just fall back to the generic
+ * buffered path.
+ *
+ * For COMPRESSED we _have_ to read the entire extent in so we can
+ * decompress it, so there will be buffering required no matter what we
+ * do, so go ahead and fallback to buffered.
+ *
+ * We return -ENOTBLK because that's what makes DIO go ahead and go back
+ * to buffered IO. Don't blame me, this is the price we pay for using
+ * the generic code.
+ */
+ if (extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
+ free_extent_map(em);
+ /*
+ * If we are in a NOWAIT context, return -EAGAIN in order to
+ * fallback to buffered IO. This is not only because we can
+ * block with buffered IO (no support for NOWAIT semantics at
+ * the moment) but also to avoid returning short reads to user
+ * space - this happens if we were able to read some data from
+ * previous non-compressed extents and then when we fallback to
+ * buffered IO, at btrfs_file_read_iter() by calling
+ * filemap_read(), we fail to fault in pages for the read buffer,
+ * in which case filemap_read() returns a short read (the number
+ * of bytes previously read is > 0, so it does not return -EFAULT).
+ */
+ ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
+ goto unlock_err;
+ }
+
+ len = min(len, em->len - (start - em->start));
+
+ /*
+ * If we have a NOWAIT request and the range contains multiple extents
+ * (or a mix of extents and holes), then we return -EAGAIN to make the
+ * caller fallback to a context where it can do a blocking (without
+ * NOWAIT) request. This way we avoid doing partial IO and returning
+ * success to the caller, which is not optimal for writes and for reads
+ * it can result in unexpected behaviour for an application.
+ *
+ * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
+ * iomap_dio_rw(), we can end up returning less data then what the caller
+ * asked for, resulting in an unexpected, and incorrect, short read.
+ * That is, the caller asked to read N bytes and we return less than that,
+ * which is wrong unless we are crossing EOF. This happens if we get a
+ * page fault error when trying to fault in pages for the buffer that is
+ * associated to the struct iov_iter passed to iomap_dio_rw(), and we
+ * have previously submitted bios for other extents in the range, in
+ * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
+ * those bios have completed by the time we get the page fault error,
+ * which we return back to our caller - we should only return EIOCBQUEUED
+ * after we have submitted bios for all the extents in the range.
+ */
+ if ((flags & IOMAP_NOWAIT) && len < length) {
+ free_extent_map(em);
+ ret = -EAGAIN;
+ goto unlock_err;
+ }
+
+ if (write) {
+ ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
+ start, &len, flags);
+ if (ret < 0)
+ goto unlock_err;
+ unlock_extents = true;
+ /* Recalc len in case the new em is smaller than requested */
+ len = min(len, em->len - (start - em->start));
+ if (dio_data->data_space_reserved) {
+ u64 release_offset;
+ u64 release_len = 0;
+
+ if (dio_data->nocow_done) {
+ release_offset = start;
+ release_len = data_alloc_len;
+ } else if (len < data_alloc_len) {
+ release_offset = start + len;
+ release_len = data_alloc_len - len;
+ }
+
+ if (release_len > 0)
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ dio_data->data_reserved,
+ release_offset,
+ release_len);
+ }
+ } else {
+ /*
+ * We need to unlock only the end area that we aren't using.
+ * The rest is going to be unlocked by the endio routine.
+ */
+ lockstart = start + len;
+ if (lockstart < lockend)
+ unlock_extents = true;
+ }
+
+ if (unlock_extents)
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
+ else
+ free_extent_state(cached_state);
+
+ /*
+ * Translate extent map information to iomap.
+ * We trim the extents (and move the addr) even though iomap code does
+ * that, since we have locked only the parts we are performing I/O in.
+ */
+ if ((em->disk_bytenr == EXTENT_MAP_HOLE) ||
+ ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->type = IOMAP_HOLE;
+ } else {
+ iomap->addr = extent_map_block_start(em) + (start - em->start);
+ iomap->type = IOMAP_MAPPED;
+ }
+ iomap->offset = start;
+ iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
+ iomap->length = len;
+ free_extent_map(em);
+
+ return 0;
+
+unlock_err:
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
+err:
+ if (dio_data->data_space_reserved) {
+ btrfs_free_reserved_data_space(BTRFS_I(inode),
+ dio_data->data_reserved,
+ start, data_alloc_len);
+ extent_changeset_free(dio_data->data_reserved);
+ }
+
+ return ret;
+}
+
+static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ ssize_t written, unsigned int flags, struct iomap *iomap)
+{
+ struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
+ struct btrfs_dio_data *dio_data = iter->private;
+ size_t submitted = dio_data->submitted;
+ const bool write = !!(flags & IOMAP_WRITE);
+ int ret = 0;
+
+ if (!write && (iomap->type == IOMAP_HOLE)) {
+ /* If reading from a hole, unlock and return */
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
+ NULL);
+ return 0;
+ }
+
+ if (submitted < length) {
+ pos += submitted;
+ length -= submitted;
+ if (write)
+ btrfs_finish_ordered_extent(dio_data->ordered, NULL,
+ pos, length, false);
+ else
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos,
+ pos + length - 1, NULL);
+ ret = -ENOTBLK;
+ }
+ if (write) {
+ btrfs_put_ordered_extent(dio_data->ordered);
+ dio_data->ordered = NULL;
+ }
+
+ if (write)
+ extent_changeset_free(dio_data->data_reserved);
+ return ret;
+}
+
+static void btrfs_dio_end_io(struct btrfs_bio *bbio)
+{
+ struct btrfs_dio_private *dip =
+ container_of(bbio, struct btrfs_dio_private, bbio);
+ struct btrfs_inode *inode = bbio->inode;
+ struct bio *bio = &bbio->bio;
+
+ if (bio->bi_status) {
+ btrfs_warn(inode->root->fs_info,
+ "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
+ btrfs_ino(inode), bio->bi_opf,
+ dip->file_offset, dip->bytes, bio->bi_status);
+ }
+
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
+ btrfs_finish_ordered_extent(bbio->ordered, NULL,
+ dip->file_offset, dip->bytes,
+ !bio->bi_status);
+ } else {
+ unlock_extent(&inode->io_tree, dip->file_offset,
+ dip->file_offset + dip->bytes - 1, NULL);
+ }
+
+ bbio->bio.bi_private = bbio->private;
+ iomap_dio_bio_end_io(bio);
+}
+
+static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
+ struct btrfs_ordered_extent *ordered)
+{
+ u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 len = bbio->bio.bi_iter.bi_size;
+ struct btrfs_ordered_extent *new;
+ int ret;
+
+ /* Must always be called for the beginning of an ordered extent. */
+ if (WARN_ON_ONCE(start != ordered->disk_bytenr))
+ return -EINVAL;
+
+ /* No need to split if the ordered extent covers the entire bio. */
+ if (ordered->disk_num_bytes == len) {
+ refcount_inc(&ordered->refs);
+ bbio->ordered = ordered;
+ return 0;
+ }
+
+ /*
+ * Don't split the extent_map for NOCOW extents, as we're writing into
+ * a pre-existing one.
+ */
+ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
+ ret = split_extent_map(bbio->inode, bbio->file_offset,
+ ordered->num_bytes, len,
+ ordered->disk_bytenr);
+ if (ret)
+ return ret;
+ }
+
+ new = btrfs_split_ordered_extent(ordered, len);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+ bbio->ordered = new;
+ return 0;
+}
+
+static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
+ loff_t file_offset)
+{
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+ struct btrfs_dio_private *dip =
+ container_of(bbio, struct btrfs_dio_private, bbio);
+ struct btrfs_dio_data *dio_data = iter->private;
+
+ btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
+ btrfs_dio_end_io, bio->bi_private);
+ bbio->inode = BTRFS_I(iter->inode);
+ bbio->file_offset = file_offset;
+
+ dip->file_offset = file_offset;
+ dip->bytes = bio->bi_iter.bi_size;
+
+ dio_data->submitted += bio->bi_iter.bi_size;
+
+ /*
+ * Check if we are doing a partial write. If we are, we need to split
+ * the ordered extent to match the submitted bio. Hang on to the
+ * remaining unfinishable ordered_extent in dio_data so that it can be
+ * cancelled in iomap_end to avoid a deadlock wherein faulting the
+ * remaining pages is blocked on the outstanding ordered extent.
+ */
+ if (iter->flags & IOMAP_WRITE) {
+ int ret;
+
+ ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
+ if (ret) {
+ btrfs_finish_ordered_extent(dio_data->ordered, NULL,
+ file_offset, dip->bytes,
+ !ret);
+ bio->bi_status = errno_to_blk_status(ret);
+ iomap_dio_bio_end_io(bio);
+ return;
+ }
+ }
+
+ btrfs_submit_bio(bbio, 0);
+}
+
+static const struct iomap_ops btrfs_dio_iomap_ops = {
+ .iomap_begin = btrfs_dio_iomap_begin,
+ .iomap_end = btrfs_dio_iomap_end,
+};
+
+static const struct iomap_dio_ops btrfs_dio_ops = {
+ .submit_io = btrfs_dio_submit_io,
+ .bio_set = &btrfs_dio_bioset,
+};
+
+static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
+ size_t done_before)
+{
+ struct btrfs_dio_data data = { 0 };
+
+ return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ IOMAP_DIO_PARTIAL, &data, done_before);
+}
+
+static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
+ size_t done_before)
+{
+ struct btrfs_dio_data data = { 0 };
+
+ return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ IOMAP_DIO_PARTIAL, &data, done_before);
+}
+
+static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
+ const struct iov_iter *iter, loff_t offset)
+{
+ const u32 blocksize_mask = fs_info->sectorsize - 1;
+
+ if (offset & blocksize_mask)
+ return -EINVAL;
+
+ if (iov_iter_alignment(iter) & blocksize_mask)
+ return -EINVAL;
+
+ return 0;
+}
+
+ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ loff_t pos;
+ ssize_t written = 0;
+ ssize_t written_buffered;
+ size_t prev_left = 0;
+ loff_t endbyte;
+ ssize_t ret;
+ unsigned int ilock_flags = 0;
+ struct iomap_dio *dio;
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ ilock_flags |= BTRFS_ILOCK_TRY;
+
+ /*
+ * If the write DIO is within EOF, use a shared lock and also only if
+ * security bits will likely not be dropped by file_remove_privs() called
+ * from btrfs_write_check(). Either will need to be rechecked after the
+ * lock was acquired.
+ */
+ if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
+ ilock_flags |= BTRFS_ILOCK_SHARED;
+
+relock:
+ ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
+ if (ret < 0)
+ return ret;
+
+ /* Shared lock cannot be used with security bits set. */
+ if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+ ilock_flags &= ~BTRFS_ILOCK_SHARED;
+ goto relock;
+ }
+
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0) {
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+ return ret;
+ }
+
+ ret = btrfs_write_check(iocb, from, ret);
+ if (ret < 0) {
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+ goto out;
+ }
+
+ pos = iocb->ki_pos;
+ /*
+ * Re-check since file size may have changed just before taking the
+ * lock or pos may have changed because of O_APPEND in generic_write_check()
+ */
+ if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
+ pos + iov_iter_count(from) > i_size_read(inode)) {
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+ ilock_flags &= ~BTRFS_ILOCK_SHARED;
+ goto relock;
+ }
+
+ if (check_direct_IO(fs_info, from, pos)) {
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+ goto buffered;
+ }
+
+ /*
+ * The iov_iter can be mapped to the same file range we are writing to.
+ * If that's the case, then we will deadlock in the iomap code, because
+ * it first calls our callback btrfs_dio_iomap_begin(), which will create
+ * an ordered extent, and after that it will fault in the pages that the
+ * iov_iter refers to. During the fault in we end up in the readahead
+ * pages code (starting at btrfs_readahead()), which will lock the range,
+ * find that ordered extent and then wait for it to complete (at
+ * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
+ * obviously the ordered extent can never complete as we didn't submit
+ * yet the respective bio(s). This always happens when the buffer is
+ * memory mapped to the same file range, since the iomap DIO code always
+ * invalidates pages in the target file range (after starting and waiting
+ * for any writeback).
+ *
+ * So here we disable page faults in the iov_iter and then retry if we
+ * got -EFAULT, faulting in the pages before the retry.
+ */
+ from->nofault = true;
+ dio = btrfs_dio_write(iocb, from, written);
+ from->nofault = false;
+
+ /*
+ * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
+ * iocb, and that needs to lock the inode. So unlock it before calling
+ * iomap_dio_complete() to avoid a deadlock.
+ */
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+
+ if (IS_ERR_OR_NULL(dio))
+ ret = PTR_ERR_OR_ZERO(dio);
+ else
+ ret = iomap_dio_complete(dio);
+
+ /* No increment (+=) because iomap returns a cumulative value. */
+ if (ret > 0)
+ written = ret;
+
+ if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
+ const size_t left = iov_iter_count(from);
+ /*
+ * We have more data left to write. Try to fault in as many as
+ * possible of the remainder pages and retry. We do this without
+ * releasing and locking again the inode, to prevent races with
+ * truncate.
+ *
+ * Also, in case the iov refers to pages in the file range of the
+ * file we want to write to (due to a mmap), we could enter an
+ * infinite loop if we retry after faulting the pages in, since
+ * iomap will invalidate any pages in the range early on, before
+ * it tries to fault in the pages of the iov. So we keep track of
+ * how much was left of iov in the previous EFAULT and fallback
+ * to buffered IO in case we haven't made any progress.
+ */
+ if (left == prev_left) {
+ ret = -ENOTBLK;
+ } else {
+ fault_in_iov_iter_readable(from, left);
+ prev_left = left;
+ goto relock;
+ }
+ }
+
+ /*
+ * If 'ret' is -ENOTBLK or we have not written all data, then it means
+ * we must fallback to buffered IO.
+ */
+ if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
+ goto out;
+
+buffered:
+ /*
+ * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
+ * it must retry the operation in a context where blocking is acceptable,
+ * because even if we end up not blocking during the buffered IO attempt
+ * below, we will block when flushing and waiting for the IO.
+ */
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ pos = iocb->ki_pos;
+ written_buffered = btrfs_buffered_write(iocb, from);
+ if (written_buffered < 0) {
+ ret = written_buffered;
+ goto out;
+ }
+ /*
+ * Ensure all data is persisted. We want the next direct IO read to be
+ * able to read what was just written.
+ */
+ endbyte = pos + written_buffered - 1;
+ ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte);
+ if (ret)
+ goto out;
+ ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
+ if (ret)
+ goto out;
+ written += written_buffered;
+ iocb->ki_pos = pos + written_buffered;
+ invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
+ endbyte >> PAGE_SHIFT);
+out:
+ return ret < 0 ? ret : written;
+}
+
+static int check_direct_read(struct btrfs_fs_info *fs_info,
+ const struct iov_iter *iter, loff_t offset)
+{
+ int ret;
+ int i, seg;
+
+ ret = check_direct_IO(fs_info, iter, offset);
+ if (ret < 0)
+ return ret;
+
+ if (!iter_is_iovec(iter))
+ return 0;
+
+ for (seg = 0; seg < iter->nr_segs; seg++) {
+ for (i = seg + 1; i < iter->nr_segs; i++) {
+ const struct iovec *iov1 = iter_iov(iter) + seg;
+ const struct iovec *iov2 = iter_iov(iter) + i;
+
+ if (iov1->iov_base == iov2->iov_base)
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ size_t prev_left = 0;
+ ssize_t read = 0;
+ ssize_t ret;
+
+ if (fsverity_active(inode))
+ return 0;
+
+ if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
+ return 0;
+
+ btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
+again:
+ /*
+ * This is similar to what we do for direct IO writes, see the comment
+ * at btrfs_direct_write(), but we also disable page faults in addition
+ * to disabling them only at the iov_iter level. This is because when
+ * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
+ * which can still trigger page fault ins despite having set ->nofault
+ * to true of our 'to' iov_iter.
+ *
+ * The difference to direct IO writes is that we deadlock when trying
+ * to lock the extent range in the inode's tree during he page reads
+ * triggered by the fault in (while for writes it is due to waiting for
+ * our own ordered extent). This is because for direct IO reads,
+ * btrfs_dio_iomap_begin() returns with the extent range locked, which
+ * is only unlocked in the endio callback (end_bio_extent_readpage()).
+ */
+ pagefault_disable();
+ to->nofault = true;
+ ret = btrfs_dio_read(iocb, to, read);
+ to->nofault = false;
+ pagefault_enable();
+
+ /* No increment (+=) because iomap returns a cumulative value. */
+ if (ret > 0)
+ read = ret;
+
+ if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
+ const size_t left = iov_iter_count(to);
+
+ if (left == prev_left) {
+ /*
+ * We didn't make any progress since the last attempt,
+ * fallback to a buffered read for the remainder of the
+ * range. This is just to avoid any possibility of looping
+ * for too long.
+ */
+ ret = read;
+ } else {
+ /*
+ * We made some progress since the last retry or this is
+ * the first time we are retrying. Fault in as many pages
+ * as possible and retry.
+ */
+ fault_in_iov_iter_writeable(to, left);
+ prev_left = left;
+ goto again;
+ }
+ }
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
+ return ret < 0 ? ret : read;
+}
+
+int __init btrfs_init_dio(void)
+{
+ if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
+ offsetof(struct btrfs_dio_private, bbio.bio),
+ BIOSET_NEED_BVECS))
+ return -ENOMEM;
+
+ return 0;
+}
+
+void __cold btrfs_destroy_dio(void)
+{
+ bioset_exit(&btrfs_dio_bioset);
+}
diff --git a/fs/btrfs/direct-io.h b/fs/btrfs/direct-io.h
new file mode 100644
index 000000000000..3dc3ea926afe
--- /dev/null
+++ b/fs/btrfs/direct-io.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_DIRECT_IO_H
+#define BTRFS_DIRECT_IO_H
+
+#include <linux/types.h>
+
+int __init btrfs_init_dio(void);
+void __cold btrfs_destroy_dio(void);
+
+ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from);
+ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to);
+
+#endif /* BTRFS_DIRECT_IO_H */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 38cdb8875e8e..a6f5441e62d1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -213,7 +213,7 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
* structure for details.
*/
int btrfs_read_extent_buffer(struct extent_buffer *eb,
- struct btrfs_tree_parent_check *check)
+ const struct btrfs_tree_parent_check *check)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
int failed = 0;
@@ -358,7 +358,7 @@ static bool check_tree_block_fsid(struct extent_buffer *eb)
/* Do basic extent buffer checks at read time */
int btrfs_validate_extent_buffer(struct extent_buffer *eb,
- struct btrfs_tree_parent_check *check)
+ const struct btrfs_tree_parent_check *check)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
u64 found_start;
@@ -367,6 +367,7 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
u8 result[BTRFS_CSUM_SIZE];
const u8 *header_csum;
int ret = 0;
+ const bool ignore_csum = btrfs_test_opt(fs_info, IGNOREMETACSUMS);
ASSERT(check);
@@ -399,13 +400,16 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
if (memcmp(result, header_csum, csum_size) != 0) {
btrfs_warn_rl(fs_info,
-"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d",
+"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d%s",
eb->start, eb->read_mirror,
CSUM_FMT_VALUE(csum_size, header_csum),
CSUM_FMT_VALUE(csum_size, result),
- btrfs_header_level(eb));
- ret = -EUCLEAN;
- goto out;
+ btrfs_header_level(eb),
+ ignore_csum ? ", ignored" : "");
+ if (!ignore_csum) {
+ ret = -EUCLEAN;
+ goto out;
+ }
}
if (found_level != check->level) {
@@ -425,7 +429,7 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
goto out;
}
if (check->has_first_key) {
- struct btrfs_key *expect_key = &check->first_key;
+ const struct btrfs_key *expect_key = &check->first_key;
struct btrfs_key found_key;
if (found_level)
@@ -635,10 +639,6 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
free_extent_buffer_stale(buf);
return ERR_PTR(ret);
}
- if (btrfs_check_eb_owner(buf, check->owner_root)) {
- free_extent_buffer_stale(buf);
- return ERR_PTR(-EUCLEAN);
- }
return buf;
}
@@ -658,11 +658,11 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->state = 0;
RB_CLEAR_NODE(&root->rb_node);
- root->last_trans = 0;
+ btrfs_set_root_last_trans(root, 0);
root->free_objectid = 0;
root->nr_delalloc_inodes = 0;
root->nr_ordered_extents = 0;
- root->inode_tree = RB_ROOT;
+ xa_init(&root->inodes);
xa_init(&root->delayed_nodes);
btrfs_init_root_block_rsv(root);
@@ -674,7 +674,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&root->ordered_extents);
INIT_LIST_HEAD(&root->ordered_root);
INIT_LIST_HEAD(&root->reloc_dirty_list);
- spin_lock_init(&root->inode_lock);
spin_lock_init(&root->delalloc_lock);
spin_lock_init(&root->ordered_extent_lock);
spin_lock_init(&root->accounting_lock);
@@ -847,13 +846,6 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
return btrfs_global_root(fs_info, &key);
}
-struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
-{
- if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
- return fs_info->block_group_root;
- return btrfs_extent_root(fs_info, 0);
-}
-
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid)
{
@@ -1010,7 +1002,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
return ret;
}
- log_root->last_trans = trans->transid;
+ btrfs_set_root_last_trans(log_root, trans->transid);
log_root->root_key.offset = btrfs_root_id(root);
inode_item = &log_root->root_item.inode;
@@ -1033,7 +1025,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
struct btrfs_path *path,
- struct btrfs_key *key)
+ const struct btrfs_key *key)
{
struct btrfs_root *root;
struct btrfs_tree_parent_check check = { 0 };
@@ -1095,7 +1087,7 @@ fail:
}
struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
- struct btrfs_key *key)
+ const struct btrfs_key *key)
{
struct btrfs_root *root;
struct btrfs_path *path;
@@ -1230,7 +1222,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
return ret;
}
-void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
+void btrfs_check_leaked_roots(const struct btrfs_fs_info *fs_info)
{
#ifdef CONFIG_BTRFS_DEBUG
struct btrfs_root *root;
@@ -1854,7 +1846,8 @@ void btrfs_put_root(struct btrfs_root *root)
return;
if (refcount_dec_and_test(&root->refs)) {
- WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
+ if (WARN_ON(!xa_empty(&root->inodes)))
+ xa_destroy(&root->inodes);
WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
if (root->anon_dev)
free_anon_bdev(root->anon_dev);
@@ -1928,7 +1921,7 @@ static int btrfs_init_btree_inode(struct super_block *sb)
if (!inode)
return -ENOMEM;
- inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
+ btrfs_set_inode_number(BTRFS_I(inode), BTRFS_BTREE_INODE_OBJECTID);
set_nlink(inode, 1);
/*
* we set the i_size on the btree inode to the max possible int.
@@ -1939,15 +1932,11 @@ static int btrfs_init_btree_inode(struct super_block *sb)
inode->i_mapping->a_ops = &btree_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
- RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
IO_TREE_BTREE_INODE_IO);
extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
- BTRFS_I(inode)->location.objectid = BTRFS_BTREE_INODE_OBJECTID;
- BTRFS_I(inode)->location.type = 0;
- BTRFS_I(inode)->location.offset = 0;
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
__insert_inode_hash(inode, hash);
fs_info->btree_inode = inode;
@@ -2146,7 +2135,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
/* If we have IGNOREDATACSUMS skip loading these roots. */
if (objectid == BTRFS_CSUM_TREE_OBJECTID &&
btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
- set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+ set_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state);
return 0;
}
@@ -2199,7 +2188,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
if (!found || ret) {
if (objectid == BTRFS_CSUM_TREE_OBJECTID)
- set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
+ set_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state);
if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
ret = ret ? ret : -ENOENT;
@@ -2350,21 +2339,29 @@ out:
* 1, 2 2nd and 3rd backup copy
* -1 skip bytenr check
*/
-int btrfs_validate_super(struct btrfs_fs_info *fs_info,
- struct btrfs_super_block *sb, int mirror_num)
+int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_super_block *sb, int mirror_num)
{
u64 nodesize = btrfs_super_nodesize(sb);
u64 sectorsize = btrfs_super_sectorsize(sb);
int ret = 0;
+ const bool ignore_flags = btrfs_test_opt(fs_info, IGNORESUPERFLAGS);
if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
btrfs_err(fs_info, "no valid FS found");
ret = -EINVAL;
}
- if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
- btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
- btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
- ret = -EINVAL;
+ if ((btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)) {
+ if (!ignore_flags) {
+ btrfs_err(fs_info,
+ "unrecognized or unsupported super flag 0x%llx",
+ btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
+ ret = -EINVAL;
+ } else {
+ btrfs_info(fs_info,
+ "unrecognized or unsupported super flags: 0x%llx, ignored",
+ btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
+ }
}
if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
btrfs_err(fs_info, "tree_root level too big: %d >= %d",
@@ -2467,7 +2464,7 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info,
(!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) ||
!btrfs_fs_incompat(fs_info, NO_HOLES))) {
btrfs_err(fs_info,
- "block-group-tree feature requires fres-space-tree and no-holes");
+ "block-group-tree feature requires free-space-tree and no-holes");
ret = -EINVAL;
}
@@ -2856,6 +2853,8 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
if (ret)
return ret;
+ spin_lock_init(&fs_info->extent_map_shrinker_lock);
+
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
if (ret)
return ret;
@@ -2880,6 +2879,8 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
if (sb_rdonly(sb))
set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
+ if (btrfs_test_opt(fs_info, IGNOREMETACSUMS))
+ set_bit(BTRFS_FS_STATE_SKIP_META_CSUMS, &fs_info->fs_state);
return btrfs_alloc_stripe_hash_table(fs_info);
}
@@ -2925,22 +2926,22 @@ static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
{
u64 root_objectid = 0;
struct btrfs_root *gang[8];
- int i = 0;
- int err = 0;
- unsigned int ret = 0;
+ int ret = 0;
while (1) {
+ unsigned int found;
+
spin_lock(&fs_info->fs_roots_radix_lock);
- ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ found = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
(void **)gang, root_objectid,
ARRAY_SIZE(gang));
- if (!ret) {
+ if (!found) {
spin_unlock(&fs_info->fs_roots_radix_lock);
break;
}
- root_objectid = btrfs_root_id(gang[ret - 1]) + 1;
+ root_objectid = btrfs_root_id(gang[found - 1]) + 1;
- for (i = 0; i < ret; i++) {
+ for (int i = 0; i < found; i++) {
/* Avoid to grab roots in dead_roots. */
if (btrfs_root_refs(&gang[i]->root_item) == 0) {
gang[i] = NULL;
@@ -2951,24 +2952,25 @@ static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->fs_roots_radix_lock);
- for (i = 0; i < ret; i++) {
+ for (int i = 0; i < found; i++) {
if (!gang[i])
continue;
root_objectid = btrfs_root_id(gang[i]);
- err = btrfs_orphan_cleanup(gang[i]);
- if (err)
- goto out;
+ /*
+ * Continue to release the remaining roots after the first
+ * error without cleanup and preserve the first error
+ * for the return.
+ */
+ if (!ret)
+ ret = btrfs_orphan_cleanup(gang[i]);
btrfs_put_root(gang[i]);
}
+ if (ret)
+ break;
+
root_objectid++;
}
-out:
- /* Release the uncleaned roots due to error. */
- for (; i < ret; i++) {
- if (gang[i])
- btrfs_put_root(gang[i]);
- }
- return err;
+ return ret;
}
/*
@@ -3202,7 +3204,7 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
}
int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
- char *options)
+ const char *options)
{
u32 sectorsize;
u32 nodesize;
@@ -4155,9 +4157,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
int btrfs_commit_super(struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *root = fs_info->tree_root;
- struct btrfs_trans_handle *trans;
-
mutex_lock(&fs_info->cleaner_mutex);
btrfs_run_delayed_iputs(fs_info);
mutex_unlock(&fs_info->cleaner_mutex);
@@ -4167,10 +4166,7 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info)
down_write(&fs_info->cleanup_work_sem);
up_write(&fs_info->cleanup_work_sem);
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- return btrfs_commit_transaction(trans);
+ return btrfs_commit_current_transaction(fs_info->tree_root);
}
static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
@@ -4531,7 +4527,7 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
* extents that haven't had their dirty pages IO start writeout yet
* actually get run and error out properly.
*/
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
}
static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 76eb53fe7a11..99af64d3f277 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -41,7 +41,7 @@ static inline u64 btrfs_sb_offset(int mirror)
return BTRFS_SUPER_INFO_OFFSET;
}
-void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info);
+void btrfs_check_leaked_roots(const struct btrfs_fs_info *fs_info);
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info);
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
struct btrfs_tree_parent_check *check);
@@ -52,12 +52,11 @@ struct extent_buffer *btrfs_find_create_tree_block(
int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
const struct btrfs_super_block *disk_sb);
-int __cold open_ctree(struct super_block *sb,
- struct btrfs_fs_devices *fs_devices,
- char *options);
+int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
+ const char *options);
void __cold close_ctree(struct btrfs_fs_info *fs_info);
-int btrfs_validate_super(struct btrfs_fs_info *fs_info,
- struct btrfs_super_block *sb, int mirror_num);
+int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_super_block *sb, int mirror_num);
int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount);
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev);
@@ -65,7 +64,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
int copy_num, bool drop_cache);
int btrfs_commit_super(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
- struct btrfs_key *key);
+ const struct btrfs_key *key);
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
@@ -83,7 +82,6 @@ struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
struct btrfs_key *key);
struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr);
struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr);
-struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info);
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
@@ -91,7 +89,7 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
int btrfs_validate_extent_buffer(struct extent_buffer *eb,
- struct btrfs_tree_parent_check *check);
+ const struct btrfs_tree_parent_check *check);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
#endif
@@ -118,7 +116,7 @@ void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
int btrfs_read_extent_buffer(struct extent_buffer *buf,
- struct btrfs_tree_parent_check *check);
+ const struct btrfs_tree_parent_check *check);
blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio);
int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 9e81f89e76d8..e2b22bea348a 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -40,7 +40,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
if (parent) {
u64 parent_root_id;
- fid->parent_objectid = BTRFS_I(parent)->location.objectid;
+ fid->parent_objectid = btrfs_ino(BTRFS_I(parent));
fid->parent_gen = parent->i_generation;
parent_root_id = btrfs_root_id(BTRFS_I(parent)->root);
@@ -84,7 +84,7 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
if (IS_ERR(root))
return ERR_CAST(root);
- inode = btrfs_iget(sb, objectid, root);
+ inode = btrfs_iget(objectid, root);
btrfs_put_root(root);
if (IS_ERR(inode))
return ERR_CAST(inode);
@@ -210,7 +210,7 @@ struct dentry *btrfs_get_parent(struct dentry *child)
found_key.offset, 0);
}
- return d_obtain_alias(btrfs_iget(fs_info->sb, key.objectid, root));
+ return d_obtain_alias(btrfs_iget(key.objectid, root));
fail:
btrfs_free_path(path);
return ERR_PTR(ret);
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index ed2cfc3d5d8a..c54c5d7a5cd5 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -4,6 +4,7 @@
#include <trace/events/btrfs.h>
#include "messages.h"
#include "ctree.h"
+#include "extent_io.h"
#include "extent-io-tree.h"
#include "btrfs_inode.h"
@@ -1084,6 +1085,9 @@ again:
*/
prealloc = alloc_extent_state(mask);
}
+ /* Optimistically preallocate the extent changeset ulist node. */
+ if (changeset)
+ extent_changeset_prealloc(changeset, mask);
spin_lock(&tree->lock);
if (cached_state && *cached_state) {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3774c191e36d..d77498e7671c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -104,10 +104,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_path *path;
- struct btrfs_extent_item *ei;
- struct extent_buffer *leaf;
struct btrfs_key key;
- u32 item_size;
u64 num_refs;
u64 extent_flags;
u64 owner = 0;
@@ -126,11 +123,6 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- if (!trans) {
- path->skip_locking = 1;
- path->search_commit_root = 1;
- }
-
search_again:
key.objectid = bytenr;
key.offset = offset;
@@ -144,7 +136,7 @@ search_again:
if (ret < 0)
goto out_free;
- if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
+ if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) {
if (path->slots[0]) {
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &key,
@@ -157,38 +149,37 @@ search_again:
}
if (ret == 0) {
- leaf = path->nodes[0];
- item_size = btrfs_item_size(leaf, path->slots[0]);
- if (item_size >= sizeof(*ei)) {
- ei = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_extent_item);
- num_refs = btrfs_extent_refs(leaf, ei);
- extent_flags = btrfs_extent_flags(leaf, ei);
- owner = btrfs_get_extent_owner_root(fs_info, leaf,
- path->slots[0]);
- } else {
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_extent_item *ei;
+ const u32 item_size = btrfs_item_size(leaf, path->slots[0]);
+
+ if (unlikely(item_size < sizeof(*ei))) {
ret = -EUCLEAN;
btrfs_err(fs_info,
"unexpected extent item size, has %u expect >= %zu",
item_size, sizeof(*ei));
- if (trans)
- btrfs_abort_transaction(trans, ret);
- else
- btrfs_handle_fs_error(fs_info, ret, NULL);
-
+ btrfs_abort_transaction(trans, ret);
goto out_free;
}
- BUG_ON(num_refs == 0);
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ num_refs = btrfs_extent_refs(leaf, ei);
+ if (unlikely(num_refs == 0)) {
+ ret = -EUCLEAN;
+ btrfs_err(fs_info,
+ "unexpected zero reference count for extent item (%llu %u %llu)",
+ key.objectid, key.type, key.offset);
+ btrfs_abort_transaction(trans, ret);
+ goto out_free;
+ }
+ extent_flags = btrfs_extent_flags(leaf, ei);
+ owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]);
} else {
num_refs = 0;
extent_flags = 0;
ret = 0;
}
- if (!trans)
- goto out;
-
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
@@ -211,15 +202,13 @@ search_again:
spin_lock(&head->lock);
if (head->extent_op && head->extent_op->update_flags)
extent_flags |= head->extent_op->flags_to_set;
- else
- BUG_ON(num_refs == 0);
num_refs += head->ref_mod;
spin_unlock(&head->lock);
mutex_unlock(&head->mutex);
}
spin_unlock(&delayed_refs->lock);
-out:
+
WARN_ON(num_refs == 0);
if (refs)
*refs = num_refs;
@@ -1632,7 +1621,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
if (metadata) {
key.type = BTRFS_METADATA_ITEM_KEY;
- key.offset = extent_op->level;
+ key.offset = head->level;
} else {
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = head->num_bytes;
@@ -1667,7 +1656,7 @@ again:
ret = -EUCLEAN;
btrfs_err(fs_info,
"missing extent item for extent %llu num_bytes %llu level %d",
- head->bytenr, head->num_bytes, extent_op->level);
+ head->bytenr, head->num_bytes, head->level);
goto out;
}
}
@@ -1726,7 +1715,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
.generation = trans->transid,
};
- BUG_ON(!extent_op || !extent_op->update_flags);
ret = alloc_reserved_tree_block(trans, node, extent_op);
if (!ret)
btrfs_record_squota_delta(fs_info, &delta);
@@ -2233,7 +2221,6 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, u64 flags)
{
struct btrfs_delayed_extent_op *extent_op;
- int level = btrfs_header_level(eb);
int ret;
extent_op = btrfs_alloc_delayed_extent_op();
@@ -2243,9 +2230,9 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
extent_op->flags_to_set = flags;
extent_op->update_flags = true;
extent_op->update_key = false;
- extent_op->level = level;
- ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
+ ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len,
+ btrfs_header_level(eb), extent_op);
if (ret)
btrfs_free_delayed_extent_op(extent_op);
return ret;
@@ -3419,10 +3406,10 @@ out_delayed_unlock:
return 0;
}
-void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
- u64 root_id,
- struct extent_buffer *buf,
- u64 parent, int last_ref)
+int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+ u64 root_id,
+ struct extent_buffer *buf,
+ u64 parent, int last_ref)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *bg;
@@ -3449,11 +3436,12 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 0, false);
btrfs_ref_tree_mod(fs_info, &generic_ref);
ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret < 0)
+ return ret;
}
if (!last_ref)
- return;
+ return 0;
if (btrfs_header_generation(buf) != trans->transid)
goto out;
@@ -3510,6 +3498,7 @@ out:
* matter anymore.
*/
clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
+ return 0;
}
/* Can return -ENOMEM */
@@ -4864,7 +4853,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct extent_buffer *leaf;
u32 size = sizeof(*extent_item) + sizeof(*iref);
- u64 flags = extent_op->flags_to_set;
+ const u64 flags = (extent_op ? extent_op->flags_to_set : 0);
/* The owner of a tree block is the level. */
int level = btrfs_delayed_ref_owner(node);
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
@@ -5121,7 +5110,6 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_key ins;
struct btrfs_block_rsv *block_rsv;
struct extent_buffer *buf;
- struct btrfs_delayed_extent_op *extent_op;
u64 flags = 0;
int ret;
u32 blocksize = fs_info->nodesize;
@@ -5164,6 +5152,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
BUG_ON(parent > 0);
if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+ struct btrfs_delayed_extent_op *extent_op;
struct btrfs_ref generic_ref = {
.action = BTRFS_ADD_DELAYED_EXTENT,
.bytenr = ins.objectid,
@@ -5172,30 +5161,34 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
.owning_root = owning_root,
.ref_root = root_objectid,
};
- extent_op = btrfs_alloc_delayed_extent_op();
- if (!extent_op) {
- ret = -ENOMEM;
- goto out_free_buf;
+
+ if (!skinny_metadata || flags != 0) {
+ extent_op = btrfs_alloc_delayed_extent_op();
+ if (!extent_op) {
+ ret = -ENOMEM;
+ goto out_free_buf;
+ }
+ if (key)
+ memcpy(&extent_op->key, key, sizeof(extent_op->key));
+ else
+ memset(&extent_op->key, 0, sizeof(extent_op->key));
+ extent_op->flags_to_set = flags;
+ extent_op->update_key = (skinny_metadata ? false : true);
+ extent_op->update_flags = (flags != 0);
+ } else {
+ extent_op = NULL;
}
- if (key)
- memcpy(&extent_op->key, key, sizeof(extent_op->key));
- else
- memset(&extent_op->key, 0, sizeof(extent_op->key));
- extent_op->flags_to_set = flags;
- extent_op->update_key = skinny_metadata ? false : true;
- extent_op->update_flags = true;
- extent_op->level = level;
btrfs_init_tree_ref(&generic_ref, level, btrfs_root_id(root), false);
btrfs_ref_tree_mod(fs_info, &generic_ref);
ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
- if (ret)
- goto out_free_delayed;
+ if (ret) {
+ btrfs_free_delayed_extent_op(extent_op);
+ goto out_free_buf;
+ }
}
return buf;
-out_free_delayed:
- btrfs_free_delayed_extent_op(extent_op);
out_free_buf:
btrfs_tree_unlock(buf);
free_extent_buffer(buf);
@@ -5220,11 +5213,99 @@ struct walk_control {
int reada_slot;
int reada_count;
int restarted;
+ /* Indicate that extent info needs to be looked up when walking the tree. */
+ int lookup_info;
};
+/*
+ * This is our normal stage. We are traversing blocks the current snapshot owns
+ * and we are dropping any of our references to any children we are able to, and
+ * then freeing the block once we've processed all of the children.
+ */
#define DROP_REFERENCE 1
+
+/*
+ * We enter this stage when we have to walk into a child block (meaning we can't
+ * simply drop our reference to it from our current parent node) and there are
+ * more than one reference on it. If we are the owner of any of the children
+ * blocks from the current parent node then we have to do the FULL_BACKREF dance
+ * on them in order to drop our normal ref and add the shared ref.
+ */
#define UPDATE_BACKREF 2
+/*
+ * Decide if we need to walk down into this node to adjust the references.
+ *
+ * @root: the root we are currently deleting
+ * @wc: the walk control for this deletion
+ * @eb: the parent eb that we're currently visiting
+ * @refs: the number of refs for wc->level - 1
+ * @flags: the flags for wc->level - 1
+ * @slot: the slot in the eb that we're currently checking
+ *
+ * This is meant to be called when we're evaluating if a node we point to at
+ * wc->level should be read and walked into, or if we can simply delete our
+ * reference to it. We return true if we should walk into the node, false if we
+ * can skip it.
+ *
+ * We have assertions in here to make sure this is called correctly. We assume
+ * that sanity checking on the blocks read to this point has been done, so any
+ * corrupted file systems must have been caught before calling this function.
+ */
+static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc,
+ struct extent_buffer *eb, u64 refs, u64 flags, int slot)
+{
+ struct btrfs_key key;
+ u64 generation;
+ int level = wc->level;
+
+ ASSERT(level > 0);
+ ASSERT(wc->refs[level - 1] > 0);
+
+ /*
+ * The update backref stage we only want to skip if we already have
+ * FULL_BACKREF set, otherwise we need to read.
+ */
+ if (wc->stage == UPDATE_BACKREF) {
+ if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ return false;
+ return true;
+ }
+
+ /*
+ * We're the last ref on this block, we must walk into it and process
+ * any refs it's pointing at.
+ */
+ if (wc->refs[level - 1] == 1)
+ return true;
+
+ /*
+ * If we're already FULL_BACKREF then we know we can just drop our
+ * current reference.
+ */
+ if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ return false;
+
+ /*
+ * This block is older than our creation generation, we can drop our
+ * reference to it.
+ */
+ generation = btrfs_node_ptr_generation(eb, slot);
+ if (!wc->update_ref || generation <= root->root_key.offset)
+ return false;
+
+ /*
+ * This block was processed from a previous snapshot deletion run, we
+ * can skip it.
+ */
+ btrfs_node_key_to_cpu(eb, &key, slot);
+ if (btrfs_comp_cpu_keys(&key, &wc->update_progress) < 0)
+ return false;
+
+ /* All other cases we need to wander into the node. */
+ return true;
+}
+
static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct walk_control *wc,
@@ -5236,7 +5317,6 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
u64 refs;
u64 flags;
u32 nritems;
- struct btrfs_key key;
struct extent_buffer *eb;
int ret;
int slot;
@@ -5276,28 +5356,19 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
/* We don't care about errors in readahead. */
if (ret < 0)
continue;
- BUG_ON(refs == 0);
- if (wc->stage == DROP_REFERENCE) {
- if (refs == 1)
- goto reada;
+ /*
+ * This could be racey, it's conceivable that we raced and end
+ * up with a bogus refs count, if that's the case just skip, if
+ * we are actually corrupt we will notice when we look up
+ * everything again with our locks.
+ */
+ if (refs == 0)
+ continue;
- if (wc->level == 1 &&
- (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
- continue;
- if (!wc->update_ref ||
- generation <= root->root_key.offset)
- continue;
- btrfs_node_key_to_cpu(eb, &key, slot);
- ret = btrfs_comp_cpu_keys(&key,
- &wc->update_progress);
- if (ret < 0)
- continue;
- } else {
- if (wc->level == 1 &&
- (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
- continue;
- }
+ /* If we don't need to visit this node don't reada. */
+ if (!visit_node_for_delete(root, wc, eb, refs, flags, slot))
+ continue;
reada:
btrfs_readahead_node_child(eb, slot);
nread++;
@@ -5316,7 +5387,7 @@ reada:
static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- struct walk_control *wc, int lookup_info)
+ struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int level = wc->level;
@@ -5331,19 +5402,22 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
* when reference count of tree block is 1, it won't increase
* again. once full backref flag is set, we never clear it.
*/
- if (lookup_info &&
+ if (wc->lookup_info &&
((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
(wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
- BUG_ON(!path->locks[level]);
+ ASSERT(path->locks[level]);
ret = btrfs_lookup_extent_info(trans, fs_info,
eb->start, level, 1,
&wc->refs[level],
&wc->flags[level],
NULL);
- BUG_ON(ret == -ENOMEM);
if (ret)
return ret;
- BUG_ON(wc->refs[level] == 0);
+ if (unlikely(wc->refs[level] == 0)) {
+ btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
+ eb->start);
+ return -EUCLEAN;
+ }
}
if (wc->stage == DROP_REFERENCE) {
@@ -5359,13 +5433,22 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
/* wc->stage == UPDATE_BACKREF */
if (!(wc->flags[level] & flag)) {
- BUG_ON(!path->locks[level]);
+ ASSERT(path->locks[level]);
ret = btrfs_inc_ref(trans, root, eb, 1);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
ret = btrfs_dec_ref(trans, root, eb, 0);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
ret = btrfs_set_disk_extent_flags(trans, eb, flag);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
wc->flags[level] |= flag;
}
@@ -5408,6 +5491,132 @@ static int check_ref_exists(struct btrfs_trans_handle *trans,
}
/*
+ * We may not have an uptodate block, so if we are going to walk down into this
+ * block we need to drop the lock, read it off of the disk, re-lock it and
+ * return to continue dropping the snapshot.
+ */
+static int check_next_block_uptodate(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct walk_control *wc,
+ struct extent_buffer *next)
+{
+ struct btrfs_tree_parent_check check = { 0 };
+ u64 generation;
+ int level = wc->level;
+ int ret;
+
+ btrfs_assert_tree_write_locked(next);
+
+ generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]);
+
+ if (btrfs_buffer_uptodate(next, generation, 0))
+ return 0;
+
+ check.level = level - 1;
+ check.transid = generation;
+ check.owner_root = btrfs_root_id(root);
+ check.has_first_key = true;
+ btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, path->slots[level]);
+
+ btrfs_tree_unlock(next);
+ if (level == 1)
+ reada_walk_down(trans, root, wc, path);
+ ret = btrfs_read_extent_buffer(next, &check);
+ if (ret) {
+ free_extent_buffer(next);
+ return ret;
+ }
+ btrfs_tree_lock(next);
+ wc->lookup_info = 1;
+ return 0;
+}
+
+/*
+ * If we determine that we don't have to visit wc->level - 1 then we need to
+ * determine if we can drop our reference.
+ *
+ * If we are UPDATE_BACKREF then we will not, we need to update our backrefs.
+ *
+ * If we are DROP_REFERENCE this will figure out if we need to drop our current
+ * reference, skipping it if we dropped it from a previous incompleted drop, or
+ * dropping it if we still have a reference to it.
+ */
+static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ struct btrfs_path *path, struct walk_control *wc,
+ struct extent_buffer *next, u64 owner_root)
+{
+ struct btrfs_ref ref = {
+ .action = BTRFS_DROP_DELAYED_REF,
+ .bytenr = next->start,
+ .num_bytes = root->fs_info->nodesize,
+ .owning_root = owner_root,
+ .ref_root = btrfs_root_id(root),
+ };
+ int level = wc->level;
+ int ret;
+
+ /* We are UPDATE_BACKREF, we're not dropping anything. */
+ if (wc->stage == UPDATE_BACKREF)
+ return 0;
+
+ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+ ref.parent = path->nodes[level]->start;
+ } else {
+ ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level]));
+ if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level])) {
+ btrfs_err(root->fs_info, "mismatched block owner");
+ return -EIO;
+ }
+ }
+
+ /*
+ * If we had a drop_progress we need to verify the refs are set as
+ * expected. If we find our ref then we know that from here on out
+ * everything should be correct, and we can clear the
+ * ->restarted flag.
+ */
+ if (wc->restarted) {
+ ret = check_ref_exists(trans, root, next->start, ref.parent,
+ level - 1);
+ if (ret <= 0)
+ return ret;
+ ret = 0;
+ wc->restarted = 0;
+ }
+
+ /*
+ * Reloc tree doesn't contribute to qgroup numbers, and we have already
+ * accounted them at merge time (replace_path), thus we could skip
+ * expensive subtree trace here.
+ */
+ if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
+ wc->refs[level - 1] > 1) {
+ u64 generation = btrfs_node_ptr_generation(path->nodes[level],
+ path->slots[level]);
+
+ ret = btrfs_qgroup_trace_subtree(trans, next, generation, level - 1);
+ if (ret) {
+ btrfs_err_rl(root->fs_info,
+"error %d accounting shared subtree, quota is out of sync, rescan required",
+ ret);
+ }
+ }
+
+ /*
+ * We need to update the next key in our walk control so we can update
+ * the drop_progress key accordingly. We don't care if find_next_key
+ * doesn't find a key because that means we're at the end and are going
+ * to clean up now.
+ */
+ wc->drop_level = level;
+ find_next_key(path, level, &wc->drop_progress);
+
+ btrfs_init_tree_ref(&ref, level - 1, 0, false);
+ return btrfs_free_extent(trans, &ref);
+}
+
+/*
* helper to process tree block pointer.
*
* when wc->stage == DROP_REFERENCE, this function checks
@@ -5423,19 +5632,15 @@ static int check_ref_exists(struct btrfs_trans_handle *trans,
static noinline int do_walk_down(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- struct walk_control *wc, int *lookup_info)
+ struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 bytenr;
u64 generation;
u64 owner_root = 0;
- struct btrfs_tree_parent_check check = { 0 };
- struct btrfs_key key;
struct extent_buffer *next;
int level = wc->level;
- int reada = 0;
int ret = 0;
- bool need_account = false;
generation = btrfs_node_ptr_generation(path->nodes[level],
path->slots[level]);
@@ -5446,27 +5651,17 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
*/
if (wc->stage == UPDATE_BACKREF &&
generation <= root->root_key.offset) {
- *lookup_info = 1;
+ wc->lookup_info = 1;
return 1;
}
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
- check.level = level - 1;
- check.transid = generation;
- check.owner_root = btrfs_root_id(root);
- check.has_first_key = true;
- btrfs_node_key_to_cpu(path->nodes[level], &check.first_key,
- path->slots[level]);
+ next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_root_id(root),
+ level - 1);
+ if (IS_ERR(next))
+ return PTR_ERR(next);
- next = find_extent_buffer(fs_info, bytenr);
- if (!next) {
- next = btrfs_find_create_tree_block(fs_info, bytenr,
- btrfs_root_id(root), level - 1);
- if (IS_ERR(next))
- return PTR_ERR(next);
- reada = 1;
- }
btrfs_tree_lock(next);
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
@@ -5477,57 +5672,32 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
goto out_unlock;
if (unlikely(wc->refs[level - 1] == 0)) {
- btrfs_err(fs_info, "Missing references.");
- ret = -EIO;
+ btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
+ bytenr);
+ ret = -EUCLEAN;
goto out_unlock;
}
- *lookup_info = 0;
+ wc->lookup_info = 0;
- if (wc->stage == DROP_REFERENCE) {
- if (wc->refs[level - 1] > 1) {
- need_account = true;
- if (level == 1 &&
- (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
- goto skip;
-
- if (!wc->update_ref ||
- generation <= root->root_key.offset)
- goto skip;
-
- btrfs_node_key_to_cpu(path->nodes[level], &key,
- path->slots[level]);
- ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
- if (ret < 0)
- goto skip;
+ /* If we don't have to walk into this node skip it. */
+ if (!visit_node_for_delete(root, wc, path->nodes[level],
+ wc->refs[level - 1], wc->flags[level - 1],
+ path->slots[level]))
+ goto skip;
- wc->stage = UPDATE_BACKREF;
- wc->shared_level = level - 1;
- }
- } else {
- if (level == 1 &&
- (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
- goto skip;
+ /*
+ * We have to walk down into this node, and if we're currently at the
+ * DROP_REFERNCE stage and this block is shared then we need to switch
+ * to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF.
+ */
+ if (wc->stage == DROP_REFERENCE && wc->refs[level - 1] > 1) {
+ wc->stage = UPDATE_BACKREF;
+ wc->shared_level = level - 1;
}
- if (!btrfs_buffer_uptodate(next, generation, 0)) {
- btrfs_tree_unlock(next);
- free_extent_buffer(next);
- next = NULL;
- *lookup_info = 1;
- }
-
- if (!next) {
- if (reada && level == 1)
- reada_walk_down(trans, root, wc, path);
- next = read_tree_block(fs_info, bytenr, &check);
- if (IS_ERR(next)) {
- return PTR_ERR(next);
- } else if (!extent_buffer_uptodate(next)) {
- free_extent_buffer(next);
- return -EIO;
- }
- btrfs_tree_lock(next);
- }
+ ret = check_next_block_uptodate(trans, root, path, wc, next);
+ if (ret)
+ return ret;
level--;
ASSERT(level == btrfs_header_level(next));
@@ -5544,78 +5714,12 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
wc->reada_slot = 0;
return 0;
skip:
+ ret = maybe_drop_reference(trans, root, path, wc, next, owner_root);
+ if (ret)
+ goto out_unlock;
wc->refs[level - 1] = 0;
wc->flags[level - 1] = 0;
- if (wc->stage == DROP_REFERENCE) {
- struct btrfs_ref ref = {
- .action = BTRFS_DROP_DELAYED_REF,
- .bytenr = bytenr,
- .num_bytes = fs_info->nodesize,
- .owning_root = owner_root,
- .ref_root = btrfs_root_id(root),
- };
- if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
- ref.parent = path->nodes[level]->start;
- } else {
- ASSERT(btrfs_root_id(root) ==
- btrfs_header_owner(path->nodes[level]));
- if (btrfs_root_id(root) !=
- btrfs_header_owner(path->nodes[level])) {
- btrfs_err(root->fs_info,
- "mismatched block owner");
- ret = -EIO;
- goto out_unlock;
- }
- }
-
- /*
- * If we had a drop_progress we need to verify the refs are set
- * as expected. If we find our ref then we know that from here
- * on out everything should be correct, and we can clear the
- * ->restarted flag.
- */
- if (wc->restarted) {
- ret = check_ref_exists(trans, root, bytenr, ref.parent,
- level - 1);
- if (ret < 0)
- goto out_unlock;
- if (ret == 0)
- goto no_delete;
- ret = 0;
- wc->restarted = 0;
- }
-
- /*
- * Reloc tree doesn't contribute to qgroup numbers, and we have
- * already accounted them at merge time (replace_path),
- * thus we could skip expensive subtree trace here.
- */
- if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && need_account) {
- ret = btrfs_qgroup_trace_subtree(trans, next,
- generation, level - 1);
- if (ret) {
- btrfs_err_rl(fs_info,
- "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
- ret);
- }
- }
-
- /*
- * We need to update the next key in our walk control so we can
- * update the drop_progress key accordingly. We don't care if
- * find_next_key doesn't find a key because that means we're at
- * the end and are going to clean up now.
- */
- wc->drop_level = level;
- find_next_key(path, level, &wc->drop_progress);
-
- btrfs_init_tree_ref(&ref, level - 1, 0, false);
- ret = btrfs_free_extent(trans, &ref);
- if (ret)
- goto out_unlock;
- }
-no_delete:
- *lookup_info = 1;
+ wc->lookup_info = 1;
ret = 1;
out_unlock:
@@ -5643,13 +5747,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
struct walk_control *wc)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- int ret;
+ int ret = 0;
int level = wc->level;
struct extent_buffer *eb = path->nodes[level];
u64 parent = 0;
if (wc->stage == UPDATE_BACKREF) {
- BUG_ON(wc->shared_level < level);
+ ASSERT(wc->shared_level >= level);
if (level < wc->shared_level)
goto out;
@@ -5667,7 +5771,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
* count is one.
*/
if (!path->locks[level]) {
- BUG_ON(level == 0);
+ ASSERT(level > 0);
btrfs_tree_lock(eb);
path->locks[level] = BTRFS_WRITE_LOCK;
@@ -5681,7 +5785,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
path->locks[level] = 0;
return ret;
}
- BUG_ON(wc->refs[level] == 0);
+ if (unlikely(wc->refs[level] == 0)) {
+ btrfs_tree_unlock_rw(eb, path->locks[level]);
+ btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
+ eb->start);
+ return -EUCLEAN;
+ }
if (wc->refs[level] == 1) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
path->locks[level] = 0;
@@ -5691,7 +5800,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
}
/* wc->stage == DROP_REFERENCE */
- BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
+ ASSERT(path->locks[level] || wc->refs[level] == 1);
if (wc->refs[level] == 1) {
if (level == 0) {
@@ -5699,7 +5808,10 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
ret = btrfs_dec_ref(trans, root, eb, 1);
else
ret = btrfs_dec_ref(trans, root, eb, 0);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
if (is_fstree(btrfs_root_id(root))) {
ret = btrfs_qgroup_trace_leaf_items(trans, eb);
if (ret) {
@@ -5730,12 +5842,14 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
goto owner_mismatch;
}
- btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
- wc->refs[level] == 1);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
+ wc->refs[level] == 1);
+ if (ret < 0)
+ btrfs_abort_transaction(trans, ret);
out:
wc->refs[level] = 0;
wc->flags[level] = 0;
- return 0;
+ return ret;
owner_mismatch:
btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
@@ -5743,17 +5857,38 @@ owner_mismatch:
return -EUCLEAN;
}
+/*
+ * walk_down_tree consists of two steps.
+ *
+ * walk_down_proc(). Look up the reference count and reference of our current
+ * wc->level. At this point path->nodes[wc->level] should be populated and
+ * uptodate, and in most cases should already be locked. If we are in
+ * DROP_REFERENCE and our refcount is > 1 then we've entered a shared node and
+ * we can walk back up the tree. If we are UPDATE_BACKREF we have to set
+ * FULL_BACKREF on this node if it's not already set, and then do the
+ * FULL_BACKREF conversion dance, which is to drop the root reference and add
+ * the shared reference to all of this nodes children.
+ *
+ * do_walk_down(). This is where we actually start iterating on the children of
+ * our current path->nodes[wc->level]. For DROP_REFERENCE that means dropping
+ * our reference to the children that return false from visit_node_for_delete(),
+ * which has various conditions where we know we can just drop our reference
+ * without visiting the node. For UPDATE_BACKREF we will skip any children that
+ * visit_node_for_delete() returns false for, only walking down when necessary.
+ * The bulk of the work for UPDATE_BACKREF occurs in the walk_up_tree() part of
+ * snapshot deletion.
+ */
static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct walk_control *wc)
{
int level = wc->level;
- int lookup_info = 1;
int ret = 0;
+ wc->lookup_info = 1;
while (level >= 0) {
- ret = walk_down_proc(trans, root, path, wc, lookup_info);
+ ret = walk_down_proc(trans, root, path, wc);
if (ret)
break;
@@ -5764,7 +5899,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
btrfs_header_nritems(path->nodes[level]))
break;
- ret = do_walk_down(trans, root, path, wc, &lookup_info);
+ ret = do_walk_down(trans, root, path, wc);
if (ret > 0) {
path->slots[level]++;
continue;
@@ -5775,6 +5910,23 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
return (ret == 1) ? 0 : ret;
}
+/*
+ * walk_up_tree() is responsible for making sure we visit every slot on our
+ * current node, and if we're at the end of that node then we call
+ * walk_up_proc() on our current node which will do one of a few things based on
+ * our stage.
+ *
+ * UPDATE_BACKREF. If we wc->level is currently less than our wc->shared_level
+ * then we need to walk back up the tree, and then going back down into the
+ * other slots via walk_down_tree to update any other children from our original
+ * wc->shared_level. Once we're at or above our wc->shared_level we can switch
+ * back to DROP_REFERENCE, lookup the current nodes refs and flags, and carry on.
+ *
+ * DROP_REFERENCE. If our refs == 1 then we're going to free this tree block.
+ * If we're level 0 then we need to btrfs_dec_ref() on all of the data extents
+ * in our current leaf. After that we call btrfs_free_tree_block() on the
+ * current node and walk up to the next node to walk down the next slot.
+ */
static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
@@ -5833,8 +5985,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
struct btrfs_root_item *root_item = &root->root_item;
struct walk_control *wc;
struct btrfs_key key;
- int err = 0;
- int ret;
+ const u64 rootid = btrfs_root_id(root);
+ int ret = 0;
int level;
bool root_dropped = false;
bool unfinished_drop = false;
@@ -5843,14 +5995,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
wc = kzalloc(sizeof(*wc), GFP_NOFS);
if (!wc) {
btrfs_free_path(path);
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
@@ -5863,12 +6015,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
else
trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_free;
}
- err = btrfs_run_delayed_items(trans);
- if (err)
+ ret = btrfs_run_delayed_items(trans);
+ if (ret)
goto out_end_trans;
/*
@@ -5899,11 +6051,11 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
path->lowest_level = level;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
path->lowest_level = 0;
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out_end_trans;
- }
+
WARN_ON(ret > 0);
+ ret = 0;
/*
* unlock our path, this is safe because only this
@@ -5916,14 +6068,17 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
btrfs_tree_lock(path->nodes[level]);
path->locks[level] = BTRFS_WRITE_LOCK;
+ /*
+ * btrfs_lookup_extent_info() returns 0 for success,
+ * or < 0 for error.
+ */
ret = btrfs_lookup_extent_info(trans, fs_info,
path->nodes[level]->start,
level, 1, &wc->refs[level],
&wc->flags[level], NULL);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out_end_trans;
- }
+
BUG_ON(wc->refs[level] == 0);
if (level == btrfs_root_drop_level(root_item))
@@ -5949,19 +6104,18 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
ret = walk_down_tree(trans, root, path, wc);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
- err = ret;
break;
}
ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
- err = ret;
break;
}
if (ret > 0) {
BUG_ON(wc->stage != DROP_REFERENCE);
+ ret = 0;
break;
}
@@ -5983,7 +6137,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
root_item);
if (ret) {
btrfs_abort_transaction(trans, ret);
- err = ret;
goto out_end_trans;
}
@@ -5994,7 +6147,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
btrfs_debug(fs_info,
"drop snapshot early exit");
- err = -EAGAIN;
+ ret = -EAGAIN;
goto out_free;
}
@@ -6008,19 +6161,18 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
else
trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_free;
}
}
}
btrfs_release_path(path);
- if (err)
+ if (ret)
goto out_end_trans;
ret = btrfs_del_root(trans, &root->root_key);
if (ret) {
btrfs_abort_transaction(trans, ret);
- err = ret;
goto out_end_trans;
}
@@ -6029,10 +6181,11 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
NULL, NULL);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
- err = ret;
goto out_end_trans;
} else if (ret > 0) {
- /* if we fail to delete the orphan item this time
+ ret = 0;
+ /*
+ * If we fail to delete the orphan item this time
* around, it'll get picked up the next time.
*
* The most common failure here is just -ENOENT.
@@ -6063,11 +6216,19 @@ out_free:
kfree(wc);
btrfs_free_path(path);
out:
+ if (!ret && root_dropped) {
+ ret = btrfs_qgroup_cleanup_dropped_subvolume(fs_info, rootid);
+ if (ret < 0)
+ btrfs_warn_rl(fs_info,
+ "failed to cleanup qgroup 0/%llu: %d",
+ rootid, ret);
+ ret = 0;
+ }
/*
* We were an unfinished drop root, check to see if there are any
* pending, and if not clear and wake up any waiters.
*/
- if (!err && unfinished_drop)
+ if (!ret && unfinished_drop)
btrfs_maybe_wake_unfinished_drop(fs_info);
/*
@@ -6079,7 +6240,7 @@ out:
*/
if (!for_reloc && !root_dropped)
btrfs_add_dead_root(root);
- return err;
+ return ret;
}
/*
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index af9f8800d5ac..2ad51130c037 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -127,10 +127,10 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
u64 empty_size,
u64 reloc_src_root,
enum btrfs_lock_nesting nest);
-void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
- u64 root_id,
- struct extent_buffer *buf,
- u64 parent, int last_ref);
+int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+ u64 root_id,
+ struct extent_buffer *buf,
+ u64 parent, int last_ref);
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 owner,
u64 offset, u64 ram_bytes,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f688fab55251..aa7f8148cd0d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -164,23 +164,8 @@ void __cold extent_buffer_free_cachep(void)
kmem_cache_destroy(extent_buffer_cache);
}
-void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
-{
- unsigned long index = start >> PAGE_SHIFT;
- unsigned long end_index = end >> PAGE_SHIFT;
- struct page *page;
-
- while (index <= end_index) {
- page = find_get_page(inode->i_mapping, index);
- BUG_ON(!page); /* Pages should be in the extent_io_tree */
- clear_page_dirty_for_io(page);
- put_page(page);
- index++;
- }
-}
-
static void process_one_page(struct btrfs_fs_info *fs_info,
- struct page *page, struct page *locked_page,
+ struct page *page, const struct page *locked_page,
unsigned long page_ops, u64 start, u64 end)
{
struct folio *folio = page_folio(page);
@@ -203,7 +188,7 @@ static void process_one_page(struct btrfs_fs_info *fs_info,
}
static void __process_pages_contig(struct address_space *mapping,
- struct page *locked_page, u64 start, u64 end,
+ const struct page *locked_page, u64 start, u64 end,
unsigned long page_ops)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
@@ -230,8 +215,8 @@ static void __process_pages_contig(struct address_space *mapping,
}
}
-static noinline void __unlock_for_delalloc(struct inode *inode,
- struct page *locked_page,
+static noinline void __unlock_for_delalloc(const struct inode *inode,
+ const struct page *locked_page,
u64 start, u64 end)
{
unsigned long index = start >> PAGE_SHIFT;
@@ -246,7 +231,7 @@ static noinline void __unlock_for_delalloc(struct inode *inode,
}
static noinline int lock_delalloc_pages(struct inode *inode,
- struct page *locked_page,
+ const struct page *locked_page,
u64 start,
u64 end)
{
@@ -411,7 +396,7 @@ out_failed:
}
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
- struct page *locked_page,
+ const struct page *locked_page,
struct extent_state **cached,
u32 clear_bits, unsigned long page_ops)
{
@@ -667,24 +652,22 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
}
/*
- * Populate every free slot in a provided array with folios.
+ * Populate every free slot in a provided array with folios using GFP_NOFS.
*
* @nr_folios: number of folios to allocate
* @folio_array: the array to fill with folios; any existing non-NULL entries in
* the array will be skipped
- * @extra_gfp: the extra GFP flags for the allocation
*
* Return: 0 if all folios were able to be allocated;
* -ENOMEM otherwise, the partially allocated folios would be freed and
* the array slots zeroed
*/
-int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array,
- gfp_t extra_gfp)
+int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array)
{
for (int i = 0; i < nr_folios; i++) {
if (folio_array[i])
continue;
- folio_array[i] = folio_alloc(GFP_NOFS | extra_gfp, 0);
+ folio_array[i] = folio_alloc(GFP_NOFS, 0);
if (!folio_array[i])
goto error;
}
@@ -698,21 +681,21 @@ error:
}
/*
- * Populate every free slot in a provided array with pages.
+ * Populate every free slot in a provided array with pages, using GFP_NOFS.
*
* @nr_pages: number of pages to allocate
* @page_array: the array to fill with pages; any existing non-null entries in
- * the array will be skipped
- * @extra_gfp: the extra GFP flags for the allocation.
+ * the array will be skipped
+ * @nofail: whether using __GFP_NOFAIL flag
*
* Return: 0 if all pages were able to be allocated;
* -ENOMEM otherwise, the partially allocated pages would be freed and
* the array slots zeroed
*/
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
- gfp_t extra_gfp)
+ bool nofail)
{
- const gfp_t gfp = GFP_NOFS | extra_gfp;
+ const gfp_t gfp = nofail ? (GFP_NOFS | __GFP_NOFAIL) : GFP_NOFS;
unsigned int allocated;
for (allocated = 0; allocated < nr_pages;) {
@@ -736,13 +719,13 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
*
* For now, the folios populated are always in order 0 (aka, single page).
*/
-static int alloc_eb_folio_array(struct extent_buffer *eb, gfp_t extra_gfp)
+static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail)
{
struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 };
int num_pages = num_extent_pages(eb);
int ret;
- ret = btrfs_alloc_page_array(num_pages, page_array, extra_gfp);
+ ret = btrfs_alloc_page_array(num_pages, page_array, nofail);
if (ret < 0)
return ret;
@@ -860,7 +843,7 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
/* Cap to the current ordered extent boundary if there is one. */
if (len > bio_ctrl->len_to_oe_boundary) {
ASSERT(bio_ctrl->compress_type == BTRFS_COMPRESS_NONE);
- ASSERT(is_data_inode(&inode->vfs_inode));
+ ASSERT(is_data_inode(inode));
len = bio_ctrl->len_to_oe_boundary;
}
@@ -1083,10 +1066,10 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
iosize = min(extent_map_end(em) - cur, end - cur + 1);
iosize = ALIGN(iosize, blocksize);
if (compress_type != BTRFS_COMPRESS_NONE)
- disk_bytenr = em->block_start;
+ disk_bytenr = em->disk_bytenr;
else
- disk_bytenr = em->block_start + extent_offset;
- block_start = em->block_start;
+ disk_bytenr = extent_map_block_start(em) + extent_offset;
+ block_start = extent_map_block_start(em);
if (em->flags & EXTENT_FLAG_PREALLOC)
block_start = EXTENT_MAP_HOLE;
@@ -1226,13 +1209,23 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages,
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
struct page *page, struct writeback_control *wbc)
{
+ struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode);
+ struct folio *folio = page_folio(page);
+ const bool is_subpage = btrfs_is_subpage(fs_info, page->mapping);
const u64 page_start = page_offset(page);
const u64 page_end = page_start + PAGE_SIZE - 1;
+ /*
+ * Save the last found delalloc end. As the delalloc end can go beyond
+ * page boundary, thus we cannot rely on subpage bitmap to locate the
+ * last delalloc end.
+ */
+ u64 last_delalloc_end = 0;
u64 delalloc_start = page_start;
u64 delalloc_end = page_end;
u64 delalloc_to_write = 0;
int ret = 0;
+ /* Lock all (subpage) delalloc ranges inside the page first. */
while (delalloc_start < page_end) {
delalloc_end = page_end;
if (!find_lock_delalloc_range(&inode->vfs_inode, page,
@@ -1240,15 +1233,95 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
delalloc_start = delalloc_end + 1;
continue;
}
-
- ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
- delalloc_end, wbc);
- if (ret < 0)
- return ret;
-
+ btrfs_folio_set_writer_lock(fs_info, folio, delalloc_start,
+ min(delalloc_end, page_end) + 1 -
+ delalloc_start);
+ last_delalloc_end = delalloc_end;
delalloc_start = delalloc_end + 1;
}
+ delalloc_start = page_start;
+
+ if (!last_delalloc_end)
+ goto out;
+
+ /* Run the delalloc ranges for the above locked ranges. */
+ while (delalloc_start < page_end) {
+ u64 found_start;
+ u32 found_len;
+ bool found;
+
+ if (!is_subpage) {
+ /*
+ * For non-subpage case, the found delalloc range must
+ * cover this page and there must be only one locked
+ * delalloc range.
+ */
+ found_start = page_start;
+ found_len = last_delalloc_end + 1 - found_start;
+ found = true;
+ } else {
+ found = btrfs_subpage_find_writer_locked(fs_info, folio,
+ delalloc_start, &found_start, &found_len);
+ }
+ if (!found)
+ break;
+ /*
+ * The subpage range covers the last sector, the delalloc range may
+ * end beyond the page boundary, use the saved delalloc_end
+ * instead.
+ */
+ if (found_start + found_len >= page_end)
+ found_len = last_delalloc_end + 1 - found_start;
+
+ if (ret >= 0) {
+ /* No errors hit so far, run the current delalloc range. */
+ ret = btrfs_run_delalloc_range(inode, page, found_start,
+ found_start + found_len - 1,
+ wbc);
+ } else {
+ /*
+ * We've hit an error during previous delalloc range,
+ * have to cleanup the remaining locked ranges.
+ */
+ unlock_extent(&inode->io_tree, found_start,
+ found_start + found_len - 1, NULL);
+ __unlock_for_delalloc(&inode->vfs_inode, page, found_start,
+ found_start + found_len - 1);
+ }
+
+ /*
+ * We can hit btrfs_run_delalloc_range() with >0 return value.
+ *
+ * This happens when either the IO is already done and page
+ * unlocked (inline) or the IO submission and page unlock would
+ * be handled as async (compression).
+ *
+ * Inline is only possible for regular sectorsize for now.
+ *
+ * Compression is possible for both subpage and regular cases,
+ * but even for subpage compression only happens for page aligned
+ * range, thus the found delalloc range must go beyond current
+ * page.
+ */
+ if (ret > 0)
+ ASSERT(!is_subpage || found_start + found_len >= page_end);
+
+ /*
+ * Above btrfs_run_delalloc_range() may have unlocked the page,
+ * thus for the last range, we cannot touch the page anymore.
+ */
+ if (found_start + found_len >= last_delalloc_end + 1)
+ break;
+ delalloc_start = found_start + found_len;
+ }
+ if (ret < 0)
+ return ret;
+out:
+ if (last_delalloc_end)
+ delalloc_end = last_delalloc_end;
+ else
+ delalloc_end = page_end;
/*
* delalloc_end is already one less than the total length, so
* we don't subtract one from PAGE_SIZE
@@ -1292,7 +1365,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
* Return the next dirty range in [@start, @end).
* If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE.
*/
-static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
+static void find_next_dirty_byte(const struct btrfs_fs_info *fs_info,
struct page *page, u64 *start, u64 *end)
{
struct folio *folio = page_folio(page);
@@ -1339,20 +1412,23 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
* < 0 if there were errors (page still locked)
*/
static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
- struct page *page,
+ struct page *page, u64 start, u32 len,
struct btrfs_bio_ctrl *bio_ctrl,
loff_t i_size,
int *nr_ret)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- u64 cur = page_offset(page);
- u64 end = cur + PAGE_SIZE - 1;
+ u64 cur = start;
+ u64 end = start + len - 1;
u64 extent_offset;
u64 block_start;
struct extent_map *em;
int ret = 0;
int nr = 0;
+ ASSERT(start >= page_offset(page) &&
+ start + len <= page_offset(page) + PAGE_SIZE);
+
ret = btrfs_writepage_cow_fixup(page);
if (ret) {
/* Fixup worker will requeue */
@@ -1405,8 +1481,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
- block_start = em->block_start;
- disk_bytenr = em->block_start + extent_offset;
+ block_start = extent_map_block_start(em);
+ disk_bytenr = extent_map_block_start(em) + extent_offset;
ASSERT(!extent_map_is_compressed(em));
ASSERT(block_start != EXTENT_MAP_HOLE);
@@ -1441,7 +1517,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
nr++;
}
- btrfs_folio_assert_not_dirty(fs_info, page_folio(page));
+ btrfs_folio_assert_not_dirty(fs_info, page_folio(page), start, len);
*nr_ret = nr;
return 0;
@@ -1499,7 +1575,8 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl
if (ret)
goto done;
- ret = __extent_writepage_io(BTRFS_I(inode), page, bio_ctrl, i_size, &nr);
+ ret = __extent_writepage_io(BTRFS_I(inode), page, page_offset(page),
+ PAGE_SIZE, bio_ctrl, i_size, &nr);
if (ret == 1)
return 0;
@@ -1516,7 +1593,8 @@ done:
PAGE_SIZE, !ret);
mapping_set_error(page->mapping, ret);
}
- unlock_page(page);
+
+ btrfs_folio_end_all_writers(inode_to_fs_info(inode), folio);
ASSERT(ret <= 0);
return ret;
}
@@ -1648,7 +1726,7 @@ static void set_btree_ioerr(struct extent_buffer *eb)
* context.
*/
static struct extent_buffer *find_extent_buffer_nolock(
- struct btrfs_fs_info *fs_info, u64 start)
+ const struct btrfs_fs_info *fs_info, u64 start)
{
struct extent_buffer *eb;
@@ -2217,7 +2295,7 @@ retry:
* already been ran (aka, ordered extent inserted) and all pages are still
* locked.
*/
-void extent_write_locked_range(struct inode *inode, struct page *locked_page,
+void extent_write_locked_range(struct inode *inode, const struct page *locked_page,
u64 start, u64 end, struct writeback_control *wbc,
bool pages_dirty)
{
@@ -2246,20 +2324,21 @@ void extent_write_locked_range(struct inode *inode, struct page *locked_page,
page = find_get_page(mapping, cur >> PAGE_SHIFT);
ASSERT(PageLocked(page));
- if (pages_dirty && page != locked_page) {
+ if (pages_dirty && page != locked_page)
ASSERT(PageDirty(page));
- clear_page_dirty_for_io(page);
- }
- ret = __extent_writepage_io(BTRFS_I(inode), page, &bio_ctrl,
- i_size, &nr);
+ ret = __extent_writepage_io(BTRFS_I(inode), page, cur, cur_len,
+ &bio_ctrl, i_size, &nr);
if (ret == 1)
goto next_page;
/* Make sure the mapping tag for page dirty gets cleared. */
if (nr == 0) {
- set_page_writeback(page);
- end_page_writeback(page);
+ struct folio *folio;
+
+ folio = page_folio(page);
+ btrfs_folio_set_writeback(fs_info, folio, cur, cur_len);
+ btrfs_folio_clear_writeback(fs_info, folio, cur, cur_len);
}
if (ret) {
btrfs_mark_ordered_io_finished(BTRFS_I(inode), page,
@@ -2470,877 +2549,6 @@ next:
return try_release_extent_state(io_tree, page, mask);
}
-struct btrfs_fiemap_entry {
- u64 offset;
- u64 phys;
- u64 len;
- u32 flags;
-};
-
-/*
- * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file
- * range from the inode's io tree, unlock the subvolume tree search path, flush
- * the fiemap cache and relock the file range and research the subvolume tree.
- * The value here is something negative that can't be confused with a valid
- * errno value and different from 1 because that's also a return value from
- * fiemap_fill_next_extent() and also it's often used to mean some btree search
- * did not find a key, so make it some distinct negative value.
- */
-#define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1))
-
-/*
- * Used to:
- *
- * - Cache the next entry to be emitted to the fiemap buffer, so that we can
- * merge extents that are contiguous and can be grouped as a single one;
- *
- * - Store extents ready to be written to the fiemap buffer in an intermediary
- * buffer. This intermediary buffer is to ensure that in case the fiemap
- * buffer is memory mapped to the fiemap target file, we don't deadlock
- * during btrfs_page_mkwrite(). This is because during fiemap we are locking
- * an extent range in order to prevent races with delalloc flushing and
- * ordered extent completion, which is needed in order to reliably detect
- * delalloc in holes and prealloc extents. And this can lead to a deadlock
- * if the fiemap buffer is memory mapped to the file we are running fiemap
- * against (a silly, useless in practice scenario, but possible) because
- * btrfs_page_mkwrite() will try to lock the same extent range.
- */
-struct fiemap_cache {
- /* An array of ready fiemap entries. */
- struct btrfs_fiemap_entry *entries;
- /* Number of entries in the entries array. */
- int entries_size;
- /* Index of the next entry in the entries array to write to. */
- int entries_pos;
- /*
- * Once the entries array is full, this indicates what's the offset for
- * the next file extent item we must search for in the inode's subvolume
- * tree after unlocking the extent range in the inode's io tree and
- * releasing the search path.
- */
- u64 next_search_offset;
- /*
- * This matches struct fiemap_extent_info::fi_mapped_extents, we use it
- * to count ourselves emitted extents and stop instead of relying on
- * fiemap_fill_next_extent() because we buffer ready fiemap entries at
- * the @entries array, and we want to stop as soon as we hit the max
- * amount of extents to map, not just to save time but also to make the
- * logic at extent_fiemap() simpler.
- */
- unsigned int extents_mapped;
- /* Fields for the cached extent (unsubmitted, not ready, extent). */
- u64 offset;
- u64 phys;
- u64 len;
- u32 flags;
- bool cached;
-};
-
-static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo,
- struct fiemap_cache *cache)
-{
- for (int i = 0; i < cache->entries_pos; i++) {
- struct btrfs_fiemap_entry *entry = &cache->entries[i];
- int ret;
-
- ret = fiemap_fill_next_extent(fieinfo, entry->offset,
- entry->phys, entry->len,
- entry->flags);
- /*
- * Ignore 1 (reached max entries) because we keep track of that
- * ourselves in emit_fiemap_extent().
- */
- if (ret < 0)
- return ret;
- }
- cache->entries_pos = 0;
-
- return 0;
-}
-
-/*
- * Helper to submit fiemap extent.
- *
- * Will try to merge current fiemap extent specified by @offset, @phys,
- * @len and @flags with cached one.
- * And only when we fails to merge, cached one will be submitted as
- * fiemap extent.
- *
- * Return value is the same as fiemap_fill_next_extent().
- */
-static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
- struct fiemap_cache *cache,
- u64 offset, u64 phys, u64 len, u32 flags)
-{
- struct btrfs_fiemap_entry *entry;
- u64 cache_end;
-
- /* Set at the end of extent_fiemap(). */
- ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
-
- if (!cache->cached)
- goto assign;
-
- /*
- * When iterating the extents of the inode, at extent_fiemap(), we may
- * find an extent that starts at an offset behind the end offset of the
- * previous extent we processed. This happens if fiemap is called
- * without FIEMAP_FLAG_SYNC and there are ordered extents completing
- * after we had to unlock the file range, release the search path, emit
- * the fiemap extents stored in the buffer (cache->entries array) and
- * the lock the remainder of the range and re-search the btree.
- *
- * For example we are in leaf X processing its last item, which is the
- * file extent item for file range [512K, 1M[, and after
- * btrfs_next_leaf() releases the path, there's an ordered extent that
- * completes for the file range [768K, 2M[, and that results in trimming
- * the file extent item so that it now corresponds to the file range
- * [512K, 768K[ and a new file extent item is inserted for the file
- * range [768K, 2M[, which may end up as the last item of leaf X or as
- * the first item of the next leaf - in either case btrfs_next_leaf()
- * will leave us with a path pointing to the new extent item, for the
- * file range [768K, 2M[, since that's the first key that follows the
- * last one we processed. So in order not to report overlapping extents
- * to user space, we trim the length of the previously cached extent and
- * emit it.
- *
- * Upon calling btrfs_next_leaf() we may also find an extent with an
- * offset smaller than or equals to cache->offset, and this happens
- * when we had a hole or prealloc extent with several delalloc ranges in
- * it, but after btrfs_next_leaf() released the path, delalloc was
- * flushed and the resulting ordered extents were completed, so we can
- * now have found a file extent item for an offset that is smaller than
- * or equals to what we have in cache->offset. We deal with this as
- * described below.
- */
- cache_end = cache->offset + cache->len;
- if (cache_end > offset) {
- if (offset == cache->offset) {
- /*
- * We cached a dealloc range (found in the io tree) for
- * a hole or prealloc extent and we have now found a
- * file extent item for the same offset. What we have
- * now is more recent and up to date, so discard what
- * we had in the cache and use what we have just found.
- */
- goto assign;
- } else if (offset > cache->offset) {
- /*
- * The extent range we previously found ends after the
- * offset of the file extent item we found and that
- * offset falls somewhere in the middle of that previous
- * extent range. So adjust the range we previously found
- * to end at the offset of the file extent item we have
- * just found, since this extent is more up to date.
- * Emit that adjusted range and cache the file extent
- * item we have just found. This corresponds to the case
- * where a previously found file extent item was split
- * due to an ordered extent completing.
- */
- cache->len = offset - cache->offset;
- goto emit;
- } else {
- const u64 range_end = offset + len;
-
- /*
- * The offset of the file extent item we have just found
- * is behind the cached offset. This means we were
- * processing a hole or prealloc extent for which we
- * have found delalloc ranges (in the io tree), so what
- * we have in the cache is the last delalloc range we
- * found while the file extent item we found can be
- * either for a whole delalloc range we previously
- * emmitted or only a part of that range.
- *
- * We have two cases here:
- *
- * 1) The file extent item's range ends at or behind the
- * cached extent's end. In this case just ignore the
- * current file extent item because we don't want to
- * overlap with previous ranges that may have been
- * emmitted already;
- *
- * 2) The file extent item starts behind the currently
- * cached extent but its end offset goes beyond the
- * end offset of the cached extent. We don't want to
- * overlap with a previous range that may have been
- * emmitted already, so we emit the currently cached
- * extent and then partially store the current file
- * extent item's range in the cache, for the subrange
- * going the cached extent's end to the end of the
- * file extent item.
- */
- if (range_end <= cache_end)
- return 0;
-
- if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC)))
- phys += cache_end - offset;
-
- offset = cache_end;
- len = range_end - cache_end;
- goto emit;
- }
- }
-
- /*
- * Only merges fiemap extents if
- * 1) Their logical addresses are continuous
- *
- * 2) Their physical addresses are continuous
- * So truly compressed (physical size smaller than logical size)
- * extents won't get merged with each other
- *
- * 3) Share same flags
- */
- if (cache->offset + cache->len == offset &&
- cache->phys + cache->len == phys &&
- cache->flags == flags) {
- cache->len += len;
- return 0;
- }
-
-emit:
- /* Not mergeable, need to submit cached one */
-
- if (cache->entries_pos == cache->entries_size) {
- /*
- * We will need to research for the end offset of the last
- * stored extent and not from the current offset, because after
- * unlocking the range and releasing the path, if there's a hole
- * between that end offset and this current offset, a new extent
- * may have been inserted due to a new write, so we don't want
- * to miss it.
- */
- entry = &cache->entries[cache->entries_size - 1];
- cache->next_search_offset = entry->offset + entry->len;
- cache->cached = false;
-
- return BTRFS_FIEMAP_FLUSH_CACHE;
- }
-
- entry = &cache->entries[cache->entries_pos];
- entry->offset = cache->offset;
- entry->phys = cache->phys;
- entry->len = cache->len;
- entry->flags = cache->flags;
- cache->entries_pos++;
- cache->extents_mapped++;
-
- if (cache->extents_mapped == fieinfo->fi_extents_max) {
- cache->cached = false;
- return 1;
- }
-assign:
- cache->cached = true;
- cache->offset = offset;
- cache->phys = phys;
- cache->len = len;
- cache->flags = flags;
-
- return 0;
-}
-
-/*
- * Emit last fiemap cache
- *
- * The last fiemap cache may still be cached in the following case:
- * 0 4k 8k
- * |<- Fiemap range ->|
- * |<------------ First extent ----------->|
- *
- * In this case, the first extent range will be cached but not emitted.
- * So we must emit it before ending extent_fiemap().
- */
-static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
- struct fiemap_cache *cache)
-{
- int ret;
-
- if (!cache->cached)
- return 0;
-
- ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
- cache->len, cache->flags);
- cache->cached = false;
- if (ret > 0)
- ret = 0;
- return ret;
-}
-
-static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
-{
- struct extent_buffer *clone = path->nodes[0];
- struct btrfs_key key;
- int slot;
- int ret;
-
- path->slots[0]++;
- if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
- return 0;
-
- /*
- * Add a temporary extra ref to an already cloned extent buffer to
- * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid
- * the cost of allocating a new one.
- */
- ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags));
- atomic_inc(&clone->refs);
-
- ret = btrfs_next_leaf(inode->root, path);
- if (ret != 0)
- goto out;
-
- /*
- * Don't bother with cloning if there are no more file extent items for
- * our inode.
- */
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) {
- ret = 1;
- goto out;
- }
-
- /*
- * Important to preserve the start field, for the optimizations when
- * checking if extents are shared (see extent_fiemap()).
- *
- * We must set ->start before calling copy_extent_buffer_full(). If we
- * are on sub-pagesize blocksize, we use ->start to determine the offset
- * into the folio where our eb exists, and if we update ->start after
- * the fact then any subsequent reads of the eb may read from a
- * different offset in the folio than where we originally copied into.
- */
- clone->start = path->nodes[0]->start;
- /* See the comment at fiemap_search_slot() about why we clone. */
- copy_extent_buffer_full(clone, path->nodes[0]);
-
- slot = path->slots[0];
- btrfs_release_path(path);
- path->nodes[0] = clone;
- path->slots[0] = slot;
-out:
- if (ret)
- free_extent_buffer(clone);
-
- return ret;
-}
-
-/*
- * Search for the first file extent item that starts at a given file offset or
- * the one that starts immediately before that offset.
- * Returns: 0 on success, < 0 on error, 1 if not found.
- */
-static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,
- u64 file_offset)
-{
- const u64 ino = btrfs_ino(inode);
- struct btrfs_root *root = inode->root;
- struct extent_buffer *clone;
- struct btrfs_key key;
- int slot;
- int ret;
-
- key.objectid = ino;
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = file_offset;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- return ret;
-
- if (ret > 0 && path->slots[0] > 0) {
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
- if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
- path->slots[0]--;
- }
-
- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_leaf(root, path);
- if (ret != 0)
- return ret;
-
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
- return 1;
- }
-
- /*
- * We clone the leaf and use it during fiemap. This is because while
- * using the leaf we do expensive things like checking if an extent is
- * shared, which can take a long time. In order to prevent blocking
- * other tasks for too long, we use a clone of the leaf. We have locked
- * the file range in the inode's io tree, so we know none of our file
- * extent items can change. This way we avoid blocking other tasks that
- * want to insert items for other inodes in the same leaf or b+tree
- * rebalance operations (triggered for example when someone is trying
- * to push items into this leaf when trying to insert an item in a
- * neighbour leaf).
- * We also need the private clone because holding a read lock on an
- * extent buffer of the subvolume's b+tree will make lockdep unhappy
- * when we check if extents are shared, as backref walking may need to
- * lock the same leaf we are processing.
- */
- clone = btrfs_clone_extent_buffer(path->nodes[0]);
- if (!clone)
- return -ENOMEM;
-
- slot = path->slots[0];
- btrfs_release_path(path);
- path->nodes[0] = clone;
- path->slots[0] = slot;
-
- return 0;
-}
-
-/*
- * Process a range which is a hole or a prealloc extent in the inode's subvolume
- * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc
- * extent. The end offset (@end) is inclusive.
- */
-static int fiemap_process_hole(struct btrfs_inode *inode,
- struct fiemap_extent_info *fieinfo,
- struct fiemap_cache *cache,
- struct extent_state **delalloc_cached_state,
- struct btrfs_backref_share_check_ctx *backref_ctx,
- u64 disk_bytenr, u64 extent_offset,
- u64 extent_gen,
- u64 start, u64 end)
-{
- const u64 i_size = i_size_read(&inode->vfs_inode);
- u64 cur_offset = start;
- u64 last_delalloc_end = 0;
- u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
- bool checked_extent_shared = false;
- int ret;
-
- /*
- * There can be no delalloc past i_size, so don't waste time looking for
- * it beyond i_size.
- */
- while (cur_offset < end && cur_offset < i_size) {
- u64 delalloc_start;
- u64 delalloc_end;
- u64 prealloc_start;
- u64 prealloc_len = 0;
- bool delalloc;
-
- delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
- delalloc_cached_state,
- &delalloc_start,
- &delalloc_end);
- if (!delalloc)
- break;
-
- /*
- * If this is a prealloc extent we have to report every section
- * of it that has no delalloc.
- */
- if (disk_bytenr != 0) {
- if (last_delalloc_end == 0) {
- prealloc_start = start;
- prealloc_len = delalloc_start - start;
- } else {
- prealloc_start = last_delalloc_end + 1;
- prealloc_len = delalloc_start - prealloc_start;
- }
- }
-
- if (prealloc_len > 0) {
- if (!checked_extent_shared && fieinfo->fi_extents_max) {
- ret = btrfs_is_data_extent_shared(inode,
- disk_bytenr,
- extent_gen,
- backref_ctx);
- if (ret < 0)
- return ret;
- else if (ret > 0)
- prealloc_flags |= FIEMAP_EXTENT_SHARED;
-
- checked_extent_shared = true;
- }
- ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
- disk_bytenr + extent_offset,
- prealloc_len, prealloc_flags);
- if (ret)
- return ret;
- extent_offset += prealloc_len;
- }
-
- ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0,
- delalloc_end + 1 - delalloc_start,
- FIEMAP_EXTENT_DELALLOC |
- FIEMAP_EXTENT_UNKNOWN);
- if (ret)
- return ret;
-
- last_delalloc_end = delalloc_end;
- cur_offset = delalloc_end + 1;
- extent_offset += cur_offset - delalloc_start;
- cond_resched();
- }
-
- /*
- * Either we found no delalloc for the whole prealloc extent or we have
- * a prealloc extent that spans i_size or starts at or after i_size.
- */
- if (disk_bytenr != 0 && last_delalloc_end < end) {
- u64 prealloc_start;
- u64 prealloc_len;
-
- if (last_delalloc_end == 0) {
- prealloc_start = start;
- prealloc_len = end + 1 - start;
- } else {
- prealloc_start = last_delalloc_end + 1;
- prealloc_len = end + 1 - prealloc_start;
- }
-
- if (!checked_extent_shared && fieinfo->fi_extents_max) {
- ret = btrfs_is_data_extent_shared(inode,
- disk_bytenr,
- extent_gen,
- backref_ctx);
- if (ret < 0)
- return ret;
- else if (ret > 0)
- prealloc_flags |= FIEMAP_EXTENT_SHARED;
- }
- ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
- disk_bytenr + extent_offset,
- prealloc_len, prealloc_flags);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-static int fiemap_find_last_extent_offset(struct btrfs_inode *inode,
- struct btrfs_path *path,
- u64 *last_extent_end_ret)
-{
- const u64 ino = btrfs_ino(inode);
- struct btrfs_root *root = inode->root;
- struct extent_buffer *leaf;
- struct btrfs_file_extent_item *ei;
- struct btrfs_key key;
- u64 disk_bytenr;
- int ret;
-
- /*
- * Lookup the last file extent. We're not using i_size here because
- * there might be preallocation past i_size.
- */
- ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0);
- /* There can't be a file extent item at offset (u64)-1 */
- ASSERT(ret != 0);
- if (ret < 0)
- return ret;
-
- /*
- * For a non-existing key, btrfs_search_slot() always leaves us at a
- * slot > 0, except if the btree is empty, which is impossible because
- * at least it has the inode item for this inode and all the items for
- * the root inode 256.
- */
- ASSERT(path->slots[0] > 0);
- path->slots[0]--;
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
- /* No file extent items in the subvolume tree. */
- *last_extent_end_ret = 0;
- return 0;
- }
-
- /*
- * For an inline extent, the disk_bytenr is where inline data starts at,
- * so first check if we have an inline extent item before checking if we
- * have an implicit hole (disk_bytenr == 0).
- */
- ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
- *last_extent_end_ret = btrfs_file_extent_end(path);
- return 0;
- }
-
- /*
- * Find the last file extent item that is not a hole (when NO_HOLES is
- * not enabled). This should take at most 2 iterations in the worst
- * case: we have one hole file extent item at slot 0 of a leaf and
- * another hole file extent item as the last item in the previous leaf.
- * This is because we merge file extent items that represent holes.
- */
- disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
- while (disk_bytenr == 0) {
- ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
- if (ret < 0) {
- return ret;
- } else if (ret > 0) {
- /* No file extent items that are not holes. */
- *last_extent_end_ret = 0;
- return 0;
- }
- leaf = path->nodes[0];
- ei = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
- }
-
- *last_extent_end_ret = btrfs_file_extent_end(path);
- return 0;
-}
-
-int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
- u64 start, u64 len)
-{
- const u64 ino = btrfs_ino(inode);
- struct extent_state *cached_state = NULL;
- struct extent_state *delalloc_cached_state = NULL;
- struct btrfs_path *path;
- struct fiemap_cache cache = { 0 };
- struct btrfs_backref_share_check_ctx *backref_ctx;
- u64 last_extent_end;
- u64 prev_extent_end;
- u64 range_start;
- u64 range_end;
- const u64 sectorsize = inode->root->fs_info->sectorsize;
- bool stopped = false;
- int ret;
-
- cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry);
- cache.entries = kmalloc_array(cache.entries_size,
- sizeof(struct btrfs_fiemap_entry),
- GFP_KERNEL);
- backref_ctx = btrfs_alloc_backref_share_check_ctx();
- path = btrfs_alloc_path();
- if (!cache.entries || !backref_ctx || !path) {
- ret = -ENOMEM;
- goto out;
- }
-
-restart:
- range_start = round_down(start, sectorsize);
- range_end = round_up(start + len, sectorsize);
- prev_extent_end = range_start;
-
- lock_extent(&inode->io_tree, range_start, range_end, &cached_state);
-
- ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
- if (ret < 0)
- goto out_unlock;
- btrfs_release_path(path);
-
- path->reada = READA_FORWARD;
- ret = fiemap_search_slot(inode, path, range_start);
- if (ret < 0) {
- goto out_unlock;
- } else if (ret > 0) {
- /*
- * No file extent item found, but we may have delalloc between
- * the current offset and i_size. So check for that.
- */
- ret = 0;
- goto check_eof_delalloc;
- }
-
- while (prev_extent_end < range_end) {
- struct extent_buffer *leaf = path->nodes[0];
- struct btrfs_file_extent_item *ei;
- struct btrfs_key key;
- u64 extent_end;
- u64 extent_len;
- u64 extent_offset = 0;
- u64 extent_gen;
- u64 disk_bytenr = 0;
- u64 flags = 0;
- int extent_type;
- u8 compression;
-
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
- break;
-
- extent_end = btrfs_file_extent_end(path);
-
- /*
- * The first iteration can leave us at an extent item that ends
- * before our range's start. Move to the next item.
- */
- if (extent_end <= range_start)
- goto next_item;
-
- backref_ctx->curr_leaf_bytenr = leaf->start;
-
- /* We have in implicit hole (NO_HOLES feature enabled). */
- if (prev_extent_end < key.offset) {
- const u64 hole_end = min(key.offset, range_end) - 1;
-
- ret = fiemap_process_hole(inode, fieinfo, &cache,
- &delalloc_cached_state,
- backref_ctx, 0, 0, 0,
- prev_extent_end, hole_end);
- if (ret < 0) {
- goto out_unlock;
- } else if (ret > 0) {
- /* fiemap_fill_next_extent() told us to stop. */
- stopped = true;
- break;
- }
-
- /* We've reached the end of the fiemap range, stop. */
- if (key.offset >= range_end) {
- stopped = true;
- break;
- }
- }
-
- extent_len = extent_end - key.offset;
- ei = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- compression = btrfs_file_extent_compression(leaf, ei);
- extent_type = btrfs_file_extent_type(leaf, ei);
- extent_gen = btrfs_file_extent_generation(leaf, ei);
-
- if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
- disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
- if (compression == BTRFS_COMPRESS_NONE)
- extent_offset = btrfs_file_extent_offset(leaf, ei);
- }
-
- if (compression != BTRFS_COMPRESS_NONE)
- flags |= FIEMAP_EXTENT_ENCODED;
-
- if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
- flags |= FIEMAP_EXTENT_DATA_INLINE;
- flags |= FIEMAP_EXTENT_NOT_ALIGNED;
- ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0,
- extent_len, flags);
- } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
- ret = fiemap_process_hole(inode, fieinfo, &cache,
- &delalloc_cached_state,
- backref_ctx,
- disk_bytenr, extent_offset,
- extent_gen, key.offset,
- extent_end - 1);
- } else if (disk_bytenr == 0) {
- /* We have an explicit hole. */
- ret = fiemap_process_hole(inode, fieinfo, &cache,
- &delalloc_cached_state,
- backref_ctx, 0, 0, 0,
- key.offset, extent_end - 1);
- } else {
- /* We have a regular extent. */
- if (fieinfo->fi_extents_max) {
- ret = btrfs_is_data_extent_shared(inode,
- disk_bytenr,
- extent_gen,
- backref_ctx);
- if (ret < 0)
- goto out_unlock;
- else if (ret > 0)
- flags |= FIEMAP_EXTENT_SHARED;
- }
-
- ret = emit_fiemap_extent(fieinfo, &cache, key.offset,
- disk_bytenr + extent_offset,
- extent_len, flags);
- }
-
- if (ret < 0) {
- goto out_unlock;
- } else if (ret > 0) {
- /* emit_fiemap_extent() told us to stop. */
- stopped = true;
- break;
- }
-
- prev_extent_end = extent_end;
-next_item:
- if (fatal_signal_pending(current)) {
- ret = -EINTR;
- goto out_unlock;
- }
-
- ret = fiemap_next_leaf_item(inode, path);
- if (ret < 0) {
- goto out_unlock;
- } else if (ret > 0) {
- /* No more file extent items for this inode. */
- break;
- }
- cond_resched();
- }
-
-check_eof_delalloc:
- if (!stopped && prev_extent_end < range_end) {
- ret = fiemap_process_hole(inode, fieinfo, &cache,
- &delalloc_cached_state, backref_ctx,
- 0, 0, 0, prev_extent_end, range_end - 1);
- if (ret < 0)
- goto out_unlock;
- prev_extent_end = range_end;
- }
-
- if (cache.cached && cache.offset + cache.len >= last_extent_end) {
- const u64 i_size = i_size_read(&inode->vfs_inode);
-
- if (prev_extent_end < i_size) {
- u64 delalloc_start;
- u64 delalloc_end;
- bool delalloc;
-
- delalloc = btrfs_find_delalloc_in_range(inode,
- prev_extent_end,
- i_size - 1,
- &delalloc_cached_state,
- &delalloc_start,
- &delalloc_end);
- if (!delalloc)
- cache.flags |= FIEMAP_EXTENT_LAST;
- } else {
- cache.flags |= FIEMAP_EXTENT_LAST;
- }
- }
-
-out_unlock:
- unlock_extent(&inode->io_tree, range_start, range_end, &cached_state);
-
- if (ret == BTRFS_FIEMAP_FLUSH_CACHE) {
- btrfs_release_path(path);
- ret = flush_fiemap_cache(fieinfo, &cache);
- if (ret)
- goto out;
- len -= cache.next_search_offset - start;
- start = cache.next_search_offset;
- goto restart;
- } else if (ret < 0) {
- goto out;
- }
-
- /*
- * Must free the path before emitting to the fiemap buffer because we
- * may have a non-cloned leaf and if the fiemap buffer is memory mapped
- * to a file, a write into it (through btrfs_page_mkwrite()) may trigger
- * waiting for an ordered extent that in order to complete needs to
- * modify that leaf, therefore leading to a deadlock.
- */
- btrfs_free_path(path);
- path = NULL;
-
- ret = flush_fiemap_cache(fieinfo, &cache);
- if (ret)
- goto out;
-
- ret = emit_last_fiemap_cache(fieinfo, &cache);
-out:
- free_extent_state(delalloc_cached_state);
- kfree(cache.entries);
- btrfs_free_backref_share_ctx(backref_ctx);
- btrfs_free_path(path);
- return ret;
-}
-
static void __free_extent_buffer(struct extent_buffer *eb)
{
kmem_cache_free(extent_buffer_cache, eb);
@@ -3372,7 +2580,7 @@ static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *foli
return false;
}
-static void detach_extent_buffer_folio(struct extent_buffer *eb, struct folio *folio)
+static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
@@ -3433,7 +2641,7 @@ static void detach_extent_buffer_folio(struct extent_buffer *eb, struct folio *f
}
/* Release all pages attached to the extent buffer */
-static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
+static void btrfs_release_extent_buffer_pages(const struct extent_buffer *eb)
{
ASSERT(!extent_buffer_under_io(eb));
@@ -3499,7 +2707,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
*/
set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
- ret = alloc_eb_folio_array(new, 0);
+ ret = alloc_eb_folio_array(new, false);
if (ret) {
btrfs_release_extent_buffer(new);
return NULL;
@@ -3507,7 +2715,6 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
for (int i = 0; i < num_folios; i++) {
struct folio *folio = new->folios[i];
- int ret;
ret = attach_extent_buffer_folio(new, folio, NULL);
if (ret < 0) {
@@ -3533,7 +2740,7 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
if (!eb)
return NULL;
- ret = alloc_eb_folio_array(eb, 0);
+ ret = alloc_eb_folio_array(eb, false);
if (ret)
goto err;
@@ -3553,7 +2760,7 @@ err:
for (int i = 0; i < num_folios; i++) {
if (eb->folios[i]) {
detach_extent_buffer_folio(eb, eb->folios[i]);
- __folio_put(eb->folios[i]);
+ folio_put(eb->folios[i]);
}
}
__free_extent_buffer(eb);
@@ -3899,7 +3106,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
reallocate:
/* Allocate all pages first. */
- ret = alloc_eb_folio_array(eb, __GFP_NOFAIL);
+ ret = alloc_eb_folio_array(eb, true);
if (ret < 0) {
btrfs_free_subpage(prealloc);
goto out;
@@ -4351,7 +3558,7 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio)
}
int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
- struct btrfs_tree_parent_check *check)
+ const struct btrfs_tree_parent_check *check)
{
struct btrfs_bio *bbio;
bool ret;
@@ -4589,8 +3796,7 @@ static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i)
return;
if (fs_info->nodesize < PAGE_SIZE) {
- struct folio *folio = eb->folios[0];
-
+ folio = eb->folios[0];
ASSERT(i == 0);
if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio,
eb->start, eb->len)))
@@ -4608,7 +3814,7 @@ static void __write_extent_buffer(const struct extent_buffer *eb,
size_t cur;
size_t offset;
char *kaddr;
- char *src = (char *)srcv;
+ const char *src = (const char *)srcv;
unsigned long i = get_eb_folio_index(eb, start);
/* For unmapped (dummy) ebs, no need to check their uptodate status. */
const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
@@ -4965,7 +4171,7 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
#define GANG_LOOKUP_SIZE 16
static struct extent_buffer *get_next_extent_buffer(
- struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
+ const struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
{
struct extent_buffer *gang[GANG_LOOKUP_SIZE];
struct extent_buffer *found = NULL;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index dca6b12769ec..dceebd76c7d1 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -215,6 +215,11 @@ static inline struct extent_changeset *extent_changeset_alloc(void)
return ret;
}
+static inline void extent_changeset_prealloc(struct extent_changeset *changeset, gfp_t gfp_mask)
+{
+ ulist_prealloc(&changeset->range_changed, gfp_mask);
+}
+
static inline void extent_changeset_release(struct extent_changeset *changeset)
{
if (!changeset)
@@ -235,15 +240,13 @@ bool try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
int btrfs_read_folio(struct file *file, struct folio *folio);
-void extent_write_locked_range(struct inode *inode, struct page *locked_page,
+void extent_write_locked_range(struct inode *inode, const struct page *locked_page,
u64 start, u64 end, struct writeback_control *wbc,
bool pages_dirty);
int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
void btrfs_readahead(struct readahead_control *rac);
-int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
- u64 start, u64 len);
int set_folio_extent_mapped(struct folio *folio);
int set_page_extent_mapped(struct page *page);
void clear_page_extent_mapped(struct page *page);
@@ -263,7 +266,7 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
#define WAIT_COMPLETE 1
#define WAIT_PAGE_LOCK 2
int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
- struct btrfs_tree_parent_check *parent_check);
+ const struct btrfs_tree_parent_check *parent_check);
void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root, u64 gen, int level);
@@ -350,9 +353,8 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
void set_extent_buffer_dirty(struct extent_buffer *eb);
void set_extent_buffer_uptodate(struct extent_buffer *eb);
void clear_extent_buffer_uptodate(struct extent_buffer *eb);
-void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
- struct page *locked_page,
+ const struct page *locked_page,
struct extent_state **cached,
u32 bits_to_clear, unsigned long page_ops);
int extent_invalidate_folio(struct extent_io_tree *tree,
@@ -361,9 +363,8 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
struct extent_buffer *buf);
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
- gfp_t extra_gfp);
-int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array,
- gfp_t extra_gfp);
+ bool nofail);
+int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 744e8952abb0..81558f90ee80 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -33,7 +33,7 @@ void __cold extent_map_exit(void)
*/
void extent_map_tree_init(struct extent_map_tree *tree)
{
- tree->map = RB_ROOT_CACHED;
+ tree->root = RB_ROOT;
INIT_LIST_HEAD(&tree->modified_extents);
rwlock_init(&tree->lock);
}
@@ -85,27 +85,24 @@ static void dec_evictable_extent_maps(struct btrfs_inode *inode)
percpu_counter_dec(&fs_info->evictable_extent_maps);
}
-static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
+static int tree_insert(struct rb_root *root, struct extent_map *em)
{
- struct rb_node **p = &root->rb_root.rb_node;
+ struct rb_node **p = &root->rb_node;
struct rb_node *parent = NULL;
struct extent_map *entry = NULL;
struct rb_node *orig_parent = NULL;
u64 end = range_end(em->start, em->len);
- bool leftmost = true;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct extent_map, rb_node);
- if (em->start < entry->start) {
+ if (em->start < entry->start)
p = &(*p)->rb_left;
- } else if (em->start >= extent_map_end(entry)) {
+ else if (em->start >= extent_map_end(entry))
p = &(*p)->rb_right;
- leftmost = false;
- } else {
+ else
return -EEXIST;
- }
}
orig_parent = parent;
@@ -128,7 +125,7 @@ static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
return -EEXIST;
rb_link_node(&em->rb_node, orig_parent, p);
- rb_insert_color_cached(&em->rb_node, root, leftmost);
+ rb_insert_color(&em->rb_node, root);
return 0;
}
@@ -186,11 +183,19 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
return NULL;
}
+static inline u64 extent_map_block_len(const struct extent_map *em)
+{
+ if (extent_map_is_compressed(em))
+ return em->disk_num_bytes;
+ return em->len;
+}
+
static inline u64 extent_map_block_end(const struct extent_map *em)
{
- if (em->block_start + em->block_len < em->block_start)
+ if (extent_map_block_start(em) + extent_map_block_len(em) <
+ extent_map_block_start(em))
return (u64)-1;
- return em->block_start + em->block_len;
+ return extent_map_block_start(em) + extent_map_block_len(em);
}
static bool can_merge_extent_map(const struct extent_map *em)
@@ -225,15 +230,106 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma
if (prev->flags != next->flags)
return false;
- if (next->block_start < EXTENT_MAP_LAST_BYTE - 1)
- return next->block_start == extent_map_block_end(prev);
+ if (next->disk_bytenr < EXTENT_MAP_LAST_BYTE - 1)
+ return extent_map_block_start(next) == extent_map_block_end(prev);
/* HOLES and INLINE extents. */
- return next->block_start == prev->block_start;
+ return next->disk_bytenr == prev->disk_bytenr;
+}
+
+/*
+ * Handle the on-disk data extents merge for @prev and @next.
+ *
+ * Only touches disk_bytenr/disk_num_bytes/offset/ram_bytes.
+ * For now only uncompressed regular extent can be merged.
+ *
+ * @prev and @next will be both updated to point to the new merged range.
+ * Thus one of them should be removed by the caller.
+ */
+static void merge_ondisk_extents(struct extent_map *prev, struct extent_map *next)
+{
+ u64 new_disk_bytenr;
+ u64 new_disk_num_bytes;
+ u64 new_offset;
+
+ /* @prev and @next should not be compressed. */
+ ASSERT(!extent_map_is_compressed(prev));
+ ASSERT(!extent_map_is_compressed(next));
+
+ /*
+ * There are two different cases where @prev and @next can be merged.
+ *
+ * 1) They are referring to the same data extent:
+ *
+ * |<----- data extent A ----->|
+ * |<- prev ->|<- next ->|
+ *
+ * 2) They are referring to different data extents but still adjacent:
+ *
+ * |<-- data extent A -->|<-- data extent B -->|
+ * |<- prev ->|<- next ->|
+ *
+ * The calculation here always merges the data extents first, then updates
+ * @offset using the new data extents.
+ *
+ * For case 1), the merged data extent would be the same.
+ * For case 2), we just merge the two data extents into one.
+ */
+ new_disk_bytenr = min(prev->disk_bytenr, next->disk_bytenr);
+ new_disk_num_bytes = max(prev->disk_bytenr + prev->disk_num_bytes,
+ next->disk_bytenr + next->disk_num_bytes) -
+ new_disk_bytenr;
+ new_offset = prev->disk_bytenr + prev->offset - new_disk_bytenr;
+
+ prev->disk_bytenr = new_disk_bytenr;
+ prev->disk_num_bytes = new_disk_num_bytes;
+ prev->ram_bytes = new_disk_num_bytes;
+ prev->offset = new_offset;
+
+ next->disk_bytenr = new_disk_bytenr;
+ next->disk_num_bytes = new_disk_num_bytes;
+ next->ram_bytes = new_disk_num_bytes;
+ next->offset = new_offset;
+}
+
+static void dump_extent_map(struct btrfs_fs_info *fs_info, const char *prefix,
+ struct extent_map *em)
+{
+ if (!IS_ENABLED(CONFIG_BTRFS_DEBUG))
+ return;
+ btrfs_crit(fs_info,
+"%s, start=%llu len=%llu disk_bytenr=%llu disk_num_bytes=%llu ram_bytes=%llu offset=%llu flags=0x%x",
+ prefix, em->start, em->len, em->disk_bytenr, em->disk_num_bytes,
+ em->ram_bytes, em->offset, em->flags);
+ ASSERT(0);
+}
+
+/* Internal sanity checks for btrfs debug builds. */
+static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map *em)
+{
+ if (!IS_ENABLED(CONFIG_BTRFS_DEBUG))
+ return;
+ if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
+ if (em->disk_num_bytes == 0)
+ dump_extent_map(fs_info, "zero disk_num_bytes", em);
+ if (em->offset + em->len > em->ram_bytes)
+ dump_extent_map(fs_info, "ram_bytes too small", em);
+ if (em->offset + em->len > em->disk_num_bytes &&
+ !extent_map_is_compressed(em))
+ dump_extent_map(fs_info, "disk_num_bytes too small", em);
+ if (!extent_map_is_compressed(em) &&
+ em->ram_bytes != em->disk_num_bytes)
+ dump_extent_map(fs_info,
+ "ram_bytes mismatch with disk_num_bytes for non-compressed em",
+ em);
+ } else if (em->offset) {
+ dump_extent_map(fs_info, "non-zero offset for hole/inline", em);
+ }
}
static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_map_tree *tree = &inode->extent_tree;
struct extent_map *merge = NULL;
struct rb_node *rb;
@@ -258,14 +354,15 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
merge = rb_entry(rb, struct extent_map, rb_node);
if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) {
em->start = merge->start;
- em->orig_start = merge->orig_start;
em->len += merge->len;
- em->block_len += merge->block_len;
- em->block_start = merge->block_start;
em->generation = max(em->generation, merge->generation);
+
+ if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
+ merge_ondisk_extents(merge, em);
em->flags |= EXTENT_FLAG_MERGED;
- rb_erase_cached(&merge->rb_node, &tree->map);
+ validate_extent_map(fs_info, em);
+ rb_erase(&merge->rb_node, &tree->root);
RB_CLEAR_NODE(&merge->rb_node);
free_extent_map(merge);
dec_evictable_extent_maps(inode);
@@ -277,8 +374,10 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
merge = rb_entry(rb, struct extent_map, rb_node);
if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) {
em->len += merge->len;
- em->block_len += merge->block_len;
- rb_erase_cached(&merge->rb_node, &tree->map);
+ if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
+ merge_ondisk_extents(em, merge);
+ validate_extent_map(fs_info, em);
+ rb_erase(&merge->rb_node, &tree->root);
RB_CLEAR_NODE(&merge->rb_node);
em->generation = max(em->generation, merge->generation);
em->flags |= EXTENT_FLAG_MERGED;
@@ -389,7 +488,8 @@ static int add_extent_mapping(struct btrfs_inode *inode,
lockdep_assert_held_write(&tree->lock);
- ret = tree_insert(&tree->map, em);
+ validate_extent_map(fs_info, em);
+ ret = tree_insert(&tree->root, em);
if (ret)
return ret;
@@ -410,7 +510,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
struct rb_node *prev_or_next = NULL;
u64 end = range_end(start, len);
- rb_node = __tree_search(&tree->map.rb_root, start, &prev_or_next);
+ rb_node = __tree_search(&tree->root, start, &prev_or_next);
if (!rb_node) {
if (prev_or_next)
rb_node = prev_or_next;
@@ -479,7 +579,7 @@ void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em)
lockdep_assert_held_write(&tree->lock);
WARN_ON(em->flags & EXTENT_FLAG_PINNED);
- rb_erase_cached(&em->rb_node, &tree->map);
+ rb_erase(&em->rb_node, &tree->root);
if (!(em->flags & EXTENT_FLAG_LOGGING))
list_del_init(&em->list);
RB_CLEAR_NODE(&em->rb_node);
@@ -492,15 +592,18 @@ static void replace_extent_mapping(struct btrfs_inode *inode,
struct extent_map *new,
int modified)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_map_tree *tree = &inode->extent_tree;
lockdep_assert_held_write(&tree->lock);
+ validate_extent_map(fs_info, new);
+
WARN_ON(cur->flags & EXTENT_FLAG_PINNED);
ASSERT(extent_map_in_tree(cur));
if (!(cur->flags & EXTENT_FLAG_LOGGING))
list_del_init(&cur->list);
- rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map);
+ rb_replace_node(&cur->rb_node, &new->rb_node, &tree->root);
RB_CLEAR_NODE(&cur->rb_node);
setup_extent_mapping(inode, new, modified);
@@ -561,11 +664,8 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode,
start_diff = start - em->start;
em->start = start;
em->len = end - start;
- if (em->block_start < EXTENT_MAP_LAST_BYTE &&
- !extent_map_is_compressed(em)) {
- em->block_start += start_diff;
- em->block_len = em->len;
- }
+ if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE && !extent_map_is_compressed(em))
+ em->offset += start_diff;
return add_extent_mapping(inode, em, 0);
}
@@ -600,7 +700,7 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode,
* Tree-checker should have rejected any inline extent with non-zero
* file offset. Here just do a sanity check.
*/
- if (em->block_start == EXTENT_MAP_INLINE)
+ if (em->disk_bytenr == EXTENT_MAP_INLINE)
ASSERT(em->start == 0);
ret = add_extent_mapping(inode, em, 0);
@@ -657,18 +757,23 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode,
static void drop_all_extent_maps_fast(struct btrfs_inode *inode)
{
struct extent_map_tree *tree = &inode->extent_tree;
+ struct rb_node *node;
write_lock(&tree->lock);
- while (!RB_EMPTY_ROOT(&tree->map.rb_root)) {
+ node = rb_first(&tree->root);
+ while (node) {
struct extent_map *em;
- struct rb_node *node;
+ struct rb_node *next = rb_next(node);
- node = rb_first_cached(&tree->map);
em = rb_entry(node, struct extent_map, rb_node);
em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
remove_extent_mapping(inode, em);
free_extent_map(em);
- cond_resched_rwlock_write(&tree->lock);
+
+ if (cond_resched_rwlock_write(&tree->lock))
+ node = rb_first(&tree->root);
+ else
+ node = next;
}
write_unlock(&tree->lock);
}
@@ -729,7 +834,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
u64 gen;
unsigned long flags;
bool modified;
- bool compressed;
if (em_end < end) {
next_em = next_extent_map(em);
@@ -763,7 +867,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
goto remove_em;
gen = em->generation;
- compressed = extent_map_is_compressed(em);
if (em->start < start) {
if (!split) {
@@ -775,22 +878,15 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
split->start = em->start;
split->len = start - em->start;
- if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- split->orig_start = em->orig_start;
- split->block_start = em->block_start;
-
- if (compressed)
- split->block_len = em->block_len;
- else
- split->block_len = split->len;
- split->orig_block_len = max(split->block_len,
- em->orig_block_len);
+ if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
+ split->disk_bytenr = em->disk_bytenr;
+ split->disk_num_bytes = em->disk_num_bytes;
+ split->offset = em->offset;
split->ram_bytes = em->ram_bytes;
} else {
- split->orig_start = split->start;
- split->block_len = 0;
- split->block_start = em->block_start;
- split->orig_block_len = 0;
+ split->disk_bytenr = em->disk_bytenr;
+ split->disk_num_bytes = 0;
+ split->offset = 0;
split->ram_bytes = split->len;
}
@@ -810,30 +906,18 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
}
split->start = end;
split->len = em_end - end;
- split->block_start = em->block_start;
+ split->disk_bytenr = em->disk_bytenr;
split->flags = flags;
split->generation = gen;
- if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- split->orig_block_len = max(em->block_len,
- em->orig_block_len);
-
+ if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
+ split->disk_num_bytes = em->disk_num_bytes;
+ split->offset = em->offset + end - em->start;
split->ram_bytes = em->ram_bytes;
- if (compressed) {
- split->block_len = em->block_len;
- split->orig_start = em->orig_start;
- } else {
- const u64 diff = end - em->start;
-
- split->block_len = split->len;
- split->block_start += diff;
- split->orig_start = em->orig_start;
- }
} else {
+ split->disk_num_bytes = 0;
+ split->offset = 0;
split->ram_bytes = split->len;
- split->orig_start = split->start;
- split->block_len = 0;
- split->orig_block_len = 0;
}
if (extent_map_in_tree(em)) {
@@ -976,7 +1060,7 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
ASSERT(em->len == len);
ASSERT(!extent_map_is_compressed(em));
- ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
+ ASSERT(em->disk_bytenr < EXTENT_MAP_LAST_BYTE);
ASSERT(em->flags & EXTENT_FLAG_PINNED);
ASSERT(!(em->flags & EXTENT_FLAG_LOGGING));
ASSERT(!list_empty(&em->list));
@@ -987,10 +1071,9 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
/* First, replace the em with a new extent_map starting from * em->start */
split_pre->start = em->start;
split_pre->len = pre;
- split_pre->orig_start = split_pre->start;
- split_pre->block_start = new_logical;
- split_pre->block_len = split_pre->len;
- split_pre->orig_block_len = split_pre->block_len;
+ split_pre->disk_bytenr = new_logical;
+ split_pre->disk_num_bytes = split_pre->len;
+ split_pre->offset = 0;
split_pre->ram_bytes = split_pre->len;
split_pre->flags = flags;
split_pre->generation = em->generation;
@@ -1005,10 +1088,9 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
/* Insert the middle extent_map. */
split_mid->start = em->start + pre;
split_mid->len = em->len - pre;
- split_mid->orig_start = split_mid->start;
- split_mid->block_start = em->block_start + pre;
- split_mid->block_len = split_mid->len;
- split_mid->orig_block_len = split_mid->block_len;
+ split_mid->disk_bytenr = extent_map_block_start(em) + pre;
+ split_mid->disk_num_bytes = split_mid->len;
+ split_mid->offset = 0;
split_mid->ram_bytes = split_mid->len;
split_mid->flags = flags;
split_mid->generation = em->generation;
@@ -1028,7 +1110,14 @@ out_free_pre:
return ret;
}
-static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_to_scan)
+struct btrfs_em_shrink_ctx {
+ long nr_to_scan;
+ long scanned;
+ u64 last_ino;
+ u64 last_root;
+};
+
+static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
{
const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
struct extent_map_tree *tree = &inode->extent_tree;
@@ -1057,14 +1146,25 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
if (!down_read_trylock(&inode->i_mmap_lock))
return 0;
- write_lock(&tree->lock);
- node = rb_first_cached(&tree->map);
+ /*
+ * We want to be fast because we can be called from any path trying to
+ * allocate memory, so if the lock is busy we don't want to spend time
+ * waiting for it - either some task is about to do IO for the inode or
+ * we may have another task shrinking extent maps, here in this code, so
+ * skip this inode.
+ */
+ if (!write_trylock(&tree->lock)) {
+ up_read(&inode->i_mmap_lock);
+ return 0;
+ }
+
+ node = rb_first(&tree->root);
while (node) {
+ struct rb_node *next = rb_next(node);
struct extent_map *em;
em = rb_entry(node, struct extent_map, rb_node);
- node = rb_next(node);
- (*scanned)++;
+ ctx->scanned++;
if (em->flags & EXTENT_FLAG_PINNED)
goto next;
@@ -1085,16 +1185,19 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
free_extent_map(em);
nr_dropped++;
next:
- if (*scanned >= nr_to_scan)
+ if (ctx->scanned >= ctx->nr_to_scan)
break;
/*
- * Restart if we had to reschedule, and any extent maps that were
- * pinned before may have become unpinned after we released the
- * lock and took it again.
+ * Stop if we need to reschedule or there's contention on the
+ * lock. This is to avoid slowing other tasks trying to take the
+ * lock and because the shrinker might be called during a memory
+ * allocation path and we want to avoid taking a very long time
+ * and slowing down all sorts of tasks.
*/
- if (cond_resched_rwlock_write(&tree->lock))
- node = rb_first_cached(&tree->map);
+ if (need_resched() || rwlock_needbreak(&tree->lock))
+ break;
+ node = next;
}
write_unlock(&tree->lock);
up_read(&inode->i_mmap_lock);
@@ -1102,25 +1205,30 @@ next:
return nr_dropped;
}
-static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_scan)
+static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_inode *inode;
long nr_dropped = 0;
- u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1;
+ u64 min_ino = ctx->last_ino + 1;
inode = btrfs_find_first_inode(root, min_ino);
while (inode) {
- nr_dropped += btrfs_scan_inode(inode, scanned, nr_to_scan);
+ nr_dropped += btrfs_scan_inode(inode, ctx);
min_ino = btrfs_ino(inode) + 1;
- fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode);
- iput(&inode->vfs_inode);
+ ctx->last_ino = btrfs_ino(inode);
+ btrfs_add_delayed_iput(inode);
- if (*scanned >= nr_to_scan)
+ if (ctx->scanned >= ctx->nr_to_scan)
+ break;
+
+ /*
+ * We may be called from memory allocation paths, so we don't
+ * want to take too much time and slowdown tasks.
+ */
+ if (need_resched())
break;
- cond_resched();
inode = btrfs_find_first_inode(root, min_ino);
}
@@ -1132,14 +1240,14 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s
* inode if there is one or we will find out this was the last
* one and move to the next root.
*/
- fs_info->extent_map_shrinker_last_root = btrfs_root_id(root);
+ ctx->last_root = btrfs_root_id(root);
} else {
/*
* No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
* that when processing the next root we start from its first inode.
*/
- fs_info->extent_map_shrinker_last_ino = 0;
- fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1;
+ ctx->last_ino = 0;
+ ctx->last_root = btrfs_root_id(root) + 1;
}
return nr_dropped;
@@ -1147,19 +1255,41 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s
long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
{
- const u64 start_root_id = fs_info->extent_map_shrinker_last_root;
- u64 next_root_id = start_root_id;
+ struct btrfs_em_shrink_ctx ctx;
+ u64 start_root_id;
+ u64 next_root_id;
bool cycled = false;
long nr_dropped = 0;
- long scanned = 0;
+
+ ctx.scanned = 0;
+ ctx.nr_to_scan = nr_to_scan;
+
+ /*
+ * In case we have multiple tasks running this shrinker, make the next
+ * one start from the next inode in case it starts before we finish.
+ */
+ spin_lock(&fs_info->extent_map_shrinker_lock);
+ ctx.last_ino = fs_info->extent_map_shrinker_last_ino;
+ fs_info->extent_map_shrinker_last_ino++;
+ ctx.last_root = fs_info->extent_map_shrinker_last_root;
+ spin_unlock(&fs_info->extent_map_shrinker_lock);
+
+ start_root_id = ctx.last_root;
+ next_root_id = ctx.last_root;
if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
- trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, nr);
+ trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
+ nr, ctx.last_root,
+ ctx.last_ino);
}
- while (scanned < nr_to_scan) {
+ /*
+ * We may be called from memory allocation paths, so we don't want to
+ * take too much time and slowdown tasks, so stop if we need reschedule.
+ */
+ while (ctx.scanned < ctx.nr_to_scan && !need_resched()) {
struct btrfs_root *root;
unsigned long count;
@@ -1171,8 +1301,8 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
spin_unlock(&fs_info->fs_roots_radix_lock);
if (start_root_id > 0 && !cycled) {
next_root_id = 0;
- fs_info->extent_map_shrinker_last_root = 0;
- fs_info->extent_map_shrinker_last_ino = 0;
+ ctx.last_root = 0;
+ ctx.last_ino = 0;
cycled = true;
continue;
}
@@ -1186,15 +1316,33 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
continue;
if (is_fstree(btrfs_root_id(root)))
- nr_dropped += btrfs_scan_root(root, &scanned, nr_to_scan);
+ nr_dropped += btrfs_scan_root(root, &ctx);
btrfs_put_root(root);
}
+ /*
+ * In case of multiple tasks running this extent map shrinking code this
+ * isn't perfect but it's simple and silences things like KCSAN. It's
+ * not possible to know which task made more progress because we can
+ * cycle back to the first root and first inode if it's not the first
+ * time the shrinker ran, see the above logic. Also a task that started
+ * later may finish ealier than another task and made less progress. So
+ * make this simple and update to the progress of the last task that
+ * finished, with the occasional possiblity of having two consecutive
+ * runs of the shrinker process the same inodes.
+ */
+ spin_lock(&fs_info->extent_map_shrinker_lock);
+ fs_info->extent_map_shrinker_last_ino = ctx.last_ino;
+ fs_info->extent_map_shrinker_last_root = ctx.last_root;
+ spin_unlock(&fs_info->extent_map_shrinker_lock);
+
if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
- trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr);
+ trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped,
+ nr, ctx.last_root,
+ ctx.last_ino);
}
return nr_dropped;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 6d587111f73a..5154a8f1d26c 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -4,12 +4,11 @@
#define BTRFS_EXTENT_MAP_H
#include <linux/compiler_types.h>
-#include <linux/rwlock_types.h>
+#include <linux/spinlock_types.h>
#include <linux/rbtree.h>
#include <linux/list.h>
#include <linux/refcount.h>
#include "misc.h"
-#include "extent_map.h"
#include "compression.h"
struct btrfs_inode;
@@ -62,46 +61,33 @@ struct extent_map {
u64 len;
/*
- * The file offset of the original file extent before splitting.
+ * The bytenr of the full on-disk extent.
*
- * This is an in-memory only member, matching
- * extent_map::start - btrfs_file_extent_item::offset for
- * regular/preallocated extents. EXTENT_MAP_HOLE otherwise.
+ * For regular extents it's btrfs_file_extent_item::disk_bytenr.
+ * For holes it's EXTENT_MAP_HOLE and for inline extents it's
+ * EXTENT_MAP_INLINE.
*/
- u64 orig_start;
+ u64 disk_bytenr;
/*
* The full on-disk extent length, matching
* btrfs_file_extent_item::disk_num_bytes.
*/
- u64 orig_block_len;
-
- /*
- * The decompressed size of the whole on-disk extent, matching
- * btrfs_file_extent_item::ram_bytes.
- */
- u64 ram_bytes;
+ u64 disk_num_bytes;
/*
- * The on-disk logical bytenr for the file extent.
- *
- * For compressed extents it matches btrfs_file_extent_item::disk_bytenr.
- * For uncompressed extents it matches
- * btrfs_file_extent_item::disk_bytenr + btrfs_file_extent_item::offset
+ * Offset inside the decompressed extent.
*
- * For holes it is EXTENT_MAP_HOLE and for inline extents it is
- * EXTENT_MAP_INLINE.
+ * For regular extents it's btrfs_file_extent_item::offset.
+ * For holes and inline extents it's 0.
*/
- u64 block_start;
+ u64 offset;
/*
- * The on-disk length for the file extent.
- *
- * For compressed extents it matches btrfs_file_extent_item::disk_num_bytes.
- * For uncompressed extents it matches extent_map::len.
- * For holes and inline extents it's -1 and shouldn't be used.
+ * The decompressed size of the whole on-disk extent, matching
+ * btrfs_file_extent_item::ram_bytes.
*/
- u64 block_len;
+ u64 ram_bytes;
/*
* Generation of the extent map, for merged em it's the highest
@@ -115,7 +101,7 @@ struct extent_map {
};
struct extent_map_tree {
- struct rb_root_cached map;
+ struct rb_root root;
struct list_head modified_extents;
rwlock_t lock;
};
@@ -163,6 +149,16 @@ static inline int extent_map_in_tree(const struct extent_map *em)
return !RB_EMPTY_NODE(&em->rb_node);
}
+static inline u64 extent_map_block_start(const struct extent_map *em)
+{
+ if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
+ if (extent_map_is_compressed(em))
+ return em->disk_bytenr;
+ return em->disk_bytenr + em->offset;
+ }
+ return em->disk_bytenr;
+}
+
static inline u64 extent_map_end(const struct extent_map *em)
{
if (em->start + em->len < em->start)
diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c
new file mode 100644
index 000000000000..8f95f3e44e99
--- /dev/null
+++ b/fs/btrfs/fiemap.c
@@ -0,0 +1,930 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "backref.h"
+#include "btrfs_inode.h"
+#include "fiemap.h"
+#include "file.h"
+#include "file-item.h"
+
+struct btrfs_fiemap_entry {
+ u64 offset;
+ u64 phys;
+ u64 len;
+ u32 flags;
+};
+
+/*
+ * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file
+ * range from the inode's io tree, unlock the subvolume tree search path, flush
+ * the fiemap cache and relock the file range and research the subvolume tree.
+ * The value here is something negative that can't be confused with a valid
+ * errno value and different from 1 because that's also a return value from
+ * fiemap_fill_next_extent() and also it's often used to mean some btree search
+ * did not find a key, so make it some distinct negative value.
+ */
+#define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1))
+
+/*
+ * Used to:
+ *
+ * - Cache the next entry to be emitted to the fiemap buffer, so that we can
+ * merge extents that are contiguous and can be grouped as a single one;
+ *
+ * - Store extents ready to be written to the fiemap buffer in an intermediary
+ * buffer. This intermediary buffer is to ensure that in case the fiemap
+ * buffer is memory mapped to the fiemap target file, we don't deadlock
+ * during btrfs_page_mkwrite(). This is because during fiemap we are locking
+ * an extent range in order to prevent races with delalloc flushing and
+ * ordered extent completion, which is needed in order to reliably detect
+ * delalloc in holes and prealloc extents. And this can lead to a deadlock
+ * if the fiemap buffer is memory mapped to the file we are running fiemap
+ * against (a silly, useless in practice scenario, but possible) because
+ * btrfs_page_mkwrite() will try to lock the same extent range.
+ */
+struct fiemap_cache {
+ /* An array of ready fiemap entries. */
+ struct btrfs_fiemap_entry *entries;
+ /* Number of entries in the entries array. */
+ int entries_size;
+ /* Index of the next entry in the entries array to write to. */
+ int entries_pos;
+ /*
+ * Once the entries array is full, this indicates what's the offset for
+ * the next file extent item we must search for in the inode's subvolume
+ * tree after unlocking the extent range in the inode's io tree and
+ * releasing the search path.
+ */
+ u64 next_search_offset;
+ /*
+ * This matches struct fiemap_extent_info::fi_mapped_extents, we use it
+ * to count ourselves emitted extents and stop instead of relying on
+ * fiemap_fill_next_extent() because we buffer ready fiemap entries at
+ * the @entries array, and we want to stop as soon as we hit the max
+ * amount of extents to map, not just to save time but also to make the
+ * logic at extent_fiemap() simpler.
+ */
+ unsigned int extents_mapped;
+ /* Fields for the cached extent (unsubmitted, not ready, extent). */
+ u64 offset;
+ u64 phys;
+ u64 len;
+ u32 flags;
+ bool cached;
+};
+
+static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache)
+{
+ for (int i = 0; i < cache->entries_pos; i++) {
+ struct btrfs_fiemap_entry *entry = &cache->entries[i];
+ int ret;
+
+ ret = fiemap_fill_next_extent(fieinfo, entry->offset,
+ entry->phys, entry->len,
+ entry->flags);
+ /*
+ * Ignore 1 (reached max entries) because we keep track of that
+ * ourselves in emit_fiemap_extent().
+ */
+ if (ret < 0)
+ return ret;
+ }
+ cache->entries_pos = 0;
+
+ return 0;
+}
+
+/*
+ * Helper to submit fiemap extent.
+ *
+ * Will try to merge current fiemap extent specified by @offset, @phys,
+ * @len and @flags with cached one.
+ * And only when we fails to merge, cached one will be submitted as
+ * fiemap extent.
+ *
+ * Return value is the same as fiemap_fill_next_extent().
+ */
+static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache,
+ u64 offset, u64 phys, u64 len, u32 flags)
+{
+ struct btrfs_fiemap_entry *entry;
+ u64 cache_end;
+
+ /* Set at the end of extent_fiemap(). */
+ ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
+
+ if (!cache->cached)
+ goto assign;
+
+ /*
+ * When iterating the extents of the inode, at extent_fiemap(), we may
+ * find an extent that starts at an offset behind the end offset of the
+ * previous extent we processed. This happens if fiemap is called
+ * without FIEMAP_FLAG_SYNC and there are ordered extents completing
+ * after we had to unlock the file range, release the search path, emit
+ * the fiemap extents stored in the buffer (cache->entries array) and
+ * the lock the remainder of the range and re-search the btree.
+ *
+ * For example we are in leaf X processing its last item, which is the
+ * file extent item for file range [512K, 1M[, and after
+ * btrfs_next_leaf() releases the path, there's an ordered extent that
+ * completes for the file range [768K, 2M[, and that results in trimming
+ * the file extent item so that it now corresponds to the file range
+ * [512K, 768K[ and a new file extent item is inserted for the file
+ * range [768K, 2M[, which may end up as the last item of leaf X or as
+ * the first item of the next leaf - in either case btrfs_next_leaf()
+ * will leave us with a path pointing to the new extent item, for the
+ * file range [768K, 2M[, since that's the first key that follows the
+ * last one we processed. So in order not to report overlapping extents
+ * to user space, we trim the length of the previously cached extent and
+ * emit it.
+ *
+ * Upon calling btrfs_next_leaf() we may also find an extent with an
+ * offset smaller than or equals to cache->offset, and this happens
+ * when we had a hole or prealloc extent with several delalloc ranges in
+ * it, but after btrfs_next_leaf() released the path, delalloc was
+ * flushed and the resulting ordered extents were completed, so we can
+ * now have found a file extent item for an offset that is smaller than
+ * or equals to what we have in cache->offset. We deal with this as
+ * described below.
+ */
+ cache_end = cache->offset + cache->len;
+ if (cache_end > offset) {
+ if (offset == cache->offset) {
+ /*
+ * We cached a dealloc range (found in the io tree) for
+ * a hole or prealloc extent and we have now found a
+ * file extent item for the same offset. What we have
+ * now is more recent and up to date, so discard what
+ * we had in the cache and use what we have just found.
+ */
+ goto assign;
+ } else if (offset > cache->offset) {
+ /*
+ * The extent range we previously found ends after the
+ * offset of the file extent item we found and that
+ * offset falls somewhere in the middle of that previous
+ * extent range. So adjust the range we previously found
+ * to end at the offset of the file extent item we have
+ * just found, since this extent is more up to date.
+ * Emit that adjusted range and cache the file extent
+ * item we have just found. This corresponds to the case
+ * where a previously found file extent item was split
+ * due to an ordered extent completing.
+ */
+ cache->len = offset - cache->offset;
+ goto emit;
+ } else {
+ const u64 range_end = offset + len;
+
+ /*
+ * The offset of the file extent item we have just found
+ * is behind the cached offset. This means we were
+ * processing a hole or prealloc extent for which we
+ * have found delalloc ranges (in the io tree), so what
+ * we have in the cache is the last delalloc range we
+ * found while the file extent item we found can be
+ * either for a whole delalloc range we previously
+ * emmitted or only a part of that range.
+ *
+ * We have two cases here:
+ *
+ * 1) The file extent item's range ends at or behind the
+ * cached extent's end. In this case just ignore the
+ * current file extent item because we don't want to
+ * overlap with previous ranges that may have been
+ * emmitted already;
+ *
+ * 2) The file extent item starts behind the currently
+ * cached extent but its end offset goes beyond the
+ * end offset of the cached extent. We don't want to
+ * overlap with a previous range that may have been
+ * emmitted already, so we emit the currently cached
+ * extent and then partially store the current file
+ * extent item's range in the cache, for the subrange
+ * going the cached extent's end to the end of the
+ * file extent item.
+ */
+ if (range_end <= cache_end)
+ return 0;
+
+ if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC)))
+ phys += cache_end - offset;
+
+ offset = cache_end;
+ len = range_end - cache_end;
+ goto emit;
+ }
+ }
+
+ /*
+ * Only merges fiemap extents if
+ * 1) Their logical addresses are continuous
+ *
+ * 2) Their physical addresses are continuous
+ * So truly compressed (physical size smaller than logical size)
+ * extents won't get merged with each other
+ *
+ * 3) Share same flags
+ */
+ if (cache->offset + cache->len == offset &&
+ cache->phys + cache->len == phys &&
+ cache->flags == flags) {
+ cache->len += len;
+ return 0;
+ }
+
+emit:
+ /* Not mergeable, need to submit cached one */
+
+ if (cache->entries_pos == cache->entries_size) {
+ /*
+ * We will need to research for the end offset of the last
+ * stored extent and not from the current offset, because after
+ * unlocking the range and releasing the path, if there's a hole
+ * between that end offset and this current offset, a new extent
+ * may have been inserted due to a new write, so we don't want
+ * to miss it.
+ */
+ entry = &cache->entries[cache->entries_size - 1];
+ cache->next_search_offset = entry->offset + entry->len;
+ cache->cached = false;
+
+ return BTRFS_FIEMAP_FLUSH_CACHE;
+ }
+
+ entry = &cache->entries[cache->entries_pos];
+ entry->offset = cache->offset;
+ entry->phys = cache->phys;
+ entry->len = cache->len;
+ entry->flags = cache->flags;
+ cache->entries_pos++;
+ cache->extents_mapped++;
+
+ if (cache->extents_mapped == fieinfo->fi_extents_max) {
+ cache->cached = false;
+ return 1;
+ }
+assign:
+ cache->cached = true;
+ cache->offset = offset;
+ cache->phys = phys;
+ cache->len = len;
+ cache->flags = flags;
+
+ return 0;
+}
+
+/*
+ * Emit last fiemap cache
+ *
+ * The last fiemap cache may still be cached in the following case:
+ * 0 4k 8k
+ * |<- Fiemap range ->|
+ * |<------------ First extent ----------->|
+ *
+ * In this case, the first extent range will be cached but not emitted.
+ * So we must emit it before ending extent_fiemap().
+ */
+static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache)
+{
+ int ret;
+
+ if (!cache->cached)
+ return 0;
+
+ ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
+ cache->len, cache->flags);
+ cache->cached = false;
+ if (ret > 0)
+ ret = 0;
+ return ret;
+}
+
+static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
+{
+ struct extent_buffer *clone = path->nodes[0];
+ struct btrfs_key key;
+ int slot;
+ int ret;
+
+ path->slots[0]++;
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
+ return 0;
+
+ /*
+ * Add a temporary extra ref to an already cloned extent buffer to
+ * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid
+ * the cost of allocating a new one.
+ */
+ ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags));
+ atomic_inc(&clone->refs);
+
+ ret = btrfs_next_leaf(inode->root, path);
+ if (ret != 0)
+ goto out;
+
+ /*
+ * Don't bother with cloning if there are no more file extent items for
+ * our inode.
+ */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) {
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * Important to preserve the start field, for the optimizations when
+ * checking if extents are shared (see extent_fiemap()).
+ *
+ * We must set ->start before calling copy_extent_buffer_full(). If we
+ * are on sub-pagesize blocksize, we use ->start to determine the offset
+ * into the folio where our eb exists, and if we update ->start after
+ * the fact then any subsequent reads of the eb may read from a
+ * different offset in the folio than where we originally copied into.
+ */
+ clone->start = path->nodes[0]->start;
+ /* See the comment at fiemap_search_slot() about why we clone. */
+ copy_extent_buffer_full(clone, path->nodes[0]);
+
+ slot = path->slots[0];
+ btrfs_release_path(path);
+ path->nodes[0] = clone;
+ path->slots[0] = slot;
+out:
+ if (ret)
+ free_extent_buffer(clone);
+
+ return ret;
+}
+
+/*
+ * Search for the first file extent item that starts at a given file offset or
+ * the one that starts immediately before that offset.
+ * Returns: 0 on success, < 0 on error, 1 if not found.
+ */
+static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,
+ u64 file_offset)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *clone;
+ struct btrfs_key key;
+ int slot;
+ int ret;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = file_offset;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ if (ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret != 0)
+ return ret;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ return 1;
+ }
+
+ /*
+ * We clone the leaf and use it during fiemap. This is because while
+ * using the leaf we do expensive things like checking if an extent is
+ * shared, which can take a long time. In order to prevent blocking
+ * other tasks for too long, we use a clone of the leaf. We have locked
+ * the file range in the inode's io tree, so we know none of our file
+ * extent items can change. This way we avoid blocking other tasks that
+ * want to insert items for other inodes in the same leaf or b+tree
+ * rebalance operations (triggered for example when someone is trying
+ * to push items into this leaf when trying to insert an item in a
+ * neighbour leaf).
+ * We also need the private clone because holding a read lock on an
+ * extent buffer of the subvolume's b+tree will make lockdep unhappy
+ * when we check if extents are shared, as backref walking may need to
+ * lock the same leaf we are processing.
+ */
+ clone = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!clone)
+ return -ENOMEM;
+
+ slot = path->slots[0];
+ btrfs_release_path(path);
+ path->nodes[0] = clone;
+ path->slots[0] = slot;
+
+ return 0;
+}
+
+/*
+ * Process a range which is a hole or a prealloc extent in the inode's subvolume
+ * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc
+ * extent. The end offset (@end) is inclusive.
+ */
+static int fiemap_process_hole(struct btrfs_inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ struct fiemap_cache *cache,
+ struct extent_state **delalloc_cached_state,
+ struct btrfs_backref_share_check_ctx *backref_ctx,
+ u64 disk_bytenr, u64 extent_offset,
+ u64 extent_gen,
+ u64 start, u64 end)
+{
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+ u64 cur_offset = start;
+ u64 last_delalloc_end = 0;
+ u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
+ bool checked_extent_shared = false;
+ int ret;
+
+ /*
+ * There can be no delalloc past i_size, so don't waste time looking for
+ * it beyond i_size.
+ */
+ while (cur_offset < end && cur_offset < i_size) {
+ u64 delalloc_start;
+ u64 delalloc_end;
+ u64 prealloc_start;
+ u64 prealloc_len = 0;
+ bool delalloc;
+
+ delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
+ delalloc_cached_state,
+ &delalloc_start,
+ &delalloc_end);
+ if (!delalloc)
+ break;
+
+ /*
+ * If this is a prealloc extent we have to report every section
+ * of it that has no delalloc.
+ */
+ if (disk_bytenr != 0) {
+ if (last_delalloc_end == 0) {
+ prealloc_start = start;
+ prealloc_len = delalloc_start - start;
+ } else {
+ prealloc_start = last_delalloc_end + 1;
+ prealloc_len = delalloc_start - prealloc_start;
+ }
+ }
+
+ if (prealloc_len > 0) {
+ if (!checked_extent_shared && fieinfo->fi_extents_max) {
+ ret = btrfs_is_data_extent_shared(inode,
+ disk_bytenr,
+ extent_gen,
+ backref_ctx);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ prealloc_flags |= FIEMAP_EXTENT_SHARED;
+
+ checked_extent_shared = true;
+ }
+ ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
+ disk_bytenr + extent_offset,
+ prealloc_len, prealloc_flags);
+ if (ret)
+ return ret;
+ extent_offset += prealloc_len;
+ }
+
+ ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0,
+ delalloc_end + 1 - delalloc_start,
+ FIEMAP_EXTENT_DELALLOC |
+ FIEMAP_EXTENT_UNKNOWN);
+ if (ret)
+ return ret;
+
+ last_delalloc_end = delalloc_end;
+ cur_offset = delalloc_end + 1;
+ extent_offset += cur_offset - delalloc_start;
+ cond_resched();
+ }
+
+ /*
+ * Either we found no delalloc for the whole prealloc extent or we have
+ * a prealloc extent that spans i_size or starts at or after i_size.
+ */
+ if (disk_bytenr != 0 && last_delalloc_end < end) {
+ u64 prealloc_start;
+ u64 prealloc_len;
+
+ if (last_delalloc_end == 0) {
+ prealloc_start = start;
+ prealloc_len = end + 1 - start;
+ } else {
+ prealloc_start = last_delalloc_end + 1;
+ prealloc_len = end + 1 - prealloc_start;
+ }
+
+ if (!checked_extent_shared && fieinfo->fi_extents_max) {
+ ret = btrfs_is_data_extent_shared(inode,
+ disk_bytenr,
+ extent_gen,
+ backref_ctx);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ prealloc_flags |= FIEMAP_EXTENT_SHARED;
+ }
+ ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
+ disk_bytenr + extent_offset,
+ prealloc_len, prealloc_flags);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int fiemap_find_last_extent_offset(struct btrfs_inode *inode,
+ struct btrfs_path *path,
+ u64 *last_extent_end_ret)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_root *root = inode->root;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *ei;
+ struct btrfs_key key;
+ u64 disk_bytenr;
+ int ret;
+
+ /*
+ * Lookup the last file extent. We're not using i_size here because
+ * there might be preallocation past i_size.
+ */
+ ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0);
+ /* There can't be a file extent item at offset (u64)-1 */
+ ASSERT(ret != 0);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * For a non-existing key, btrfs_search_slot() always leaves us at a
+ * slot > 0, except if the btree is empty, which is impossible because
+ * at least it has the inode item for this inode and all the items for
+ * the root inode 256.
+ */
+ ASSERT(path->slots[0] > 0);
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+ /* No file extent items in the subvolume tree. */
+ *last_extent_end_ret = 0;
+ return 0;
+ }
+
+ /*
+ * For an inline extent, the disk_bytenr is where inline data starts at,
+ * so first check if we have an inline extent item before checking if we
+ * have an implicit hole (disk_bytenr == 0).
+ */
+ ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
+ *last_extent_end_ret = btrfs_file_extent_end(path);
+ return 0;
+ }
+
+ /*
+ * Find the last file extent item that is not a hole (when NO_HOLES is
+ * not enabled). This should take at most 2 iterations in the worst
+ * case: we have one hole file extent item at slot 0 of a leaf and
+ * another hole file extent item as the last item in the previous leaf.
+ * This is because we merge file extent items that represent holes.
+ */
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ while (disk_bytenr == 0) {
+ ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ /* No file extent items that are not holes. */
+ *last_extent_end_ret = 0;
+ return 0;
+ }
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ }
+
+ *last_extent_end_ret = btrfs_file_extent_end(path);
+ return 0;
+}
+
+static int extent_fiemap(struct btrfs_inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
+{
+ const u64 ino = btrfs_ino(inode);
+ struct extent_state *cached_state = NULL;
+ struct extent_state *delalloc_cached_state = NULL;
+ struct btrfs_path *path;
+ struct fiemap_cache cache = { 0 };
+ struct btrfs_backref_share_check_ctx *backref_ctx;
+ u64 last_extent_end;
+ u64 prev_extent_end;
+ u64 range_start;
+ u64 range_end;
+ const u64 sectorsize = inode->root->fs_info->sectorsize;
+ bool stopped = false;
+ int ret;
+
+ cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry);
+ cache.entries = kmalloc_array(cache.entries_size,
+ sizeof(struct btrfs_fiemap_entry),
+ GFP_KERNEL);
+ backref_ctx = btrfs_alloc_backref_share_check_ctx();
+ path = btrfs_alloc_path();
+ if (!cache.entries || !backref_ctx || !path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+restart:
+ range_start = round_down(start, sectorsize);
+ range_end = round_up(start + len, sectorsize);
+ prev_extent_end = range_start;
+
+ lock_extent(&inode->io_tree, range_start, range_end, &cached_state);
+
+ ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
+ if (ret < 0)
+ goto out_unlock;
+ btrfs_release_path(path);
+
+ path->reada = READA_FORWARD;
+ ret = fiemap_search_slot(inode, path, range_start);
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /*
+ * No file extent item found, but we may have delalloc between
+ * the current offset and i_size. So check for that.
+ */
+ ret = 0;
+ goto check_eof_delalloc;
+ }
+
+ while (prev_extent_end < range_end) {
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_file_extent_item *ei;
+ struct btrfs_key key;
+ u64 extent_end;
+ u64 extent_len;
+ u64 extent_offset = 0;
+ u64 extent_gen;
+ u64 disk_bytenr = 0;
+ u64 flags = 0;
+ int extent_type;
+ u8 compression;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ extent_end = btrfs_file_extent_end(path);
+
+ /*
+ * The first iteration can leave us at an extent item that ends
+ * before our range's start. Move to the next item.
+ */
+ if (extent_end <= range_start)
+ goto next_item;
+
+ backref_ctx->curr_leaf_bytenr = leaf->start;
+
+ /* We have in implicit hole (NO_HOLES feature enabled). */
+ if (prev_extent_end < key.offset) {
+ const u64 hole_end = min(key.offset, range_end) - 1;
+
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ &delalloc_cached_state,
+ backref_ctx, 0, 0, 0,
+ prev_extent_end, hole_end);
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /* fiemap_fill_next_extent() told us to stop. */
+ stopped = true;
+ break;
+ }
+
+ /* We've reached the end of the fiemap range, stop. */
+ if (key.offset >= range_end) {
+ stopped = true;
+ break;
+ }
+ }
+
+ extent_len = extent_end - key.offset;
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ compression = btrfs_file_extent_compression(leaf, ei);
+ extent_type = btrfs_file_extent_type(leaf, ei);
+ extent_gen = btrfs_file_extent_generation(leaf, ei);
+
+ if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+ if (compression == BTRFS_COMPRESS_NONE)
+ extent_offset = btrfs_file_extent_offset(leaf, ei);
+ }
+
+ if (compression != BTRFS_COMPRESS_NONE)
+ flags |= FIEMAP_EXTENT_ENCODED;
+
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ flags |= FIEMAP_EXTENT_DATA_INLINE;
+ flags |= FIEMAP_EXTENT_NOT_ALIGNED;
+ ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0,
+ extent_len, flags);
+ } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ &delalloc_cached_state,
+ backref_ctx,
+ disk_bytenr, extent_offset,
+ extent_gen, key.offset,
+ extent_end - 1);
+ } else if (disk_bytenr == 0) {
+ /* We have an explicit hole. */
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ &delalloc_cached_state,
+ backref_ctx, 0, 0, 0,
+ key.offset, extent_end - 1);
+ } else {
+ /* We have a regular extent. */
+ if (fieinfo->fi_extents_max) {
+ ret = btrfs_is_data_extent_shared(inode,
+ disk_bytenr,
+ extent_gen,
+ backref_ctx);
+ if (ret < 0)
+ goto out_unlock;
+ else if (ret > 0)
+ flags |= FIEMAP_EXTENT_SHARED;
+ }
+
+ ret = emit_fiemap_extent(fieinfo, &cache, key.offset,
+ disk_bytenr + extent_offset,
+ extent_len, flags);
+ }
+
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /* emit_fiemap_extent() told us to stop. */
+ stopped = true;
+ break;
+ }
+
+ prev_extent_end = extent_end;
+next_item:
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto out_unlock;
+ }
+
+ ret = fiemap_next_leaf_item(inode, path);
+ if (ret < 0) {
+ goto out_unlock;
+ } else if (ret > 0) {
+ /* No more file extent items for this inode. */
+ break;
+ }
+ cond_resched();
+ }
+
+check_eof_delalloc:
+ if (!stopped && prev_extent_end < range_end) {
+ ret = fiemap_process_hole(inode, fieinfo, &cache,
+ &delalloc_cached_state, backref_ctx,
+ 0, 0, 0, prev_extent_end, range_end - 1);
+ if (ret < 0)
+ goto out_unlock;
+ prev_extent_end = range_end;
+ }
+
+ if (cache.cached && cache.offset + cache.len >= last_extent_end) {
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+
+ if (prev_extent_end < i_size) {
+ u64 delalloc_start;
+ u64 delalloc_end;
+ bool delalloc;
+
+ delalloc = btrfs_find_delalloc_in_range(inode,
+ prev_extent_end,
+ i_size - 1,
+ &delalloc_cached_state,
+ &delalloc_start,
+ &delalloc_end);
+ if (!delalloc)
+ cache.flags |= FIEMAP_EXTENT_LAST;
+ } else {
+ cache.flags |= FIEMAP_EXTENT_LAST;
+ }
+ }
+
+out_unlock:
+ unlock_extent(&inode->io_tree, range_start, range_end, &cached_state);
+
+ if (ret == BTRFS_FIEMAP_FLUSH_CACHE) {
+ btrfs_release_path(path);
+ ret = flush_fiemap_cache(fieinfo, &cache);
+ if (ret)
+ goto out;
+ len -= cache.next_search_offset - start;
+ start = cache.next_search_offset;
+ goto restart;
+ } else if (ret < 0) {
+ goto out;
+ }
+
+ /*
+ * Must free the path before emitting to the fiemap buffer because we
+ * may have a non-cloned leaf and if the fiemap buffer is memory mapped
+ * to a file, a write into it (through btrfs_page_mkwrite()) may trigger
+ * waiting for an ordered extent that in order to complete needs to
+ * modify that leaf, therefore leading to a deadlock.
+ */
+ btrfs_free_path(path);
+ path = NULL;
+
+ ret = flush_fiemap_cache(fieinfo, &cache);
+ if (ret)
+ goto out;
+
+ ret = emit_last_fiemap_cache(fieinfo, &cache);
+out:
+ free_extent_state(delalloc_cached_state);
+ kfree(cache.entries);
+ btrfs_free_backref_share_ctx(backref_ctx);
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
+{
+ struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
+ int ret;
+
+ ret = fiemap_prep(inode, fieinfo, start, &len, 0);
+ if (ret)
+ return ret;
+
+ /*
+ * fiemap_prep() called filemap_write_and_wait() for the whole possible
+ * file range (0 to LLONG_MAX), but that is not enough if we have
+ * compression enabled. The first filemap_fdatawrite_range() only kicks
+ * in the compression of data (in an async thread) and will return
+ * before the compression is done and writeback is started. A second
+ * filemap_fdatawrite_range() is needed to wait for the compression to
+ * complete and writeback to start. We also need to wait for ordered
+ * extents to complete, because our fiemap implementation uses mainly
+ * file extent items to list the extents, searching for extent maps
+ * only for file ranges with holes or prealloc extents to figure out
+ * if we have delalloc in those ranges.
+ */
+ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
+ ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX);
+ if (ret)
+ return ret;
+ }
+
+ btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);
+
+ /*
+ * We did an initial flush to avoid holding the inode's lock while
+ * triggering writeback and waiting for the completion of IO and ordered
+ * extents. Now after we locked the inode we do it again, because it's
+ * possible a new write may have happened in between those two steps.
+ */
+ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
+ ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX);
+ if (ret) {
+ btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
+ return ret;
+ }
+ }
+
+ ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
+ btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
+
+ return ret;
+}
diff --git a/fs/btrfs/fiemap.h b/fs/btrfs/fiemap.h
new file mode 100644
index 000000000000..cfd74b35988f
--- /dev/null
+++ b/fs/btrfs/fiemap.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_FIEMAP_H
+#define BTRFS_FIEMAP_H
+
+#include <linux/fiemap.h>
+
+int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len);
+
+#endif /* BTRFS_FIEMAP_H */
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index bce95f871750..5c342fe1af61 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -45,13 +45,12 @@
*/
void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 start, end, i_size;
int ret;
spin_lock(&inode->lock);
i_size = new_i_size ?: i_size_read(&inode->vfs_inode);
- if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
+ if (!inode->file_extent_tree) {
inode->disk_i_size = i_size;
goto out_unlock;
}
@@ -84,13 +83,14 @@ out_unlock:
int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
u64 len)
{
+ if (!inode->file_extent_tree)
+ return 0;
+
if (len == 0)
return 0;
ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize));
- if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
- return 0;
return set_extent_bit(inode->file_extent_tree, start, start + len - 1,
EXTENT_DIRTY, NULL);
}
@@ -112,14 +112,15 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
u64 len)
{
+ if (!inode->file_extent_tree)
+ return 0;
+
if (len == 0)
return 0;
ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) ||
len == (u64)-1);
- if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
- return 0;
return clear_extent_bit(inode->file_extent_tree, start,
start + len - 1, EXTENT_DIRTY, NULL);
}
@@ -352,7 +353,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
u32 bio_offset = 0;
if ((inode->flags & BTRFS_INODE_NODATASUM) ||
- test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
+ test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state))
return BLK_STS_OK;
/*
@@ -1280,7 +1281,6 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
const int slot = path->slots[0];
struct btrfs_key key;
u64 extent_start;
- u64 bytenr;
u8 type = btrfs_file_extent_type(leaf, fi);
int compress_type = btrfs_file_extent_compression(leaf, fi);
@@ -1290,24 +1290,29 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
em->generation = btrfs_file_extent_generation(leaf, fi);
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
+ const u64 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+
em->start = extent_start;
em->len = btrfs_file_extent_end(path) - extent_start;
- em->orig_start = extent_start -
- btrfs_file_extent_offset(leaf, fi);
- em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
- bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
- if (bytenr == 0) {
- em->block_start = EXTENT_MAP_HOLE;
+ if (disk_bytenr == 0) {
+ em->disk_bytenr = EXTENT_MAP_HOLE;
+ em->disk_num_bytes = 0;
+ em->offset = 0;
return;
}
+ em->disk_bytenr = disk_bytenr;
+ em->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ em->offset = btrfs_file_extent_offset(leaf, fi);
if (compress_type != BTRFS_COMPRESS_NONE) {
extent_map_set_compression(em, compress_type);
- em->block_start = bytenr;
- em->block_len = em->orig_block_len;
} else {
- bytenr += btrfs_file_extent_offset(leaf, fi);
- em->block_start = bytenr;
- em->block_len = em->len;
+ /*
+ * Older kernels can create regular non-hole data
+ * extents with ram_bytes smaller than disk_num_bytes.
+ * Not a big deal, just always use disk_num_bytes
+ * for ram_bytes.
+ */
+ em->ram_bytes = em->disk_num_bytes;
if (type == BTRFS_FILE_EXTENT_PREALLOC)
em->flags |= EXTENT_FLAG_PREALLOC;
}
@@ -1315,15 +1320,10 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
/* Tree-checker has ensured this. */
ASSERT(extent_start == 0);
- em->block_start = EXTENT_MAP_INLINE;
+ em->disk_bytenr = EXTENT_MAP_INLINE;
em->start = 0;
em->len = fs_info->sectorsize;
- /*
- * Initialize orig_start and block_len with the same values
- * as in inode.c:btrfs_get_extent().
- */
- em->orig_start = EXTENT_MAP_HOLE;
- em->block_len = (u64)-1;
+ em->offset = 0;
extent_map_set_compression(em, compress_type);
} else {
btrfs_err(fs_info,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d90138683a0a..21381de906f6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -17,8 +17,8 @@
#include <linux/uio.h>
#include <linux/iversion.h>
#include <linux/fsverity.h>
-#include <linux/iomap.h>
#include "ctree.h"
+#include "direct-io.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
@@ -1104,7 +1104,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
&cached_state);
}
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
- NULL, NULL, NULL, nowait, false);
+ NULL, nowait, false);
if (ret <= 0)
btrfs_drew_write_unlock(&root->snapshot_lock);
else
@@ -1140,8 +1140,7 @@ static void update_time_for_write(struct inode *inode)
inode_inc_iversion(inode);
}
-static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
- size_t count)
+int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
@@ -1187,8 +1186,7 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
return 0;
}
-static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
- struct iov_iter *i)
+ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
{
struct file *file = iocb->ki_filp;
loff_t pos;
@@ -1451,194 +1449,6 @@ out:
return num_written ? num_written : ret;
}
-static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
- const struct iov_iter *iter, loff_t offset)
-{
- const u32 blocksize_mask = fs_info->sectorsize - 1;
-
- if (offset & blocksize_mask)
- return -EINVAL;
-
- if (iov_iter_alignment(iter) & blocksize_mask)
- return -EINVAL;
-
- return 0;
-}
-
-static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- loff_t pos;
- ssize_t written = 0;
- ssize_t written_buffered;
- size_t prev_left = 0;
- loff_t endbyte;
- ssize_t ret;
- unsigned int ilock_flags = 0;
- struct iomap_dio *dio;
-
- if (iocb->ki_flags & IOCB_NOWAIT)
- ilock_flags |= BTRFS_ILOCK_TRY;
-
- /*
- * If the write DIO is within EOF, use a shared lock and also only if
- * security bits will likely not be dropped by file_remove_privs() called
- * from btrfs_write_check(). Either will need to be rechecked after the
- * lock was acquired.
- */
- if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
- ilock_flags |= BTRFS_ILOCK_SHARED;
-
-relock:
- ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
- if (ret < 0)
- return ret;
-
- /* Shared lock cannot be used with security bits set. */
- if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
- btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
- ilock_flags &= ~BTRFS_ILOCK_SHARED;
- goto relock;
- }
-
- ret = generic_write_checks(iocb, from);
- if (ret <= 0) {
- btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
- return ret;
- }
-
- ret = btrfs_write_check(iocb, from, ret);
- if (ret < 0) {
- btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
- goto out;
- }
-
- pos = iocb->ki_pos;
- /*
- * Re-check since file size may have changed just before taking the
- * lock or pos may have changed because of O_APPEND in generic_write_check()
- */
- if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
- pos + iov_iter_count(from) > i_size_read(inode)) {
- btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
- ilock_flags &= ~BTRFS_ILOCK_SHARED;
- goto relock;
- }
-
- if (check_direct_IO(fs_info, from, pos)) {
- btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
- goto buffered;
- }
-
- /*
- * The iov_iter can be mapped to the same file range we are writing to.
- * If that's the case, then we will deadlock in the iomap code, because
- * it first calls our callback btrfs_dio_iomap_begin(), which will create
- * an ordered extent, and after that it will fault in the pages that the
- * iov_iter refers to. During the fault in we end up in the readahead
- * pages code (starting at btrfs_readahead()), which will lock the range,
- * find that ordered extent and then wait for it to complete (at
- * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
- * obviously the ordered extent can never complete as we didn't submit
- * yet the respective bio(s). This always happens when the buffer is
- * memory mapped to the same file range, since the iomap DIO code always
- * invalidates pages in the target file range (after starting and waiting
- * for any writeback).
- *
- * So here we disable page faults in the iov_iter and then retry if we
- * got -EFAULT, faulting in the pages before the retry.
- */
- from->nofault = true;
- dio = btrfs_dio_write(iocb, from, written);
- from->nofault = false;
-
- /*
- * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
- * iocb, and that needs to lock the inode. So unlock it before calling
- * iomap_dio_complete() to avoid a deadlock.
- */
- btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
-
- if (IS_ERR_OR_NULL(dio))
- ret = PTR_ERR_OR_ZERO(dio);
- else
- ret = iomap_dio_complete(dio);
-
- /* No increment (+=) because iomap returns a cumulative value. */
- if (ret > 0)
- written = ret;
-
- if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
- const size_t left = iov_iter_count(from);
- /*
- * We have more data left to write. Try to fault in as many as
- * possible of the remainder pages and retry. We do this without
- * releasing and locking again the inode, to prevent races with
- * truncate.
- *
- * Also, in case the iov refers to pages in the file range of the
- * file we want to write to (due to a mmap), we could enter an
- * infinite loop if we retry after faulting the pages in, since
- * iomap will invalidate any pages in the range early on, before
- * it tries to fault in the pages of the iov. So we keep track of
- * how much was left of iov in the previous EFAULT and fallback
- * to buffered IO in case we haven't made any progress.
- */
- if (left == prev_left) {
- ret = -ENOTBLK;
- } else {
- fault_in_iov_iter_readable(from, left);
- prev_left = left;
- goto relock;
- }
- }
-
- /*
- * If 'ret' is -ENOTBLK or we have not written all data, then it means
- * we must fallback to buffered IO.
- */
- if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
- goto out;
-
-buffered:
- /*
- * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
- * it must retry the operation in a context where blocking is acceptable,
- * because even if we end up not blocking during the buffered IO attempt
- * below, we will block when flushing and waiting for the IO.
- */
- if (iocb->ki_flags & IOCB_NOWAIT) {
- ret = -EAGAIN;
- goto out;
- }
-
- pos = iocb->ki_pos;
- written_buffered = btrfs_buffered_write(iocb, from);
- if (written_buffered < 0) {
- ret = written_buffered;
- goto out;
- }
- /*
- * Ensure all data is persisted. We want the next direct IO read to be
- * able to read what was just written.
- */
- endbyte = pos + written_buffered - 1;
- ret = btrfs_fdatawrite_range(inode, pos, endbyte);
- if (ret)
- goto out;
- ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
- if (ret)
- goto out;
- written += written_buffered;
- iocb->ki_pos = pos + written_buffered;
- invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
- endbyte >> PAGE_SHIFT);
-out:
- return ret < 0 ? ret : written;
-}
-
static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
const struct btrfs_ioctl_encoded_io_args *encoded)
{
@@ -1738,7 +1548,7 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
return 0;
}
-static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
+static int start_ordered_ops(struct btrfs_inode *inode, loff_t start, loff_t end)
{
int ret;
struct blk_plug plug;
@@ -1758,7 +1568,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
{
- struct btrfs_inode *inode = BTRFS_I(ctx->inode);
+ struct btrfs_inode *inode = ctx->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) &&
@@ -1794,9 +1604,9 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct dentry *dentry = file_dentry(file);
- struct inode *inode = d_inode(dentry);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
struct btrfs_log_ctx ctx;
int ret = 0, err;
@@ -1829,7 +1639,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (ret)
goto out;
- btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
atomic_inc(&root->log_batch);
@@ -1853,7 +1663,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = start_ordered_ops(inode, start, end);
if (ret) {
- btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
goto out;
}
@@ -1865,8 +1675,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* running delalloc the full sync flag may be set if we need to drop
* extra extent map ranges due to temporary memory allocation failures.
*/
- full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
/*
* We have to do this here to avoid the priority inversion of waiting on
@@ -1885,16 +1694,15 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
if (full_sync || btrfs_is_zoned(fs_info)) {
ret = btrfs_wait_ordered_range(inode, start, len);
- clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &BTRFS_I(inode)->runtime_flags);
+ clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);
} else {
/*
* Get our ordered extents as soon as possible to avoid doing
* checksum lookups in the csum tree, and use instead the
* checksums attached to the ordered extents.
*/
- btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
- &ctx.ordered_extents);
- ret = filemap_fdatawait_range(inode->i_mapping, start, end);
+ btrfs_get_ordered_extents_for_logging(inode, &ctx.ordered_extents);
+ ret = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, end);
if (ret)
goto out_release_extents;
@@ -1907,8 +1715,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* extents to complete so that any extent maps that point to
* unwritten locations are dropped and we don't log them.
*/
- if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR,
- &BTRFS_I(inode)->runtime_flags))
+ if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags))
ret = btrfs_wait_ordered_range(inode, start, len);
}
@@ -1923,8 +1730,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* modified so clear this flag in case it was set for whatever
* reason, it's no longer relevant.
*/
- clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
/*
* An ordered extent might have started before and completed
* already with io errors, in which case the inode was not
@@ -1932,7 +1738,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* for any errors that might have happened since we last
* checked called fsync.
*/
- ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
+ ret = filemap_check_wb_err(inode->vfs_inode.i_mapping, file->f_wb_err);
goto out_release_extents;
}
@@ -1982,7 +1788,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
- btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
if (ret == BTRFS_NO_LOG_SYNC) {
ret = btrfs_end_transaction(trans);
@@ -2051,7 +1857,7 @@ out:
out_release_extents:
btrfs_release_log_ctx_extents(&ctx);
- btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
goto out;
}
@@ -2350,11 +2156,9 @@ out:
hole_em->start = offset;
hole_em->len = end - offset;
hole_em->ram_bytes = hole_em->len;
- hole_em->orig_start = offset;
- hole_em->block_start = EXTENT_MAP_HOLE;
- hole_em->block_len = 0;
- hole_em->orig_block_len = 0;
+ hole_em->disk_bytenr = EXTENT_MAP_HOLE;
+ hole_em->disk_num_bytes = 0;
hole_em->generation = trans->transid;
ret = btrfs_replace_extent_map_range(inode, hole_em, true);
@@ -2385,7 +2189,7 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
return PTR_ERR(em);
/* Hole or vacuum extent(only exists in no-hole mode) */
- if (em->block_start == EXTENT_MAP_HOLE) {
+ if (em->disk_bytenr == EXTENT_MAP_HOLE) {
ret = 1;
*len = em->start + em->len > *start + *len ?
0 : *start + *len - em->start - em->len;
@@ -2814,7 +2618,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
- ret = btrfs_wait_ordered_range(inode, offset, len);
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode), offset, len);
if (ret)
goto out_only_mutex;
@@ -3042,7 +2846,7 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
if (IS_ERR(em))
return PTR_ERR(em);
- if (em->block_start == EXTENT_MAP_HOLE)
+ if (em->disk_bytenr == EXTENT_MAP_HOLE)
ret = RANGE_BOUNDARY_HOLE;
else if (em->flags & EXTENT_FLAG_PREALLOC)
ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
@@ -3106,7 +2910,7 @@ static int btrfs_zero_range(struct inode *inode,
ASSERT(IS_ALIGNED(alloc_start, sectorsize));
len = offset + len - alloc_start;
offset = alloc_start;
- alloc_hint = em->block_start + em->len;
+ alloc_hint = extent_map_block_start(em) + em->len;
}
free_extent_map(em);
@@ -3124,7 +2928,7 @@ static int btrfs_zero_range(struct inode *inode,
mode);
goto out;
}
- if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
+ if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) {
free_extent_map(em);
ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
0);
@@ -3309,7 +3113,7 @@ static long btrfs_fallocate(struct file *file, int mode,
* the file range and, due to the previous locking we did, we know there
* can't be more delalloc or ordered extents in the range.
*/
- ret = btrfs_wait_ordered_range(inode, alloc_start,
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode), alloc_start,
alloc_end - alloc_start);
if (ret)
goto out;
@@ -3337,7 +3141,7 @@ static long btrfs_fallocate(struct file *file, int mode,
last_byte = min(extent_map_end(em), alloc_end);
actual_end = min_t(u64, extent_map_end(em), offset + len);
last_byte = ALIGN(last_byte, blocksize);
- if (em->block_start == EXTENT_MAP_HOLE ||
+ if (em->disk_bytenr == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
!(em->flags & EXTENT_FLAG_PREALLOC))) {
const u64 range_len = last_byte - cur_offset;
@@ -3920,97 +3724,6 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
return generic_file_open(inode, filp);
}
-static int check_direct_read(struct btrfs_fs_info *fs_info,
- const struct iov_iter *iter, loff_t offset)
-{
- int ret;
- int i, seg;
-
- ret = check_direct_IO(fs_info, iter, offset);
- if (ret < 0)
- return ret;
-
- if (!iter_is_iovec(iter))
- return 0;
-
- for (seg = 0; seg < iter->nr_segs; seg++) {
- for (i = seg + 1; i < iter->nr_segs; i++) {
- const struct iovec *iov1 = iter_iov(iter) + seg;
- const struct iovec *iov2 = iter_iov(iter) + i;
-
- if (iov1->iov_base == iov2->iov_base)
- return -EINVAL;
- }
- }
- return 0;
-}
-
-static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- size_t prev_left = 0;
- ssize_t read = 0;
- ssize_t ret;
-
- if (fsverity_active(inode))
- return 0;
-
- if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
- return 0;
-
- btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
-again:
- /*
- * This is similar to what we do for direct IO writes, see the comment
- * at btrfs_direct_write(), but we also disable page faults in addition
- * to disabling them only at the iov_iter level. This is because when
- * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
- * which can still trigger page fault ins despite having set ->nofault
- * to true of our 'to' iov_iter.
- *
- * The difference to direct IO writes is that we deadlock when trying
- * to lock the extent range in the inode's tree during he page reads
- * triggered by the fault in (while for writes it is due to waiting for
- * our own ordered extent). This is because for direct IO reads,
- * btrfs_dio_iomap_begin() returns with the extent range locked, which
- * is only unlocked in the endio callback (end_bio_extent_readpage()).
- */
- pagefault_disable();
- to->nofault = true;
- ret = btrfs_dio_read(iocb, to, read);
- to->nofault = false;
- pagefault_enable();
-
- /* No increment (+=) because iomap returns a cumulative value. */
- if (ret > 0)
- read = ret;
-
- if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
- const size_t left = iov_iter_count(to);
-
- if (left == prev_left) {
- /*
- * We didn't make any progress since the last attempt,
- * fallback to a buffered read for the remainder of the
- * range. This is just to avoid any possibility of looping
- * for too long.
- */
- ret = read;
- } else {
- /*
- * We made some progress since the last retry or this is
- * the first time we are retrying. Fault in as many pages
- * as possible and retry.
- */
- fault_in_iov_iter_writeable(to, left);
- prev_left = left;
- goto again;
- }
- }
- btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
- return ret < 0 ? ret : read;
-}
-
static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
ssize_t ret = 0;
@@ -4045,8 +3758,9 @@ const struct file_operations btrfs_file_operations = {
.fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
};
-int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
+int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end)
{
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
int ret;
/*
@@ -4063,10 +3777,9 @@ int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
* know better and pull this out at some point in the future, it is
* right and you are wrong.
*/
- ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
- if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+ ret = filemap_fdatawrite_range(mapping, start, end);
+ if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags))
+ ret = filemap_fdatawrite_range(mapping, start, end);
return ret;
}
diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h
index 77aaca208c7b..912254e653cf 100644
--- a/fs/btrfs/file.h
+++ b/fs/btrfs/file.h
@@ -37,12 +37,14 @@ int btrfs_release_file(struct inode *inode, struct file *file);
int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
struct extent_state **cached, bool noreserve);
-int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
+int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end);
int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes, bool nowait);
void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
struct extent_state **cached_state,
u64 *delalloc_start_ret, u64 *delalloc_end_ret);
+int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count);
+ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i);
#endif
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 3ab8dea5036b..3f9b7507543a 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -82,7 +82,6 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_path *path,
u64 offset)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
struct btrfs_key location;
struct btrfs_disk_key disk_key;
@@ -116,7 +115,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
* sure NOFS is set to keep us from deadlocking.
*/
nofs_flag = memalloc_nofs_save();
- inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path);
+ inode = btrfs_iget_path(location.objectid, root, path);
btrfs_release_path(path);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(inode))
@@ -138,7 +137,7 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
spin_lock(&block_group->lock);
if (block_group->inode)
- inode = igrab(block_group->inode);
+ inode = igrab(&block_group->inode->vfs_inode);
spin_unlock(&block_group->lock);
if (inode)
return inode;
@@ -157,7 +156,7 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
}
if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags))
- block_group->inode = igrab(inode);
+ block_group->inode = BTRFS_I(igrab(inode));
spin_unlock(&block_group->lock);
return inode;
@@ -858,6 +857,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
spin_unlock(&ctl->tree_lock);
btrfs_err(fs_info,
"Duplicate entries in free space cache, dumping");
+ kmem_cache_free(btrfs_free_space_bitmap_cachep, e->bitmap);
kmem_cache_free(btrfs_free_space_cachep, e);
goto free_cache;
}
@@ -1268,7 +1268,7 @@ static int flush_dirty_cache(struct inode *inode)
{
int ret;
- ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
EXTENT_DELALLOC, NULL);
@@ -1483,7 +1483,7 @@ static int __btrfs_write_out_cache(struct inode *inode,
io_ctl->entries = entries;
io_ctl->bitmaps = bitmaps;
- ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
+ ret = btrfs_fdatawrite_range(BTRFS_I(inode), 0, (u64)-1);
if (ret)
goto out;
@@ -2697,7 +2697,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
u64 offset = bytenr - block_group->start;
u64 to_free, to_unusable;
int bg_reclaim_threshold = 0;
- bool initial = (size == block_group->length);
+ bool initial = ((size == block_group->length) && (block_group->alloc_offset == 0));
u64 reclaimable_unusable;
WARN_ON(!initial && offset + size > block_group->zone_capacity);
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 90f2938bd743..7ba50e133921 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1300,10 +1300,14 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
btrfs_tree_lock(free_space_root->node);
btrfs_clear_buffer_dirty(trans, free_space_root->node);
btrfs_tree_unlock(free_space_root->node);
- btrfs_free_tree_block(trans, btrfs_root_id(free_space_root),
- free_space_root->node, 0, 1);
-
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(free_space_root),
+ free_space_root->node, 0, 1);
btrfs_put_root(free_space_root);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
return btrfs_commit_transaction(trans);
}
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 89f0650631cd..3d6d4b503220 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -29,7 +29,6 @@
#include "extent-io-tree.h"
#include "async-thread.h"
#include "block-rsv.h"
-#include "fs.h"
struct inode;
struct super_block;
@@ -99,7 +98,9 @@ enum {
/* The btrfs_fs_info created for self-tests */
BTRFS_FS_STATE_DUMMY_FS_INFO,
- BTRFS_FS_STATE_NO_CSUMS,
+ /* Checksum errors are ignored. */
+ BTRFS_FS_STATE_NO_DATA_CSUMS,
+ BTRFS_FS_STATE_SKIP_META_CSUMS,
/* Indicates there was an error cleaning up a log tree. */
BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
@@ -194,37 +195,39 @@ enum {
* Note: don't forget to add new options to btrfs_show_options()
*/
enum {
- BTRFS_MOUNT_NODATASUM = (1UL << 0),
- BTRFS_MOUNT_NODATACOW = (1UL << 1),
- BTRFS_MOUNT_NOBARRIER = (1UL << 2),
- BTRFS_MOUNT_SSD = (1UL << 3),
- BTRFS_MOUNT_DEGRADED = (1UL << 4),
- BTRFS_MOUNT_COMPRESS = (1UL << 5),
- BTRFS_MOUNT_NOTREELOG = (1UL << 6),
- BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7),
- BTRFS_MOUNT_SSD_SPREAD = (1UL << 8),
- BTRFS_MOUNT_NOSSD = (1UL << 9),
- BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10),
- BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11),
- BTRFS_MOUNT_SPACE_CACHE = (1UL << 12),
- BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13),
- BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14),
- BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15),
- BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16),
- BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17),
- BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18),
- BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 19),
- BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 20),
- BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 21),
- BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 22),
- BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 23),
- BTRFS_MOUNT_NOLOGREPLAY = (1UL << 24),
- BTRFS_MOUNT_REF_VERIFY = (1UL << 25),
- BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 26),
- BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 27),
- BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 28),
- BTRFS_MOUNT_NODISCARD = (1UL << 29),
- BTRFS_MOUNT_NOSPACECACHE = (1UL << 30),
+ BTRFS_MOUNT_NODATASUM = (1ULL << 0),
+ BTRFS_MOUNT_NODATACOW = (1ULL << 1),
+ BTRFS_MOUNT_NOBARRIER = (1ULL << 2),
+ BTRFS_MOUNT_SSD = (1ULL << 3),
+ BTRFS_MOUNT_DEGRADED = (1ULL << 4),
+ BTRFS_MOUNT_COMPRESS = (1ULL << 5),
+ BTRFS_MOUNT_NOTREELOG = (1ULL << 6),
+ BTRFS_MOUNT_FLUSHONCOMMIT = (1ULL << 7),
+ BTRFS_MOUNT_SSD_SPREAD = (1ULL << 8),
+ BTRFS_MOUNT_NOSSD = (1ULL << 9),
+ BTRFS_MOUNT_DISCARD_SYNC = (1ULL << 10),
+ BTRFS_MOUNT_FORCE_COMPRESS = (1ULL << 11),
+ BTRFS_MOUNT_SPACE_CACHE = (1ULL << 12),
+ BTRFS_MOUNT_CLEAR_CACHE = (1ULL << 13),
+ BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1ULL << 14),
+ BTRFS_MOUNT_ENOSPC_DEBUG = (1ULL << 15),
+ BTRFS_MOUNT_AUTO_DEFRAG = (1ULL << 16),
+ BTRFS_MOUNT_USEBACKUPROOT = (1ULL << 17),
+ BTRFS_MOUNT_SKIP_BALANCE = (1ULL << 18),
+ BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1ULL << 19),
+ BTRFS_MOUNT_RESCAN_UUID_TREE = (1ULL << 20),
+ BTRFS_MOUNT_FRAGMENT_DATA = (1ULL << 21),
+ BTRFS_MOUNT_FRAGMENT_METADATA = (1ULL << 22),
+ BTRFS_MOUNT_FREE_SPACE_TREE = (1ULL << 23),
+ BTRFS_MOUNT_NOLOGREPLAY = (1ULL << 24),
+ BTRFS_MOUNT_REF_VERIFY = (1ULL << 25),
+ BTRFS_MOUNT_DISCARD_ASYNC = (1ULL << 26),
+ BTRFS_MOUNT_IGNOREBADROOTS = (1ULL << 27),
+ BTRFS_MOUNT_IGNOREDATACSUMS = (1ULL << 28),
+ BTRFS_MOUNT_NODISCARD = (1ULL << 29),
+ BTRFS_MOUNT_NOSPACECACHE = (1ULL << 30),
+ BTRFS_MOUNT_IGNOREMETACSUMS = (1ULL << 31),
+ BTRFS_MOUNT_IGNORESUPERFLAGS = (1ULL << 32),
};
/*
@@ -478,7 +481,7 @@ struct btrfs_fs_info {
* required instead of the faster short fsync log commits
*/
u64 last_trans_log_full_commit;
- unsigned long mount_opt;
+ unsigned long long mount_opt;
unsigned long compress_type:4;
unsigned int compress_level;
@@ -630,6 +633,7 @@ struct btrfs_fs_info {
s32 delalloc_batch;
struct percpu_counter evictable_extent_maps;
+ spinlock_t extent_map_shrinker_lock;
u64 extent_map_shrinker_last_root;
u64 extent_map_shrinker_last_ino;
@@ -957,7 +961,7 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
/*
* Count how many fs_info->max_extent_size cover the @size
*/
-static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size)
+static inline u32 count_max_extents(const struct btrfs_fs_info *fs_info, u64 size)
{
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (!fs_info)
@@ -1018,7 +1022,7 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \
BTRFS_MOUNT_##opt)
-static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
+static inline int btrfs_fs_closing(const struct btrfs_fs_info *fs_info)
{
/* Do it this way so we only ever do one test_bit in the normal case. */
if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) {
@@ -1037,7 +1041,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
* since setting and checking for SB_RDONLY in the superblock's flags is not
* atomic.
*/
-static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
+static inline int btrfs_need_cleaner_sleep(const struct btrfs_fs_info *fs_info)
{
return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) ||
btrfs_fs_closing(fs_info);
@@ -1058,7 +1062,7 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
#define EXPORT_FOR_TESTS
-static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
+static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info)
{
return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
}
@@ -1069,7 +1073,7 @@ void btrfs_test_destroy_inode(struct inode *inode);
#define EXPORT_FOR_TESTS static
-static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
+static inline int btrfs_is_testing(const struct btrfs_fs_info *fs_info)
{
return 0;
}
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 84a94d19b22c..316756ff08ac 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -141,8 +141,8 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
ref_objectid, name);
if (!extref) {
- btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL);
- ret = -EROFS;
+ btrfs_abort_transaction(trans, -ENOENT);
+ ret = -ENOENT;
goto out;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 753db965f7c0..01eab6955647 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -70,31 +70,13 @@
#include "orphan.h"
#include "backref.h"
#include "raid-stripe-tree.h"
+#include "fiemap.h"
struct btrfs_iget_args {
u64 ino;
struct btrfs_root *root;
};
-struct btrfs_dio_data {
- ssize_t submitted;
- struct extent_changeset *data_reserved;
- struct btrfs_ordered_extent *ordered;
- bool data_space_reserved;
- bool nocow_done;
-};
-
-struct btrfs_dio_private {
- /* Range of I/O */
- u64 file_offset;
- u32 bytes;
-
- /* This must be last */
- struct btrfs_bio bbio;
-};
-
-static struct bio_set btrfs_dio_bioset;
-
struct btrfs_rename_ctx {
/* Output field. Stores the index number of the old directory entry. */
u64 index;
@@ -137,11 +119,6 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode,
struct page *locked_page, u64 start,
u64 end, struct writeback_control *wbc,
bool pages_dirty);
-static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
- u64 len, u64 orig_start, u64 block_start,
- u64 block_len, u64 orig_block_len,
- u64 ram_bytes, int compress_type,
- int type);
static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
u64 root, void *warn_ctx)
@@ -877,7 +854,7 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
if (btrfs_test_opt(fs_info, COMPRESS) ||
inode->flags & BTRFS_INODE_COMPRESS ||
inode->prop_compress)
- return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
+ return btrfs_compress_heuristic(inode, start, end);
return 0;
}
@@ -890,6 +867,26 @@ static inline void inode_should_defrag(struct btrfs_inode *inode,
btrfs_add_inode_defrag(NULL, inode, small_write);
}
+static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
+{
+ unsigned long end_index = end >> PAGE_SHIFT;
+ struct page *page;
+ int ret = 0;
+
+ for (unsigned long index = start >> PAGE_SHIFT;
+ index <= end_index; index++) {
+ page = find_get_page(inode->i_mapping, index);
+ if (unlikely(!page)) {
+ if (!ret)
+ ret = -ENOENT;
+ continue;
+ }
+ clear_page_dirty_for_io(page);
+ put_page(page);
+ }
+ return ret;
+}
+
/*
* Work queue call back to started compression on a file and pages.
*
@@ -931,7 +928,16 @@ static void compress_file_range(struct btrfs_work *work)
* Otherwise applications with the file mmap'd can wander in and change
* the page contents while we are compressing them.
*/
- extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
+ ret = extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
+
+ /*
+ * All the folios should have been locked thus no failure.
+ *
+ * And even if some folios are missing, btrfs_compress_folios()
+ * would handle them correctly, so here just do an ASSERT() check for
+ * early logic errors.
+ */
+ ASSERT(ret == 0);
/*
* We need to save i_size before now because it could change in between
@@ -1152,6 +1158,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ordered_extent *ordered;
+ struct btrfs_file_extent file_extent;
struct btrfs_key ins;
struct page *locked_page = NULL;
struct extent_state *cached = NULL;
@@ -1198,29 +1205,22 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
lock_extent(io_tree, start, end, &cached);
/* Here we're doing allocation and writeback of the compressed pages */
- em = create_io_em(inode, start,
- async_extent->ram_size, /* len */
- start, /* orig_start */
- ins.objectid, /* block_start */
- ins.offset, /* block_len */
- ins.offset, /* orig_block_len */
- async_extent->ram_size, /* ram_bytes */
- async_extent->compress_type,
- BTRFS_ORDERED_COMPRESSED);
+ file_extent.disk_bytenr = ins.objectid;
+ file_extent.disk_num_bytes = ins.offset;
+ file_extent.ram_bytes = async_extent->ram_size;
+ file_extent.num_bytes = async_extent->ram_size;
+ file_extent.offset = 0;
+ file_extent.compression = async_extent->compress_type;
+
+ em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out_free_reserve;
}
free_extent_map(em);
- ordered = btrfs_alloc_ordered_extent(inode, start, /* file_offset */
- async_extent->ram_size, /* num_bytes */
- async_extent->ram_size, /* ram_bytes */
- ins.objectid, /* disk_bytenr */
- ins.offset, /* disk_num_bytes */
- 0, /* offset */
- 1 << BTRFS_ORDERED_COMPRESSED,
- async_extent->compress_type);
+ ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
+ 1 << BTRFS_ORDERED_COMPRESSED);
if (IS_ERR(ordered)) {
btrfs_drop_extent_map_range(inode, start, end, false);
ret = PTR_ERR(ordered);
@@ -1264,8 +1264,8 @@ out_free_reserve:
kfree(async_extent);
}
-static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
- u64 num_bytes)
+u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
+ u64 num_bytes)
{
struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
@@ -1279,15 +1279,15 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* first block in this inode and use that as a hint. If that
* block is also bogus then just don't worry about it.
*/
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
free_extent_map(em);
em = search_extent_mapping(em_tree, 0, 0);
- if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
- alloc_hint = em->block_start;
+ if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
+ alloc_hint = extent_map_block_start(em);
if (em)
free_extent_map(em);
} else {
- alloc_hint = em->block_start;
+ alloc_hint = extent_map_block_start(em);
free_extent_map(em);
}
}
@@ -1375,7 +1375,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
}
}
- alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
+ alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes);
/*
* Relocation relies on the relocated extents to have exactly the same
@@ -1395,6 +1395,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
while (num_bytes > 0) {
struct btrfs_ordered_extent *ordered;
+ struct btrfs_file_extent file_extent;
cur_alloc_size = num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
@@ -1431,18 +1432,18 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
extent_reserved = true;
ram_size = ins.offset;
+ file_extent.disk_bytenr = ins.objectid;
+ file_extent.disk_num_bytes = ins.offset;
+ file_extent.num_bytes = ins.offset;
+ file_extent.ram_bytes = ins.offset;
+ file_extent.offset = 0;
+ file_extent.compression = BTRFS_COMPRESS_NONE;
lock_extent(&inode->io_tree, start, start + ram_size - 1,
&cached);
- em = create_io_em(inode, start, ins.offset, /* len */
- start, /* orig_start */
- ins.objectid, /* block_start */
- ins.offset, /* block_len */
- ins.offset, /* orig_block_len */
- ram_size, /* ram_bytes */
- BTRFS_COMPRESS_NONE, /* compress_type */
- BTRFS_ORDERED_REGULAR /* type */);
+ em = btrfs_create_io_em(inode, start, &file_extent,
+ BTRFS_ORDERED_REGULAR);
if (IS_ERR(em)) {
unlock_extent(&inode->io_tree, start,
start + ram_size - 1, &cached);
@@ -1451,10 +1452,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
}
free_extent_map(em);
- ordered = btrfs_alloc_ordered_extent(inode, start, ram_size,
- ram_size, ins.objectid, cur_alloc_size,
- 0, 1 << BTRFS_ORDERED_REGULAR,
- BTRFS_COMPRESS_NONE);
+ ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
+ 1 << BTRFS_ORDERED_REGULAR);
if (IS_ERR(ordered)) {
unlock_extent(&inode->io_tree, start,
start + ram_size - 1, &cached);
@@ -1617,10 +1616,8 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_
u64 alloc_hint = 0;
if (do_free) {
- struct async_chunk *async_chunk;
struct async_cow *async_cow;
- async_chunk = container_of(work, struct async_chunk, work);
btrfs_add_delayed_iput(async_chunk->inode);
if (async_chunk->blkcg_css)
css_put(async_chunk->blkcg_css);
@@ -1850,13 +1847,11 @@ struct can_nocow_file_extent_args {
*/
bool free_path;
- /* Output fields. Only set when can_nocow_file_extent() returns 1. */
-
- u64 disk_bytenr;
- u64 disk_num_bytes;
- u64 extent_offset;
- /* Number of bytes that can be written to in NOCOW mode. */
- u64 num_bytes;
+ /*
+ * Output fields. Only set when can_nocow_file_extent() returns 1.
+ * The expected file extent for the NOCOW write.
+ */
+ struct btrfs_file_extent file_extent;
};
/*
@@ -1878,6 +1873,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
struct btrfs_root *root = inode->root;
struct btrfs_file_extent_item *fi;
struct btrfs_root *csum_root;
+ u64 io_start;
u64 extent_end;
u8 extent_type;
int can_nocow = 0;
@@ -1890,11 +1886,6 @@ static int can_nocow_file_extent(struct btrfs_path *path,
if (extent_type == BTRFS_FILE_EXTENT_INLINE)
goto out;
- /* Can't access these fields unless we know it's not an inline extent. */
- args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
- args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
- args->extent_offset = btrfs_file_extent_offset(leaf, fi);
-
if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
extent_type == BTRFS_FILE_EXTENT_REG)
goto out;
@@ -1910,7 +1901,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
goto out;
/* An explicit hole, must COW. */
- if (args->disk_bytenr == 0)
+ if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
goto out;
/* Compressed/encrypted/encoded extents must be COWed. */
@@ -1921,6 +1912,12 @@ static int can_nocow_file_extent(struct btrfs_path *path,
extent_end = btrfs_file_extent_end(path);
+ args->file_extent.disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ args->file_extent.disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ args->file_extent.ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+ args->file_extent.offset = btrfs_file_extent_offset(leaf, fi);
+ args->file_extent.compression = btrfs_file_extent_compression(leaf, fi);
+
/*
* The following checks can be expensive, as they need to take other
* locks and do btree or rbtree searches, so release the path to avoid
@@ -1929,8 +1926,8 @@ static int can_nocow_file_extent(struct btrfs_path *path,
btrfs_release_path(path);
ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
- key->offset - args->extent_offset,
- args->disk_bytenr, args->strict, path);
+ key->offset - args->file_extent.offset,
+ args->file_extent.disk_bytenr, args->strict, path);
WARN_ON_ONCE(ret > 0 && is_freespace_inode);
if (ret != 0)
goto out;
@@ -1951,18 +1948,18 @@ static int can_nocow_file_extent(struct btrfs_path *path,
atomic_read(&root->snapshot_force_cow))
goto out;
- args->disk_bytenr += args->extent_offset;
- args->disk_bytenr += args->start - key->offset;
- args->num_bytes = min(args->end + 1, extent_end) - args->start;
+ args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start;
+ args->file_extent.offset += args->start - key->offset;
+ io_start = args->file_extent.disk_bytenr + args->file_extent.offset;
/*
* Force COW if csums exist in the range. This ensures that csums for a
* given extent are either valid or do not exist.
*/
- csum_root = btrfs_csum_root(root->fs_info, args->disk_bytenr);
- ret = btrfs_lookup_csums_list(csum_root, args->disk_bytenr,
- args->disk_bytenr + args->num_bytes - 1,
+ csum_root = btrfs_csum_root(root->fs_info, io_start);
+ ret = btrfs_lookup_csums_list(csum_root, io_start,
+ io_start + args->file_extent.num_bytes - 1,
NULL, nowait);
WARN_ON_ONCE(ret > 0 && is_freespace_inode);
if (ret != 0)
@@ -2021,7 +2018,6 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
struct extent_buffer *leaf;
struct extent_state *cached_state = NULL;
u64 extent_end;
- u64 ram_bytes;
u64 nocow_end;
int extent_type;
bool is_prealloc;
@@ -2100,7 +2096,6 @@ next_slot:
ret = -EUCLEAN;
goto error;
}
- ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
extent_end = btrfs_file_extent_end(path);
/*
@@ -2120,7 +2115,9 @@ next_slot:
goto must_cow;
ret = 0;
- nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr);
+ nocow_bg = btrfs_inc_nocow_writers(fs_info,
+ nocow_args.file_extent.disk_bytenr +
+ nocow_args.file_extent.offset);
if (!nocow_bg) {
must_cow:
/*
@@ -2156,21 +2153,16 @@ must_cow:
}
}
- nocow_end = cur_offset + nocow_args.num_bytes - 1;
+ nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1;
lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state);
is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
if (is_prealloc) {
- u64 orig_start = found_key.offset - nocow_args.extent_offset;
struct extent_map *em;
- em = create_io_em(inode, cur_offset, nocow_args.num_bytes,
- orig_start,
- nocow_args.disk_bytenr, /* block_start */
- nocow_args.num_bytes, /* block_len */
- nocow_args.disk_num_bytes, /* orig_block_len */
- ram_bytes, BTRFS_COMPRESS_NONE,
- BTRFS_ORDERED_PREALLOC);
+ em = btrfs_create_io_em(inode, cur_offset,
+ &nocow_args.file_extent,
+ BTRFS_ORDERED_PREALLOC);
if (IS_ERR(em)) {
unlock_extent(&inode->io_tree, cur_offset,
nocow_end, &cached_state);
@@ -2182,12 +2174,10 @@ must_cow:
}
ordered = btrfs_alloc_ordered_extent(inode, cur_offset,
- nocow_args.num_bytes, nocow_args.num_bytes,
- nocow_args.disk_bytenr, nocow_args.num_bytes, 0,
+ &nocow_args.file_extent,
is_prealloc
? (1 << BTRFS_ORDERED_PREALLOC)
- : (1 << BTRFS_ORDERED_NOCOW),
- BTRFS_COMPRESS_NONE);
+ : (1 << BTRFS_ORDERED_NOCOW));
btrfs_dec_nocow_writers(nocow_bg);
if (IS_ERR(ordered)) {
if (is_prealloc) {
@@ -2601,44 +2591,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
}
}
-static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
- struct btrfs_ordered_extent *ordered)
-{
- u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
- u64 len = bbio->bio.bi_iter.bi_size;
- struct btrfs_ordered_extent *new;
- int ret;
-
- /* Must always be called for the beginning of an ordered extent. */
- if (WARN_ON_ONCE(start != ordered->disk_bytenr))
- return -EINVAL;
-
- /* No need to split if the ordered extent covers the entire bio. */
- if (ordered->disk_num_bytes == len) {
- refcount_inc(&ordered->refs);
- bbio->ordered = ordered;
- return 0;
- }
-
- /*
- * Don't split the extent_map for NOCOW extents, as we're writing into
- * a pre-existing one.
- */
- if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
- ret = split_extent_map(bbio->inode, bbio->file_offset,
- ordered->num_bytes, len,
- ordered->disk_bytenr);
- if (ret)
- return ret;
- }
-
- new = btrfs_split_ordered_extent(ordered, len);
- if (IS_ERR(new))
- return PTR_ERR(new);
- bbio->ordered = new;
- return 0;
-}
-
/*
* given a list of ordered sums record them in the inode. This happens
* at IO completion time based on sums calculated at bio submission time.
@@ -2681,7 +2633,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
if (IS_ERR(em))
return PTR_ERR(em);
- if (em->block_start != EXTENT_MAP_HOLE)
+ if (em->disk_bytenr != EXTENT_MAP_HOLE)
goto next;
em_len = em->len;
@@ -3037,10 +2989,8 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
oe->disk_num_bytes);
btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
- if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) {
+ if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
num_bytes = oe->truncated_len;
- ram_bytes = num_bytes;
- }
btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
@@ -3056,7 +3006,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
- return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
+ return insert_reserved_file_extent(trans, oe->inode,
oe->file_offset, &stack_fi,
update_inode_bytes, oe->qgroup_rsv);
}
@@ -3068,7 +3018,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
*/
int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
{
- struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
+ struct btrfs_inode *inode = ordered_extent->inode;
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans = NULL;
@@ -3302,7 +3252,7 @@ out:
int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
{
- if (btrfs_is_zoned(inode_to_fs_info(ordered->inode)) &&
+ if (btrfs_is_zoned(ordered->inode->root->fs_info) &&
!test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
list_empty(&ordered->bioc_list))
btrfs_finish_ordered_zoned(ordered);
@@ -3596,7 +3546,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
found_key.objectid = found_key.offset;
found_key.type = BTRFS_INODE_ITEM_KEY;
found_key.offset = 0;
- inode = btrfs_iget(fs_info->sb, last_objectid, root);
+ inode = btrfs_iget(last_objectid, root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
inode = NULL;
@@ -3781,6 +3731,30 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
return 1;
}
+static int btrfs_init_file_extent_tree(struct btrfs_inode *inode)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+ if (WARN_ON_ONCE(inode->file_extent_tree))
+ return 0;
+ if (btrfs_fs_incompat(fs_info, NO_HOLES))
+ return 0;
+ if (!S_ISREG(inode->vfs_inode.i_mode))
+ return 0;
+ if (btrfs_is_free_space_inode(inode))
+ return 0;
+
+ inode->file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL);
+ if (!inode->file_extent_tree)
+ return -ENOMEM;
+
+ extent_io_tree_init(fs_info, inode->file_extent_tree, IO_TREE_INODE_FILE_EXTENT);
+ /* Lockdep class is set only for the file extent tree. */
+ lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class);
+
+ return 0;
+}
+
/*
* read an inode from the btree into the in-memory inode
*/
@@ -3800,6 +3774,10 @@ static int btrfs_read_locked_inode(struct inode *inode,
bool filled = false;
int first_xattr_slot;
+ ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
+ if (ret)
+ return ret;
+
ret = btrfs_fill_inode(inode, &rdev);
if (!ret)
filled = true;
@@ -3810,7 +3788,7 @@ static int btrfs_read_locked_inode(struct inode *inode,
return -ENOMEM;
}
- memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
+ btrfs_get_inode_key(BTRFS_I(inode), &location);
ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
if (ret) {
@@ -3856,7 +3834,9 @@ static int btrfs_read_locked_inode(struct inode *inode,
inode->i_rdev = 0;
rdev = btrfs_inode_rdev(leaf, inode_item);
- BTRFS_I(inode)->index_cnt = (u64)-1;
+ if (S_ISDIR(inode->i_mode))
+ BTRFS_I(inode)->index_cnt = (u64)-1;
+
btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
@@ -4038,13 +4018,15 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *inode_item;
struct btrfs_path *path;
struct extent_buffer *leaf;
+ struct btrfs_key key;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- ret = btrfs_lookup_inode(trans, inode->root, path, &inode->location, 1);
+ btrfs_get_inode_key(inode, &key);
+ ret = btrfs_lookup_inode(trans, inode->root, path, &key, 1);
if (ret) {
if (ret > 0)
ret = -ENOENT;
@@ -4308,7 +4290,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
objectid = btrfs_root_id(inode->root);
} else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
- objectid = inode->location.objectid;
+ objectid = inode->ref_root_id;
} else {
WARN_ON(1);
fscrypt_free_filename(&fname);
@@ -4539,11 +4521,6 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
ret = PTR_ERR(trans);
goto out_release;
}
- ret = btrfs_record_root_in_trans(trans, root);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out_end_trans;
- }
btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
qgroup_reserved = 0;
trans->block_rsv = &block_rsv;
@@ -4967,11 +4944,9 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
}
hole_em->start = cur_offset;
hole_em->len = hole_size;
- hole_em->orig_start = cur_offset;
- hole_em->block_start = EXTENT_MAP_HOLE;
- hole_em->block_len = 0;
- hole_em->orig_block_len = 0;
+ hole_em->disk_bytenr = EXTENT_MAP_HOLE;
+ hole_em->disk_num_bytes = 0;
hole_em->ram_bytes = hole_size;
hole_em->generation = btrfs_get_fs_generation(fs_info);
@@ -5049,7 +5024,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
if (btrfs_is_zoned(fs_info)) {
- ret = btrfs_wait_ordered_range(inode,
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode),
ALIGN(newsize, fs_info->sectorsize),
(u64)-1);
if (ret)
@@ -5079,7 +5054,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
* wait for disk_i_size to be stable and then update the
* in-memory size to match.
*/
- err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
if (err)
return err;
i_size_write(inode, BTRFS_I(inode)->disk_i_size);
@@ -5493,59 +5468,52 @@ out:
return err;
}
-static void inode_tree_add(struct btrfs_inode *inode)
+static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
{
struct btrfs_root *root = inode->root;
- struct btrfs_inode *entry;
- struct rb_node **p;
- struct rb_node *parent;
- struct rb_node *new = &inode->rb_node;
- u64 ino = btrfs_ino(inode);
+ struct btrfs_inode *existing;
+ const u64 ino = btrfs_ino(inode);
+ int ret;
if (inode_unhashed(&inode->vfs_inode))
- return;
- parent = NULL;
- spin_lock(&root->inode_lock);
- p = &root->inode_tree.rb_node;
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct btrfs_inode, rb_node);
+ return 0;
- if (ino < btrfs_ino(entry))
- p = &parent->rb_left;
- else if (ino > btrfs_ino(entry))
- p = &parent->rb_right;
- else {
- WARN_ON(!(entry->vfs_inode.i_state &
- (I_WILL_FREE | I_FREEING)));
- rb_replace_node(parent, new, &root->inode_tree);
- RB_CLEAR_NODE(parent);
- spin_unlock(&root->inode_lock);
- return;
- }
+ if (prealloc) {
+ ret = xa_reserve(&root->inodes, ino, GFP_NOFS);
+ if (ret)
+ return ret;
}
- rb_link_node(new, parent, p);
- rb_insert_color(new, &root->inode_tree);
- spin_unlock(&root->inode_lock);
+
+ existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC);
+
+ if (xa_is_err(existing)) {
+ ret = xa_err(existing);
+ ASSERT(ret != -EINVAL);
+ ASSERT(ret != -ENOMEM);
+ return ret;
+ } else if (existing) {
+ WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING)));
+ }
+
+ return 0;
}
-static void inode_tree_del(struct btrfs_inode *inode)
+static void btrfs_del_inode_from_root(struct btrfs_inode *inode)
{
struct btrfs_root *root = inode->root;
- int empty = 0;
+ struct btrfs_inode *entry;
+ bool empty = false;
- spin_lock(&root->inode_lock);
- if (!RB_EMPTY_NODE(&inode->rb_node)) {
- rb_erase(&inode->rb_node, &root->inode_tree);
- RB_CLEAR_NODE(&inode->rb_node);
- empty = RB_EMPTY_ROOT(&root->inode_tree);
- }
- spin_unlock(&root->inode_lock);
+ xa_lock(&root->inodes);
+ entry = __xa_erase(&root->inodes, btrfs_ino(inode));
+ if (entry == inode)
+ empty = xa_empty(&root->inodes);
+ xa_unlock(&root->inodes);
if (empty && btrfs_root_refs(&root->root_item) == 0) {
- spin_lock(&root->inode_lock);
- empty = RB_EMPTY_ROOT(&root->inode_tree);
- spin_unlock(&root->inode_lock);
+ xa_lock(&root->inodes);
+ empty = xa_empty(&root->inodes);
+ xa_unlock(&root->inodes);
if (empty)
btrfs_add_dead_root(root);
}
@@ -5556,10 +5524,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
{
struct btrfs_iget_args *args = p;
- inode->i_ino = args->ino;
- BTRFS_I(inode)->location.objectid = args->ino;
- BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
- BTRFS_I(inode)->location.offset = 0;
+ btrfs_set_inode_number(BTRFS_I(inode), args->ino);
BTRFS_I(inode)->root = btrfs_grab_root(args->root);
if (args->root && args->root == args->root->fs_info->tree_root &&
@@ -5573,12 +5538,11 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)
{
struct btrfs_iget_args *args = opaque;
- return args->ino == BTRFS_I(inode)->location.objectid &&
+ return args->ino == btrfs_ino(BTRFS_I(inode)) &&
args->root == BTRFS_I(inode)->root;
}
-static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
- struct btrfs_root *root)
+static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
{
struct inode *inode;
struct btrfs_iget_args args;
@@ -5587,7 +5551,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
args.ino = ino;
args.root = root;
- inode = iget5_locked(s, hashval, btrfs_find_actor,
+ inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor,
btrfs_init_locked_inode,
(void *)&args);
return inode;
@@ -5599,41 +5563,44 @@ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
* allocator. NULL is also valid but may require an additional allocation
* later.
*/
-struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
- struct btrfs_root *root, struct btrfs_path *path)
+struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
+ struct btrfs_path *path)
{
struct inode *inode;
+ int ret;
- inode = btrfs_iget_locked(s, ino, root);
+ inode = btrfs_iget_locked(ino, root);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW) {
- int ret;
+ if (!(inode->i_state & I_NEW))
+ return inode;
- ret = btrfs_read_locked_inode(inode, path);
- if (!ret) {
- inode_tree_add(BTRFS_I(inode));
- unlock_new_inode(inode);
- } else {
- iget_failed(inode);
- /*
- * ret > 0 can come from btrfs_search_slot called by
- * btrfs_read_locked_inode, this means the inode item
- * was not found.
- */
- if (ret > 0)
- ret = -ENOENT;
- inode = ERR_PTR(ret);
- }
- }
+ ret = btrfs_read_locked_inode(inode, path);
+ /*
+ * ret > 0 can come from btrfs_search_slot called by
+ * btrfs_read_locked_inode(), this means the inode item was not found.
+ */
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret < 0)
+ goto error;
+
+ ret = btrfs_add_inode_to_root(BTRFS_I(inode), true);
+ if (ret < 0)
+ goto error;
+
+ unlock_new_inode(inode);
return inode;
+error:
+ iget_failed(inode);
+ return ERR_PTR(ret);
}
-struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
+struct inode *btrfs_iget(u64 ino, struct btrfs_root *root)
{
- return btrfs_iget_path(s, ino, root, NULL);
+ return btrfs_iget_path(ino, root, NULL);
}
static struct inode *new_simple_dir(struct inode *dir,
@@ -5647,10 +5614,11 @@ static struct inode *new_simple_dir(struct inode *dir,
return ERR_PTR(-ENOMEM);
BTRFS_I(inode)->root = btrfs_grab_root(root);
- memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
+ BTRFS_I(inode)->ref_root_id = key->objectid;
+ set_bit(BTRFS_INODE_ROOT_STUB, &BTRFS_I(inode)->runtime_flags);
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
- inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
+ btrfs_set_inode_number(BTRFS_I(inode), BTRFS_EMPTY_SUBVOL_DIR_OBJECTID);
/*
* We only need lookup, the rest is read-only and there's no inode
* associated with the dentry
@@ -5704,7 +5672,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
return ERR_PTR(ret);
if (location.type == BTRFS_INODE_ITEM_KEY) {
- inode = btrfs_iget(dir->i_sb, location.objectid, root);
+ inode = btrfs_iget(location.objectid, root);
if (IS_ERR(inode))
return inode;
@@ -5728,7 +5696,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
else
inode = new_simple_dir(dir, &location, root);
} else {
- inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
+ inode = btrfs_iget(location.objectid, sub_root);
btrfs_put_root(sub_root);
if (IS_ERR(inode))
@@ -5948,7 +5916,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
addr = private->filldir_buf;
path->reada = READA_FORWARD;
- put = btrfs_readdir_get_delayed_items(inode, private->last_index,
+ put = btrfs_readdir_get_delayed_items(BTRFS_I(inode), private->last_index,
&ins_list, &del_list);
again:
@@ -6038,7 +6006,7 @@ nopos:
ret = 0;
err:
if (put)
- btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
+ btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list);
btrfs_free_path(path);
return ret;
}
@@ -6123,7 +6091,7 @@ static int btrfs_insert_inode_locked(struct inode *inode)
{
struct btrfs_iget_args args;
- args.ino = BTRFS_I(inode)->location.objectid;
+ args.ino = btrfs_ino(BTRFS_I(inode));
args.root = BTRFS_I(inode)->root;
return insert_inode_locked4(inode,
@@ -6230,7 +6198,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct btrfs_root *root;
struct btrfs_inode_item *inode_item;
- struct btrfs_key *location;
struct btrfs_path *path;
u64 objectid;
struct btrfs_inode_ref *ref;
@@ -6239,6 +6206,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_item_batch batch;
unsigned long ptr;
int ret;
+ bool xa_reserved = false;
path = btrfs_alloc_path();
if (!path)
@@ -6248,10 +6216,19 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
root = BTRFS_I(inode)->root;
+ ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
+ if (ret)
+ goto out;
+
ret = btrfs_get_free_objectid(root, &objectid);
if (ret)
goto out;
- inode->i_ino = objectid;
+ btrfs_set_inode_number(BTRFS_I(inode), objectid);
+
+ ret = xa_reserve(&root->inodes, objectid, GFP_NOFS);
+ if (ret)
+ goto out;
+ xa_reserved = true;
if (args->orphan) {
/*
@@ -6266,8 +6243,10 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
if (ret)
goto out;
}
- /* index_cnt is ignored for everything but a dir. */
- BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
+
+ if (S_ISDIR(inode->i_mode))
+ BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
+
BTRFS_I(inode)->generation = trans->transid;
inode->i_generation = BTRFS_I(inode)->generation;
@@ -6294,11 +6273,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
BTRFS_INODE_NODATASUM;
}
- location = &BTRFS_I(inode)->location;
- location->objectid = objectid;
- location->offset = 0;
- location->type = BTRFS_INODE_ITEM_KEY;
-
ret = btrfs_insert_inode_locked(inode);
if (ret < 0) {
if (!args->orphan)
@@ -6397,8 +6371,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
* Subvolumes inherit properties from their parent subvolume,
* not the directory they were created in.
*/
- parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID,
- BTRFS_I(dir)->root);
+ parent = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, BTRFS_I(dir)->root);
if (IS_ERR(parent)) {
ret = PTR_ERR(parent);
} else {
@@ -6426,7 +6399,12 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
}
}
- inode_tree_add(BTRFS_I(inode));
+ ret = btrfs_add_inode_to_root(BTRFS_I(inode), false);
+ if (WARN_ON(ret)) {
+ /* Shouldn't happen, we used xa_reserve() before. */
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
trace_btrfs_inode_new(inode);
btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
@@ -6454,6 +6432,9 @@ discard:
ihold(inode);
discard_new_inode(inode);
out:
+ if (xa_reserved)
+ xa_release(&root->inodes, objectid);
+
btrfs_free_path(path);
return ret;
}
@@ -6818,7 +6799,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
if (em) {
if (em->start > start || em->start + em->len <= start)
free_extent_map(em);
- else if (em->block_start == EXTENT_MAP_INLINE && page)
+ else if (em->disk_bytenr == EXTENT_MAP_INLINE && page)
free_extent_map(em);
else
goto out;
@@ -6829,9 +6810,8 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
goto out;
}
em->start = EXTENT_MAP_HOLE;
- em->orig_start = EXTENT_MAP_HOLE;
+ em->disk_bytenr = EXTENT_MAP_HOLE;
em->len = (u64)-1;
- em->block_len = (u64)-1;
path = btrfs_alloc_path();
if (!path) {
@@ -6921,9 +6901,8 @@ next:
/* New extent overlaps with existing one */
em->start = start;
- em->orig_start = start;
em->len = found_key.offset - start;
- em->block_start = EXTENT_MAP_HOLE;
+ em->disk_bytenr = EXTENT_MAP_HOLE;
goto insert;
}
@@ -6947,7 +6926,7 @@ next:
*
* Other members are not utilized for inline extents.
*/
- ASSERT(em->block_start == EXTENT_MAP_INLINE);
+ ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);
ASSERT(em->len == fs_info->sectorsize);
ret = read_inline_extent(inode, path, page);
@@ -6957,9 +6936,8 @@ next:
}
not_found:
em->start = start;
- em->orig_start = start;
em->len = len;
- em->block_start = EXTENT_MAP_HOLE;
+ em->disk_bytenr = EXTENT_MAP_HOLE;
insert:
ret = 0;
btrfs_release_path(path);
@@ -6986,84 +6964,6 @@ out:
return em;
}
-static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
- struct btrfs_dio_data *dio_data,
- const u64 start,
- const u64 len,
- const u64 orig_start,
- const u64 block_start,
- const u64 block_len,
- const u64 orig_block_len,
- const u64 ram_bytes,
- const int type)
-{
- struct extent_map *em = NULL;
- struct btrfs_ordered_extent *ordered;
-
- if (type != BTRFS_ORDERED_NOCOW) {
- em = create_io_em(inode, start, len, orig_start, block_start,
- block_len, orig_block_len, ram_bytes,
- BTRFS_COMPRESS_NONE, /* compress_type */
- type);
- if (IS_ERR(em))
- goto out;
- }
- ordered = btrfs_alloc_ordered_extent(inode, start, len, len,
- block_start, block_len, 0,
- (1 << type) |
- (1 << BTRFS_ORDERED_DIRECT),
- BTRFS_COMPRESS_NONE);
- if (IS_ERR(ordered)) {
- if (em) {
- free_extent_map(em);
- btrfs_drop_extent_map_range(inode, start,
- start + len - 1, false);
- }
- em = ERR_CAST(ordered);
- } else {
- ASSERT(!dio_data->ordered);
- dio_data->ordered = ordered;
- }
- out:
-
- return em;
-}
-
-static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
- struct btrfs_dio_data *dio_data,
- u64 start, u64 len)
-{
- struct btrfs_root *root = inode->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct extent_map *em;
- struct btrfs_key ins;
- u64 alloc_hint;
- int ret;
-
- alloc_hint = get_extent_allocation_hint(inode, start, len);
-again:
- ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
- 0, alloc_hint, &ins, 1, 1);
- if (ret == -EAGAIN) {
- ASSERT(btrfs_is_zoned(fs_info));
- wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
- TASK_UNINTERRUPTIBLE);
- goto again;
- }
- if (ret)
- return ERR_PTR(ret);
-
- em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start,
- ins.objectid, ins.offset, ins.offset,
- ins.offset, BTRFS_ORDERED_REGULAR);
- btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- if (IS_ERR(em))
- btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
- 1);
-
- return em;
-}
-
static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
{
struct btrfs_block_group *block_group;
@@ -7098,8 +6998,8 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
* any ordered extents.
*/
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
- u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes, bool nowait, bool strict)
+ struct btrfs_file_extent *file_extent,
+ bool nowait, bool strict)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct can_nocow_file_extent_args nocow_args = { 0 };
@@ -7149,8 +7049,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
found_type = btrfs_file_extent_type(leaf, fi);
- if (ram_bytes)
- *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
nocow_args.start = offset;
nocow_args.end = offset + *len - 1;
@@ -7168,14 +7066,16 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
}
ret = 0;
- if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr))
+ if (btrfs_extent_readonly(fs_info,
+ nocow_args.file_extent.disk_bytenr +
+ nocow_args.file_extent.offset))
goto out;
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
u64 range_end;
- range_end = round_up(offset + nocow_args.num_bytes,
+ range_end = round_up(offset + nocow_args.file_extent.num_bytes,
root->fs_info->sectorsize) - 1;
ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC);
if (ret) {
@@ -7184,117 +7084,20 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
}
}
- if (orig_start)
- *orig_start = key.offset - nocow_args.extent_offset;
- if (orig_block_len)
- *orig_block_len = nocow_args.disk_num_bytes;
+ if (file_extent)
+ memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent));
- *len = nocow_args.num_bytes;
+ *len = nocow_args.file_extent.num_bytes;
ret = 1;
out:
btrfs_free_path(path);
return ret;
}
-static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
- struct extent_state **cached_state,
- unsigned int iomap_flags)
-{
- const bool writing = (iomap_flags & IOMAP_WRITE);
- const bool nowait = (iomap_flags & IOMAP_NOWAIT);
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct btrfs_ordered_extent *ordered;
- int ret = 0;
-
- while (1) {
- if (nowait) {
- if (!try_lock_extent(io_tree, lockstart, lockend,
- cached_state))
- return -EAGAIN;
- } else {
- lock_extent(io_tree, lockstart, lockend, cached_state);
- }
- /*
- * We're concerned with the entire range that we're going to be
- * doing DIO to, so we need to make sure there's no ordered
- * extents in this range.
- */
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
- lockend - lockstart + 1);
-
- /*
- * We need to make sure there are no buffered pages in this
- * range either, we could have raced between the invalidate in
- * generic_file_direct_write and locking the extent. The
- * invalidate needs to happen so that reads after a write do not
- * get stale data.
- */
- if (!ordered &&
- (!writing || !filemap_range_has_page(inode->i_mapping,
- lockstart, lockend)))
- break;
-
- unlock_extent(io_tree, lockstart, lockend, cached_state);
-
- if (ordered) {
- if (nowait) {
- btrfs_put_ordered_extent(ordered);
- ret = -EAGAIN;
- break;
- }
- /*
- * If we are doing a DIO read and the ordered extent we
- * found is for a buffered write, we can not wait for it
- * to complete and retry, because if we do so we can
- * deadlock with concurrent buffered writes on page
- * locks. This happens only if our DIO read covers more
- * than one extent map, if at this point has already
- * created an ordered extent for a previous extent map
- * and locked its range in the inode's io tree, and a
- * concurrent write against that previous extent map's
- * range and this range started (we unlock the ranges
- * in the io tree only when the bios complete and
- * buffered writes always lock pages before attempting
- * to lock range in the io tree).
- */
- if (writing ||
- test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
- btrfs_start_ordered_extent(ordered);
- else
- ret = nowait ? -EAGAIN : -ENOTBLK;
- btrfs_put_ordered_extent(ordered);
- } else {
- /*
- * We could trigger writeback for this range (and wait
- * for it to complete) and then invalidate the pages for
- * this range (through invalidate_inode_pages2_range()),
- * but that can lead us to a deadlock with a concurrent
- * call to readahead (a buffered read or a defrag call
- * triggered a readahead) on a page lock due to an
- * ordered dio extent we created before but did not have
- * yet a corresponding bio submitted (whence it can not
- * complete), which makes readahead wait for that
- * ordered extent to complete while holding a lock on
- * that page.
- */
- ret = nowait ? -EAGAIN : -ENOTBLK;
- }
-
- if (ret)
- break;
-
- cond_resched();
- }
-
- return ret;
-}
-
/* The callers of this must take lock_extent() */
-static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
- u64 len, u64 orig_start, u64 block_start,
- u64 block_len, u64 orig_block_len,
- u64 ram_bytes, int compress_type,
- int type)
+struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
+ const struct btrfs_file_extent *file_extent,
+ int type)
{
struct extent_map *em;
int ret;
@@ -7313,32 +7116,26 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
switch (type) {
case BTRFS_ORDERED_PREALLOC:
- /* Uncompressed extents. */
- ASSERT(block_len == len);
-
/* We're only referring part of a larger preallocated extent. */
- ASSERT(block_len <= ram_bytes);
+ ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);
break;
case BTRFS_ORDERED_REGULAR:
- /* Uncompressed extents. */
- ASSERT(block_len == len);
-
/* COW results a new extent matching our file extent size. */
- ASSERT(orig_block_len == len);
- ASSERT(ram_bytes == len);
+ ASSERT(file_extent->disk_num_bytes == file_extent->num_bytes);
+ ASSERT(file_extent->ram_bytes == file_extent->num_bytes);
/* Since it's a new extent, we should not have any offset. */
- ASSERT(orig_start == start);
+ ASSERT(file_extent->offset == 0);
break;
case BTRFS_ORDERED_COMPRESSED:
/* Must be compressed. */
- ASSERT(compress_type != BTRFS_COMPRESS_NONE);
+ ASSERT(file_extent->compression != BTRFS_COMPRESS_NONE);
/*
* Encoded write can make us to refer to part of the
* uncompressed extent.
*/
- ASSERT(len <= ram_bytes);
+ ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);
break;
}
@@ -7347,16 +7144,15 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
return ERR_PTR(-ENOMEM);
em->start = start;
- em->orig_start = orig_start;
- em->len = len;
- em->block_len = block_len;
- em->block_start = block_start;
- em->orig_block_len = orig_block_len;
- em->ram_bytes = ram_bytes;
+ em->len = file_extent->num_bytes;
+ em->disk_bytenr = file_extent->disk_bytenr;
+ em->disk_num_bytes = file_extent->disk_num_bytes;
+ em->ram_bytes = file_extent->ram_bytes;
em->generation = -1;
+ em->offset = file_extent->offset;
em->flags |= EXTENT_FLAG_PINNED;
if (type == BTRFS_ORDERED_COMPRESSED)
- extent_map_set_compression(em, compress_type);
+ extent_map_set_compression(em, file_extent->compression);
ret = btrfs_replace_extent_map_range(inode, em, true);
if (ret) {
@@ -7368,580 +7164,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
return em;
}
-
-static int btrfs_get_blocks_direct_write(struct extent_map **map,
- struct inode *inode,
- struct btrfs_dio_data *dio_data,
- u64 start, u64 *lenp,
- unsigned int iomap_flags)
-{
- const bool nowait = (iomap_flags & IOMAP_NOWAIT);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct extent_map *em = *map;
- int type;
- u64 block_start, orig_start, orig_block_len, ram_bytes;
- struct btrfs_block_group *bg;
- bool can_nocow = false;
- bool space_reserved = false;
- u64 len = *lenp;
- u64 prev_len;
- int ret = 0;
-
- /*
- * We don't allocate a new extent in the following cases
- *
- * 1) The inode is marked as NODATACOW. In this case we'll just use the
- * existing extent.
- * 2) The extent is marked as PREALLOC. We're good to go here and can
- * just use the extent.
- *
- */
- if ((em->flags & EXTENT_FLAG_PREALLOC) ||
- ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
- em->block_start != EXTENT_MAP_HOLE)) {
- if (em->flags & EXTENT_FLAG_PREALLOC)
- type = BTRFS_ORDERED_PREALLOC;
- else
- type = BTRFS_ORDERED_NOCOW;
- len = min(len, em->len - (start - em->start));
- block_start = em->block_start + (start - em->start);
-
- if (can_nocow_extent(inode, start, &len, &orig_start,
- &orig_block_len, &ram_bytes, false, false) == 1) {
- bg = btrfs_inc_nocow_writers(fs_info, block_start);
- if (bg)
- can_nocow = true;
- }
- }
-
- prev_len = len;
- if (can_nocow) {
- struct extent_map *em2;
-
- /* We can NOCOW, so only need to reserve metadata space. */
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
- nowait);
- if (ret < 0) {
- /* Our caller expects us to free the input extent map. */
- free_extent_map(em);
- *map = NULL;
- btrfs_dec_nocow_writers(bg);
- if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
- ret = -EAGAIN;
- goto out;
- }
- space_reserved = true;
-
- em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len,
- orig_start, block_start,
- len, orig_block_len,
- ram_bytes, type);
- btrfs_dec_nocow_writers(bg);
- if (type == BTRFS_ORDERED_PREALLOC) {
- free_extent_map(em);
- *map = em2;
- em = em2;
- }
-
- if (IS_ERR(em2)) {
- ret = PTR_ERR(em2);
- goto out;
- }
-
- dio_data->nocow_done = true;
- } else {
- /* Our caller expects us to free the input extent map. */
- free_extent_map(em);
- *map = NULL;
-
- if (nowait) {
- ret = -EAGAIN;
- goto out;
- }
-
- /*
- * If we could not allocate data space before locking the file
- * range and we can't do a NOCOW write, then we have to fail.
- */
- if (!dio_data->data_space_reserved) {
- ret = -ENOSPC;
- goto out;
- }
-
- /*
- * We have to COW and we have already reserved data space before,
- * so now we reserve only metadata.
- */
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
- false);
- if (ret < 0)
- goto out;
- space_reserved = true;
-
- em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto out;
- }
- *map = em;
- len = min(len, em->len - (start - em->start));
- if (len < prev_len)
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- prev_len - len, true);
- }
-
- /*
- * We have created our ordered extent, so we can now release our reservation
- * for an outstanding extent.
- */
- btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
-
- /*
- * Need to update the i_size under the extent lock so buffered
- * readers will get the updated i_size when we unlock.
- */
- if (start + len > i_size_read(inode))
- i_size_write(inode, start + len);
-out:
- if (ret && space_reserved) {
- btrfs_delalloc_release_extents(BTRFS_I(inode), len);
- btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
- }
- *lenp = len;
- return ret;
-}
-
-static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
- loff_t length, unsigned int flags, struct iomap *iomap,
- struct iomap *srcmap)
-{
- struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct extent_map *em;
- struct extent_state *cached_state = NULL;
- struct btrfs_dio_data *dio_data = iter->private;
- u64 lockstart, lockend;
- const bool write = !!(flags & IOMAP_WRITE);
- int ret = 0;
- u64 len = length;
- const u64 data_alloc_len = length;
- bool unlock_extents = false;
-
- /*
- * We could potentially fault if we have a buffer > PAGE_SIZE, and if
- * we're NOWAIT we may submit a bio for a partial range and return
- * EIOCBQUEUED, which would result in an errant short read.
- *
- * The best way to handle this would be to allow for partial completions
- * of iocb's, so we could submit the partial bio, return and fault in
- * the rest of the pages, and then submit the io for the rest of the
- * range. However we don't have that currently, so simply return
- * -EAGAIN at this point so that the normal path is used.
- */
- if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
- return -EAGAIN;
-
- /*
- * Cap the size of reads to that usually seen in buffered I/O as we need
- * to allocate a contiguous array for the checksums.
- */
- if (!write)
- len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
-
- lockstart = start;
- lockend = start + len - 1;
-
- /*
- * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
- * enough if we've written compressed pages to this area, so we need to
- * flush the dirty pages again to make absolutely sure that any
- * outstanding dirty pages are on disk - the first flush only starts
- * compression on the data, while keeping the pages locked, so by the
- * time the second flush returns we know bios for the compressed pages
- * were submitted and finished, and the pages no longer under writeback.
- *
- * If we have a NOWAIT request and we have any pages in the range that
- * are locked, likely due to compression still in progress, we don't want
- * to block on page locks. We also don't want to block on pages marked as
- * dirty or under writeback (same as for the non-compression case).
- * iomap_dio_rw() did the same check, but after that and before we got
- * here, mmap'ed writes may have happened or buffered reads started
- * (readpage() and readahead(), which lock pages), as we haven't locked
- * the file range yet.
- */
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags)) {
- if (flags & IOMAP_NOWAIT) {
- if (filemap_range_needs_writeback(inode->i_mapping,
- lockstart, lockend))
- return -EAGAIN;
- } else {
- ret = filemap_fdatawrite_range(inode->i_mapping, start,
- start + length - 1);
- if (ret)
- return ret;
- }
- }
-
- memset(dio_data, 0, sizeof(*dio_data));
-
- /*
- * We always try to allocate data space and must do it before locking
- * the file range, to avoid deadlocks with concurrent writes to the same
- * range if the range has several extents and the writes don't expand the
- * current i_size (the inode lock is taken in shared mode). If we fail to
- * allocate data space here we continue and later, after locking the
- * file range, we fail with ENOSPC only if we figure out we can not do a
- * NOCOW write.
- */
- if (write && !(flags & IOMAP_NOWAIT)) {
- ret = btrfs_check_data_free_space(BTRFS_I(inode),
- &dio_data->data_reserved,
- start, data_alloc_len, false);
- if (!ret)
- dio_data->data_space_reserved = true;
- else if (ret && !(BTRFS_I(inode)->flags &
- (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
- goto err;
- }
-
- /*
- * If this errors out it's because we couldn't invalidate pagecache for
- * this range and we need to fallback to buffered IO, or we are doing a
- * NOWAIT read/write and we need to block.
- */
- ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
- if (ret < 0)
- goto err;
-
- em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto unlock_err;
- }
-
- /*
- * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
- * io. INLINE is special, and we could probably kludge it in here, but
- * it's still buffered so for safety lets just fall back to the generic
- * buffered path.
- *
- * For COMPRESSED we _have_ to read the entire extent in so we can
- * decompress it, so there will be buffering required no matter what we
- * do, so go ahead and fallback to buffered.
- *
- * We return -ENOTBLK because that's what makes DIO go ahead and go back
- * to buffered IO. Don't blame me, this is the price we pay for using
- * the generic code.
- */
- if (extent_map_is_compressed(em) ||
- em->block_start == EXTENT_MAP_INLINE) {
- free_extent_map(em);
- /*
- * If we are in a NOWAIT context, return -EAGAIN in order to
- * fallback to buffered IO. This is not only because we can
- * block with buffered IO (no support for NOWAIT semantics at
- * the moment) but also to avoid returning short reads to user
- * space - this happens if we were able to read some data from
- * previous non-compressed extents and then when we fallback to
- * buffered IO, at btrfs_file_read_iter() by calling
- * filemap_read(), we fail to fault in pages for the read buffer,
- * in which case filemap_read() returns a short read (the number
- * of bytes previously read is > 0, so it does not return -EFAULT).
- */
- ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
- goto unlock_err;
- }
-
- len = min(len, em->len - (start - em->start));
-
- /*
- * If we have a NOWAIT request and the range contains multiple extents
- * (or a mix of extents and holes), then we return -EAGAIN to make the
- * caller fallback to a context where it can do a blocking (without
- * NOWAIT) request. This way we avoid doing partial IO and returning
- * success to the caller, which is not optimal for writes and for reads
- * it can result in unexpected behaviour for an application.
- *
- * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
- * iomap_dio_rw(), we can end up returning less data then what the caller
- * asked for, resulting in an unexpected, and incorrect, short read.
- * That is, the caller asked to read N bytes and we return less than that,
- * which is wrong unless we are crossing EOF. This happens if we get a
- * page fault error when trying to fault in pages for the buffer that is
- * associated to the struct iov_iter passed to iomap_dio_rw(), and we
- * have previously submitted bios for other extents in the range, in
- * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
- * those bios have completed by the time we get the page fault error,
- * which we return back to our caller - we should only return EIOCBQUEUED
- * after we have submitted bios for all the extents in the range.
- */
- if ((flags & IOMAP_NOWAIT) && len < length) {
- free_extent_map(em);
- ret = -EAGAIN;
- goto unlock_err;
- }
-
- if (write) {
- ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
- start, &len, flags);
- if (ret < 0)
- goto unlock_err;
- unlock_extents = true;
- /* Recalc len in case the new em is smaller than requested */
- len = min(len, em->len - (start - em->start));
- if (dio_data->data_space_reserved) {
- u64 release_offset;
- u64 release_len = 0;
-
- if (dio_data->nocow_done) {
- release_offset = start;
- release_len = data_alloc_len;
- } else if (len < data_alloc_len) {
- release_offset = start + len;
- release_len = data_alloc_len - len;
- }
-
- if (release_len > 0)
- btrfs_free_reserved_data_space(BTRFS_I(inode),
- dio_data->data_reserved,
- release_offset,
- release_len);
- }
- } else {
- /*
- * We need to unlock only the end area that we aren't using.
- * The rest is going to be unlocked by the endio routine.
- */
- lockstart = start + len;
- if (lockstart < lockend)
- unlock_extents = true;
- }
-
- if (unlock_extents)
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
- else
- free_extent_state(cached_state);
-
- /*
- * Translate extent map information to iomap.
- * We trim the extents (and move the addr) even though iomap code does
- * that, since we have locked only the parts we are performing I/O in.
- */
- if ((em->block_start == EXTENT_MAP_HOLE) ||
- ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
- iomap->addr = IOMAP_NULL_ADDR;
- iomap->type = IOMAP_HOLE;
- } else {
- iomap->addr = em->block_start + (start - em->start);
- iomap->type = IOMAP_MAPPED;
- }
- iomap->offset = start;
- iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
- iomap->length = len;
- free_extent_map(em);
-
- return 0;
-
-unlock_err:
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
-err:
- if (dio_data->data_space_reserved) {
- btrfs_free_reserved_data_space(BTRFS_I(inode),
- dio_data->data_reserved,
- start, data_alloc_len);
- extent_changeset_free(dio_data->data_reserved);
- }
-
- return ret;
-}
-
-static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
- ssize_t written, unsigned int flags, struct iomap *iomap)
-{
- struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
- struct btrfs_dio_data *dio_data = iter->private;
- size_t submitted = dio_data->submitted;
- const bool write = !!(flags & IOMAP_WRITE);
- int ret = 0;
-
- if (!write && (iomap->type == IOMAP_HOLE)) {
- /* If reading from a hole, unlock and return */
- unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
- NULL);
- return 0;
- }
-
- if (submitted < length) {
- pos += submitted;
- length -= submitted;
- if (write)
- btrfs_finish_ordered_extent(dio_data->ordered, NULL,
- pos, length, false);
- else
- unlock_extent(&BTRFS_I(inode)->io_tree, pos,
- pos + length - 1, NULL);
- ret = -ENOTBLK;
- }
- if (write) {
- btrfs_put_ordered_extent(dio_data->ordered);
- dio_data->ordered = NULL;
- }
-
- if (write)
- extent_changeset_free(dio_data->data_reserved);
- return ret;
-}
-
-static void btrfs_dio_end_io(struct btrfs_bio *bbio)
-{
- struct btrfs_dio_private *dip =
- container_of(bbio, struct btrfs_dio_private, bbio);
- struct btrfs_inode *inode = bbio->inode;
- struct bio *bio = &bbio->bio;
-
- if (bio->bi_status) {
- btrfs_warn(inode->root->fs_info,
- "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
- btrfs_ino(inode), bio->bi_opf,
- dip->file_offset, dip->bytes, bio->bi_status);
- }
-
- if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
- btrfs_finish_ordered_extent(bbio->ordered, NULL,
- dip->file_offset, dip->bytes,
- !bio->bi_status);
- } else {
- unlock_extent(&inode->io_tree, dip->file_offset,
- dip->file_offset + dip->bytes - 1, NULL);
- }
-
- bbio->bio.bi_private = bbio->private;
- iomap_dio_bio_end_io(bio);
-}
-
-static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
- loff_t file_offset)
-{
- struct btrfs_bio *bbio = btrfs_bio(bio);
- struct btrfs_dio_private *dip =
- container_of(bbio, struct btrfs_dio_private, bbio);
- struct btrfs_dio_data *dio_data = iter->private;
-
- btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
- btrfs_dio_end_io, bio->bi_private);
- bbio->inode = BTRFS_I(iter->inode);
- bbio->file_offset = file_offset;
-
- dip->file_offset = file_offset;
- dip->bytes = bio->bi_iter.bi_size;
-
- dio_data->submitted += bio->bi_iter.bi_size;
-
- /*
- * Check if we are doing a partial write. If we are, we need to split
- * the ordered extent to match the submitted bio. Hang on to the
- * remaining unfinishable ordered_extent in dio_data so that it can be
- * cancelled in iomap_end to avoid a deadlock wherein faulting the
- * remaining pages is blocked on the outstanding ordered extent.
- */
- if (iter->flags & IOMAP_WRITE) {
- int ret;
-
- ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
- if (ret) {
- btrfs_finish_ordered_extent(dio_data->ordered, NULL,
- file_offset, dip->bytes,
- !ret);
- bio->bi_status = errno_to_blk_status(ret);
- iomap_dio_bio_end_io(bio);
- return;
- }
- }
-
- btrfs_submit_bio(bbio, 0);
-}
-
-static const struct iomap_ops btrfs_dio_iomap_ops = {
- .iomap_begin = btrfs_dio_iomap_begin,
- .iomap_end = btrfs_dio_iomap_end,
-};
-
-static const struct iomap_dio_ops btrfs_dio_ops = {
- .submit_io = btrfs_dio_submit_io,
- .bio_set = &btrfs_dio_bioset,
-};
-
-ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
-{
- struct btrfs_dio_data data = { 0 };
-
- return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
- IOMAP_DIO_PARTIAL, &data, done_before);
-}
-
-struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
- size_t done_before)
-{
- struct btrfs_dio_data data = { 0 };
-
- return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
- IOMAP_DIO_PARTIAL, &data, done_before);
-}
-
-static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- u64 start, u64 len)
-{
- struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
- int ret;
-
- ret = fiemap_prep(inode, fieinfo, start, &len, 0);
- if (ret)
- return ret;
-
- /*
- * fiemap_prep() called filemap_write_and_wait() for the whole possible
- * file range (0 to LLONG_MAX), but that is not enough if we have
- * compression enabled. The first filemap_fdatawrite_range() only kicks
- * in the compression of data (in an async thread) and will return
- * before the compression is done and writeback is started. A second
- * filemap_fdatawrite_range() is needed to wait for the compression to
- * complete and writeback to start. We also need to wait for ordered
- * extents to complete, because our fiemap implementation uses mainly
- * file extent items to list the extents, searching for extent maps
- * only for file ranges with holes or prealloc extents to figure out
- * if we have delalloc in those ranges.
- */
- if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
- ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
- if (ret)
- return ret;
- }
-
- btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);
-
- /*
- * We did an initial flush to avoid holding the inode's lock while
- * triggering writeback and waiting for the completion of IO and ordered
- * extents. Now after we locked the inode we do it again, because it's
- * possible a new write may have happened in between those two steps.
- */
- if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
- ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
- if (ret) {
- btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
- return ret;
- }
- }
-
- ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
- btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
-
- return ret;
-}
-
/*
* For release_folio() and invalidate_folio() we have a race window where
* folio_end_writeback() is called but the subpage spinlock is not yet released.
@@ -8198,7 +7420,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
if (!skip_writeback) {
- ret = btrfs_wait_ordered_range(&inode->vfs_inode,
+ ret = btrfs_wait_ordered_range(inode,
inode->vfs_inode.i_size & (~mask),
(u64)-1);
if (ret)
@@ -8399,20 +7621,10 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_inode *ei;
struct inode *inode;
- struct extent_io_tree *file_extent_tree = NULL;
-
- /* Self tests may pass a NULL fs_info. */
- if (fs_info && !btrfs_fs_incompat(fs_info, NO_HOLES)) {
- file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL);
- if (!file_extent_tree)
- return NULL;
- }
ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
- if (!ei) {
- kfree(file_extent_tree);
+ if (!ei)
return NULL;
- }
ei->root = NULL;
ei->generation = 0;
@@ -8425,8 +7637,12 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->disk_i_size = 0;
ei->flags = 0;
ei->ro_flags = 0;
+ /*
+ * ->index_cnt will be properly initialized later when creating a new
+ * inode (btrfs_create_new_inode()) or when reading an existing inode
+ * from disk (btrfs_read_locked_inode()).
+ */
ei->csum_bytes = 0;
- ei->index_cnt = (u64)-1;
ei->dir_index = 0;
ei->last_unlink_trans = 0;
ei->last_reflink_trans = 0;
@@ -8453,20 +7669,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
ei->io_tree.inode = ei;
- ei->file_extent_tree = file_extent_tree;
- if (file_extent_tree) {
- extent_io_tree_init(fs_info, ei->file_extent_tree,
- IO_TREE_INODE_FILE_EXTENT);
- /* Lockdep class is set only for the file extent tree. */
- lockdep_set_class(&ei->file_extent_tree->lock, &file_extent_tree_class);
- }
+ ei->file_extent_tree = NULL;
+
mutex_init(&ei->log_mutex);
spin_lock_init(&ei->ordered_tree_lock);
ei->ordered_tree = RB_ROOT;
ei->ordered_tree_last = NULL;
INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->delayed_iput);
- RB_CLEAR_NODE(&ei->rb_node);
init_rwsem(&ei->i_mmap_lock);
return inode;
@@ -8502,9 +7712,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
if (!S_ISDIR(vfs_inode->i_mode)) {
WARN_ON(inode->delalloc_bytes);
WARN_ON(inode->new_delalloc_bytes);
+ WARN_ON(inode->csum_bytes);
}
- WARN_ON(inode->csum_bytes);
- WARN_ON(inode->defrag_bytes);
+ if (!root || !btrfs_is_data_reloc_root(root))
+ WARN_ON(inode->defrag_bytes);
/*
* This can happen where we create an inode, but somebody else also
@@ -8538,7 +7749,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
}
}
btrfs_qgroup_check_reserved_leak(inode);
- inode_tree_del(inode);
+ btrfs_del_inode_from_root(inode);
btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
btrfs_put_root(inode->root);
@@ -8572,7 +7783,6 @@ void __cold btrfs_destroy_cachep(void)
* destroy cache.
*/
rcu_barrier();
- bioset_exit(&btrfs_dio_bioset);
kmem_cache_destroy(btrfs_inode_cachep);
}
@@ -8583,17 +7793,9 @@ int __init btrfs_init_cachep(void)
SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
init_once);
if (!btrfs_inode_cachep)
- goto fail;
-
- if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
- offsetof(struct btrfs_dio_private, bbio.bio),
- BIOSET_NEED_BVECS))
- goto fail;
+ return -ENOMEM;
return 0;
-fail:
- btrfs_destroy_cachep();
- return -ENOMEM;
}
static int btrfs_getattr(struct mnt_idmap *idmap,
@@ -9586,11 +8788,10 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
}
em->start = cur_offset;
- em->orig_start = cur_offset;
em->len = ins.offset;
- em->block_start = ins.objectid;
- em->block_len = ins.offset;
- em->orig_block_len = ins.offset;
+ em->disk_bytenr = ins.objectid;
+ em->offset = 0;
+ em->disk_num_bytes = ins.offset;
em->ram_bytes = ins.offset;
em->flags |= EXTENT_FLAG_PREALLOC;
em->generation = trans->transid;
@@ -9956,7 +9157,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages)
return -ENOMEM;
- ret = btrfs_alloc_page_array(nr_pages, pages, 0);
+ ret = btrfs_alloc_page_array(nr_pages, pages, false);
if (ret) {
ret = -ENOMEM;
goto out;
@@ -10033,7 +9234,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
for (;;) {
struct btrfs_ordered_extent *ordered;
- ret = btrfs_wait_ordered_range(&inode->vfs_inode, start,
+ ret = btrfs_wait_ordered_range(inode, start,
lockend - start + 1);
if (ret)
goto out_unlock_inode;
@@ -10053,7 +9254,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
goto out_unlock_extent;
}
- if (em->block_start == EXTENT_MAP_INLINE) {
+ if (em->disk_bytenr == EXTENT_MAP_INLINE) {
u64 extent_start = em->start;
/*
@@ -10074,33 +9275,33 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
*/
encoded->len = min_t(u64, extent_map_end(em),
inode->vfs_inode.i_size) - iocb->ki_pos;
- if (em->block_start == EXTENT_MAP_HOLE ||
+ if (em->disk_bytenr == EXTENT_MAP_HOLE ||
(em->flags & EXTENT_FLAG_PREALLOC)) {
disk_bytenr = EXTENT_MAP_HOLE;
count = min_t(u64, count, encoded->len);
encoded->len = count;
encoded->unencoded_len = count;
} else if (extent_map_is_compressed(em)) {
- disk_bytenr = em->block_start;
+ disk_bytenr = em->disk_bytenr;
/*
* Bail if the buffer isn't large enough to return the whole
* compressed extent.
*/
- if (em->block_len > count) {
+ if (em->disk_num_bytes > count) {
ret = -ENOBUFS;
goto out_em;
}
- disk_io_size = em->block_len;
- count = em->block_len;
+ disk_io_size = em->disk_num_bytes;
+ count = em->disk_num_bytes;
encoded->unencoded_len = em->ram_bytes;
- encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
+ encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset);
ret = btrfs_encoded_io_compression_from_extent(fs_info,
extent_map_compression(em));
if (ret < 0)
goto out_em;
encoded->compression = ret;
} else {
- disk_bytenr = em->block_start + (start - em->start);
+ disk_bytenr = extent_map_block_start(em) + (start - em->start);
if (encoded->len > count)
encoded->len = count;
/*
@@ -10155,6 +9356,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
struct extent_changeset *data_reserved = NULL;
struct extent_state *cached_state = NULL;
struct btrfs_ordered_extent *ordered;
+ struct btrfs_file_extent file_extent;
int compression;
size_t orig_count;
u64 start, end;
@@ -10276,7 +9478,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
for (;;) {
struct btrfs_ordered_extent *ordered;
- ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
+ ret = btrfs_wait_ordered_range(inode, start, num_bytes);
if (ret)
goto out_folios;
ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
@@ -10330,22 +9532,22 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
goto out_delalloc_release;
extent_reserved = true;
- em = create_io_em(inode, start, num_bytes,
- start - encoded->unencoded_offset, ins.objectid,
- ins.offset, ins.offset, ram_bytes, compression,
- BTRFS_ORDERED_COMPRESSED);
+ file_extent.disk_bytenr = ins.objectid;
+ file_extent.disk_num_bytes = ins.offset;
+ file_extent.num_bytes = num_bytes;
+ file_extent.ram_bytes = ram_bytes;
+ file_extent.offset = encoded->unencoded_offset;
+ file_extent.compression = compression;
+ em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out_free_reserved;
}
free_extent_map(em);
- ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes,
- ins.objectid, ins.offset,
- encoded->unencoded_offset,
+ ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
(1 << BTRFS_ORDERED_ENCODED) |
- (1 << BTRFS_ORDERED_COMPRESSED),
- compression);
+ (1 << BTRFS_ORDERED_COMPRESSED));
if (IS_ERR(ordered)) {
btrfs_drop_extent_map_range(inode, start, end, false);
ret = PTR_ERR(ordered);
@@ -10385,7 +9587,7 @@ out_unlock:
out_folios:
for (i = 0; i < nr_folios; i++) {
if (folios[i])
- __folio_put(folios[i]);
+ folio_put(folios[i]);
}
kvfree(folios);
out:
@@ -10549,7 +9751,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
* file changes again after this, the user is doing something stupid and
* we don't really care.
*/
- ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
if (ret)
return ret;
@@ -10635,12 +9837,12 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
goto out;
}
- if (em->block_start == EXTENT_MAP_HOLE) {
+ if (em->disk_bytenr == EXTENT_MAP_HOLE) {
btrfs_warn(fs_info, "swapfile must not have holes");
ret = -EINVAL;
goto out;
}
- if (em->block_start == EXTENT_MAP_INLINE) {
+ if (em->disk_bytenr == EXTENT_MAP_INLINE) {
/*
* It's unlikely we'll ever actually find ourselves
* here, as a file small enough to fit inline won't be
@@ -10658,12 +9860,12 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
goto out;
}
- logical_block_start = em->block_start + (start - em->start);
+ logical_block_start = extent_map_block_start(em) + (start - em->start);
len = min(len, em->len - (start - em->start));
free_extent_map(em);
em = NULL;
- ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true);
+ ret = can_nocow_extent(inode, start, &len, NULL, false, true);
if (ret < 0) {
goto out;
} else if (ret) {
@@ -10860,52 +10062,23 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en
*/
struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino)
{
- struct rb_node *node;
- struct rb_node *prev;
struct btrfs_inode *inode;
+ unsigned long from = min_ino;
- spin_lock(&root->inode_lock);
-again:
- node = root->inode_tree.rb_node;
- prev = NULL;
- while (node) {
- prev = node;
- inode = rb_entry(node, struct btrfs_inode, rb_node);
- if (min_ino < btrfs_ino(inode))
- node = node->rb_left;
- else if (min_ino > btrfs_ino(inode))
- node = node->rb_right;
- else
+ xa_lock(&root->inodes);
+ while (true) {
+ inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
+ if (!inode)
+ break;
+ if (igrab(&inode->vfs_inode))
break;
- }
-
- if (!node) {
- while (prev) {
- inode = rb_entry(prev, struct btrfs_inode, rb_node);
- if (min_ino <= btrfs_ino(inode)) {
- node = prev;
- break;
- }
- prev = rb_next(prev);
- }
- }
-
- while (node) {
- inode = rb_entry(prev, struct btrfs_inode, rb_node);
- if (igrab(&inode->vfs_inode)) {
- spin_unlock(&root->inode_lock);
- return inode;
- }
-
- min_ino = btrfs_ino(inode) + 1;
- if (cond_resched_lock(&root->inode_lock))
- goto again;
- node = rb_next(node);
+ from = btrfs_ino(inode) + 1;
+ cond_resched_lock(&root->inodes.xa_lock);
}
- spin_unlock(&root->inode_lock);
+ xa_unlock(&root->inodes);
- return NULL;
+ return inode;
}
static const struct inode_operations btrfs_dir_inode_operations = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index efd5d6e9589e..e0a664b8a46a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -375,15 +375,15 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
return PTR_ERR(trans);
if (comp) {
- ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp,
- strlen(comp), 0);
+ ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression",
+ comp, strlen(comp), 0);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
} else {
- ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL,
- 0, 0);
+ ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression",
+ NULL, 0, 0);
if (ret && ret != -ENODATA) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
@@ -552,7 +552,7 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
return 0;
}
-int __pure btrfs_is_empty_uuid(u8 *uuid)
+int __pure btrfs_is_empty_uuid(const u8 *uuid)
{
int i;
@@ -658,15 +658,10 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
ret = PTR_ERR(trans);
goto out_release_rsv;
}
- ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root);
- if (ret)
- goto out;
btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
qgroup_reserved = 0;
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
- /* Tree log can't currently deal with an inode which is a new root. */
- btrfs_set_log_full_commit(trans);
ret = btrfs_qgroup_inherit(trans, 0, objectid, btrfs_root_id(root), inherit);
if (ret)
@@ -719,6 +714,8 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
root_item);
if (ret) {
+ int ret2;
+
/*
* Since we don't abort the transaction in this case, free the
* tree block so that we don't leak space and leave the
@@ -729,7 +726,9 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
btrfs_tree_lock(leaf);
btrfs_clear_buffer_dirty(trans, leaf);
btrfs_tree_unlock(leaf);
- btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
+ ret2 = btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
+ if (ret2 < 0)
+ btrfs_abort_transaction(trans, ret2);
free_extent_buffer(leaf);
goto out;
}
@@ -767,6 +766,8 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
goto out;
}
+ btrfs_record_new_subvolume(trans, BTRFS_I(dir));
+
d_instantiate_new(dentry, new_inode_args.inode);
new_inode_args.inode = NULL;
@@ -854,7 +855,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
pending_snapshot->dentry = dentry;
pending_snapshot->root = root;
pending_snapshot->readonly = readonly;
- pending_snapshot->dir = dir;
+ pending_snapshot->dir = BTRFS_I(dir);
pending_snapshot->inherit = inherit;
trans = btrfs_start_transaction(root, 0);
@@ -1070,7 +1071,7 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
atomic_inc(&root->snapshot_force_cow);
snapshot_force_cow = true;
- btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
+ btrfs_wait_ordered_extents(root, U64_MAX, NULL);
ret = btrfs_mksubvol(parent, idmap, name, namelen,
root, readonly, inherit);
@@ -1917,8 +1918,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
struct btrfs_ioctl_ino_lookup_user_args *args)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
- struct super_block *sb = inode->i_sb;
- struct btrfs_key upper_limit = BTRFS_I(inode)->location;
+ u64 upper_limit = btrfs_ino(BTRFS_I(inode));
u64 treeid = btrfs_root_id(BTRFS_I(inode)->root);
u64 dirid = args->dirid;
unsigned long item_off;
@@ -1944,7 +1944,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
* If the bottom subvolume does not exist directly under upper_limit,
* construct the path in from the bottom up.
*/
- if (dirid != upper_limit.objectid) {
+ if (dirid != upper_limit) {
ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1];
root = btrfs_get_fs_root(fs_info, treeid, true);
@@ -2006,7 +2006,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
* btree and lock the same leaf.
*/
btrfs_release_path(path);
- temp_inode = btrfs_iget(sb, key2.objectid, root);
+ temp_inode = btrfs_iget(key2.objectid, root);
if (IS_ERR(temp_inode)) {
ret = PTR_ERR(temp_inode);
goto out_put;
@@ -2019,7 +2019,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
goto out_put;
}
- if (key.offset == upper_limit.objectid)
+ if (key.offset == upper_limit)
break;
if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) {
ret = -EACCES;
@@ -2140,7 +2140,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
inode = file_inode(file);
if (args->dirid == BTRFS_FIRST_FREE_OBJECTID &&
- BTRFS_I(inode)->location.objectid != BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
/*
* The subvolume does not exist under fd with which this is
* called
@@ -3807,12 +3807,29 @@ drop_write:
return ret;
}
+/*
+ * Quick check for ioctl handlers if quotas are enabled. Proper locking must be
+ * done before any operations.
+ */
+static bool qgroup_enabled(struct btrfs_fs_info *fs_info)
+{
+ bool ret = true;
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ if (!fs_info->quota_root)
+ ret = false;
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
+ return ret;
+}
+
static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ioctl_qgroup_assign_args *sa;
+ struct btrfs_qgroup_list *prealloc = NULL;
struct btrfs_trans_handle *trans;
int ret;
int err;
@@ -3820,6 +3837,9 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!qgroup_enabled(root->fs_info))
+ return -ENOTCONN;
+
ret = mnt_want_write_file(file);
if (ret)
return ret;
@@ -3830,14 +3850,27 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
goto drop_write;
}
+ if (sa->assign) {
+ prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
+ if (!prealloc) {
+ ret = -ENOMEM;
+ goto drop_write;
+ }
+ }
+
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out;
}
+ /*
+ * Prealloc ownership is moved to the relation handler, there it's used
+ * or freed on error.
+ */
if (sa->assign) {
- ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst);
+ ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst, prealloc);
+ prealloc = NULL;
} else {
ret = btrfs_del_qgroup_relation(trans, sa->src, sa->dst);
}
@@ -3847,13 +3880,15 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
err = btrfs_run_qgroups(trans);
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (err < 0)
- btrfs_handle_fs_error(fs_info, err,
- "failed to update qgroup status and info");
+ btrfs_warn(fs_info,
+ "qgroup status update failed after %s relation, marked as inconsistent",
+ sa->assign ? "adding" : "deleting");
err = btrfs_end_transaction(trans);
if (err && !ret)
ret = err;
out:
+ kfree(prealloc);
kfree(sa);
drop_write:
mnt_drop_write_file(file);
@@ -3872,6 +3907,9 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!qgroup_enabled(root->fs_info))
+ return -ENOTCONN;
+
ret = mnt_want_write_file(file);
if (ret)
return ret;
@@ -3928,6 +3966,9 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!qgroup_enabled(root->fs_info))
+ return -ENOTCONN;
+
ret = mnt_want_write_file(file);
if (ret)
return ret;
@@ -3973,6 +4014,9 @@ static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!qgroup_enabled(fs_info))
+ return -ENOTCONN;
+
ret = mnt_want_write_file(file);
if (ret)
return ret;
@@ -4429,7 +4473,7 @@ out_drop_write:
return ret;
}
-static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat)
+static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool compat)
{
struct btrfs_ioctl_send_args *arg;
int ret;
@@ -4627,7 +4671,7 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
goto out_iov;
init_sync_kiocb(&kiocb, file);
- ret = kiocb_set_rw_flags(&kiocb, 0);
+ ret = kiocb_set_rw_flags(&kiocb, 0, WRITE);
if (ret)
goto out_iov;
kiocb.ki_pos = pos;
@@ -4751,10 +4795,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_set_received_subvol_32(file, argp);
#endif
case BTRFS_IOC_SEND:
- return _btrfs_ioctl_send(inode, argp, false);
+ return _btrfs_ioctl_send(BTRFS_I(inode), argp, false);
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
case BTRFS_IOC_SEND_32:
- return _btrfs_ioctl_send(inode, argp, true);
+ return _btrfs_ioctl_send(BTRFS_I(inode), argp, true);
#endif
case BTRFS_IOC_GET_DEV_STATS:
return btrfs_ioctl_get_dev_stats(fs_info, argp);
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 2c5dc25ec670..19cd26b0244a 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -19,7 +19,7 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
struct dentry *dentry, struct fileattr *fa);
int btrfs_ioctl_get_supported_features(void __user *arg);
void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
-int __pure btrfs_is_empty_uuid(u8 *uuid);
+int __pure btrfs_is_empty_uuid(const u8 *uuid);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 1bc8e6738879..3c15c75e0582 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -11,7 +11,6 @@
#include <linux/lockdep.h>
#include <linux/percpu_counter.h>
#include "extent_io.h"
-#include "locking.h"
struct extent_buffer;
struct btrfs_path;
diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h
index e32906ab6faa..07f1bb1c6aa3 100644
--- a/fs/btrfs/lru_cache.h
+++ b/fs/btrfs/lru_cache.h
@@ -6,7 +6,6 @@
#include <linux/types.h>
#include <linux/maple_tree.h>
#include <linux/list.h>
-#include "lru_cache.h"
/*
* A cache entry. This is meant to be embedded in a structure of a user of
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 1c396ac167aa..1e2a68b8f62d 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -258,8 +258,8 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
workspace->cbuf, &out_len,
workspace->mem);
kunmap_local(data_in);
- if (ret < 0) {
- pr_debug("BTRFS: lzo in loop returned %d\n", ret);
+ if (unlikely(ret < 0)) {
+ /* lzo1x_1_compress never fails. */
ret = -EIO;
goto out;
}
@@ -354,11 +354,14 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
* and all sectors should be used.
* If this happens, it means the compressed extent is corrupted.
*/
- if (len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, cb->compressed_len) ||
- round_up(len_in, sectorsize) < cb->compressed_len) {
+ if (unlikely(len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, cb->compressed_len) ||
+ round_up(len_in, sectorsize) < cb->compressed_len)) {
+ struct btrfs_inode *inode = cb->bbio.inode;
+
btrfs_err(fs_info,
- "invalid lzo header, lzo len %u compressed len %u",
- len_in, cb->compressed_len);
+"lzo header invalid, root %llu inode %llu offset %llu lzo len %u compressed len %u",
+ btrfs_root_id(inode->root), btrfs_ino(inode),
+ cb->start, len_in, cb->compressed_len);
return -EUCLEAN;
}
@@ -383,13 +386,17 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
kunmap_local(kaddr);
cur_in += LZO_LEN;
- if (seg_len > WORKSPACE_CBUF_LENGTH) {
+ if (unlikely(seg_len > WORKSPACE_CBUF_LENGTH)) {
+ struct btrfs_inode *inode = cb->bbio.inode;
+
/*
* seg_len shouldn't be larger than we have allocated
* for workspace->cbuf
*/
- btrfs_err(fs_info, "unexpectedly large lzo segment len %u",
- seg_len);
+ btrfs_err(fs_info,
+ "lzo segment too big, root %llu inode %llu offset %llu len %u",
+ btrfs_root_id(inode->root), btrfs_ino(inode),
+ cb->start, seg_len);
return -EIO;
}
@@ -399,8 +406,13 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
/* Decompress the data */
ret = lzo1x_decompress_safe(workspace->cbuf, seg_len,
workspace->buf, &out_len);
- if (ret != LZO_E_OK) {
- btrfs_err(fs_info, "failed to decompress");
+ if (unlikely(ret != LZO_E_OK)) {
+ struct btrfs_inode *inode = cb->bbio.inode;
+
+ btrfs_err(fs_info,
+ "lzo decompression failed, error %d root %llu inode %llu offset %llu",
+ ret, btrfs_root_id(inode->root), btrfs_ino(inode),
+ cb->start);
return -EIO;
}
@@ -454,8 +466,13 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
out_len = sectorsize;
ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
- if (ret != LZO_E_OK) {
- pr_warn("BTRFS: decompress failed!\n");
+ if (unlikely(ret != LZO_E_OK)) {
+ struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host);
+
+ btrfs_err(fs_info,
+ "lzo decompression failed, error %d root %llu inode %llu offset %llu",
+ ret, btrfs_root_id(inode->root), btrfs_ino(inode),
+ page_offset(dest_page));
ret = -EIO;
goto out;
}
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index 210d9c82e2ae..77752eec125d 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -20,7 +20,8 @@ static const char fs_state_chars[] = {
[BTRFS_FS_STATE_TRANS_ABORTED] = 'A',
[BTRFS_FS_STATE_DEV_REPLACING] = 'R',
[BTRFS_FS_STATE_DUMMY_FS_INFO] = 0,
- [BTRFS_FS_STATE_NO_CSUMS] = 'C',
+ [BTRFS_FS_STATE_NO_DATA_CSUMS] = 'C',
+ [BTRFS_FS_STATE_SKIP_META_CSUMS] = 'S',
[BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L',
};
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index dde4904aead9..0d599fd847c9 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -66,7 +66,7 @@ struct rb_simple_node {
u64 bytenr;
};
-static inline struct rb_node *rb_simple_search(struct rb_root *root, u64 bytenr)
+static inline struct rb_node *rb_simple_search(const struct rb_root *root, u64 bytenr)
{
struct rb_node *node = root->rb_node;
struct rb_simple_node *entry;
@@ -93,7 +93,7 @@ static inline struct rb_node *rb_simple_search(struct rb_root *root, u64 bytenr)
* Return the rb_node that start at or after @bytenr. If there is no entry at
* or after @bytner return NULL.
*/
-static inline struct rb_node *rb_simple_search_first(struct rb_root *root,
+static inline struct rb_node *rb_simple_search_first(const struct rb_root *root,
u64 bytenr)
{
struct rb_node *node = root->rb_node, *ret = NULL;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 35a413ce935d..82a68394a89c 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -19,6 +19,7 @@
#include "qgroup.h"
#include "subpage.h"
#include "file.h"
+#include "block-group.h"
static struct kmem_cache *btrfs_ordered_extent_cache;
@@ -179,7 +180,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
entry->disk_num_bytes = disk_num_bytes;
entry->offset = offset;
entry->bytes_left = num_bytes;
- entry->inode = igrab(&inode->vfs_inode);
+ entry->inode = BTRFS_I(igrab(&inode->vfs_inode));
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
entry->qgroup_rsv = qgroup_rsv;
@@ -207,7 +208,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
{
- struct btrfs_inode *inode = BTRFS_I(entry->inode);
+ struct btrfs_inode *inode = entry->inode;
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *node;
@@ -223,7 +224,7 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
spin_lock_irq(&inode->ordered_tree_lock);
node = tree_insert(&inode->ordered_tree, entry->file_offset,
&entry->rb_node);
- if (node)
+ if (unlikely(node))
btrfs_panic(fs_info, -EEXIST,
"inconsistency in ordered tree at offset %llu",
entry->file_offset);
@@ -263,17 +264,39 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
*/
struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
struct btrfs_inode *inode, u64 file_offset,
- u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
- u64 disk_num_bytes, u64 offset, unsigned long flags,
- int compress_type)
+ const struct btrfs_file_extent *file_extent, unsigned long flags)
{
struct btrfs_ordered_extent *entry;
ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
- entry = alloc_ordered_extent(inode, file_offset, num_bytes, ram_bytes,
- disk_bytenr, disk_num_bytes, offset, flags,
- compress_type);
+ /*
+ * For regular writes, we just use the members in @file_extent.
+ *
+ * For NOCOW, we don't really care about the numbers except @start and
+ * file_extent->num_bytes, as we won't insert a file extent item at all.
+ *
+ * For PREALLOC, we do not use ordered extent members, but
+ * btrfs_mark_extent_written() handles everything.
+ *
+ * So here we always pass 0 as offset for NOCOW/PREALLOC ordered extents,
+ * or btrfs_split_ordered_extent() cannot handle it correctly.
+ */
+ if (flags & ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC)))
+ entry = alloc_ordered_extent(inode, file_offset,
+ file_extent->num_bytes,
+ file_extent->num_bytes,
+ file_extent->disk_bytenr + file_extent->offset,
+ file_extent->num_bytes, 0, flags,
+ file_extent->compression);
+ else
+ entry = alloc_ordered_extent(inode, file_offset,
+ file_extent->num_bytes,
+ file_extent->ram_bytes,
+ file_extent->disk_bytenr,
+ file_extent->disk_num_bytes,
+ file_extent->offset, flags,
+ file_extent->compression);
if (!IS_ERR(entry))
insert_ordered_extent(entry);
return entry;
@@ -287,7 +310,7 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum)
{
- struct btrfs_inode *inode = BTRFS_I(entry->inode);
+ struct btrfs_inode *inode = entry->inode;
spin_lock_irq(&inode->ordered_tree_lock);
list_add_tail(&sum->list, &entry->list);
@@ -297,7 +320,7 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered)
{
if (!test_and_set_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
- mapping_set_error(ordered->inode->i_mapping, -EIO);
+ mapping_set_error(ordered->inode->vfs_inode.i_mapping, -EIO);
}
static void finish_ordered_fn(struct btrfs_work *work)
@@ -312,7 +335,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
struct page *page, u64 file_offset,
u64 len, bool uptodate)
{
- struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_inode *inode = ordered->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
lockdep_assert_held(&inode->ordered_tree_lock);
@@ -365,7 +388,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered)
{
- struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_inode *inode = ordered->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ?
fs_info->endio_freespace_worker : fs_info->endio_write_workers;
@@ -374,11 +397,11 @@ static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered)
btrfs_queue_work(wq, &ordered->work);
}
-bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
+void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
struct page *page, u64 file_offset, u64 len,
bool uptodate)
{
- struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_inode *inode = ordered->inode;
unsigned long flags;
bool ret;
@@ -421,7 +444,6 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
if (ret)
btrfs_queue_ordered_fn(ordered);
- return ret;
}
/*
@@ -588,14 +610,14 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
struct list_head *cur;
struct btrfs_ordered_sum *sum;
- trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry);
+ trace_btrfs_ordered_extent_put(entry->inode, entry);
if (refcount_dec_and_test(&entry->refs)) {
ASSERT(list_empty(&entry->root_extent_list));
ASSERT(list_empty(&entry->log_list));
ASSERT(RB_EMPTY_NODE(&entry->rb_node));
if (entry->inode)
- btrfs_add_delayed_iput(BTRFS_I(entry->inode));
+ btrfs_add_delayed_iput(entry->inode);
while (!list_empty(&entry->list)) {
cur = entry->list.next;
sum = list_entry(cur, struct btrfs_ordered_sum, list);
@@ -626,7 +648,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
freespace_inode = btrfs_is_free_space_inode(btrfs_inode);
btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered);
- /* This is paired with btrfs_alloc_ordered_extent. */
+ /* This is paired with alloc_ordered_extent(). */
spin_lock(&btrfs_inode->lock);
btrfs_mod_outstanding_extents(btrfs_inode, -1);
spin_unlock(&btrfs_inode->lock);
@@ -712,11 +734,11 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
}
/*
- * wait for all the ordered extents in a root. This is done when balancing
- * space between drives.
+ * Wait for all the ordered extents in a root. Use @bg as range or do whole
+ * range if it's NULL.
*/
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
- const u64 range_start, const u64 range_len)
+ const struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = root->fs_info;
LIST_HEAD(splice);
@@ -724,7 +746,17 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
LIST_HEAD(works);
struct btrfs_ordered_extent *ordered, *next;
u64 count = 0;
- const u64 range_end = range_start + range_len;
+ u64 range_start, range_len;
+ u64 range_end;
+
+ if (bg) {
+ range_start = bg->start;
+ range_len = bg->length;
+ } else {
+ range_start = 0;
+ range_len = U64_MAX;
+ }
+ range_end = range_start + range_len;
mutex_lock(&root->ordered_extent_mutex);
spin_lock(&root->ordered_extent_lock);
@@ -751,10 +783,10 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
cond_resched();
- spin_lock(&root->ordered_extent_lock);
if (nr != U64_MAX)
nr--;
count++;
+ spin_lock(&root->ordered_extent_lock);
}
list_splice_tail(&skipped, &root->ordered_extents);
list_splice_tail(&splice, &root->ordered_extents);
@@ -771,8 +803,12 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
return count;
}
+/*
+ * Wait for @nr ordered extents that intersect the @bg, or the whole range of
+ * the filesystem if @bg is NULL.
+ */
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
- const u64 range_start, const u64 range_len)
+ const struct btrfs_block_group *bg)
{
struct btrfs_root *root;
LIST_HEAD(splice);
@@ -790,14 +826,13 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
&fs_info->ordered_roots);
spin_unlock(&fs_info->ordered_root_lock);
- done = btrfs_wait_ordered_extents(root, nr,
- range_start, range_len);
+ done = btrfs_wait_ordered_extents(root, nr, bg);
btrfs_put_root(root);
- spin_lock(&fs_info->ordered_root_lock);
- if (nr != U64_MAX) {
+ if (nr != U64_MAX)
nr -= done;
- }
+
+ spin_lock(&fs_info->ordered_root_lock);
}
list_splice_tail(&splice, &fs_info->ordered_roots);
spin_unlock(&fs_info->ordered_root_lock);
@@ -814,7 +849,7 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
{
u64 start = entry->file_offset;
u64 end = start + entry->num_bytes - 1;
- struct btrfs_inode *inode = BTRFS_I(entry->inode);
+ struct btrfs_inode *inode = entry->inode;
bool freespace_inode;
trace_btrfs_ordered_extent_start(inode, entry);
@@ -841,7 +876,7 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
/*
* Used to wait on ordered extents across a large range of bytes.
*/
-int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len)
{
int ret = 0;
int ret_wb = 0;
@@ -871,11 +906,11 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
* before the ordered extents complete - to avoid failures (-EEXIST)
* when adding the new ordered extents to the ordered tree.
*/
- ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+ ret_wb = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, orig_end);
end = orig_end;
while (1) {
- ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
+ ordered = btrfs_lookup_first_ordered_extent(inode, end);
if (!ordered)
break;
if (ordered->file_offset > orig_end) {
@@ -1173,7 +1208,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
struct btrfs_ordered_extent *btrfs_split_ordered_extent(
struct btrfs_ordered_extent *ordered, u64 len)
{
- struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_inode *inode = ordered->inode;
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 file_offset = ordered->file_offset;
@@ -1212,15 +1247,32 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
/* One ref for the tree. */
refcount_inc(&new->refs);
+ /*
+ * Take the root's ordered_extent_lock to avoid a race with
+ * btrfs_wait_ordered_extents() when updating the disk_bytenr and
+ * disk_num_bytes fields of the ordered extent below. And we disable
+ * IRQs because the inode's ordered_tree_lock is used in IRQ context
+ * elsewhere.
+ *
+ * There's no concern about a previous caller of
+ * btrfs_wait_ordered_extents() getting the trimmed ordered extent
+ * before we insert the new one, because even if it gets the ordered
+ * extent before it's trimmed and the new one inserted, right before it
+ * uses it or during its use, the ordered extent might have been
+ * trimmed in the meanwhile, and it missed the new ordered extent.
+ * There's no way around this and it's harmless for current use cases,
+ * so we take the root's ordered_extent_lock to fix that race during
+ * trimming and silence tools like KCSAN.
+ */
spin_lock_irq(&root->ordered_extent_lock);
spin_lock(&inode->ordered_tree_lock);
- /* Remove from tree once */
- node = &ordered->rb_node;
- rb_erase(node, &inode->ordered_tree);
- RB_CLEAR_NODE(node);
- if (inode->ordered_tree_last == node)
- inode->ordered_tree_last = NULL;
+ /*
+ * We don't have overlapping ordered extents (that would imply double
+ * allocation of extents) and we checked above that the split length
+ * does not cross the ordered extent's num_bytes field, so there's
+ * no need to remove it and re-insert it in the tree.
+ */
ordered->file_offset += len;
ordered->disk_bytenr += len;
ordered->num_bytes -= len;
@@ -1250,18 +1302,10 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
offset += sum->len;
}
- /* Re-insert the node */
- node = tree_insert(&inode->ordered_tree, ordered->file_offset,
- &ordered->rb_node);
- if (node)
- btrfs_panic(fs_info, -EEXIST,
- "zoned: inconsistency in ordered tree at offset %llu",
- ordered->file_offset);
-
node = tree_insert(&inode->ordered_tree, new->file_offset, &new->rb_node);
- if (node)
+ if (unlikely(node))
btrfs_panic(fs_info, -EEXIST,
- "zoned: inconsistency in ordered tree at offset %llu",
+ "inconsistency in ordered tree at offset %llu after split",
new->file_offset);
spin_unlock(&inode->ordered_tree_lock);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index b6f6c6b91732..51b9e81726e2 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -130,7 +130,7 @@ struct btrfs_ordered_extent {
refcount_t refs;
/* the inode we belong to */
- struct inode *inode;
+ struct btrfs_inode *inode;
/* list of checksums for insertion when the extent io is done */
struct list_head list;
@@ -162,7 +162,7 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry);
-bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
+void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
struct page *page, u64 file_offset, u64 len,
bool uptodate);
void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
@@ -171,17 +171,28 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size);
+
+/*
+ * This represents details about the target file extent item of a write operation.
+ */
+struct btrfs_file_extent {
+ u64 disk_bytenr;
+ u64 disk_num_bytes;
+ u64 num_bytes;
+ u64 ram_bytes;
+ u64 offset;
+ u8 compression;
+};
+
struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
struct btrfs_inode *inode, u64 file_offset,
- u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
- u64 disk_num_bytes, u64 offset, unsigned long flags,
- int compress_type);
+ const struct btrfs_file_extent *file_extent, unsigned long flags);
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset);
void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry);
-int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset);
struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
@@ -193,9 +204,9 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
struct list_head *list);
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
- const u64 range_start, const u64 range_len);
+ const struct btrfs_block_group *bg);
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
- const u64 range_start, const u64 range_len);
+ const struct btrfs_block_group *bg);
void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
u64 end,
struct extent_state **cached_state);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 7e46aa8a0444..32dcea662da3 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -109,7 +109,7 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type
btrfs_err(eb->fs_info,
"unexpected extent item size, has %u expect >= %zu",
item_size, sizeof(*ei));
- btrfs_handle_fs_error(eb->fs_info, -EUCLEAN, NULL);
+ return;
}
ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);
@@ -208,11 +208,6 @@ static void print_raid_stripe_key(const struct extent_buffer *eb, u32 item_size,
struct btrfs_stripe_extent *stripe)
{
const int num_stripes = btrfs_num_raid_stripes(item_size);
- const u8 encoding = btrfs_stripe_extent_encoding(eb, stripe);
-
- pr_info("\t\t\tencoding: %s\n",
- (encoding && encoding < BTRFS_NR_RAID_TYPES) ?
- btrfs_raid_array[encoding].raid_name : "unknown");
for (int i = 0; i < num_stripes; i++)
pr_info("\t\t\tstride %d devid %llu physical %llu\n",
@@ -310,6 +305,9 @@ void btrfs_print_leaf(const struct extent_buffer *l)
case BTRFS_EXTENT_DATA_KEY:
fi = btrfs_item_ptr(l, i,
struct btrfs_file_extent_item);
+ pr_info("\t\tgeneration %llu type %hhu\n",
+ btrfs_file_extent_generation(l, fi),
+ btrfs_file_extent_type(l, fi));
if (btrfs_file_extent_type(l, fi) ==
BTRFS_FILE_EXTENT_INLINE) {
pr_info("\t\tinline extent data size %llu\n",
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 155570e20f45..b8fa34e16abb 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -27,7 +27,7 @@ struct prop_handler {
int (*validate)(const struct btrfs_inode *inode, const char *value,
size_t len);
int (*apply)(struct inode *inode, const char *value, size_t len);
- const char *(*extract)(struct inode *inode);
+ const char *(*extract)(const struct inode *inode);
bool (*ignore)(const struct btrfs_inode *inode);
int inheritable;
};
@@ -104,7 +104,7 @@ bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name)
return handler->ignore(inode);
}
-int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
+int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode,
const char *name, const char *value, size_t value_len,
int flags)
{
@@ -116,29 +116,29 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
return -EINVAL;
if (value_len == 0) {
- ret = btrfs_setxattr(trans, inode, handler->xattr_name,
+ ret = btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name,
NULL, 0, flags);
if (ret)
return ret;
- ret = handler->apply(inode, NULL, 0);
+ ret = handler->apply(&inode->vfs_inode, NULL, 0);
ASSERT(ret == 0);
return ret;
}
- ret = btrfs_setxattr(trans, inode, handler->xattr_name, value,
+ ret = btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, value,
value_len, flags);
if (ret)
return ret;
- ret = handler->apply(inode, value, value_len);
+ ret = handler->apply(&inode->vfs_inode, value, value_len);
if (ret) {
- btrfs_setxattr(trans, inode, handler->xattr_name, NULL,
+ btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, NULL,
0, flags);
return ret;
}
- set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
+ set_bit(BTRFS_INODE_HAS_PROPS, &inode->runtime_flags);
return 0;
}
@@ -359,7 +359,7 @@ static bool prop_compression_ignore(const struct btrfs_inode *inode)
return false;
}
-static const char *prop_compression_extract(struct inode *inode)
+static const char *prop_compression_extract(const struct inode *inode)
{
switch (BTRFS_I(inode)->prop_compress) {
case BTRFS_COMPRESS_ZLIB:
@@ -385,7 +385,7 @@ static struct prop_handler prop_handlers[] = {
};
int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *parent)
+ struct inode *inode, const struct inode *parent)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_fs_info *fs_info = root->fs_info;
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
index f60cd89feb29..63546d0a9444 100644
--- a/fs/btrfs/props.h
+++ b/fs/btrfs/props.h
@@ -15,7 +15,7 @@ struct btrfs_trans_handle;
int __init btrfs_props_init(void);
-int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
+int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode,
const char *name, const char *value, size_t value_len,
int flags);
int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name,
@@ -26,6 +26,6 @@ int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path);
int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
struct inode *inode,
- struct inode *dir);
+ const struct inode *dir);
#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index fc2a7ea26354..5d57a285d59b 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -30,7 +30,7 @@
#include "root-tree.h"
#include "tree-checker.h"
-enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info)
+enum btrfs_qgroup_mode btrfs_qgroup_mode(const struct btrfs_fs_info *fs_info)
{
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
return BTRFS_QGROUP_MODE_DISABLED;
@@ -39,12 +39,12 @@ enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info)
return BTRFS_QGROUP_MODE_FULL;
}
-bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info)
+bool btrfs_qgroup_enabled(const struct btrfs_fs_info *fs_info)
{
return btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_DISABLED;
}
-bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info)
+bool btrfs_qgroup_full_accounting(const struct btrfs_fs_info *fs_info)
{
return btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL;
}
@@ -107,7 +107,7 @@ static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *dest,
- struct btrfs_qgroup *src)
+ const struct btrfs_qgroup *src)
{
int i;
@@ -117,7 +117,7 @@ static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *dest,
- struct btrfs_qgroup *src)
+ const struct btrfs_qgroup *src)
{
int i;
@@ -141,37 +141,27 @@ static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
qg->new_refcnt += mod;
}
-static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
+static inline u64 btrfs_qgroup_get_old_refcnt(const struct btrfs_qgroup *qg, u64 seq)
{
if (qg->old_refcnt < seq)
return 0;
return qg->old_refcnt - seq;
}
-static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
+static inline u64 btrfs_qgroup_get_new_refcnt(const struct btrfs_qgroup *qg, u64 seq)
{
if (qg->new_refcnt < seq)
return 0;
return qg->new_refcnt - seq;
}
-/*
- * glue structure to represent the relations between qgroups.
- */
-struct btrfs_qgroup_list {
- struct list_head next_group;
- struct list_head next_member;
- struct btrfs_qgroup *group;
- struct btrfs_qgroup *member;
-};
-
static int
qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
int init_flags);
static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
/* must be called with qgroup_ioctl_lock held */
-static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
+static struct btrfs_qgroup *find_qgroup_rb(const struct btrfs_fs_info *fs_info,
u64 qgroupid)
{
struct rb_node *n = fs_info->qgroup_tree.rb_node;
@@ -346,7 +336,7 @@ static int del_relation_rb(struct btrfs_fs_info *fs_info,
}
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid,
u64 rfer, u64 excl)
{
struct btrfs_qgroup *qgroup;
@@ -608,7 +598,7 @@ out:
* Return false if no reserved space is left.
* Return true if some reserved space is leaked.
*/
-bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
+bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info)
{
struct rb_node *node;
bool ret = false;
@@ -1334,24 +1324,19 @@ out:
*/
static int flush_reservations(struct btrfs_fs_info *fs_info)
{
- struct btrfs_trans_handle *trans;
int ret;
ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
if (ret)
return ret;
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
- trans = btrfs_join_transaction(fs_info->tree_root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- ret = btrfs_commit_transaction(trans);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
- return ret;
+ return btrfs_commit_current_transaction(fs_info->tree_root);
}
int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *quota_root;
+ struct btrfs_root *quota_root = NULL;
struct btrfs_trans_handle *trans = NULL;
int ret = 0;
@@ -1446,12 +1431,14 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
btrfs_tree_lock(quota_root->node);
btrfs_clear_buffer_dirty(trans, quota_root->node);
btrfs_tree_unlock(quota_root->node);
- btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
- quota_root->node, 0, 1);
+ ret = btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
+ quota_root->node, 0, 1);
- btrfs_put_root(quota_root);
+ if (ret < 0)
+ btrfs_abort_transaction(trans, ret);
out:
+ btrfs_put_root(quota_root);
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (ret && trans)
btrfs_end_transaction(trans);
@@ -1572,15 +1559,21 @@ out:
return ret;
}
-int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst)
+/*
+ * Add relation between @src and @dst qgroup. The @prealloc is allocated by the
+ * callers and transferred here (either used or freed on error).
+ */
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst,
+ struct btrfs_qgroup_list *prealloc)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_qgroup *parent;
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
- struct btrfs_qgroup_list *prealloc = NULL;
int ret = 0;
+ ASSERT(prealloc);
+
/* Check the level of src and dst first */
if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
return -EINVAL;
@@ -1605,11 +1598,6 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst
}
}
- prealloc = kzalloc(sizeof(*list), GFP_NOFS);
- if (!prealloc) {
- ret = -ENOMEM;
- goto out;
- }
ret = add_qgroup_relation_item(trans, src, dst);
if (ret)
goto out;
@@ -1748,13 +1736,55 @@ out:
return ret;
}
-static bool qgroup_has_usage(struct btrfs_qgroup *qgroup)
+/*
+ * Return 0 if we can not delete the qgroup (not empty or has children etc).
+ * Return >0 if we can delete the qgroup.
+ * Return <0 for other errors during tree search.
+ */
+static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup)
{
- return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 ||
- qgroup->excl > 0 || qgroup->excl_cmpr > 0 ||
- qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 ||
- qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 ||
- qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0);
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ int ret;
+
+ /*
+ * Squota would never be inconsistent, but there can still be case
+ * where a dropped subvolume still has qgroup numbers, and squota
+ * relies on such qgroup for future accounting.
+ *
+ * So for squota, do not allow dropping any non-zero qgroup.
+ */
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE &&
+ (qgroup->rfer || qgroup->excl || qgroup->excl_cmpr || qgroup->rfer_cmpr))
+ return 0;
+
+ /* For higher level qgroup, we can only delete it if it has no child. */
+ if (btrfs_qgroup_level(qgroup->qgroupid)) {
+ if (!list_empty(&qgroup->members))
+ return 0;
+ return 1;
+ }
+
+ /*
+ * For level-0 qgroups, we can only delete it if it has no subvolume
+ * for it.
+ * This means even a subvolume is unlinked but not yet fully dropped,
+ * we can not delete the qgroup.
+ */
+ key.objectid = qgroup->qgroupid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = -1ULL;
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL);
+ btrfs_free_path(path);
+ /*
+ * The @ret from btrfs_find_root() exactly matches our definition for
+ * the return value, thus can be returned directly.
+ */
+ return ret;
}
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
@@ -1776,7 +1806,10 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
goto out;
}
- if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) {
+ ret = can_delete_qgroup(fs_info, qgroup);
+ if (ret < 0)
+ goto out;
+ if (ret == 0) {
ret = -EBUSY;
goto out;
}
@@ -1801,6 +1834,34 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
}
spin_lock(&fs_info->qgroup_lock);
+ /*
+ * Warn on reserved space. The subvolume should has no child nor
+ * corresponding subvolume.
+ * Thus its reserved space should all be zero, no matter if qgroup
+ * is consistent or the mode.
+ */
+ WARN_ON(qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
+ /*
+ * The same for rfer/excl numbers, but that's only if our qgroup is
+ * consistent and if it's in regular qgroup mode.
+ * For simple mode it's not as accurate thus we can hit non-zero values
+ * very frequently.
+ */
+ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL &&
+ !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
+ if (WARN_ON(qgroup->rfer || qgroup->excl ||
+ qgroup->rfer_cmpr || qgroup->excl_cmpr)) {
+ btrfs_warn_rl(fs_info,
+"to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
+ btrfs_qgroup_level(qgroup->qgroupid),
+ btrfs_qgroup_subvolid(qgroup->qgroupid),
+ qgroup->rfer, qgroup->rfer_cmpr,
+ qgroup->excl, qgroup->excl_cmpr);
+ qgroup_mark_inconsistent(fs_info);
+ }
+ }
del_qgroup_rb(fs_info, qgroupid);
spin_unlock(&fs_info->qgroup_lock);
@@ -1816,6 +1877,41 @@ out:
return ret;
}
+int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 subvolid)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ if (!is_fstree(subvolid) || !btrfs_qgroup_enabled(fs_info) || !fs_info->quota_root)
+ return 0;
+
+ /*
+ * Commit current transaction to make sure all the rfer/excl numbers
+ * get updated.
+ */
+ trans = btrfs_start_transaction(fs_info->quota_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_commit_transaction(trans);
+ if (ret < 0)
+ return ret;
+
+ /* Start new trans to delete the qgroup info and limit items. */
+ trans = btrfs_start_transaction(fs_info->quota_root, 2);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ ret = btrfs_remove_qgroup(trans, subvolid);
+ btrfs_end_transaction(trans);
+ /*
+ * It's squota and the subvolume still has numbers needed for future
+ * accounting, in this case we can not delete it. Just skip it.
+ */
+ if (ret == -EBUSY)
+ ret = 0;
+ return ret;
+}
+
int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
struct btrfs_qgroup_limit *limit)
{
@@ -3062,8 +3158,6 @@ int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup_inherit *inherit,
size_t size)
{
- if (!btrfs_qgroup_enabled(fs_info))
- return 0;
if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP)
return -EOPNOTSUPP;
if (size < sizeof(*inherit) || size > PAGE_SIZE)
@@ -3085,6 +3179,14 @@ int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info,
return -EINVAL;
/*
+ * Skip the inherit source qgroups check if qgroup is not enabled.
+ * Qgroup can still be later enabled causing problems, but in that case
+ * btrfs_qgroup_inherit() would just ignore those invalid ones.
+ */
+ if (!btrfs_qgroup_enabled(fs_info))
+ return 0;
+
+ /*
* Now check all the remaining qgroups, they should all:
*
* - Exist
@@ -3216,7 +3318,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
struct btrfs_qgroup_inherit *inherit)
{
int ret = 0;
- int i;
u64 *i_qgroups;
bool committing = false;
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -3273,7 +3374,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
i_qgroups = (u64 *)(inherit + 1);
nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
2 * inherit->num_excl_copies;
- for (i = 0; i < nums; ++i) {
+ for (int i = 0; i < nums; i++) {
srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
/*
@@ -3300,7 +3401,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
*/
if (inherit) {
i_qgroups = (u64 *)(inherit + 1);
- for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) {
+ for (int i = 0; i < inherit->num_qgroups; i++, i_qgroups++) {
if (*i_qgroups == 0)
continue;
ret = add_qgroup_relation_item(trans, objectid,
@@ -3386,7 +3487,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
goto unlock;
i_qgroups = (u64 *)(inherit + 1);
- for (i = 0; i < inherit->num_qgroups; ++i) {
+ for (int i = 0; i < inherit->num_qgroups; i++) {
if (*i_qgroups) {
ret = add_relation_rb(fs_info, qlist_prealloc[i], objectid,
*i_qgroups);
@@ -3406,7 +3507,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
++i_qgroups;
}
- for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) {
+ for (int i = 0; i < inherit->num_ref_copies; i++, i_qgroups += 2) {
struct btrfs_qgroup *src;
struct btrfs_qgroup *dst;
@@ -3427,7 +3528,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
/* Manually tweaking numbers certainly needs a rescan */
need_rescan = true;
}
- for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) {
+ for (int i = 0; i < inherit->num_excl_copies; i++, i_qgroups += 2) {
struct btrfs_qgroup *src;
struct btrfs_qgroup *dst;
@@ -3912,7 +4013,6 @@ int
btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
{
int ret = 0;
- struct btrfs_trans_handle *trans;
ret = qgroup_rescan_init(fs_info, 0, 1);
if (ret)
@@ -3929,16 +4029,10 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
* going to clear all tracking information for a clean start.
*/
- trans = btrfs_attach_transaction_barrier(fs_info->fs_root);
- if (IS_ERR(trans) && trans != ERR_PTR(-ENOENT)) {
+ ret = btrfs_commit_current_transaction(fs_info->fs_root);
+ if (ret) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
- return PTR_ERR(trans);
- } else if (trans != ERR_PTR(-ENOENT)) {
- ret = btrfs_commit_transaction(trans);
- if (ret) {
- fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
- return ret;
- }
+ return ret;
}
qgroup_rescan_zero_tracking(fs_info);
@@ -4074,7 +4168,6 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode,
*/
static int try_flush_qgroup(struct btrfs_root *root)
{
- struct btrfs_trans_handle *trans;
int ret;
/* Can't hold an open transaction or we run the risk of deadlocking. */
@@ -4095,17 +4188,9 @@ static int try_flush_qgroup(struct btrfs_root *root)
ret = btrfs_start_delalloc_snapshot(root, true);
if (ret < 0)
goto out;
- btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
+ btrfs_wait_ordered_extents(root, U64_MAX, NULL);
- trans = btrfs_attach_transaction_barrier(root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- if (ret == -ENOENT)
- ret = 0;
- goto out;
- }
-
- ret = btrfs_commit_transaction(trans);
+ ret = btrfs_commit_current_transaction(root);
out:
clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
wake_up(&root->qgroup_flush_wait);
@@ -4811,7 +4896,7 @@ void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_byte
}
int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
- struct btrfs_squota_delta *delta)
+ const struct btrfs_squota_delta *delta)
{
int ret;
struct btrfs_qgroup *qgroup;
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 706640be0ec2..deb479d176a9 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -123,7 +123,6 @@ struct btrfs_inode;
/*
* Record a dirty extent, and info qgroup to update quota on it
- * TODO: Use kmem cache to alloc it.
*/
struct btrfs_qgroup_extent_record {
struct rb_node node;
@@ -279,6 +278,14 @@ struct btrfs_qgroup {
struct kobject kobj;
};
+/* Glue structure to represent the relations between qgroups. */
+struct btrfs_qgroup_list {
+ struct list_head next_group;
+ struct list_head next_member;
+ struct btrfs_qgroup *group;
+ struct btrfs_qgroup *member;
+};
+
struct btrfs_squota_delta {
/* The fstree root this delta counts against. */
u64 root;
@@ -312,9 +319,9 @@ enum btrfs_qgroup_mode {
BTRFS_QGROUP_MODE_SIMPLE
};
-enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info);
-bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info);
-bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info);
+enum btrfs_qgroup_mode btrfs_qgroup_mode(const struct btrfs_fs_info *fs_info);
+bool btrfs_qgroup_enabled(const struct btrfs_fs_info *fs_info);
+bool btrfs_qgroup_full_accounting(const struct btrfs_fs_info *fs_info);
int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_quota_ctl_args *quota_ctl_args);
int btrfs_quota_disable(struct btrfs_fs_info *fs_info);
@@ -322,11 +329,13 @@ int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
bool interruptible);
-int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst);
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst,
+ struct btrfs_qgroup_list *prealloc);
int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
u64 dst);
int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid);
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid);
+int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 subvolid);
int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
struct btrfs_qgroup_limit *limit);
int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
@@ -361,7 +370,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
enum btrfs_qgroup_rsv_type type);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid,
u64 rfer, u64 excl);
#endif
@@ -431,9 +440,9 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *eb);
void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
-bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info);
+bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info);
void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes);
int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
- struct btrfs_squota_delta *delta);
+ const struct btrfs_squota_delta *delta);
#endif
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 6af6b4b9a32e..e6f7a234b8f6 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -80,7 +80,6 @@ static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
struct btrfs_key stripe_key;
struct btrfs_root *stripe_root = fs_info->stripe_root;
const int num_stripes = btrfs_bg_type_to_factor(bioc->map_type);
- u8 encoding = btrfs_bg_flags_to_raid_index(bioc->map_type);
struct btrfs_stripe_extent *stripe_extent;
const size_t item_size = struct_size(stripe_extent, strides, num_stripes);
int ret;
@@ -94,7 +93,6 @@ static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
trace_btrfs_insert_one_raid_extent(fs_info, bioc->logical, bioc->size,
num_stripes);
- btrfs_set_stack_stripe_extent_encoding(stripe_extent, encoding);
for (int i = 0; i < num_stripes; i++) {
u64 devid = bioc->stripes[i].dev->devid;
u64 physical = bioc->stripes[i].physical;
@@ -159,7 +157,6 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf;
const u64 end = logical + *length;
int num_stripes;
- u8 encoding;
u64 offset;
u64 found_logical;
u64 found_length;
@@ -222,16 +219,6 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot));
stripe_extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
- encoding = btrfs_stripe_extent_encoding(leaf, stripe_extent);
-
- if (encoding != btrfs_bg_flags_to_raid_index(map_type)) {
- ret = -EUCLEAN;
- btrfs_handle_fs_error(fs_info, ret,
- "on-disk stripe encoding %d doesn't match RAID index %d",
- encoding,
- btrfs_bg_flags_to_raid_index(map_type));
- goto out;
- }
for (int i = 0; i < num_stripes; i++) {
struct btrfs_raid_stride *stride = &stripe_extent->strides[i];
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
index c9c258f84903..1ac1c21aac2f 100644
--- a/fs/btrfs/raid-stripe-tree.h
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -48,8 +48,7 @@ static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info,
static inline int btrfs_num_raid_stripes(u32 item_size)
{
- return (item_size - offsetof(struct btrfs_stripe_extent, strides)) /
- sizeof(struct btrfs_raid_stride);
+ return item_size / sizeof(struct btrfs_raid_stride);
}
#endif
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 831fac45e70f..39bec672df0c 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -40,6 +40,85 @@
#define BTRFS_STRIPE_HASH_TABLE_BITS 11
+static void dump_bioc(const struct btrfs_fs_info *fs_info, const struct btrfs_io_context *bioc)
+{
+ if (unlikely(!bioc)) {
+ btrfs_crit(fs_info, "bioc=NULL");
+ return;
+ }
+ btrfs_crit(fs_info,
+"bioc logical=%llu full_stripe=%llu size=%llu map_type=0x%llx mirror=%u replace_nr_stripes=%u replace_stripe_src=%d num_stripes=%u",
+ bioc->logical, bioc->full_stripe_logical, bioc->size,
+ bioc->map_type, bioc->mirror_num, bioc->replace_nr_stripes,
+ bioc->replace_stripe_src, bioc->num_stripes);
+ for (int i = 0; i < bioc->num_stripes; i++) {
+ btrfs_crit(fs_info, " nr=%d devid=%llu physical=%llu",
+ i, bioc->stripes[i].dev->devid,
+ bioc->stripes[i].physical);
+ }
+}
+
+static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_raid_bio *rbio)
+{
+ if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
+ return;
+
+ dump_bioc(fs_info, rbio->bioc);
+ btrfs_crit(fs_info,
+"rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u scrubp=%u dbitmap=0x%lx",
+ rbio->flags, rbio->nr_sectors, rbio->nr_data,
+ rbio->real_stripes, rbio->stripe_nsectors,
+ rbio->scrubp, rbio->dbitmap);
+}
+
+#define ASSERT_RBIO(expr, rbio) \
+({ \
+ if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
+ const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
+ (rbio)->bioc->fs_info : NULL; \
+ \
+ btrfs_dump_rbio(__fs_info, (rbio)); \
+ } \
+ ASSERT((expr)); \
+})
+
+#define ASSERT_RBIO_STRIPE(expr, rbio, stripe_nr) \
+({ \
+ if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
+ const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
+ (rbio)->bioc->fs_info : NULL; \
+ \
+ btrfs_dump_rbio(__fs_info, (rbio)); \
+ btrfs_crit(__fs_info, "stripe_nr=%d", (stripe_nr)); \
+ } \
+ ASSERT((expr)); \
+})
+
+#define ASSERT_RBIO_SECTOR(expr, rbio, sector_nr) \
+({ \
+ if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
+ const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
+ (rbio)->bioc->fs_info : NULL; \
+ \
+ btrfs_dump_rbio(__fs_info, (rbio)); \
+ btrfs_crit(__fs_info, "sector_nr=%d", (sector_nr)); \
+ } \
+ ASSERT((expr)); \
+})
+
+#define ASSERT_RBIO_LOGICAL(expr, rbio, logical) \
+({ \
+ if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
+ const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
+ (rbio)->bioc->fs_info : NULL; \
+ \
+ btrfs_dump_rbio(__fs_info, (rbio)); \
+ btrfs_crit(__fs_info, "logical=%llu", (logical)); \
+ } \
+ ASSERT((expr)); \
+})
+
/* Used by the raid56 code to lock stripes for read/modify/write */
struct btrfs_stripe_hash {
struct list_head hash_list;
@@ -592,8 +671,8 @@ static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
unsigned int stripe_nr,
unsigned int sector_nr)
{
- ASSERT(stripe_nr < rbio->real_stripes);
- ASSERT(sector_nr < rbio->stripe_nsectors);
+ ASSERT_RBIO_STRIPE(stripe_nr < rbio->real_stripes, rbio, stripe_nr);
+ ASSERT_RBIO_SECTOR(sector_nr < rbio->stripe_nsectors, rbio, sector_nr);
return stripe_nr * rbio->stripe_nsectors + sector_nr;
}
@@ -873,8 +952,10 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
struct sector_ptr *sector;
int index;
- ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes);
- ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
+ ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->real_stripes,
+ rbio, stripe_nr);
+ ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
+ rbio, sector_nr);
index = stripe_nr * rbio->stripe_nsectors + sector_nr;
ASSERT(index >= 0 && index < rbio->nr_sectors);
@@ -970,7 +1051,7 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
{
int ret;
- ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, 0);
+ ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, false);
if (ret < 0)
return ret;
/* Mapping all sectors */
@@ -985,7 +1066,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
int ret;
ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
- rbio->stripe_pages + data_pages, 0);
+ rbio->stripe_pages + data_pages, false);
if (ret < 0)
return ret;
@@ -1057,8 +1138,10 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
* thus it can be larger than rbio->real_stripe.
* So here we check against bioc->num_stripes, not rbio->real_stripes.
*/
- ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes);
- ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
+ ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes,
+ rbio, stripe_nr);
+ ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
+ rbio, sector_nr);
ASSERT(sector->page);
stripe = &rbio->bioc->stripes[stripe_nr];
@@ -1197,14 +1280,14 @@ static void assert_rbio(struct btrfs_raid_bio *rbio)
* At least two stripes (2 disks RAID5), and since real_stripes is U8,
* we won't go beyond 256 disks anyway.
*/
- ASSERT(rbio->real_stripes >= 2);
- ASSERT(rbio->nr_data > 0);
+ ASSERT_RBIO(rbio->real_stripes >= 2, rbio);
+ ASSERT_RBIO(rbio->nr_data > 0, rbio);
/*
* This is another check to make sure nr data stripes is smaller
* than total stripes.
*/
- ASSERT(rbio->nr_data < rbio->real_stripes);
+ ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio);
}
/* Generate PQ for one vertical stripe. */
@@ -1557,7 +1640,7 @@ static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
const int data_pages = rbio->nr_data * rbio->stripe_npages;
int ret;
- ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, 0);
+ ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, false);
if (ret < 0)
return ret;
@@ -1641,9 +1724,10 @@ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
const u32 sectorsize = fs_info->sectorsize;
u64 cur_logical;
- ASSERT(orig_logical >= full_stripe_start &&
- orig_logical + orig_len <= full_stripe_start +
- rbio->nr_data * BTRFS_STRIPE_LEN);
+ ASSERT_RBIO_LOGICAL(orig_logical >= full_stripe_start &&
+ orig_logical + orig_len <= full_stripe_start +
+ rbio->nr_data * BTRFS_STRIPE_LEN,
+ rbio, orig_logical);
bio_list_add(&rbio->bio_list, orig_bio);
rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
@@ -2389,7 +2473,7 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
break;
}
}
- ASSERT(i < rbio->real_stripes);
+ ASSERT_RBIO_STRIPE(i < rbio->real_stripes, rbio, i);
bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
return rbio;
@@ -2555,7 +2639,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
* Replace is running and our parity stripe needs to be duplicated to
* the target device. Check we have a valid source stripe number.
*/
- ASSERT(rbio->bioc->replace_stripe_src >= 0);
+ ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio);
for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
struct sector_ptr *sector;
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index cf531255ab76..9522a8b79d22 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -441,7 +441,8 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
u32 item_size = btrfs_item_size(leaf, slot);
unsigned long end, ptr;
u64 offset, flags, count;
- int type, ret;
+ int type;
+ int ret = 0;
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
flags = btrfs_extent_flags(leaf, ei);
@@ -486,7 +487,11 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
key->objectid, key->offset);
break;
case BTRFS_EXTENT_OWNER_REF_KEY:
- WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
+ if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)) {
+ btrfs_err(fs_info,
+ "found extent owner ref without simple quotas enabled");
+ ret = -EINVAL;
+ }
break;
default:
btrfs_err(fs_info, "invalid key type in iref");
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index d0a3fcecc46a..df6b93b927cd 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -733,7 +733,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
* we found the previous extent covering eof and before we
* attempted to increment its reference count).
*/
- ret = btrfs_wait_ordered_range(inode, wb_start,
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode), wb_start,
destoff - wb_start);
if (ret)
return ret;
@@ -755,7 +755,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
* range, so wait for writeback to complete before truncating pages
* from the page cache. This is a rare case.
*/
- wb_ret = btrfs_wait_ordered_range(inode, destoff, len);
+ wb_ret = btrfs_wait_ordered_range(BTRFS_I(inode), destoff, len);
ret = ret ? ret : wb_ret;
/*
* Truncate page cache pages so that future reads will see the cloned
@@ -835,11 +835,11 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (ret < 0)
return ret;
- ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode_in), ALIGN_DOWN(pos_in, bs),
wb_len);
if (ret < 0)
return ret;
- ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
+ ret = btrfs_wait_ordered_range(BTRFS_I(inode_out), ALIGN_DOWN(pos_out, bs),
wb_len);
if (ret < 0)
return ret;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 8b24bb5a0aa1..0533d0f82dc9 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -817,7 +817,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
goto abort;
}
set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
- reloc_root->last_trans = trans->transid;
+ btrfs_set_root_last_trans(reloc_root, trans->transid);
return reloc_root;
fail:
kfree(root_item);
@@ -864,7 +864,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
*/
if (root->reloc_root) {
reloc_root = root->reloc_root;
- reloc_root->last_trans = trans->transid;
+ btrfs_set_root_last_trans(reloc_root, trans->transid);
return 0;
}
@@ -962,7 +962,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
if (!path)
return -ENOMEM;
- bytenr -= BTRFS_I(reloc_inode)->index_cnt;
+ bytenr -= BTRFS_I(reloc_inode)->reloc_block_group_start;
ret = btrfs_lookup_file_extent(NULL, root, path,
btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
if (ret < 0)
@@ -1739,7 +1739,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
* btrfs_update_reloc_root() and update our root item
* appropriately.
*/
- reloc_root->last_trans = trans->transid;
+ btrfs_set_root_last_trans(reloc_root, trans->transid);
trans->block_rsv = rc->block_rsv;
replaced = 0;
@@ -2082,7 +2082,7 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root;
int ret;
- if (reloc_root->last_trans == trans->transid)
+ if (btrfs_get_root_last_trans(reloc_root) == trans->transid)
return 0;
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false);
@@ -2790,14 +2790,14 @@ out_free_blocks:
return ret;
}
-static noinline_for_stack int prealloc_file_extent_cluster(
- struct btrfs_inode *inode,
- const struct file_extent_cluster *cluster)
+static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control *rc)
{
+ const struct file_extent_cluster *cluster = &rc->cluster;
+ struct btrfs_inode *inode = BTRFS_I(rc->data_inode);
u64 alloc_hint = 0;
u64 start;
u64 end;
- u64 offset = inode->index_cnt;
+ u64 offset = inode->reloc_block_group_start;
u64 num_bytes;
int nr;
int ret = 0;
@@ -2899,11 +2899,14 @@ static noinline_for_stack int prealloc_file_extent_cluster(
return ret;
}
-static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode,
- u64 start, u64 end, u64 block_start)
+static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_control *rc)
{
+ struct btrfs_inode *inode = BTRFS_I(rc->data_inode);
struct extent_map *em;
struct extent_state *cached_state = NULL;
+ u64 offset = inode->reloc_block_group_start;
+ u64 start = rc->cluster.start - offset;
+ u64 end = rc->cluster.end - offset;
int ret = 0;
em = alloc_extent_map();
@@ -2912,13 +2915,14 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod
em->start = start;
em->len = end + 1 - start;
- em->block_len = em->len;
- em->block_start = block_start;
+ em->disk_bytenr = rc->cluster.start;
+ em->disk_num_bytes = em->len;
+ em->ram_bytes = em->len;
em->flags |= EXTENT_FLAG_PINNED;
- lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
- ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false);
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
+ lock_extent(&inode->io_tree, start, end, &cached_state);
+ ret = btrfs_replace_extent_map_range(inode, em, false);
+ unlock_extent(&inode->io_tree, start, end, &cached_state);
free_extent_map(em);
return ret;
@@ -2946,12 +2950,14 @@ static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster,
return cluster->boundary[cluster_nr + 1] - 1;
}
-static int relocate_one_folio(struct inode *inode, struct file_ra_state *ra,
- const struct file_extent_cluster *cluster,
+static int relocate_one_folio(struct reloc_control *rc,
+ struct file_ra_state *ra,
int *cluster_nr, unsigned long index)
{
+ const struct file_extent_cluster *cluster = &rc->cluster;
+ struct inode *inode = rc->data_inode;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- u64 offset = BTRFS_I(inode)->index_cnt;
+ u64 offset = BTRFS_I(inode)->reloc_block_group_start;
const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
struct folio *folio;
@@ -2975,8 +2981,7 @@ static int relocate_one_folio(struct inode *inode, struct file_ra_state *ra,
if (folio_test_readahead(folio))
page_cache_async_readahead(inode->i_mapping, ra, NULL,
- folio, index,
- last_index + 1 - index);
+ folio, last_index + 1 - index);
if (!folio_test_uptodate(folio)) {
btrfs_read_folio(NULL, folio);
@@ -3083,10 +3088,11 @@ release_folio:
return ret;
}
-static int relocate_file_extent_cluster(struct inode *inode,
- const struct file_extent_cluster *cluster)
+static int relocate_file_extent_cluster(struct reloc_control *rc)
{
- u64 offset = BTRFS_I(inode)->index_cnt;
+ struct inode *inode = rc->data_inode;
+ const struct file_extent_cluster *cluster = &rc->cluster;
+ u64 offset = BTRFS_I(inode)->reloc_block_group_start;
unsigned long index;
unsigned long last_index;
struct file_ra_state *ra;
@@ -3100,21 +3106,20 @@ static int relocate_file_extent_cluster(struct inode *inode,
if (!ra)
return -ENOMEM;
- ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster);
+ ret = prealloc_file_extent_cluster(rc);
if (ret)
goto out;
file_ra_state_init(ra, inode->i_mapping);
- ret = setup_relocation_extent_mapping(inode, cluster->start - offset,
- cluster->end - offset, cluster->start);
+ ret = setup_relocation_extent_mapping(rc);
if (ret)
goto out;
last_index = (cluster->end - offset) >> PAGE_SHIFT;
for (index = (cluster->start - offset) >> PAGE_SHIFT;
index <= last_index && !ret; index++)
- ret = relocate_one_folio(inode, ra, cluster, &cluster_nr, index);
+ ret = relocate_one_folio(rc, ra, &cluster_nr, index);
if (ret == 0)
WARN_ON(cluster_nr != cluster->nr);
out:
@@ -3122,15 +3127,16 @@ out:
return ret;
}
-static noinline_for_stack int relocate_data_extent(struct inode *inode,
- const struct btrfs_key *extent_key,
- struct file_extent_cluster *cluster)
+static noinline_for_stack int relocate_data_extent(struct reloc_control *rc,
+ const struct btrfs_key *extent_key)
{
+ struct inode *inode = rc->data_inode;
+ struct file_extent_cluster *cluster = &rc->cluster;
int ret;
struct btrfs_root *root = BTRFS_I(inode)->root;
if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
- ret = relocate_file_extent_cluster(inode, cluster);
+ ret = relocate_file_extent_cluster(rc);
if (ret)
return ret;
cluster->nr = 0;
@@ -3156,7 +3162,7 @@ static noinline_for_stack int relocate_data_extent(struct inode *inode,
* the cluster we need to relocate.
*/
root->relocation_src_root = cluster->owning_root;
- ret = relocate_file_extent_cluster(inode, cluster);
+ ret = relocate_file_extent_cluster(rc);
if (ret)
return ret;
cluster->nr = 0;
@@ -3175,7 +3181,7 @@ static noinline_for_stack int relocate_data_extent(struct inode *inode,
cluster->nr++;
if (cluster->nr >= MAX_EXTENTS) {
- ret = relocate_file_extent_cluster(inode, cluster);
+ ret = relocate_file_extent_cluster(rc);
if (ret)
return ret;
cluster->nr = 0;
@@ -3369,7 +3375,7 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
if (inode)
goto truncate;
- inode = btrfs_iget(fs_info->sb, ino, root);
+ inode = btrfs_iget(ino, root);
if (IS_ERR(inode))
return -ENOENT;
@@ -3744,8 +3750,7 @@ restart:
if (rc->stage == MOVE_DATA_EXTENTS &&
(flags & BTRFS_EXTENT_FLAG_DATA)) {
rc->found_file_extent = true;
- ret = relocate_data_extent(rc->data_inode,
- &key, &rc->cluster);
+ ret = relocate_data_extent(rc, &key);
if (ret < 0) {
err = ret;
break;
@@ -3774,8 +3779,7 @@ restart:
}
if (!err) {
- ret = relocate_file_extent_cluster(rc->data_inode,
- &rc->cluster);
+ ret = relocate_file_extent_cluster(rc);
if (ret < 0)
err = ret;
}
@@ -3908,14 +3912,14 @@ static noinline_for_stack struct inode *create_reloc_inode(
if (ret)
goto out;
- inode = btrfs_iget(fs_info->sb, objectid, root);
+ inode = btrfs_iget(objectid, root);
if (IS_ERR(inode)) {
delete_orphan_inode(trans, root, objectid);
ret = PTR_ERR(inode);
inode = NULL;
goto out;
}
- BTRFS_I(inode)->index_cnt = group->start;
+ BTRFS_I(inode)->reloc_block_group_start = group->start;
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
out:
@@ -4002,15 +4006,13 @@ static void free_reloc_control(struct reloc_control *rc)
/*
* Print the block group being relocated
*/
-static void describe_relocation(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group *block_group)
+static void describe_relocation(struct btrfs_block_group *block_group)
{
char buf[128] = {'\0'};
btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
- btrfs_info(fs_info,
- "relocating block group %llu flags %s",
+ btrfs_info(block_group->fs_info, "relocating block group %llu flags %s",
block_group->start, buf);
}
@@ -4118,13 +4120,11 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
goto out;
}
- describe_relocation(fs_info, rc->block_group);
+ describe_relocation(rc->block_group);
btrfs_wait_block_group_reservations(rc->block_group);
btrfs_wait_nocow_writers(rc->block_group);
- btrfs_wait_ordered_roots(fs_info, U64_MAX,
- rc->block_group->start,
- rc->block_group->length);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, rc->block_group);
ret = btrfs_zone_finish(rc->block_group);
WARN_ON(ret && ret != -EAGAIN);
@@ -4149,7 +4149,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
* out of the loop if we hit an error.
*/
if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
- ret = btrfs_wait_ordered_range(rc->data_inode, 0,
+ ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0,
(u64)-1);
if (ret)
err = ret;
@@ -4221,8 +4221,8 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
struct extent_buffer *leaf;
struct reloc_control *rc = NULL;
struct btrfs_trans_handle *trans;
- int ret;
- int err = 0;
+ int ret2;
+ int ret = 0;
path = btrfs_alloc_path();
if (!path)
@@ -4236,15 +4236,14 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
while (1) {
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
path, 0, 0);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- }
if (ret > 0) {
if (path->slots[0] == 0)
break;
path->slots[0]--;
}
+ ret = 0;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
btrfs_release_path(path);
@@ -4255,7 +4254,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key);
if (IS_ERR(reloc_root)) {
- err = PTR_ERR(reloc_root);
+ ret = PTR_ERR(reloc_root);
goto out;
}
@@ -4267,15 +4266,12 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
reloc_root->root_key.offset, false);
if (IS_ERR(fs_root)) {
ret = PTR_ERR(fs_root);
- if (ret != -ENOENT) {
- err = ret;
+ if (ret != -ENOENT)
goto out;
- }
ret = mark_garbage_root(reloc_root);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- }
+ ret = 0;
} else {
btrfs_put_root(fs_root);
}
@@ -4293,15 +4289,13 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
rc = alloc_reloc_control(fs_info);
if (!rc) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
ret = reloc_chunk_start(fs_info);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out_end;
- }
rc->extent_root = btrfs_extent_root(fs_info, 0);
@@ -4309,7 +4303,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
trans = btrfs_join_transaction(rc->extent_root);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_unset;
}
@@ -4329,15 +4323,15 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
false);
if (IS_ERR(fs_root)) {
- err = PTR_ERR(fs_root);
+ ret = PTR_ERR(fs_root);
list_add_tail(&reloc_root->root_list, &reloc_roots);
btrfs_end_transaction(trans);
goto out_unset;
}
- err = __add_reloc_root(reloc_root);
- ASSERT(err != -EEXIST);
- if (err) {
+ ret = __add_reloc_root(reloc_root);
+ ASSERT(ret != -EEXIST);
+ if (ret) {
list_add_tail(&reloc_root->root_list, &reloc_roots);
btrfs_put_root(fs_root);
btrfs_end_transaction(trans);
@@ -4347,8 +4341,8 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
btrfs_put_root(fs_root);
}
- err = btrfs_commit_transaction(trans);
- if (err)
+ ret = btrfs_commit_transaction(trans);
+ if (ret)
goto out_unset;
merge_reloc_roots(rc);
@@ -4357,14 +4351,14 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
trans = btrfs_join_transaction(rc->extent_root);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_clean;
}
- err = btrfs_commit_transaction(trans);
+ ret = btrfs_commit_transaction(trans);
out_clean:
- ret = clean_dirty_subvols(rc);
- if (ret < 0 && !err)
- err = ret;
+ ret2 = clean_dirty_subvols(rc);
+ if (ret2 < 0 && !ret)
+ ret = ret2;
out_unset:
unset_reloc_control(rc);
out_end:
@@ -4375,14 +4369,14 @@ out:
btrfs_free_path(path);
- if (err == 0) {
+ if (ret == 0) {
/* cleanup orphan inode in data relocation tree */
fs_root = btrfs_grab_root(fs_info->data_reloc_root);
ASSERT(fs_root);
- err = btrfs_orphan_cleanup(fs_root);
+ ret = btrfs_orphan_cleanup(fs_root);
btrfs_put_root(fs_root);
}
- return err;
+ return ret;
}
/*
@@ -4393,9 +4387,9 @@ out:
*/
int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered)
{
- struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_inode *inode = ordered->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- u64 disk_bytenr = ordered->file_offset + inode->index_cnt;
+ u64 disk_bytenr = ordered->file_offset + inode->reloc_block_group_start;
struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr);
LIST_HEAD(list);
int ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index afd6932f5e89..14a8d7100018 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -261,7 +261,7 @@ static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
atomic_set(&stripe->pending_io, 0);
spin_lock_init(&stripe->write_error_lock);
- ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, 0);
+ ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, false);
if (ret < 0)
goto error;
@@ -1688,20 +1688,24 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
(i << fs_info->sectorsize_bits);
int err;
- bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
- fs_info, scrub_read_endio, stripe);
- bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT;
-
io_stripe.is_scrub = true;
+ stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits;
+ /*
+ * For RST cases, we need to manually split the bbio to
+ * follow the RST boundary.
+ */
err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
- &stripe_len, &bioc, &io_stripe,
- &mirror);
+ &stripe_len, &bioc, &io_stripe, &mirror);
btrfs_put_bioc(bioc);
- if (err) {
- btrfs_bio_end_io(bbio,
- errno_to_blk_status(err));
- return;
+ if (err < 0) {
+ set_bit(i, &stripe->io_error_bitmap);
+ set_bit(i, &stripe->error_bitmap);
+ continue;
}
+
+ bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
+ fs_info, scrub_read_endio, stripe);
+ bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT;
}
__bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
@@ -2437,19 +2441,15 @@ static int finish_extent_writes_for_zoned(struct btrfs_root *root,
struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
- struct btrfs_trans_handle *trans;
if (!btrfs_is_zoned(fs_info))
return 0;
btrfs_wait_block_group_reservations(cache);
btrfs_wait_nocow_writers(cache);
- btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, cache);
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- return btrfs_commit_transaction(trans);
+ return btrfs_commit_current_transaction(root);
}
static noinline_for_stack
@@ -2680,8 +2680,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
*/
if (sctx->is_dev_replace) {
btrfs_wait_nocow_writers(cache);
- btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
- cache->length);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, cache);
}
scrub_pause_off(fs_info);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 3dd4a48479a9..4ca711a773ef 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5188,11 +5188,10 @@ out:
static int process_verity(struct send_ctx *sctx)
{
int ret = 0;
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
struct inode *inode;
struct fs_path *p;
- inode = btrfs_iget(fs_info->sb, sctx->cur_ino, sctx->send_root);
+ inode = btrfs_iget(sctx->cur_ino, sctx->send_root);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -5307,7 +5306,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
if (folio_test_readahead(folio))
page_cache_async_readahead(mapping, &sctx->ra, NULL, folio,
- index, last_index + 1 - index);
+ last_index + 1 - index);
if (!folio_test_uptodate(folio)) {
btrfs_read_folio(NULL, folio);
@@ -5550,7 +5549,7 @@ static int send_encoded_inline_extent(struct send_ctx *sctx,
size_t inline_size;
int ret;
- inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
+ inode = btrfs_iget(sctx->cur_ino, root);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -5617,7 +5616,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
u32 crc;
int ret;
- inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
+ inode = btrfs_iget(sctx->cur_ino, root);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -5746,7 +5745,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
if (sctx->cur_inode == NULL) {
struct btrfs_root *root = sctx->send_root;
- sctx->cur_inode = btrfs_iget(root->fs_info->sb, sctx->cur_ino, root);
+ sctx->cur_inode = btrfs_iget(sctx->cur_ino, root);
if (IS_ERR(sctx->cur_inode)) {
int err = PTR_ERR(sctx->cur_inode);
@@ -7998,34 +7997,18 @@ out:
*/
static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
{
- int i;
- struct btrfs_trans_handle *trans = NULL;
-
-again:
- if (sctx->parent_root &&
- sctx->parent_root->node != sctx->parent_root->commit_root)
- goto commit_trans;
-
- for (i = 0; i < sctx->clone_roots_cnt; i++)
- if (sctx->clone_roots[i].root->node !=
- sctx->clone_roots[i].root->commit_root)
- goto commit_trans;
-
- if (trans)
- return btrfs_end_transaction(trans);
+ struct btrfs_root *root = sctx->parent_root;
- return 0;
+ if (root && root->node != root->commit_root)
+ return btrfs_commit_current_transaction(root);
-commit_trans:
- /* Use any root, all fs roots will get their commit roots updated. */
- if (!trans) {
- trans = btrfs_join_transaction(sctx->send_root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- goto again;
+ for (int i = 0; i < sctx->clone_roots_cnt; i++) {
+ root = sctx->clone_roots[i].root;
+ if (root->node != root->commit_root)
+ return btrfs_commit_current_transaction(root);
}
- return btrfs_commit_transaction(trans);
+ return 0;
}
/*
@@ -8046,7 +8029,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx)
ret = btrfs_start_delalloc_snapshot(root, false);
if (ret)
return ret;
- btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
+ btrfs_wait_ordered_extents(root, U64_MAX, NULL);
}
for (i = 0; i < sctx->clone_roots_cnt; i++) {
@@ -8054,7 +8037,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx)
ret = btrfs_start_delalloc_snapshot(root, false);
if (ret)
return ret;
- btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
+ btrfs_wait_ordered_extents(root, U64_MAX, NULL);
}
return 0;
@@ -8082,10 +8065,10 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root)
btrfs_root_id(root), root->dedupe_in_progress);
}
-long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
+long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg)
{
int ret = 0;
- struct btrfs_root *send_root = BTRFS_I(inode)->root;
+ struct btrfs_root *send_root = inode->root;
struct btrfs_fs_info *fs_info = send_root->fs_info;
struct btrfs_root *clone_root;
struct send_ctx *sctx = NULL;
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index dd1c9f02b011..b07f4aa66878 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -11,7 +11,7 @@
#include <linux/sizes.h>
#include <linux/align.h>
-struct inode;
+struct btrfs_inode;
struct btrfs_ioctl_send_args;
#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
@@ -182,6 +182,6 @@ enum {
__BTRFS_SEND_A_MAX = 35,
};
-long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg);
+long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg);
#endif
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index d620323d08ea..9ac94d3119e8 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1,5 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
+#include "linux/spinlock.h"
+#include <linux/minmax.h>
#include "misc.h"
#include "ctree.h"
#include "space-info.h"
@@ -190,6 +192,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
*/
#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75)
+#define BTRFS_UNALLOC_BLOCK_GROUP_TARGET (10ULL)
+
/*
* Calculate chunk size depending on volume type (regular or zoned).
*/
@@ -232,6 +236,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
if (!space_info)
return -ENOMEM;
+ space_info->fs_info = info;
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
INIT_LIST_HEAD(&space_info->block_groups[i]);
init_rwsem(&space_info->groups_sem);
@@ -340,11 +345,32 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
return NULL;
}
+static u64 calc_effective_data_chunk_size(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *data_sinfo;
+ u64 data_chunk_size;
+
+ /*
+ * Calculate the data_chunk_size, space_info->chunk_size is the
+ * "optimal" chunk size based on the fs size. However when we actually
+ * allocate the chunk we will strip this down further, making it no
+ * more than 10% of the disk or 1G, whichever is smaller.
+ *
+ * On the zoned mode, we need to use zone_size (= data_sinfo->chunk_size)
+ * as it is.
+ */
+ data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
+ if (btrfs_is_zoned(fs_info))
+ return data_sinfo->chunk_size;
+ data_chunk_size = min(data_sinfo->chunk_size,
+ mult_perc(fs_info->fs_devices->total_rw_bytes, 10));
+ return min_t(u64, data_chunk_size, SZ_1G);
+}
+
static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
enum btrfs_reserve_flush_enum flush)
{
- struct btrfs_space_info *data_sinfo;
u64 profile;
u64 avail;
u64 data_chunk_size;
@@ -368,16 +394,7 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
if (avail == 0)
return 0;
- /*
- * Calculate the data_chunk_size, space_info->chunk_size is the
- * "optimal" chunk size based on the fs size. However when we actually
- * allocate the chunk we will strip this down further, making it no more
- * than 10% of the disk or 1G, whichever is smaller.
- */
- data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
- data_chunk_size = min(data_sinfo->chunk_size,
- mult_perc(fs_info->fs_devices->total_rw_bytes, 10));
- data_chunk_size = min_t(u64, data_chunk_size, SZ_1G);
+ data_chunk_size = calc_effective_data_chunk_size(fs_info);
/*
* Since data allocations immediately use block groups as part of the
@@ -405,6 +422,17 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
avail >>= 3;
else
avail >>= 1;
+
+ /*
+ * On the zoned mode, we always allocate one zone as one chunk.
+ * Returning non-zone size alingned bytes here will result in
+ * less pressure for the async metadata reclaim process, and it
+ * will over-commit too much leading to ENOSPC. Align down to the
+ * zone size to avoid that.
+ */
+ if (btrfs_is_zoned(fs_info))
+ avail = ALIGN_DOWN(avail, fs_info->zone_size);
+
return avail;
}
@@ -587,8 +615,6 @@ static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info,
return nr;
}
-#define EXTENT_SIZE_PER_ITEM SZ_256K
-
/*
* shrink metadata reservation for delalloc
*/
@@ -688,7 +714,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
skip_async:
loops++;
if (wait_ordered && !trans) {
- btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, items, NULL);
} else {
time_left = schedule_timeout_killable(1);
if (time_left)
@@ -807,14 +833,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
* because that does not wait for a transaction to fully commit
* (only for it to be unblocked, state TRANS_STATE_UNBLOCKED).
*/
- trans = btrfs_attach_transaction_barrier(root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- if (ret == -ENOENT)
- ret = 0;
- break;
- }
- ret = btrfs_commit_transaction(trans);
+ ret = btrfs_commit_current_transaction(root);
break;
default:
ret = -ENOSPC;
@@ -1868,3 +1887,209 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
return free_bytes;
}
+
+static u64 calc_pct_ratio(u64 x, u64 y)
+{
+ int err;
+
+ if (!y)
+ return 0;
+again:
+ err = check_mul_overflow(100, x, &x);
+ if (err)
+ goto lose_precision;
+ return div64_u64(x, y);
+lose_precision:
+ x >>= 10;
+ y >>= 10;
+ if (!y)
+ y = 1;
+ goto again;
+}
+
+/*
+ * A reasonable buffer for unallocated space is 10 data block_groups.
+ * If we claw this back repeatedly, we can still achieve efficient
+ * utilization when near full, and not do too much reclaim while
+ * always maintaining a solid buffer for workloads that quickly
+ * allocate and pressure the unallocated space.
+ */
+static u64 calc_unalloc_target(struct btrfs_fs_info *fs_info)
+{
+ u64 chunk_sz = calc_effective_data_chunk_size(fs_info);
+
+ return BTRFS_UNALLOC_BLOCK_GROUP_TARGET * chunk_sz;
+}
+
+/*
+ * The fundamental goal of automatic reclaim is to protect the filesystem's
+ * unallocated space and thus minimize the probability of the filesystem going
+ * read only when a metadata allocation failure causes a transaction abort.
+ *
+ * However, relocations happen into the space_info's unused space, therefore
+ * automatic reclaim must also back off as that space runs low. There is no
+ * value in doing trivial "relocations" of re-writing the same block group
+ * into a fresh one.
+ *
+ * Furthermore, we want to avoid doing too much reclaim even if there are good
+ * candidates. This is because the allocator is pretty good at filling up the
+ * holes with writes. So we want to do just enough reclaim to try and stay
+ * safe from running out of unallocated space but not be wasteful about it.
+ *
+ * Therefore, the dynamic reclaim threshold is calculated as follows:
+ * - calculate a target unallocated amount of 5 block group sized chunks
+ * - ratchet up the intensity of reclaim depending on how far we are from
+ * that target by using a formula of unalloc / target to set the threshold.
+ *
+ * Typically with 10 block groups as the target, the discrete values this comes
+ * out to are 0, 10, 20, ... , 80, 90, and 99.
+ */
+static int calc_dynamic_reclaim_threshold(struct btrfs_space_info *space_info)
+{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
+ u64 unalloc = atomic64_read(&fs_info->free_chunk_space);
+ u64 target = calc_unalloc_target(fs_info);
+ u64 alloc = space_info->total_bytes;
+ u64 used = btrfs_space_info_used(space_info, false);
+ u64 unused = alloc - used;
+ u64 want = target > unalloc ? target - unalloc : 0;
+ u64 data_chunk_size = calc_effective_data_chunk_size(fs_info);
+
+ /* If we have no unused space, don't bother, it won't work anyway. */
+ if (unused < data_chunk_size)
+ return 0;
+
+ /* Cast to int is OK because want <= target. */
+ return calc_pct_ratio(want, target);
+}
+
+int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info)
+{
+ lockdep_assert_held(&space_info->lock);
+
+ if (READ_ONCE(space_info->dynamic_reclaim))
+ return calc_dynamic_reclaim_threshold(space_info);
+ return READ_ONCE(space_info->bg_reclaim_threshold);
+}
+
+/*
+ * Under "urgent" reclaim, we will reclaim even fresh block groups that have
+ * recently seen successful allocations, as we are desperate to reclaim
+ * whatever we can to avoid ENOSPC in a transaction leading to a readonly fs.
+ */
+static bool is_reclaim_urgent(struct btrfs_space_info *space_info)
+{
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
+ u64 unalloc = atomic64_read(&fs_info->free_chunk_space);
+ u64 data_chunk_size = calc_effective_data_chunk_size(fs_info);
+
+ return unalloc < data_chunk_size;
+}
+
+static int do_reclaim_sweep(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, int raid)
+{
+ struct btrfs_block_group *bg;
+ int thresh_pct;
+ bool try_again = true;
+ bool urgent;
+
+ spin_lock(&space_info->lock);
+ urgent = is_reclaim_urgent(space_info);
+ thresh_pct = btrfs_calc_reclaim_threshold(space_info);
+ spin_unlock(&space_info->lock);
+
+ down_read(&space_info->groups_sem);
+again:
+ list_for_each_entry(bg, &space_info->block_groups[raid], list) {
+ u64 thresh;
+ bool reclaim = false;
+
+ btrfs_get_block_group(bg);
+ spin_lock(&bg->lock);
+ thresh = mult_perc(bg->length, thresh_pct);
+ if (bg->used < thresh && bg->reclaim_mark) {
+ try_again = false;
+ reclaim = true;
+ }
+ bg->reclaim_mark++;
+ spin_unlock(&bg->lock);
+ if (reclaim)
+ btrfs_mark_bg_to_reclaim(bg);
+ btrfs_put_block_group(bg);
+ }
+
+ /*
+ * In situations where we are very motivated to reclaim (low unalloc)
+ * use two passes to make the reclaim mark check best effort.
+ *
+ * If we have any staler groups, we don't touch the fresher ones, but if we
+ * really need a block group, do take a fresh one.
+ */
+ if (try_again && urgent) {
+ try_again = false;
+ goto again;
+ }
+
+ up_read(&space_info->groups_sem);
+ return 0;
+}
+
+void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes)
+{
+ u64 chunk_sz = calc_effective_data_chunk_size(space_info->fs_info);
+
+ lockdep_assert_held(&space_info->lock);
+ space_info->reclaimable_bytes += bytes;
+
+ if (space_info->reclaimable_bytes >= chunk_sz)
+ btrfs_set_periodic_reclaim_ready(space_info, true);
+}
+
+void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready)
+{
+ lockdep_assert_held(&space_info->lock);
+ if (!READ_ONCE(space_info->periodic_reclaim))
+ return;
+ if (ready != space_info->periodic_reclaim_ready) {
+ space_info->periodic_reclaim_ready = ready;
+ if (!ready)
+ space_info->reclaimable_bytes = 0;
+ }
+}
+
+bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info)
+{
+ bool ret;
+
+ if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ return false;
+ if (!READ_ONCE(space_info->periodic_reclaim))
+ return false;
+
+ spin_lock(&space_info->lock);
+ ret = space_info->periodic_reclaim_ready;
+ btrfs_set_periodic_reclaim_ready(space_info, false);
+ spin_unlock(&space_info->lock);
+
+ return ret;
+}
+
+int btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info)
+{
+ int ret;
+ int raid;
+ struct btrfs_space_info *space_info;
+
+ list_for_each_entry(space_info, &fs_info->space_info, list) {
+ if (!btrfs_should_periodic_reclaim(space_info))
+ continue;
+ for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) {
+ ret = do_reclaim_sweep(fs_info, space_info, raid);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return ret;
+}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index a733458fd13b..4db8a0267c16 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -94,6 +94,7 @@ enum btrfs_flush_state {
};
struct btrfs_space_info {
+ struct btrfs_fs_info *fs_info;
spinlock_t lock;
u64 total_bytes; /* total bytes in the space,
@@ -165,6 +166,47 @@ struct btrfs_space_info {
struct kobject kobj;
struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
+
+ /*
+ * Monotonically increasing counter of block group reclaim attempts
+ * Exposed in /sys/fs/<uuid>/allocation/<type>/reclaim_count
+ */
+ u64 reclaim_count;
+
+ /*
+ * Monotonically increasing counter of reclaimed bytes
+ * Exposed in /sys/fs/<uuid>/allocation/<type>/reclaim_bytes
+ */
+ u64 reclaim_bytes;
+
+ /*
+ * Monotonically increasing counter of reclaim errors
+ * Exposed in /sys/fs/<uuid>/allocation/<type>/reclaim_errors
+ */
+ u64 reclaim_errors;
+
+ /*
+ * If true, use the dynamic relocation threshold, instead of the
+ * fixed bg_reclaim_threshold.
+ */
+ bool dynamic_reclaim;
+
+ /*
+ * Periodically check all block groups against the reclaim
+ * threshold in the cleaner thread.
+ */
+ bool periodic_reclaim;
+
+ /*
+ * Periodic reclaim should be a no-op if a space_info hasn't
+ * freed any space since the last time we tried.
+ */
+ bool periodic_reclaim_ready;
+
+ /*
+ * Net bytes freed or allocated since the last reclaim pass.
+ */
+ s64 reclaimable_bytes;
};
struct reserve_ticket {
@@ -247,4 +289,10 @@ void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info);
void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
+void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes);
+void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready);
+bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info);
+int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info);
+int btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info);
+
#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 54736f6238e6..8ddd5fcbeb93 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -74,7 +74,7 @@ bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space
* mapping. And if page->mapping->host is data inode, it's subpage.
* As we have ruled our sectorsize >= PAGE_SIZE case already.
*/
- if (!mapping || !mapping->host || is_data_inode(mapping->host))
+ if (!mapping || !mapping->host || is_data_inode(BTRFS_I(mapping->host)))
return true;
/*
@@ -242,12 +242,12 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
#define subpage_calc_start_bit(fs_info, folio, name, start, len) \
({ \
- unsigned int start_bit; \
+ unsigned int __start_bit; \
\
btrfs_subpage_assert(fs_info, folio, start, len); \
- start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
- start_bit += fs_info->subpage_info->name##_offset; \
- start_bit; \
+ __start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
+ __start_bit += fs_info->subpage_info->name##_offset; \
+ __start_bit; \
})
void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
@@ -283,7 +283,7 @@ void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
bool last;
btrfs_subpage_assert(fs_info, folio, start, len);
- is_data = is_data_inode(folio->mapping->host);
+ is_data = is_data_inode(BTRFS_I(folio->mapping->host));
spin_lock_irqsave(&subpage->lock, flags);
@@ -703,19 +703,29 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
* Make sure not only the page dirty bit is cleared, but also subpage dirty bit
* is cleared.
*/
-void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio)
+void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_subpage *subpage;
+ unsigned int start_bit;
+ unsigned int nbits;
+ unsigned long flags;
if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
return;
- ASSERT(!folio_test_dirty(folio));
- if (!btrfs_is_subpage(fs_info, folio->mapping))
+ if (!btrfs_is_subpage(fs_info, folio->mapping)) {
+ ASSERT(!folio_test_dirty(folio));
return;
+ }
- ASSERT(folio_test_private(folio) && folio_get_private(folio));
- ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty));
+ start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len);
+ nbits = len >> fs_info->sectorsize_bits;
+ subpage = folio_get_private(folio);
+ ASSERT(subpage);
+ spin_lock_irqsave(&subpage->lock, flags);
+ ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
+ spin_unlock_irqrestore(&subpage->lock, flags);
}
/*
@@ -765,6 +775,130 @@ void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info,
btrfs_folio_end_writer_lock(fs_info, folio, start, len);
}
+/*
+ * This is for folio already locked by plain lock_page()/folio_lock(), which
+ * doesn't have any subpage awareness.
+ *
+ * This populates the involved subpage ranges so that subpage helpers can
+ * properly unlock them.
+ */
+void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage;
+ unsigned long flags;
+ unsigned int start_bit;
+ unsigned int nbits;
+ int ret;
+
+ ASSERT(folio_test_locked(folio));
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping))
+ return;
+
+ subpage = folio_get_private(folio);
+ start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
+ nbits = len >> fs_info->sectorsize_bits;
+ spin_lock_irqsave(&subpage->lock, flags);
+ /* Target range should not yet be locked. */
+ ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
+ bitmap_set(subpage->bitmaps, start_bit, nbits);
+ ret = atomic_add_return(nbits, &subpage->writers);
+ ASSERT(ret <= fs_info->subpage_info->bitmap_nr_bits);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+/*
+ * Find any subpage writer locked range inside @folio, starting at file offset
+ * @search_start. The caller should ensure the folio is locked.
+ *
+ * Return true and update @found_start_ret and @found_len_ret to the first
+ * writer locked range.
+ * Return false if there is no writer locked range.
+ */
+bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 search_start,
+ u64 *found_start_ret, u32 *found_len_ret)
+{
+ struct btrfs_subpage_info *subpage_info = fs_info->subpage_info;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ const unsigned int len = PAGE_SIZE - offset_in_page(search_start);
+ const unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
+ locked, search_start, len);
+ const unsigned int locked_bitmap_start = subpage_info->locked_offset;
+ const unsigned int locked_bitmap_end = locked_bitmap_start +
+ subpage_info->bitmap_nr_bits;
+ unsigned long flags;
+ int first_zero;
+ int first_set;
+ bool found = false;
+
+ ASSERT(folio_test_locked(folio));
+ spin_lock_irqsave(&subpage->lock, flags);
+ first_set = find_next_bit(subpage->bitmaps, locked_bitmap_end, start_bit);
+ if (first_set >= locked_bitmap_end)
+ goto out;
+
+ found = true;
+
+ *found_start_ret = folio_pos(folio) +
+ ((first_set - locked_bitmap_start) << fs_info->sectorsize_bits);
+ /*
+ * Since @first_set is ensured to be smaller than locked_bitmap_end
+ * here, @found_start_ret should be inside the folio.
+ */
+ ASSERT(*found_start_ret < folio_pos(folio) + PAGE_SIZE);
+
+ first_zero = find_next_zero_bit(subpage->bitmaps, locked_bitmap_end, first_set);
+ *found_len_ret = (first_zero - first_set) << fs_info->sectorsize_bits;
+out:
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ return found;
+}
+
+/*
+ * Unlike btrfs_folio_end_writer_lock() which unlocks a specified subpage range,
+ * this ends all writer locked ranges of a page.
+ *
+ * This is for the locked page of __extent_writepage(), as the locked page
+ * can contain several locked subpage ranges.
+ */
+void btrfs_folio_end_all_writers(const struct btrfs_fs_info *fs_info, struct folio *folio)
+{
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ u64 folio_start = folio_pos(folio);
+ u64 cur = folio_start;
+
+ ASSERT(folio_test_locked(folio));
+ if (!btrfs_is_subpage(fs_info, folio->mapping)) {
+ folio_unlock(folio);
+ return;
+ }
+
+ /* The page has no new delalloc range locked on it. Just plain unlock. */
+ if (atomic_read(&subpage->writers) == 0) {
+ folio_unlock(folio);
+ return;
+ }
+ while (cur < folio_start + PAGE_SIZE) {
+ u64 found_start;
+ u32 found_len;
+ bool found;
+ bool last;
+
+ found = btrfs_subpage_find_writer_locked(fs_info, folio, cur,
+ &found_start, &found_len);
+ if (!found)
+ break;
+ last = btrfs_subpage_end_and_test_writer(fs_info, folio,
+ found_start, found_len);
+ if (last) {
+ folio_unlock(folio);
+ break;
+ }
+ cur = found_start + found_len;
+ }
+}
+
#define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst) \
bitmap_cut(dst, subpage->bitmaps, 0, \
subpage_info->name##_offset, subpage_info->bitmap_nr_bits)
@@ -775,7 +909,6 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
struct btrfs_subpage_info *subpage_info = fs_info->subpage_info;
struct btrfs_subpage *subpage;
unsigned long uptodate_bitmap;
- unsigned long error_bitmap;
unsigned long dirty_bitmap;
unsigned long writeback_bitmap;
unsigned long ordered_bitmap;
@@ -797,10 +930,9 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
dump_page(folio_page(folio, 0), "btrfs subpage dump");
btrfs_warn(fs_info,
-"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl error=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
+"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
start, len, folio_pos(folio),
subpage_info->bitmap_nr_bits, &uptodate_bitmap,
- subpage_info->bitmap_nr_bits, &error_bitmap,
subpage_info->bitmap_nr_bits, &dirty_bitmap,
subpage_info->bitmap_nr_bits, &writeback_bitmap,
subpage_info->bitmap_nr_bits, &ordered_bitmap,
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index b6dc013b0fdc..249396e118d0 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -112,6 +112,12 @@ int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
+void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
+bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 search_start,
+ u64 *found_start_ret, u32 *found_len_ret);
+void btrfs_folio_end_all_writers(const struct btrfs_fs_info *fs_info, struct folio *folio);
/*
* Template for subpage related operations.
@@ -156,7 +162,8 @@ DECLARE_BTRFS_SUBPAGE_OPS(checked);
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
-void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio);
+void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f05cce7c8b8d..08d33cb372fb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -34,6 +34,7 @@
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
+#include "direct-io.h"
#include "props.h"
#include "xattr.h"
#include "bio.h"
@@ -81,7 +82,7 @@ struct btrfs_fs_context {
u32 commit_interval;
u32 metadata_ratio;
u32 thread_pool_size;
- unsigned long mount_opt;
+ unsigned long long mount_opt;
unsigned long compress_type:4;
unsigned int compress_level;
refcount_t refs;
@@ -125,9 +126,6 @@ enum {
Opt_rescue,
Opt_usebackuproot,
Opt_nologreplay,
- Opt_ignorebadroots,
- Opt_ignoredatacsums,
- Opt_rescue_all,
/* Debugging options */
Opt_enospc_debug,
@@ -178,6 +176,8 @@ enum {
Opt_rescue_nologreplay,
Opt_rescue_ignorebadroots,
Opt_rescue_ignoredatacsums,
+ Opt_rescue_ignoremetacsums,
+ Opt_rescue_ignoresuperflags,
Opt_rescue_parameter_all,
};
@@ -187,7 +187,11 @@ static const struct constant_table btrfs_parameter_rescue[] = {
{ "ignorebadroots", Opt_rescue_ignorebadroots },
{ "ibadroots", Opt_rescue_ignorebadroots },
{ "ignoredatacsums", Opt_rescue_ignoredatacsums },
+ { "ignoremetacsums", Opt_rescue_ignoremetacsums},
+ { "ignoresuperflags", Opt_rescue_ignoresuperflags},
{ "idatacsums", Opt_rescue_ignoredatacsums },
+ { "imetacsums", Opt_rescue_ignoremetacsums},
+ { "isuperflags", Opt_rescue_ignoresuperflags},
{ "all", Opt_rescue_parameter_all },
{}
};
@@ -573,8 +577,16 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_rescue_ignoredatacsums:
btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS);
break;
+ case Opt_rescue_ignoremetacsums:
+ btrfs_set_opt(ctx->mount_opt, IGNOREMETACSUMS);
+ break;
+ case Opt_rescue_ignoresuperflags:
+ btrfs_set_opt(ctx->mount_opt, IGNORESUPERFLAGS);
+ break;
case Opt_rescue_parameter_all:
btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS);
+ btrfs_set_opt(ctx->mount_opt, IGNOREMETACSUMS);
+ btrfs_set_opt(ctx->mount_opt, IGNORESUPERFLAGS);
btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS);
btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
break;
@@ -629,8 +641,8 @@ static void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
btrfs_clear_opt(fs_info->mount_opt, NOSPACECACHE);
}
-static bool check_ro_option(struct btrfs_fs_info *fs_info,
- unsigned long mount_opt, unsigned long opt,
+static bool check_ro_option(const struct btrfs_fs_info *fs_info,
+ unsigned long long mount_opt, unsigned long long opt,
const char *opt_name)
{
if (mount_opt & opt) {
@@ -641,7 +653,8 @@ static bool check_ro_option(struct btrfs_fs_info *fs_info,
return false;
}
-bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt,
+bool btrfs_check_options(const struct btrfs_fs_info *info,
+ unsigned long long *mount_opt,
unsigned long flags)
{
bool ret = true;
@@ -649,7 +662,9 @@ bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt,
if (!(flags & SB_RDONLY) &&
(check_ro_option(info, *mount_opt, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") ||
check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") ||
- check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums")))
+ check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums") ||
+ check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREMETACSUMS, "ignoremetacsums") ||
+ check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNORESUPERFLAGS, "ignoresuperflags")))
ret = false;
if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
@@ -949,7 +964,7 @@ static int btrfs_fill_super(struct super_block *sb,
return err;
}
- inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
+ inode = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
btrfs_handle_fs_error(fs_info, err, NULL);
@@ -983,7 +998,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
return 0;
}
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
@@ -1065,6 +1080,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
print_rescue_option(seq, "ignorebadroots", &printed);
if (btrfs_test_opt(info, IGNOREDATACSUMS))
print_rescue_option(seq, "ignoredatacsums", &printed);
+ if (btrfs_test_opt(info, IGNOREMETACSUMS))
+ print_rescue_option(seq, "ignoremetacsums", &printed);
+ if (btrfs_test_opt(info, IGNORESUPERFLAGS))
+ print_rescue_option(seq, "ignoresuperflags", &printed);
if (btrfs_test_opt(info, FLUSHONCOMMIT))
seq_puts(seq, ",flushoncommit");
if (btrfs_test_opt(info, DISCARD_SYNC))
@@ -1213,7 +1232,7 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
}
static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
- unsigned long old_opts, int flags)
+ unsigned long long old_opts, int flags)
{
if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
(!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
@@ -1227,7 +1246,7 @@ static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
}
static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
- unsigned long old_opts)
+ unsigned long long old_opts)
{
const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
@@ -1422,6 +1441,8 @@ static void btrfs_emit_options(struct btrfs_fs_info *info,
btrfs_info_if_set(info, old, USEBACKUPROOT, "trying to use backup root at mount time");
btrfs_info_if_set(info, old, IGNOREBADROOTS, "ignoring bad roots");
btrfs_info_if_set(info, old, IGNOREDATACSUMS, "ignoring data csums");
+ btrfs_info_if_set(info, old, IGNOREMETACSUMS, "ignoring meta csums");
+ btrfs_info_if_set(info, old, IGNORESUPERFLAGS, "ignoring unknown super block flags");
btrfs_info_if_unset(info, old, NODATACOW, "setting datacow");
btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations");
@@ -2257,9 +2278,7 @@ out:
static int btrfs_freeze(struct super_block *sb)
{
- struct btrfs_trans_handle *trans;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- struct btrfs_root *root = fs_info->tree_root;
set_bit(BTRFS_FS_FROZEN, &fs_info->flags);
/*
@@ -2268,14 +2287,7 @@ static int btrfs_freeze(struct super_block *sb)
* we want to avoid on a frozen filesystem), or do the commit
* ourselves.
*/
- trans = btrfs_attach_transaction_barrier(root);
- if (IS_ERR(trans)) {
- /* no transaction, don't bother */
- if (PTR_ERR(trans) == -ENOENT)
- return 0;
- return PTR_ERR(trans);
- }
- return btrfs_commit_transaction(trans);
+ return btrfs_commit_current_transaction(fs_info->tree_root);
}
static int check_dev_super(struct btrfs_device *dev)
@@ -2499,6 +2511,9 @@ static const struct init_sequence mod_init_seq[] = {
.init_func = btrfs_init_cachep,
.exit_func = btrfs_destroy_cachep,
}, {
+ .init_func = btrfs_init_dio,
+ .exit_func = btrfs_destroy_dio,
+ }, {
.init_func = btrfs_transaction_init,
.exit_func = btrfs_transaction_exit,
}, {
@@ -2590,6 +2605,7 @@ static int __init init_btrfs_fs(void)
late_initcall(init_btrfs_fs);
module_exit(exit_btrfs_fs)
+MODULE_DESCRIPTION("B-Tree File System (BTRFS)");
MODULE_LICENSE("GPL");
MODULE_SOFTDEP("pre: crc32c");
MODULE_SOFTDEP("pre: xxhash64");
diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h
index cbcab434b5ec..d80a86acfbbe 100644
--- a/fs/btrfs/super.h
+++ b/fs/btrfs/super.h
@@ -10,7 +10,8 @@
struct super_block;
struct btrfs_fs_info;
-bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt,
+bool btrfs_check_options(const struct btrfs_fs_info *info,
+ unsigned long long *mount_opt,
unsigned long flags);
int btrfs_sync_fs(struct super_block *sb, int wait);
char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index af545b6b1190..03926ad467c9 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -385,6 +385,8 @@ static const char *rescue_opts[] = {
"nologreplay",
"ignorebadroots",
"ignoredatacsums",
+ "ignoremetacsums",
+ "ignoresuperflags",
"all",
};
@@ -894,6 +896,9 @@ SPACE_INFO_ATTR(bytes_readonly);
SPACE_INFO_ATTR(bytes_zone_unusable);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
+SPACE_INFO_ATTR(reclaim_count);
+SPACE_INFO_ATTR(reclaim_bytes);
+SPACE_INFO_ATTR(reclaim_errors);
BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store);
BTRFS_ATTR(space_info, size_classes, btrfs_size_classes_show);
@@ -902,8 +907,12 @@ static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
char *buf)
{
struct btrfs_space_info *space_info = to_space_info(kobj);
+ ssize_t ret;
- return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold));
+ spin_lock(&space_info->lock);
+ ret = sysfs_emit(buf, "%d\n", btrfs_calc_reclaim_threshold(space_info));
+ spin_unlock(&space_info->lock);
+ return ret;
}
static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj,
@@ -914,6 +923,9 @@ static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj,
int thresh;
int ret;
+ if (READ_ONCE(space_info->dynamic_reclaim))
+ return -EINVAL;
+
ret = kstrtoint(buf, 10, &thresh);
if (ret)
return ret;
@@ -930,6 +942,72 @@ BTRFS_ATTR_RW(space_info, bg_reclaim_threshold,
btrfs_sinfo_bg_reclaim_threshold_show,
btrfs_sinfo_bg_reclaim_threshold_store);
+static ssize_t btrfs_sinfo_dynamic_reclaim_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+
+ return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->dynamic_reclaim));
+}
+
+static ssize_t btrfs_sinfo_dynamic_reclaim_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ int dynamic_reclaim;
+ int ret;
+
+ ret = kstrtoint(buf, 10, &dynamic_reclaim);
+ if (ret)
+ return ret;
+
+ if (dynamic_reclaim < 0)
+ return -EINVAL;
+
+ WRITE_ONCE(space_info->dynamic_reclaim, dynamic_reclaim != 0);
+
+ return len;
+}
+
+BTRFS_ATTR_RW(space_info, dynamic_reclaim,
+ btrfs_sinfo_dynamic_reclaim_show,
+ btrfs_sinfo_dynamic_reclaim_store);
+
+static ssize_t btrfs_sinfo_periodic_reclaim_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+
+ return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->periodic_reclaim));
+}
+
+static ssize_t btrfs_sinfo_periodic_reclaim_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_space_info *space_info = to_space_info(kobj);
+ int periodic_reclaim;
+ int ret;
+
+ ret = kstrtoint(buf, 10, &periodic_reclaim);
+ if (ret)
+ return ret;
+
+ if (periodic_reclaim < 0)
+ return -EINVAL;
+
+ WRITE_ONCE(space_info->periodic_reclaim, periodic_reclaim != 0);
+
+ return len;
+}
+
+BTRFS_ATTR_RW(space_info, periodic_reclaim,
+ btrfs_sinfo_periodic_reclaim_show,
+ btrfs_sinfo_periodic_reclaim_store);
+
/*
* Allocation information about block group types.
*
@@ -947,8 +1025,13 @@ static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(space_info, disk_used),
BTRFS_ATTR_PTR(space_info, disk_total),
BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold),
+ BTRFS_ATTR_PTR(space_info, dynamic_reclaim),
BTRFS_ATTR_PTR(space_info, chunk_size),
BTRFS_ATTR_PTR(space_info, size_classes),
+ BTRFS_ATTR_PTR(space_info, reclaim_count),
+ BTRFS_ATTR_PTR(space_info, reclaim_bytes),
+ BTRFS_ATTR_PTR(space_info, reclaim_errors),
+ BTRFS_ATTR_PTR(space_info, periodic_reclaim),
#ifdef CONFIG_BTRFS_DEBUG
BTRFS_ATTR_PTR(space_info, force_chunk_alloc),
#endif
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index dce0387ef155..ce50847e1e01 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -61,10 +61,7 @@ struct inode *btrfs_new_test_inode(void)
return NULL;
inode->i_mode = S_IFREG;
- inode->i_ino = BTRFS_FIRST_FREE_OBJECTID;
- BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
- BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
- BTRFS_I(inode)->location.offset = 0;
+ btrfs_set_inode_number(BTRFS_I(inode), BTRFS_FIRST_FREE_OBJECTID);
inode_init_owner(&nop_mnt_idmap, inode, NULL, S_IFREG);
return inode;
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index ba36794ba2d5..ebec4ab361b8 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -19,8 +19,8 @@ static int free_extent_map_tree(struct btrfs_inode *inode)
int ret = 0;
write_lock(&em_tree->lock);
- while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) {
- node = rb_first_cached(&em_tree->map);
+ while (!RB_EMPTY_ROOT(&em_tree->root)) {
+ node = rb_first(&em_tree->root);
em = rb_entry(node, struct extent_map, rb_node);
remove_extent_mapping(inode, em);
@@ -28,9 +28,10 @@ static int free_extent_map_tree(struct btrfs_inode *inode)
if (refcount_read(&em->refs) != 1) {
ret = -EINVAL;
test_err(
-"em leak: em (start %llu len %llu block_start %llu block_len %llu) refs %d",
- em->start, em->len, em->block_start,
- em->block_len, refcount_read(&em->refs));
+"em leak: em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu offset %llu) refs %d",
+ em->start, em->len, em->disk_bytenr,
+ em->disk_num_bytes, em->offset,
+ refcount_read(&em->refs));
refcount_set(&em->refs, 1);
}
@@ -76,8 +77,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
/* Add [0, 16K) */
em->start = 0;
em->len = SZ_16K;
- em->block_start = 0;
- em->block_len = SZ_16K;
+ em->disk_bytenr = 0;
+ em->disk_num_bytes = SZ_16K;
+ em->ram_bytes = SZ_16K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
@@ -97,8 +99,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
em->start = SZ_16K;
em->len = SZ_4K;
- em->block_start = SZ_32K; /* avoid merging */
- em->block_len = SZ_4K;
+ em->disk_bytenr = SZ_32K; /* avoid merging */
+ em->disk_num_bytes = SZ_4K;
+ em->ram_bytes = SZ_4K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
@@ -118,8 +121,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
/* Add [0, 8K), should return [0, 16K) instead. */
em->start = start;
em->len = len;
- em->block_start = start;
- em->block_len = len;
+ em->disk_bytenr = start;
+ em->disk_num_bytes = len;
+ em->ram_bytes = len;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
@@ -134,11 +138,11 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
goto out;
}
if (em->start != 0 || extent_map_end(em) != SZ_16K ||
- em->block_start != 0 || em->block_len != SZ_16K) {
+ em->disk_bytenr != 0 || em->disk_num_bytes != SZ_16K) {
test_err(
-"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
+"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu",
start, start + len, ret, em->start, em->len,
- em->block_start, em->block_len);
+ em->disk_bytenr, em->disk_num_bytes);
ret = -EINVAL;
}
free_extent_map(em);
@@ -172,8 +176,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
/* Add [0, 1K) */
em->start = 0;
em->len = SZ_1K;
- em->block_start = EXTENT_MAP_INLINE;
- em->block_len = (u64)-1;
+ em->disk_bytenr = EXTENT_MAP_INLINE;
+ em->disk_num_bytes = 0;
+ em->ram_bytes = SZ_1K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
@@ -193,8 +198,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
em->start = SZ_4K;
em->len = SZ_4K;
- em->block_start = SZ_4K;
- em->block_len = SZ_4K;
+ em->disk_bytenr = SZ_4K;
+ em->disk_num_bytes = SZ_4K;
+ em->ram_bytes = SZ_4K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
@@ -214,8 +220,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
/* Add [0, 1K) */
em->start = 0;
em->len = SZ_1K;
- em->block_start = EXTENT_MAP_INLINE;
- em->block_len = (u64)-1;
+ em->disk_bytenr = EXTENT_MAP_INLINE;
+ em->disk_num_bytes = 0;
+ em->ram_bytes = SZ_1K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
@@ -229,11 +236,10 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
goto out;
}
if (em->start != 0 || extent_map_end(em) != SZ_1K ||
- em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1) {
+ em->disk_bytenr != EXTENT_MAP_INLINE) {
test_err(
-"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
- ret, em->start, em->len, em->block_start,
- em->block_len);
+"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu",
+ ret, em->start, em->len, em->disk_bytenr);
ret = -EINVAL;
}
free_extent_map(em);
@@ -263,8 +269,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
/* Add [4K, 8K) */
em->start = SZ_4K;
em->len = SZ_4K;
- em->block_start = SZ_4K;
- em->block_len = SZ_4K;
+ em->disk_bytenr = SZ_4K;
+ em->disk_num_bytes = SZ_4K;
+ em->ram_bytes = SZ_4K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
@@ -284,8 +291,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
/* Add [0, 16K) */
em->start = 0;
em->len = SZ_16K;
- em->block_start = 0;
- em->block_len = SZ_16K;
+ em->disk_bytenr = 0;
+ em->disk_num_bytes = SZ_16K;
+ em->ram_bytes = SZ_16K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, start, len);
write_unlock(&em_tree->lock);
@@ -305,11 +313,11 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
* em->start.
*/
if (start < em->start || start + len > extent_map_end(em) ||
- em->start != em->block_start || em->len != em->block_len) {
+ em->start != extent_map_block_start(em)) {
test_err(
-"case3 [%llu %llu): ret %d em (start %llu len %llu block_start %llu block_len %llu)",
+"case3 [%llu %llu): ret %d em (start %llu len %llu disk_bytenr %llu block_len %llu)",
start, start + len, ret, em->start, em->len,
- em->block_start, em->block_len);
+ em->disk_bytenr, em->disk_num_bytes);
ret = -EINVAL;
}
free_extent_map(em);
@@ -370,8 +378,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
/* Add [0K, 8K) */
em->start = 0;
em->len = SZ_8K;
- em->block_start = 0;
- em->block_len = SZ_8K;
+ em->disk_bytenr = 0;
+ em->disk_num_bytes = SZ_8K;
+ em->ram_bytes = SZ_8K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
@@ -391,8 +400,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
/* Add [8K, 32K) */
em->start = SZ_8K;
em->len = 24 * SZ_1K;
- em->block_start = SZ_16K; /* avoid merging */
- em->block_len = 24 * SZ_1K;
+ em->disk_bytenr = SZ_16K; /* avoid merging */
+ em->disk_num_bytes = 24 * SZ_1K;
+ em->ram_bytes = 24 * SZ_1K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
@@ -411,8 +421,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
/* Add [0K, 32K) */
em->start = 0;
em->len = SZ_32K;
- em->block_start = 0;
- em->block_len = SZ_32K;
+ em->disk_bytenr = 0;
+ em->disk_num_bytes = SZ_32K;
+ em->ram_bytes = SZ_32K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, start, len);
write_unlock(&em_tree->lock);
@@ -429,9 +440,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
}
if (start < em->start || start + len > extent_map_end(em)) {
test_err(
-"case4 [%llu %llu): ret %d, added wrong em (start %llu len %llu block_start %llu block_len %llu)",
- start, start + len, ret, em->start, em->len, em->block_start,
- em->block_len);
+"case4 [%llu %llu): ret %d, added wrong em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu)",
+ start, start + len, ret, em->start, em->len,
+ em->disk_bytenr, em->disk_num_bytes);
ret = -EINVAL;
}
free_extent_map(em);
@@ -495,8 +506,9 @@ static int add_compressed_extent(struct btrfs_inode *inode,
em->start = start;
em->len = len;
- em->block_start = block_start;
- em->block_len = SZ_4K;
+ em->disk_bytenr = block_start;
+ em->disk_num_bytes = SZ_4K;
+ em->ram_bytes = len;
em->flags |= EXTENT_FLAG_COMPRESS_ZLIB;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
@@ -551,7 +563,7 @@ static int validate_range(struct extent_map_tree *em_tree, int index)
struct rb_node *n;
int i;
- for (i = 0, n = rb_first_cached(&em_tree->map);
+ for (i = 0, n = rb_first(&em_tree->root);
valid_ranges[index][i].len && n;
i++, n = rb_next(n)) {
struct extent_map *entry = rb_entry(n, struct extent_map, rb_node);
@@ -716,8 +728,9 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
em->start = SZ_4K;
em->len = SZ_4K;
- em->block_start = SZ_16K;
- em->block_len = SZ_16K;
+ em->disk_bytenr = SZ_16K;
+ em->disk_num_bytes = SZ_16K;
+ em->ram_bytes = SZ_16K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, 0, SZ_8K);
write_unlock(&em_tree->lock);
@@ -769,9 +782,10 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
/* [0, 16K), pinned */
em->start = 0;
em->len = SZ_16K;
- em->block_start = 0;
- em->block_len = SZ_4K;
- em->flags |= EXTENT_FLAG_PINNED;
+ em->disk_bytenr = 0;
+ em->disk_num_bytes = SZ_4K;
+ em->ram_bytes = SZ_16K;
+ em->flags |= (EXTENT_FLAG_PINNED | EXTENT_FLAG_COMPRESS_ZLIB);
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
@@ -791,8 +805,9 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
/* [32K, 48K), not pinned */
em->start = SZ_32K;
em->len = SZ_16K;
- em->block_start = SZ_32K;
- em->block_len = SZ_16K;
+ em->disk_bytenr = SZ_32K;
+ em->disk_num_bytes = SZ_16K;
+ em->ram_bytes = SZ_16K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
@@ -855,8 +870,9 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
goto out;
}
- if (em->block_start != SZ_32K + SZ_4K) {
- test_err("em->block_start is %llu, expected 36K", em->block_start);
+ if (extent_map_block_start(em) != SZ_32K + SZ_4K) {
+ test_err("em->block_start is %llu, expected 36K",
+ extent_map_block_start(em));
goto out;
}
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 99da9d34b77a..3ea3bc2225fe 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -117,7 +117,7 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
/* Now for a regular extent */
insert_extent(root, offset, sectorsize - 1, sectorsize - 1, 0,
- disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
+ disk_bytenr, sectorsize - 1, BTRFS_FILE_EXTENT_REG, 0, slot);
slot++;
disk_bytenr += sectorsize;
offset += sectorsize - 1;
@@ -264,8 +264,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != EXTENT_MAP_HOLE) {
- test_err("expected a hole, got %llu", em->block_start);
+ if (em->disk_bytenr != EXTENT_MAP_HOLE) {
+ test_err("expected a hole, got %llu", em->disk_bytenr);
goto out;
}
free_extent_map(em);
@@ -283,8 +283,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != EXTENT_MAP_INLINE) {
- test_err("expected an inline, got %llu", em->block_start);
+ if (em->disk_bytenr != EXTENT_MAP_INLINE) {
+ test_err("expected an inline, got %llu", em->disk_bytenr);
goto out;
}
@@ -321,8 +321,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != EXTENT_MAP_HOLE) {
- test_err("expected a hole, got %llu", em->block_start);
+ if (em->disk_bytenr != EXTENT_MAP_HOLE) {
+ test_err("expected a hole, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != 4) {
@@ -344,8 +344,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize - 1) {
@@ -358,9 +358,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu", em->start,
- em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
offset = em->start + em->len;
@@ -372,8 +371,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -386,12 +385,11 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu", em->start,
- em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- disk_bytenr = em->block_start;
+ disk_bytenr = extent_map_block_start(em);
orig_start = em->start;
offset = em->start + em->len;
free_extent_map(em);
@@ -401,8 +399,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != EXTENT_MAP_HOLE) {
- test_err("expected a hole, got %llu", em->block_start);
+ if (em->disk_bytenr != EXTENT_MAP_HOLE) {
+ test_err("expected a hole, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -423,8 +421,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != 2 * sectorsize) {
@@ -437,15 +435,15 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
- if (em->orig_start != orig_start) {
- test_err("wrong orig offset, want %llu, have %llu",
- orig_start, em->orig_start);
+ if (em->start - em->offset != orig_start) {
+ test_err("wrong offset, em->start=%llu em->offset=%llu orig_start=%llu",
+ em->start, em->offset, orig_start);
goto out;
}
disk_bytenr += (em->start - orig_start);
- if (em->block_start != disk_bytenr) {
+ if (extent_map_block_start(em) != disk_bytenr) {
test_err("wrong block start, want %llu, have %llu",
- disk_bytenr, em->block_start);
+ disk_bytenr, extent_map_block_start(em));
goto out;
}
offset = em->start + em->len;
@@ -457,8 +455,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -472,9 +470,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
prealloc_only, em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu", em->start,
- em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
offset = em->start + em->len;
@@ -486,8 +483,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -501,12 +498,11 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
prealloc_only, em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu", em->start,
- em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- disk_bytenr = em->block_start;
+ disk_bytenr = extent_map_block_start(em);
orig_start = em->start;
offset = em->start + em->len;
free_extent_map(em);
@@ -516,8 +512,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_HOLE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_HOLE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -530,15 +526,14 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
- if (em->orig_start != orig_start) {
- test_err("unexpected orig offset, wanted %llu, have %llu",
- orig_start, em->orig_start);
+ if (em->start - em->offset != orig_start) {
+ test_err("unexpected offset, wanted %llu, have %llu",
+ em->start - orig_start, em->offset);
goto out;
}
- if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
+ if (extent_map_block_start(em) != disk_bytenr + em->offset) {
test_err("unexpected block start, wanted %llu, have %llu",
- disk_bytenr + (em->start - em->orig_start),
- em->block_start);
+ disk_bytenr + em->offset, extent_map_block_start(em));
goto out;
}
offset = em->start + em->len;
@@ -549,8 +544,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != 2 * sectorsize) {
@@ -564,15 +559,14 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
prealloc_only, em->flags);
goto out;
}
- if (em->orig_start != orig_start) {
- test_err("wrong orig offset, want %llu, have %llu", orig_start,
- em->orig_start);
+ if (em->start - em->offset != orig_start) {
+ test_err("wrong offset, em->start=%llu em->offset=%llu orig_start=%llu",
+ em->start, em->offset, orig_start);
goto out;
}
- if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
+ if (extent_map_block_start(em) != disk_bytenr + em->offset) {
test_err("unexpected block start, wanted %llu, have %llu",
- disk_bytenr + (em->start - em->orig_start),
- em->block_start);
+ disk_bytenr + em->offset, extent_map_block_start(em));
goto out;
}
offset = em->start + em->len;
@@ -584,8 +578,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != 2 * sectorsize) {
@@ -599,9 +593,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
compressed_only, em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu",
- em->start, em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
@@ -618,8 +611,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -633,9 +626,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
compressed_only, em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu",
- em->start, em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
@@ -643,7 +635,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
goto out;
}
- disk_bytenr = em->block_start;
+ disk_bytenr = extent_map_block_start(em);
orig_start = em->start;
offset = em->start + em->len;
free_extent_map(em);
@@ -653,8 +645,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -667,9 +659,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu", em->start,
- em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
offset = em->start + em->len;
@@ -680,9 +671,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != disk_bytenr) {
+ if (extent_map_block_start(em) != disk_bytenr) {
test_err("block start does not match, want %llu got %llu",
- disk_bytenr, em->block_start);
+ disk_bytenr, extent_map_block_start(em));
goto out;
}
if (em->start != offset || em->len != 2 * sectorsize) {
@@ -696,9 +687,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
compressed_only, em->flags);
goto out;
}
- if (em->orig_start != orig_start) {
- test_err("wrong orig offset, want %llu, have %llu",
- em->start, orig_start);
+ if (em->start - em->offset != orig_start) {
+ test_err("wrong offset, em->start=%llu em->offset=%llu orig_start=%llu",
+ em->start, em->offset, orig_start);
goto out;
}
if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
@@ -715,8 +706,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -729,9 +720,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu", em->start,
- em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
offset = em->start + em->len;
@@ -742,8 +732,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != EXTENT_MAP_HOLE) {
- test_err("expected a hole extent, got %llu", em->block_start);
+ if (em->disk_bytenr != EXTENT_MAP_HOLE) {
+ test_err("expected a hole extent, got %llu", em->disk_bytenr);
goto out;
}
/*
@@ -762,9 +752,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
vacancy_only, em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu", em->start,
- em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
offset = em->start + em->len;
@@ -775,8 +764,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
+ test_err("expected a real extent, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != offset || em->len != sectorsize) {
@@ -789,9 +778,8 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
- if (em->orig_start != em->start) {
- test_err("wrong orig offset, want %llu, have %llu", em->start,
- em->orig_start);
+ if (em->offset != 0) {
+ test_err("wrong orig offset, want 0, have %llu", em->offset);
goto out;
}
ret = 0;
@@ -855,8 +843,8 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != EXTENT_MAP_HOLE) {
- test_err("expected a hole, got %llu", em->block_start);
+ if (em->disk_bytenr != EXTENT_MAP_HOLE) {
+ test_err("expected a hole, got %llu", em->disk_bytenr);
goto out;
}
if (em->start != 0 || em->len != sectorsize) {
@@ -877,8 +865,8 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
- if (em->block_start != sectorsize) {
- test_err("expected a real extent, got %llu", em->block_start);
+ if (extent_map_block_start(em) != sectorsize) {
+ test_err("expected a real extent, got %llu", extent_map_block_start(em));
goto out;
}
if (em->start != sectorsize || em->len != sectorsize) {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3388c836b9a5..5e6fff8e1003 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -405,7 +405,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
int ret = 0;
if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
- root->last_trans < trans->transid) || force) {
+ btrfs_get_root_last_trans(root) < trans->transid) || force) {
WARN_ON(!force && root->commit_root != root->node);
/*
@@ -421,7 +421,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
smp_wmb();
spin_lock(&fs_info->fs_roots_radix_lock);
- if (root->last_trans == trans->transid && !force) {
+ if (btrfs_get_root_last_trans(root) == trans->transid && !force) {
spin_unlock(&fs_info->fs_roots_radix_lock);
return 0;
}
@@ -429,7 +429,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
(unsigned long)btrfs_root_id(root),
BTRFS_ROOT_TRANS_TAG);
spin_unlock(&fs_info->fs_roots_radix_lock);
- root->last_trans = trans->transid;
+ btrfs_set_root_last_trans(root, trans->transid);
/* this is pretty tricky. We don't want to
* take the relocation lock in btrfs_record_root_in_trans
@@ -491,7 +491,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
* and barriers
*/
smp_rmb();
- if (root->last_trans == trans->transid &&
+ if (btrfs_get_root_last_trans(root) == trans->transid &&
!test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
return 0;
@@ -1637,7 +1637,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_root *root = pending->root;
struct btrfs_root *parent_root;
struct btrfs_block_rsv *rsv;
- struct inode *parent_inode = pending->dir;
+ struct inode *parent_inode = &pending->dir->vfs_inode;
struct btrfs_path *path;
struct btrfs_dir_item *dir_item;
struct extent_buffer *tmp;
@@ -1989,6 +1989,25 @@ void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
btrfs_put_transaction(cur_trans);
}
+/*
+ * If there is a running transaction commit it or if it's already committing,
+ * wait for its commit to complete. Does not start and commit a new transaction
+ * if there isn't any running.
+ */
+int btrfs_commit_current_transaction(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+
+ trans = btrfs_attach_transaction_barrier(root);
+ if (IS_ERR(trans)) {
+ int ret = PTR_ERR(trans);
+
+ return (ret == -ENOENT) ? 0 : ret;
+ }
+
+ return btrfs_commit_transaction(trans);
+}
+
static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -2110,7 +2129,7 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
{
if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
}
/*
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 4e451ab173b1..98c03ddc760b 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -172,7 +172,7 @@ struct btrfs_trans_handle {
struct btrfs_pending_snapshot {
struct dentry *dentry;
- struct inode *dir;
+ struct btrfs_inode *dir;
struct btrfs_root *root;
struct btrfs_root_item *root_item;
struct btrfs_root *snap;
@@ -229,11 +229,11 @@ bool __cold abort_should_print_stack(int error);
*/
#define btrfs_abort_transaction(trans, error) \
do { \
- bool first = false; \
+ bool __first = false; \
/* Report first abort since mount */ \
if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
&((trans)->fs_info->fs_state))) { \
- first = true; \
+ __first = true; \
if (WARN(abort_should_print_stack(error), \
KERN_ERR \
"BTRFS: Transaction aborted (error %d)\n", \
@@ -246,7 +246,7 @@ do { \
} \
} \
__btrfs_abort_transaction((trans), __func__, \
- __LINE__, (error), first); \
+ __LINE__, (error), __first); \
} while (0)
int btrfs_end_transaction(struct btrfs_trans_handle *trans);
@@ -268,6 +268,7 @@ void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info);
int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);
+int btrfs_commit_current_transaction(struct btrfs_root *root);
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
void btrfs_throttle(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index a2c3651a3d8f..6388786fd8b5 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -340,6 +340,24 @@ static int check_extent_data_item(struct extent_buffer *leaf,
}
}
+ /*
+ * For non-compressed data extents, ram_bytes should match its
+ * disk_num_bytes.
+ * However we do not really utilize ram_bytes in this case, so this check
+ * is only optional for DEBUG builds for developers to catch the
+ * unexpected behaviors.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG) &&
+ btrfs_file_extent_compression(leaf, fi) == BTRFS_COMPRESS_NONE &&
+ btrfs_file_extent_disk_bytenr(leaf, fi)) {
+ if (WARN_ON(btrfs_file_extent_ram_bytes(leaf, fi) !=
+ btrfs_file_extent_disk_num_bytes(leaf, fi)))
+ file_extent_err(leaf, slot,
+"mismatch ram_bytes (%llu) and disk_num_bytes (%llu) for non-compressed extent",
+ btrfs_file_extent_ram_bytes(leaf, fi),
+ btrfs_file_extent_disk_num_bytes(leaf, fi));
+ }
+
return 0;
}
@@ -1682,9 +1700,6 @@ static int check_inode_ref(struct extent_buffer *leaf,
static int check_raid_stripe_extent(const struct extent_buffer *leaf,
const struct btrfs_key *key, int slot)
{
- struct btrfs_stripe_extent *stripe_extent =
- btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
-
if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) {
generic_err(leaf, slot,
"invalid key objectid for raid stripe extent, have %llu expect aligned to %u",
@@ -1698,22 +1713,6 @@ static int check_raid_stripe_extent(const struct extent_buffer *leaf,
return -EUCLEAN;
}
- switch (btrfs_stripe_extent_encoding(leaf, stripe_extent)) {
- case BTRFS_STRIPE_RAID0:
- case BTRFS_STRIPE_RAID1:
- case BTRFS_STRIPE_DUP:
- case BTRFS_STRIPE_RAID10:
- case BTRFS_STRIPE_RAID5:
- case BTRFS_STRIPE_RAID6:
- case BTRFS_STRIPE_RAID1C3:
- case BTRFS_STRIPE_RAID1C4:
- break;
- default:
- generic_err(leaf, slot, "invalid raid stripe encoding %u",
- btrfs_stripe_extent_encoding(leaf, stripe_extent));
- return -EUCLEAN;
- }
-
return 0;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 26a2e5aa08e9..f0cf8ce26f01 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -138,6 +138,25 @@ static void wait_log_commit(struct btrfs_root *root, int transid);
* and once to do all the other items.
*/
+static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
+{
+ unsigned int nofs_flag;
+ struct inode *inode;
+
+ /*
+ * We're holding a transaction handle whether we are logging or
+ * replaying a log tree, so we must make sure NOFS semantics apply
+ * because btrfs_alloc_inode() may be triggered and it uses GFP_KERNEL
+ * to allocate an inode, which can recurse back into the filesystem and
+ * attempt a transaction commit, resulting in a deadlock.
+ */
+ nofs_flag = memalloc_nofs_save();
+ inode = btrfs_iget(objectid, root);
+ memalloc_nofs_restore(nofs_flag);
+
+ return inode;
+}
+
/*
* start a sub transaction and setup the log tree
* this increments the log tree writer count to make the people
@@ -600,7 +619,7 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root,
{
struct inode *inode;
- inode = btrfs_iget(root->fs_info->sb, objectid, root);
+ inode = btrfs_iget_logging(objectid, root);
if (IS_ERR(inode))
inode = NULL;
return inode;
@@ -1625,7 +1644,8 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (ret)
goto out;
}
- BTRFS_I(inode)->index_cnt = (u64)-1;
+ if (S_ISDIR(inode->i_mode))
+ BTRFS_I(inode)->index_cnt = (u64)-1;
if (inode->i_nlink == 0) {
if (S_ISDIR(inode->i_mode)) {
@@ -2820,7 +2840,7 @@ static void wait_for_writer(struct btrfs_root *root)
finish_wait(&root->log_writer_wait, &wait);
}
-void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct inode *inode)
+void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct btrfs_inode *inode)
{
ctx->log_ret = 0;
ctx->log_transid = 0;
@@ -2839,7 +2859,7 @@ void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct inode *inode)
void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx)
{
- struct btrfs_inode *inode = BTRFS_I(ctx->inode);
+ struct btrfs_inode *inode = ctx->inode;
if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
!test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
@@ -2857,7 +2877,7 @@ void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
struct btrfs_ordered_extent *ordered;
struct btrfs_ordered_extent *tmp;
- ASSERT(inode_is_locked(ctx->inode));
+ ASSERT(inode_is_locked(&ctx->inode->vfs_inode));
list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
list_del_init(&ordered->log_list);
@@ -4234,8 +4254,10 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, bool inode_item_dropped)
{
struct btrfs_inode_item *inode_item;
+ struct btrfs_key key;
int ret;
+ btrfs_get_inode_key(inode, &key);
/*
* If we are doing a fast fsync and the inode was logged before in the
* current transaction, then we know the inode was previously logged and
@@ -4247,7 +4269,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
* already exists can also result in unnecessarily splitting a leaf.
*/
if (!inode_item_dropped && inode->logged_trans == trans->transid) {
- ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
+ ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
ASSERT(ret <= 0);
if (ret > 0)
ret = -ENOENT;
@@ -4261,7 +4283,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
* the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
* flags and set ->logged_trans to 0.
*/
- ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
+ ret = btrfs_insert_empty_item(trans, log, path, &key,
sizeof(*inode_item));
ASSERT(ret != -EEXIST);
}
@@ -4575,6 +4597,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
{
struct btrfs_ordered_extent *ordered;
struct btrfs_root *csum_root;
+ u64 block_start;
u64 csum_offset;
u64 csum_len;
u64 mod_start = em->start;
@@ -4584,7 +4607,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
if (inode->flags & BTRFS_INODE_NODATASUM ||
(em->flags & EXTENT_FLAG_PREALLOC) ||
- em->block_start == EXTENT_MAP_HOLE)
+ em->disk_bytenr == EXTENT_MAP_HOLE)
return 0;
list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
@@ -4648,17 +4671,18 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
/* If we're compressed we have to save the entire range of csums. */
if (extent_map_is_compressed(em)) {
csum_offset = 0;
- csum_len = max(em->block_len, em->orig_block_len);
+ csum_len = em->disk_num_bytes;
} else {
csum_offset = mod_start - em->start;
csum_len = mod_len;
}
/* block start is already adjusted for the file extent offset. */
- csum_root = btrfs_csum_root(trans->fs_info, em->block_start);
- ret = btrfs_lookup_csums_list(csum_root, em->block_start + csum_offset,
- em->block_start + csum_offset +
- csum_len - 1, &ordered_sums, false);
+ block_start = extent_map_block_start(em);
+ csum_root = btrfs_csum_root(trans->fs_info, block_start);
+ ret = btrfs_lookup_csums_list(csum_root, block_start + csum_offset,
+ block_start + csum_offset + csum_len - 1,
+ &ordered_sums, false);
if (ret < 0)
return ret;
ret = 0;
@@ -4688,7 +4712,8 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_key key;
enum btrfs_compression_type compress_type;
- u64 extent_offset = em->start - em->orig_start;
+ u64 extent_offset = em->offset;
+ u64 block_start = extent_map_block_start(em);
u64 block_len;
int ret;
@@ -4698,14 +4723,13 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
else
btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
- block_len = max(em->block_len, em->orig_block_len);
+ block_len = em->disk_num_bytes;
compress_type = extent_map_compression(em);
if (compress_type != BTRFS_COMPRESS_NONE) {
- btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start);
+ btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start);
btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
- } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start -
- extent_offset);
+ } else if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
+ btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start - extent_offset);
btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
}
@@ -5438,7 +5462,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
struct btrfs_root *root = start_inode->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
LIST_HEAD(dir_list);
struct btrfs_dir_list *dir_elem;
@@ -5499,7 +5522,7 @@ again:
continue;
btrfs_release_path(path);
- di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
+ di_inode = btrfs_iget_logging(di_key.objectid, root);
if (IS_ERR(di_inode)) {
ret = PTR_ERR(di_inode);
goto out;
@@ -5559,7 +5582,7 @@ again:
btrfs_add_delayed_iput(curr_inode);
curr_inode = NULL;
- vfs_inode = btrfs_iget(fs_info->sb, ino, root);
+ vfs_inode = btrfs_iget_logging(ino, root);
if (IS_ERR(vfs_inode)) {
ret = PTR_ERR(vfs_inode);
break;
@@ -5654,7 +5677,7 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
return BTRFS_LOG_FORCE_COMMIT;
- inode = btrfs_iget(root->fs_info->sb, ino, root);
+ inode = btrfs_iget_logging(ino, root);
/*
* If the other inode that had a conflicting dir entry was deleted in
* the current transaction then we either:
@@ -5755,7 +5778,6 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_log_ctx *ctx)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
/*
@@ -5786,7 +5808,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
list_del(&curr->list);
kfree(curr);
- inode = btrfs_iget(fs_info->sb, ino, root);
+ inode = btrfs_iget_logging(ino, root);
/*
* If the other inode that had a conflicting dir entry was
* deleted in the current transaction, we need to log its parent
@@ -5797,7 +5819,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
if (ret != -ENOENT)
break;
- inode = btrfs_iget(fs_info->sb, parent, root);
+ inode = btrfs_iget_logging(parent, root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
break;
@@ -5910,7 +5932,7 @@ again:
if (ret < 0) {
return ret;
} else if (ret > 0 &&
- other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
+ other_ino != btrfs_ino(ctx->inode)) {
if (ins_nr > 0) {
ins_nr++;
} else {
@@ -6319,7 +6341,6 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
const bool orig_log_new_dentries = ctx->log_new_dentries;
- struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_item *item;
int ret = 0;
@@ -6345,7 +6366,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
if (key.type == BTRFS_ROOT_ITEM_KEY)
continue;
- di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root);
+ di_inode = btrfs_iget_logging(key.objectid, inode->root);
if (IS_ERR(di_inode)) {
ret = PTR_ERR(di_inode);
break;
@@ -6729,7 +6750,6 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_log_ctx *ctx)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
struct btrfs_path *path;
struct btrfs_key key;
@@ -6794,8 +6814,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
cur_offset = item_size;
}
- dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
- root);
+ dir_inode = btrfs_iget_logging(inode_key.objectid, root);
/*
* If the parent inode was deleted, return an error to
* fallback to a transaction commit. This is to prevent
@@ -6857,7 +6876,6 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
while (true) {
- struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
int slot;
struct btrfs_key search_key;
@@ -6872,7 +6890,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
search_key.objectid = found_key.offset;
search_key.type = BTRFS_INODE_ITEM_KEY;
search_key.offset = 0;
- inode = btrfs_iget(fs_info->sb, ino, root);
+ inode = btrfs_iget_logging(ino, root);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -7061,6 +7079,15 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
}
/*
+ * If we're logging an inode from a subvolume created in the current
+ * transaction we must force a commit since the root is not persisted.
+ */
+ if (btrfs_root_generation(&root->root_item) == trans->transid) {
+ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto end_no_trans;
+ }
+
+ /*
* Skip already logged inodes or inodes corresponding to tmpfiles
* (since logging them is pointless, a link count of 0 means they
* will never be accessible).
@@ -7441,6 +7468,24 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
}
/*
+ * Call this when creating a subvolume in a directory.
+ * Because we don't commit a transaction when creating a subvolume, we can't
+ * allow the directory pointing to the subvolume to be logged with an entry that
+ * points to an unpersisted root if we are still in the transaction used to
+ * create the subvolume, so make any attempt to log the directory to result in a
+ * full log sync.
+ * Also we don't need to worry with renames, since btrfs_rename() marks the log
+ * for full commit when renaming a subvolume.
+ */
+void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir)
+{
+ mutex_lock(&dir->log_mutex);
+ dir->last_unlink_trans = trans->transid;
+ mutex_unlock(&dir->log_mutex);
+}
+
+/*
* Update the log after adding a new name for an inode.
*
* @trans: Transaction handle.
@@ -7572,7 +7617,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
goto out;
}
- btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
+ btrfs_init_log_ctx(&ctx, inode);
ctx.logging_new_name = true;
btrfs_init_log_ctx_scratch_eb(&ctx);
/*
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 22e9cbc81577..dc313e6bb2fa 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -37,7 +37,7 @@ struct btrfs_log_ctx {
bool logging_new_delayed_dentries;
/* Indicate if the inode being logged was logged before. */
bool logged_before;
- struct inode *inode;
+ struct btrfs_inode *inode;
struct list_head list;
/* Only used for fast fsyncs. */
struct list_head ordered_extents;
@@ -55,7 +55,7 @@ struct btrfs_log_ctx {
struct extent_buffer *scratch_eb;
};
-void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct inode *inode);
+void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct btrfs_inode *inode);
void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx);
void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx);
@@ -94,6 +94,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
bool for_rename);
void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir);
+void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir);
void btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct dentry *old_dentry, struct btrfs_inode *old_dir,
u64 old_dir_index, struct dentry *parent);
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 183863f4bfa4..fc59b57257d6 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -50,6 +50,7 @@ void ulist_init(struct ulist *ulist)
INIT_LIST_HEAD(&ulist->nodes);
ulist->root = RB_ROOT;
ulist->nnodes = 0;
+ ulist->prealloc = NULL;
}
/*
@@ -68,6 +69,8 @@ void ulist_release(struct ulist *ulist)
list_for_each_entry_safe(node, next, &ulist->nodes, list) {
kfree(node);
}
+ kfree(ulist->prealloc);
+ ulist->prealloc = NULL;
ulist->root = RB_ROOT;
INIT_LIST_HEAD(&ulist->nodes);
}
@@ -105,6 +108,12 @@ struct ulist *ulist_alloc(gfp_t gfp_mask)
return ulist;
}
+void ulist_prealloc(struct ulist *ulist, gfp_t gfp_mask)
+{
+ if (!ulist->prealloc)
+ ulist->prealloc = kzalloc(sizeof(*ulist->prealloc), gfp_mask);
+}
+
/*
* Free dynamically allocated ulist.
*
@@ -206,9 +215,15 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
*old_aux = node->aux;
return 0;
}
- node = kmalloc(sizeof(*node), gfp_mask);
- if (!node)
- return -ENOMEM;
+
+ if (ulist->prealloc) {
+ node = ulist->prealloc;
+ ulist->prealloc = NULL;
+ } else {
+ node = kmalloc(sizeof(*node), gfp_mask);
+ if (!node)
+ return -ENOMEM;
+ }
node->val = val;
node->aux = aux;
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 8e200fe1a2dd..c62a372f1462 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -41,12 +41,14 @@ struct ulist {
struct list_head nodes;
struct rb_root root;
+ struct ulist_node *prealloc;
};
void ulist_init(struct ulist *ulist);
void ulist_release(struct ulist *ulist);
void ulist_reinit(struct ulist *ulist);
struct ulist *ulist_alloc(gfp_t gfp_mask);
+void ulist_prealloc(struct ulist *ulist, gfp_t mask);
void ulist_free(struct ulist *ulist);
int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index b0aff297d67d..eae75bb572b9 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -13,7 +13,7 @@
#include "accessors.h"
#include "uuid-tree.h"
-static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key)
+static void btrfs_uuid_to_key(const u8 *uuid, u8 type, struct btrfs_key *key)
{
key->type = type;
key->objectid = get_unaligned_le64(uuid);
@@ -21,7 +21,7 @@ static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key)
}
/* return -ENOENT for !found, < 0 for errors, or 0 if an item was found */
-static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid,
+static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, const u8 *uuid,
u8 type, u64 subid)
{
int ret;
@@ -81,7 +81,7 @@ out:
return ret;
}
-int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type,
u64 subid_cpu)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -145,7 +145,7 @@ out:
return ret;
}
-int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type,
u64 subid)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -256,7 +256,7 @@ out:
* < 0 if an error occurred
*/
static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
- u8 *uuid, u8 type, u64 subvolid)
+ const u8 *uuid, u8 type, u64 subvolid)
{
int ret = 0;
struct btrfs_root *subvol_root;
diff --git a/fs/btrfs/uuid-tree.h b/fs/btrfs/uuid-tree.h
index 080ede0227ae..a3f5757cc7cf 100644
--- a/fs/btrfs/uuid-tree.h
+++ b/fs/btrfs/uuid-tree.h
@@ -8,9 +8,9 @@
struct btrfs_trans_handle;
struct btrfs_fs_info;
-int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type,
u64 subid);
-int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
+int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type,
u64 subid);
int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c39145e8c4ad..fcedc43ef291 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -722,7 +722,7 @@ error_free_page:
return -EINVAL;
}
-u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb)
+const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb)
{
bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) &
BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
@@ -1380,20 +1380,13 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
bool new_device_added = false;
struct btrfs_device *device = NULL;
struct file *bdev_file;
- u64 bytenr, bytenr_orig;
+ u64 bytenr;
dev_t devt;
int ret;
lockdep_assert_held(&uuid_mutex);
/*
- * we would like to check all the supers, but that would make
- * a btrfs mount succeed after a mkfs from a different FS.
- * So, we need to add a special mount option to scan for
- * later supers, using BTRFS_SUPER_MIRROR_MAX instead
- */
-
- /*
* Avoid an exclusive open here, as the systemd-udev may initiate the
* device scan which may race with the user's mount or mkfs command,
* resulting in failure.
@@ -1407,7 +1400,12 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
if (IS_ERR(bdev_file))
return ERR_CAST(bdev_file);
- bytenr_orig = btrfs_sb_offset(0);
+ /*
+ * We would like to check all the super blocks, but doing so would
+ * allow a mount to succeed after a mkfs from a different filesystem.
+ * Currently, recovery from a bad primary btrfs superblock is done
+ * using the userspace command 'btrfs check --super'.
+ */
ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr);
if (ret) {
device = ERR_PTR(ret);
@@ -1415,7 +1413,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
}
disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr,
- bytenr_orig);
+ btrfs_sb_offset(0));
if (IS_ERR(disk_super)) {
device = ERR_CAST(disk_super);
goto error_bdev_put;
@@ -2991,16 +2989,19 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
if (ret < 0)
goto out;
else if (ret > 0) { /* Logic error or corruption */
- btrfs_handle_fs_error(fs_info, -ENOENT,
- "Failed lookup while freeing chunk.");
- ret = -ENOENT;
+ btrfs_err(fs_info, "failed to lookup chunk %llu when freeing",
+ chunk_offset);
+ btrfs_abort_transaction(trans, -ENOENT);
+ ret = -EUCLEAN;
goto out;
}
ret = btrfs_del_item(trans, root, path);
- if (ret < 0)
- btrfs_handle_fs_error(fs_info, ret,
- "Failed to delete chunk item.");
+ if (ret < 0) {
+ btrfs_err(fs_info, "failed to delete chunk %llu item", chunk_offset);
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
out:
btrfs_free_path(path);
return ret;
@@ -5628,8 +5629,6 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
u64 start = ctl->start;
u64 type = ctl->type;
int ret;
- int i;
- int j;
map = btrfs_alloc_chunk_map(ctl->num_stripes, GFP_NOFS);
if (!map)
@@ -5644,8 +5643,8 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
map->sub_stripes = ctl->sub_stripes;
map->num_stripes = ctl->num_stripes;
- for (i = 0; i < ctl->ndevs; ++i) {
- for (j = 0; j < ctl->dev_stripes; ++j) {
+ for (int i = 0; i < ctl->ndevs; i++) {
+ for (int j = 0; j < ctl->dev_stripes; j++) {
int s = i * ctl->dev_stripes + j;
map->stripes[s].dev = devices_info[i].dev;
map->stripes[s].physical = devices_info[i].dev_offset +
@@ -6288,20 +6287,19 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
return ret;
}
-static void handle_ops_on_dev_replace(enum btrfs_map_op op,
- struct btrfs_io_context *bioc,
+static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc,
struct btrfs_dev_replace *dev_replace,
u64 logical,
- int *num_stripes_ret, int *max_errors_ret)
+ struct btrfs_io_geometry *io_geom)
{
u64 srcdev_devid = dev_replace->srcdev->devid;
/*
* At this stage, num_stripes is still the real number of stripes,
* excluding the duplicated stripes.
*/
- int num_stripes = *num_stripes_ret;
+ int num_stripes = io_geom->num_stripes;
+ int max_errors = io_geom->max_errors;
int nr_extra_stripes = 0;
- int max_errors = *max_errors_ret;
int i;
/*
@@ -6342,7 +6340,7 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
* replace.
* If we have 2 extra stripes, only choose the one with smaller physical.
*/
- if (op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) {
+ if (io_geom->op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) {
struct btrfs_io_stripe *first = &bioc->stripes[num_stripes];
struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1];
@@ -6360,8 +6358,8 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
}
}
- *num_stripes_ret = num_stripes + nr_extra_stripes;
- *max_errors_ret = max_errors + nr_extra_stripes;
+ io_geom->num_stripes = num_stripes + nr_extra_stripes;
+ io_geom->max_errors = max_errors + nr_extra_stripes;
bioc->replace_nr_stripes = nr_extra_stripes;
}
@@ -6624,7 +6622,6 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
struct btrfs_chunk_map *map;
struct btrfs_io_geometry io_geom = { 0 };
u64 map_offset;
- int i;
int ret = 0;
int num_copies;
struct btrfs_io_context *bioc = NULL;
@@ -6770,7 +6767,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* For all other non-RAID56 profiles, just copy the target
* stripe into the bioc.
*/
- for (i = 0; i < io_geom.num_stripes; i++) {
+ for (int i = 0; i < io_geom.num_stripes; i++) {
ret = set_io_stripe(fs_info, logical, length,
&bioc->stripes[i], map, &io_geom);
if (ret < 0)
@@ -6790,8 +6787,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
op != BTRFS_MAP_READ) {
- handle_ops_on_dev_replace(op, bioc, dev_replace, logical,
- &io_geom.num_stripes, &io_geom.max_errors);
+ handle_ops_on_dev_replace(bioc, dev_replace, logical, &io_geom);
}
*bioc_ret = bioc;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 66e6fc481ecd..37a09ebb34dd 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -834,6 +834,6 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
-u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb);
+const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb);
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 15d0999e340e..738c7bb8ea7c 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -24,7 +24,7 @@
#include "accessors.h"
#include "dir-item.h"
-int btrfs_getxattr(struct inode *inode, const char *name,
+int btrfs_getxattr(const struct inode *inode, const char *name,
void *buffer, size_t size)
{
struct btrfs_dir_item *di;
@@ -451,7 +451,7 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_set_prop(trans, inode, name, value, size, flags);
+ ret = btrfs_set_prop(trans, BTRFS_I(inode), name, value, size, flags);
if (!ret) {
inode_inc_iversion(inode);
inode_set_ctime_current(inode);
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index b9376ea258ff..8dc4cf49f6f0 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -14,7 +14,7 @@ struct btrfs_trans_handle;
extern const struct xattr_handler * const btrfs_xattr_handlers[];
-int btrfs_getxattr(struct inode *inode, const char *name,
+int btrfs_getxattr(const struct inode *inode, const char *name,
void *buffer, size_t size);
int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
const char *name, const void *value, size_t size, int flags);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index d9e5c88a0f85..30971dd741e2 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -18,6 +18,7 @@
#include <linux/pagemap.h>
#include <linux/bio.h>
#include <linux/refcount.h>
+#include "btrfs_inode.h"
#include "compression.h"
/* workspace buffer size for s390 zlib hardware support */
@@ -112,8 +113,13 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
*total_out = 0;
*total_in = 0;
- if (Z_OK != zlib_deflateInit(&workspace->strm, workspace->level)) {
- pr_warn("BTRFS: deflateInit failed\n");
+ ret = zlib_deflateInit(&workspace->strm, workspace->level);
+ if (unlikely(ret != Z_OK)) {
+ struct btrfs_inode *inode = BTRFS_I(mapping->host);
+
+ btrfs_err(inode->root->fs_info,
+ "zlib compression init failed, error %d root %llu inode %llu offset %llu",
+ ret, btrfs_root_id(inode->root), btrfs_ino(inode), start);
ret = -EIO;
goto out;
}
@@ -182,9 +188,13 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
}
ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
- if (ret != Z_OK) {
- pr_debug("BTRFS: deflate in loop returned %d\n",
- ret);
+ if (unlikely(ret != Z_OK)) {
+ struct btrfs_inode *inode = BTRFS_I(mapping->host);
+
+ btrfs_warn(inode->root->fs_info,
+ "zlib compression failed, error %d root %llu inode %llu offset %llu",
+ ret, btrfs_root_id(inode->root), btrfs_ino(inode),
+ start);
zlib_deflateEnd(&workspace->strm);
ret = -EIO;
goto out;
@@ -307,9 +317,14 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
workspace->strm.avail_in -= 2;
}
- if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
- pr_warn("BTRFS: inflateInit failed\n");
+ ret = zlib_inflateInit2(&workspace->strm, wbits);
+ if (unlikely(ret != Z_OK)) {
+ struct btrfs_inode *inode = cb->bbio.inode;
+
kunmap_local(data_in);
+ btrfs_err(inode->root->fs_info,
+ "zlib decompression init failed, error %d root %llu inode %llu offset %llu",
+ ret, btrfs_root_id(inode->root), btrfs_ino(inode), cb->start);
return -EIO;
}
while (workspace->strm.total_in < srclen) {
@@ -348,10 +363,15 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
workspace->strm.avail_in = min(tmp, PAGE_SIZE);
}
}
- if (ret != Z_STREAM_END)
+ if (unlikely(ret != Z_STREAM_END)) {
+ btrfs_err(cb->bbio.inode->root->fs_info,
+ "zlib decompression failed, error %d root %llu inode %llu offset %llu",
+ ret, btrfs_root_id(cb->bbio.inode->root),
+ btrfs_ino(cb->bbio.inode), cb->start);
ret = -EIO;
- else
+ } else {
ret = 0;
+ }
done:
zlib_inflateEnd(&workspace->strm);
if (data_in)
@@ -386,8 +406,14 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in,
workspace->strm.avail_in -= 2;
}
- if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
- pr_warn("BTRFS: inflateInit failed\n");
+ ret = zlib_inflateInit2(&workspace->strm, wbits);
+ if (unlikely(ret != Z_OK)) {
+ struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host);
+
+ btrfs_err(inode->root->fs_info,
+ "zlib decompression init failed, error %d root %llu inode %llu offset %llu",
+ ret, btrfs_root_id(inode->root), btrfs_ino(inode),
+ page_offset(dest_page));
return -EIO;
}
@@ -404,8 +430,12 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in,
out:
if (unlikely(to_copy != destlen)) {
- pr_warn_ratelimited("BTRFS: inflate failed, decompressed=%lu expected=%zu\n",
- to_copy, destlen);
+ struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host);
+
+ btrfs_err(inode->root->fs_info,
+"zlib decompression failed, error %d root %llu inode %llu offset %llu decompressed %lu expected %zu",
+ ret, btrfs_root_id(inode->root), btrfs_ino(inode),
+ page_offset(dest_page), to_copy, destlen);
ret = -EIO;
} else {
ret = 0;
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 947a87576f6c..66f63e82af79 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -87,9 +87,8 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
bool empty[BTRFS_NR_SB_LOG_ZONES];
bool full[BTRFS_NR_SB_LOG_ZONES];
sector_t sector;
- int i;
- for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL);
empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY);
full[i] = sb_zone_is_full(&zones[i]);
@@ -121,9 +120,8 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
struct address_space *mapping = bdev->bd_mapping;
struct page *page[BTRFS_NR_SB_LOG_ZONES];
struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
- int i;
- for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+ for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
u64 zone_end = (zones[i].start + zones[i].capacity) << SECTOR_SHIFT;
u64 bytenr = ALIGN_DOWN(zone_end, BTRFS_SUPER_INFO_SIZE) -
BTRFS_SUPER_INFO_SIZE;
@@ -144,7 +142,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
else
sector = zones[0].start;
- for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
+ for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
btrfs_release_disk_super(super[i]);
} else if (!full[0] && (empty[1] || full[1])) {
sector = zones[0].wp;
@@ -652,8 +650,7 @@ out:
return NULL;
}
-int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
- struct blk_zone *zone)
+static int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone)
{
unsigned int nr_zones = 1;
int ret;
@@ -770,7 +767,8 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
return 0;
}
-int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt)
+int btrfs_check_mountopts_zoned(const struct btrfs_fs_info *info,
+ unsigned long long *mount_opt)
{
if (!btrfs_is_zoned(info))
return 0;
@@ -1726,7 +1724,7 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio)
if (!btrfs_is_zoned(fs_info))
return false;
- if (!inode || !is_data_inode(&inode->vfs_inode))
+ if (!inode || !is_data_inode(inode))
return false;
if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE)
@@ -1768,7 +1766,7 @@ void btrfs_record_physical_zoned(struct btrfs_bio *bbio)
static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered,
u64 logical)
{
- struct extent_map_tree *em_tree = &BTRFS_I(ordered->inode)->extent_tree;
+ struct extent_map_tree *em_tree = &ordered->inode->extent_tree;
struct extent_map *em;
ordered->disk_bytenr = logical;
@@ -1776,7 +1774,9 @@ static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered,
write_lock(&em_tree->lock);
em = search_extent_mapping(em_tree, ordered->file_offset,
ordered->num_bytes);
- em->block_start = logical;
+ /* The em should be a new COW extent, thus it should not have an offset. */
+ ASSERT(em->offset == 0);
+ em->disk_bytenr = logical;
free_extent_map(em);
write_unlock(&em_tree->lock);
}
@@ -1787,7 +1787,7 @@ static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered,
struct btrfs_ordered_extent *new;
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
- split_extent_map(BTRFS_I(ordered->inode), ordered->file_offset,
+ split_extent_map(ordered->inode, ordered->file_offset,
ordered->num_bytes, len, logical))
return false;
@@ -1801,7 +1801,7 @@ static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered,
void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered)
{
- struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_inode *inode = ordered->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_sum *sum;
u64 logical, len;
@@ -1845,7 +1845,7 @@ out:
* here so that we don't attempt to log the csums later.
*/
if ((inode->flags & BTRFS_INODE_NODATASUM) ||
- test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) {
+ test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state)) {
while ((sum = list_first_entry_or_null(&ordered->list,
typeof(*sum), list))) {
list_del(&sum->list);
@@ -2215,8 +2215,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
/* Ensure all writes in this block group finish */
btrfs_wait_block_group_reservations(block_group);
/* No need to wait for NOCOW writers. Zoned mode does not allow that */
- btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
- block_group->length);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group);
/* Wait for extent buffers to be written. */
if (is_metadata)
wait_eb_writebacks(block_group);
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 77c4321e331f..30b2e48a1cec 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -53,14 +53,13 @@ struct btrfs_zoned_device_info {
void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered);
#ifdef CONFIG_BLK_DEV_ZONED
-int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
- struct blk_zone *zone);
int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info);
int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache);
void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev);
int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
-int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt);
+int btrfs_check_mountopts_zoned(const struct btrfs_fs_info *info,
+ unsigned long long *mount_opt);
int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
u64 *bytenr_ret);
int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
@@ -98,11 +97,6 @@ int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, bool do_finish);
void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info);
#else /* CONFIG_BLK_DEV_ZONED */
-static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
- struct blk_zone *zone)
-{
- return 0;
-}
static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
{
@@ -136,8 +130,8 @@ static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info)
return -EOPNOTSUPP;
}
-static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info,
- unsigned long *mount_opt)
+static inline int btrfs_check_mountopts_zoned(const struct btrfs_fs_info *info,
+ unsigned long long *mount_opt)
{
return 0;
}
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 2b232b82c3a8..2a079561b2b1 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -19,6 +19,7 @@
#include <linux/zstd.h>
#include "misc.h"
#include "fs.h"
+#include "btrfs_inode.h"
#include "compression.h"
#include "super.h"
@@ -399,8 +400,13 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
/* Initialize the stream */
stream = zstd_init_cstream(&params, len, workspace->mem,
workspace->size);
- if (!stream) {
- pr_warn("BTRFS: zstd_init_cstream failed\n");
+ if (unlikely(!stream)) {
+ struct btrfs_inode *inode = BTRFS_I(mapping->host);
+
+ btrfs_err(inode->root->fs_info,
+ "zstd compression init level %d failed, root %llu inode %llu offset %llu",
+ workspace->req_level, btrfs_root_id(inode->root),
+ btrfs_ino(inode), start);
ret = -EIO;
goto out;
}
@@ -429,9 +435,14 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
ret2 = zstd_compress_stream(stream, &workspace->out_buf,
&workspace->in_buf);
- if (zstd_is_error(ret2)) {
- pr_debug("BTRFS: zstd_compress_stream returned %d\n",
- zstd_get_error_code(ret2));
+ if (unlikely(zstd_is_error(ret2))) {
+ struct btrfs_inode *inode = BTRFS_I(mapping->host);
+
+ btrfs_warn(inode->root->fs_info,
+"zstd compression level %d failed, error %d root %llu inode %llu offset %llu",
+ workspace->req_level, zstd_get_error_code(ret2),
+ btrfs_root_id(inode->root), btrfs_ino(inode),
+ start);
ret = -EIO;
goto out;
}
@@ -497,9 +508,14 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
size_t ret2;
ret2 = zstd_end_stream(stream, &workspace->out_buf);
- if (zstd_is_error(ret2)) {
- pr_debug("BTRFS: zstd_end_stream returned %d\n",
- zstd_get_error_code(ret2));
+ if (unlikely(zstd_is_error(ret2))) {
+ struct btrfs_inode *inode = BTRFS_I(mapping->host);
+
+ btrfs_err(inode->root->fs_info,
+"zstd compression end level %d failed, error %d root %llu inode %llu offset %llu",
+ workspace->req_level, zstd_get_error_code(ret2),
+ btrfs_root_id(inode->root), btrfs_ino(inode),
+ start);
ret = -EIO;
goto out;
}
@@ -561,8 +577,12 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
stream = zstd_init_dstream(
ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
- if (!stream) {
- pr_debug("BTRFS: zstd_init_dstream failed\n");
+ if (unlikely(!stream)) {
+ struct btrfs_inode *inode = cb->bbio.inode;
+
+ btrfs_err(inode->root->fs_info,
+ "zstd decompression init failed, root %llu inode %llu offset %llu",
+ btrfs_root_id(inode->root), btrfs_ino(inode), cb->start);
ret = -EIO;
goto done;
}
@@ -580,9 +600,13 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
ret2 = zstd_decompress_stream(stream, &workspace->out_buf,
&workspace->in_buf);
- if (zstd_is_error(ret2)) {
- pr_debug("BTRFS: zstd_decompress_stream returned %d\n",
- zstd_get_error_code(ret2));
+ if (unlikely(zstd_is_error(ret2))) {
+ struct btrfs_inode *inode = cb->bbio.inode;
+
+ btrfs_err(inode->root->fs_info,
+ "zstd decompression failed, error %d root %llu inode %llu offset %llu",
+ zstd_get_error_code(ret2), btrfs_root_id(inode->root),
+ btrfs_ino(inode), cb->start);
ret = -EIO;
goto done;
}
@@ -637,8 +661,14 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in,
stream = zstd_init_dstream(
ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
- if (!stream) {
- pr_warn("BTRFS: zstd_init_dstream failed\n");
+ if (unlikely(!stream)) {
+ struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host);
+
+ btrfs_err(inode->root->fs_info,
+ "zstd decompression init failed, root %llu inode %llu offset %llu",
+ btrfs_root_id(inode->root), btrfs_ino(inode),
+ page_offset(dest_page));
+ ret = -EIO;
goto finish;
}
@@ -655,9 +685,13 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in,
* one call should end the decompression.
*/
ret = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf);
- if (zstd_is_error(ret)) {
- pr_warn_ratelimited("BTRFS: zstd_decompress_stream return %d\n",
- zstd_get_error_code(ret));
+ if (unlikely(zstd_is_error(ret))) {
+ struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host);
+
+ btrfs_err(inode->root->fs_info,
+ "zstd decompression failed, error %d root %llu inode %llu offset %llu",
+ zstd_get_error_code(ret), btrfs_root_id(inode->root),
+ btrfs_ino(inode), page_offset(dest_page));
goto finish;
}
to_copy = workspace->out_buf.pos;
diff --git a/fs/buffer.c b/fs/buffer.c
index 8c19e705b9c3..e55ad471c530 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -258,7 +258,6 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
} else {
clear_buffer_uptodate(bh);
buffer_io_error(bh, ", async page read");
- folio_set_error(folio);
}
/*
@@ -391,7 +390,6 @@ static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
buffer_io_error(bh, ", lost async page write");
mark_buffer_write_io_error(bh);
clear_buffer_uptodate(bh);
- folio_set_error(folio);
}
first = folio_buffers(folio);
@@ -1960,7 +1958,6 @@ recover:
clear_buffer_dirty(bh);
}
} while ((bh = bh->b_this_page) != head);
- folio_set_error(folio);
BUG_ON(folio_test_writeback(folio));
mapping_set_error(folio->mapping, err);
folio_start_writeback(folio);
@@ -2187,6 +2184,8 @@ static void __block_commit_write(struct folio *folio, size_t from, size_t to)
struct buffer_head *bh, *head;
bh = head = folio_buffers(folio);
+ if (!bh)
+ return;
blocksize = bh->b_size;
block_start = 0;
@@ -2405,10 +2404,8 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
if (iblock < lblock) {
WARN_ON(bh->b_size != blocksize);
err = get_block(inode, iblock, bh, 0);
- if (err) {
- folio_set_error(folio);
+ if (err)
page_error = true;
- }
}
if (!buffer_mapped(bh)) {
folio_zero_range(folio, i * blocksize,
diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c
index f449f7340aad..9fb06dc16520 100644
--- a/fs/cachefiles/cache.c
+++ b/fs/cachefiles/cache.c
@@ -8,6 +8,7 @@
#include <linux/slab.h>
#include <linux/statfs.h>
#include <linux/namei.h>
+#include <trace/events/fscache.h>
#include "internal.h"
/*
@@ -312,19 +313,59 @@ static void cachefiles_withdraw_objects(struct cachefiles_cache *cache)
}
/*
- * Withdraw volumes.
+ * Withdraw fscache volumes.
+ */
+static void cachefiles_withdraw_fscache_volumes(struct cachefiles_cache *cache)
+{
+ struct list_head *cur;
+ struct cachefiles_volume *volume;
+ struct fscache_volume *vcookie;
+
+ _enter("");
+retry:
+ spin_lock(&cache->object_list_lock);
+ list_for_each(cur, &cache->volumes) {
+ volume = list_entry(cur, struct cachefiles_volume, cache_link);
+
+ if (atomic_read(&volume->vcookie->n_accesses) == 0)
+ continue;
+
+ vcookie = fscache_try_get_volume(volume->vcookie,
+ fscache_volume_get_withdraw);
+ if (vcookie) {
+ spin_unlock(&cache->object_list_lock);
+ fscache_withdraw_volume(vcookie);
+ fscache_put_volume(vcookie, fscache_volume_put_withdraw);
+ goto retry;
+ }
+ }
+ spin_unlock(&cache->object_list_lock);
+
+ _leave("");
+}
+
+/*
+ * Withdraw cachefiles volumes.
*/
static void cachefiles_withdraw_volumes(struct cachefiles_cache *cache)
{
_enter("");
for (;;) {
+ struct fscache_volume *vcookie = NULL;
struct cachefiles_volume *volume = NULL;
spin_lock(&cache->object_list_lock);
if (!list_empty(&cache->volumes)) {
volume = list_first_entry(&cache->volumes,
struct cachefiles_volume, cache_link);
+ vcookie = fscache_try_get_volume(volume->vcookie,
+ fscache_volume_get_withdraw);
+ if (!vcookie) {
+ spin_unlock(&cache->object_list_lock);
+ cpu_relax();
+ continue;
+ }
list_del_init(&volume->cache_link);
}
spin_unlock(&cache->object_list_lock);
@@ -332,6 +373,7 @@ static void cachefiles_withdraw_volumes(struct cachefiles_cache *cache)
break;
cachefiles_withdraw_volume(volume);
+ fscache_put_volume(vcookie, fscache_volume_put_withdraw);
}
_leave("");
@@ -371,6 +413,7 @@ void cachefiles_withdraw_cache(struct cachefiles_cache *cache)
pr_info("File cache on %s unregistering\n", fscache->name);
fscache_withdraw_cache(fscache);
+ cachefiles_withdraw_fscache_volumes(cache);
/* we now have to destroy all the active objects pertaining to this
* cache - which we do by passing them off to thread pool to be
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 06cdf1a8a16f..89b11336a836 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -366,14 +366,14 @@ static __poll_t cachefiles_daemon_poll(struct file *file,
if (cachefiles_in_ondemand_mode(cache)) {
if (!xa_empty(&cache->reqs)) {
- rcu_read_lock();
+ xas_lock(&xas);
xas_for_each_marked(&xas, req, ULONG_MAX, CACHEFILES_REQ_NEW) {
if (!cachefiles_ondemand_is_reopening_read(req)) {
mask |= EPOLLIN;
break;
}
}
- rcu_read_unlock();
+ xas_unlock(&xas);
}
} else {
if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 6845a90cdfcc..7b99bd98de75 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -48,6 +48,7 @@ enum cachefiles_object_state {
CACHEFILES_ONDEMAND_OBJSTATE_CLOSE, /* Anonymous fd closed by daemon or initial state */
CACHEFILES_ONDEMAND_OBJSTATE_OPEN, /* Anonymous fd associated with object is available */
CACHEFILES_ONDEMAND_OBJSTATE_REOPENING, /* Object that was closed and is being reopened. */
+ CACHEFILES_ONDEMAND_OBJSTATE_DROPPING, /* Object is being dropped. */
};
struct cachefiles_ondemand_info {
@@ -128,6 +129,7 @@ struct cachefiles_cache {
unsigned long req_id_next;
struct xarray ondemand_ids; /* xarray for ondemand_id allocation */
u32 ondemand_id_next;
+ u32 msg_id_next;
};
static inline bool cachefiles_in_ondemand_mode(struct cachefiles_cache *cache)
@@ -335,6 +337,7 @@ cachefiles_ondemand_set_object_##_state(struct cachefiles_object *object) \
CACHEFILES_OBJECT_STATE_FUNCS(open, OPEN);
CACHEFILES_OBJECT_STATE_FUNCS(close, CLOSE);
CACHEFILES_OBJECT_STATE_FUNCS(reopening, REOPENING);
+CACHEFILES_OBJECT_STATE_FUNCS(dropping, DROPPING);
static inline bool cachefiles_ondemand_is_reopening_read(struct cachefiles_req *req)
{
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index bce005f2b456..470c96658385 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -517,7 +517,8 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
*/
xas_lock(&xas);
- if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
+ if (test_bit(CACHEFILES_DEAD, &cache->flags) ||
+ cachefiles_ondemand_object_is_dropping(object)) {
xas_unlock(&xas);
ret = -EIO;
goto out;
@@ -527,20 +528,32 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
smp_mb();
if (opcode == CACHEFILES_OP_CLOSE &&
- !cachefiles_ondemand_object_is_open(object)) {
+ !cachefiles_ondemand_object_is_open(object)) {
WARN_ON_ONCE(object->ondemand->ondemand_id == 0);
xas_unlock(&xas);
ret = -EIO;
goto out;
}
- xas.xa_index = 0;
+ /*
+ * Cyclically find a free xas to avoid msg_id reuse that would
+ * cause the daemon to successfully copen a stale msg_id.
+ */
+ xas.xa_index = cache->msg_id_next;
xas_find_marked(&xas, UINT_MAX, XA_FREE_MARK);
+ if (xas.xa_node == XAS_RESTART) {
+ xas.xa_index = 0;
+ xas_find_marked(&xas, cache->msg_id_next - 1, XA_FREE_MARK);
+ }
if (xas.xa_node == XAS_RESTART)
xas_set_err(&xas, -EBUSY);
+
xas_store(&xas, req);
- xas_clear_mark(&xas, XA_FREE_MARK);
- xas_set_mark(&xas, CACHEFILES_REQ_NEW);
+ if (xas_valid(&xas)) {
+ cache->msg_id_next = xas.xa_index + 1;
+ xas_clear_mark(&xas, XA_FREE_MARK);
+ xas_set_mark(&xas, CACHEFILES_REQ_NEW);
+ }
xas_unlock(&xas);
} while (xas_nomem(&xas, GFP_KERNEL));
@@ -568,7 +581,8 @@ out:
* If error occurs after creating the anonymous fd,
* cachefiles_ondemand_fd_release() will set object to close.
*/
- if (opcode == CACHEFILES_OP_OPEN)
+ if (opcode == CACHEFILES_OP_OPEN &&
+ !cachefiles_ondemand_object_is_dropping(object))
cachefiles_ondemand_set_object_close(object);
kfree(req);
return ret;
@@ -667,8 +681,34 @@ int cachefiles_ondemand_init_object(struct cachefiles_object *object)
void cachefiles_ondemand_clean_object(struct cachefiles_object *object)
{
+ unsigned long index;
+ struct cachefiles_req *req;
+ struct cachefiles_cache *cache;
+
+ if (!object->ondemand)
+ return;
+
cachefiles_ondemand_send_req(object, CACHEFILES_OP_CLOSE, 0,
cachefiles_ondemand_init_close_req, NULL);
+
+ if (!object->ondemand->ondemand_id)
+ return;
+
+ /* Cancel all requests for the object that is being dropped. */
+ cache = object->volume->cache;
+ xa_lock(&cache->reqs);
+ cachefiles_ondemand_set_object_dropping(object);
+ xa_for_each(&cache->reqs, index, req) {
+ if (req->object == object) {
+ req->error = -EIO;
+ complete(&req->done);
+ __xa_erase(&cache->reqs, index);
+ }
+ }
+ xa_unlock(&cache->reqs);
+
+ /* Wait for ondemand_object_worker() to finish to avoid UAF. */
+ cancel_work_sync(&object->ondemand->ondemand_work);
}
int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object,
diff --git a/fs/cachefiles/volume.c b/fs/cachefiles/volume.c
index 89df0ba8ba5e..781aac4ef274 100644
--- a/fs/cachefiles/volume.c
+++ b/fs/cachefiles/volume.c
@@ -133,7 +133,6 @@ void cachefiles_free_volume(struct fscache_volume *vcookie)
void cachefiles_withdraw_volume(struct cachefiles_volume *volume)
{
- fscache_withdraw_volume(volume->vcookie);
cachefiles_set_volume_xattr(volume);
__cachefiles_free_volume(volume);
}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index bcb6173943ee..4dd8a993c60a 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -110,9 +110,11 @@ int cachefiles_check_auxdata(struct cachefiles_object *object, struct file *file
if (xlen == 0)
xlen = vfs_getxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, buf, tlen);
if (xlen != tlen) {
- if (xlen < 0)
+ if (xlen < 0) {
+ ret = xlen;
trace_cachefiles_vfs_error(object, file_inode(file), xlen,
cachefiles_trace_getxattr_error);
+ }
if (xlen == -EIO)
cachefiles_io_error_obj(
object,
@@ -252,6 +254,7 @@ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume)
xlen = vfs_getxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, buf, len);
if (xlen != len) {
if (xlen < 0) {
+ ret = xlen;
trace_cachefiles_vfs_error(NULL, d_inode(dentry), xlen,
cachefiles_trace_getxattr_error);
if (xlen == -EIO)
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 82a2e2a06a65..5aadc56e0cc0 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -141,7 +141,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
if (ptr_pos >= i_size_read(dir))
return NULL;
- if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) {
+ if (!cache_ctl->page || ptr_pgoff != cache_ctl->page->index) {
ceph_readdir_cache_release(cache_ctl);
cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff);
if (!cache_ctl->page) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 249ddfbb1b03..8f8de8f33abb 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1863,7 +1863,7 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
unsigned idx = ctl->index % nsize;
pgoff_t pgoff = ctl->index / nsize;
- if (!ctl->page || pgoff != page_index(ctl->page)) {
+ if (!ctl->page || pgoff != ctl->page->index) {
ceph_readdir_cache_release(ctl);
if (idx == 0)
ctl->page = grab_cache_page(&dir->i_data, pgoff);
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index ccdbec388091..40f84d014524 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -31,15 +31,7 @@ static int coda_symlink_filler(struct file *file, struct folio *folio)
cii = ITOC(inode);
error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len);
- if (error)
- goto fail;
- folio_mark_uptodate(folio);
- folio_unlock(folio);
- return 0;
-
-fail:
- folio_set_error(folio);
- folio_unlock(folio);
+ folio_end_read(folio, error == 0);
return error;
}
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 18677cd4e62f..43d6bde1adcc 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -580,6 +580,7 @@ static void detach_attrs(struct config_item * item)
static int populate_attrs(struct config_item *item)
{
const struct config_item_type *t = item->ci_type;
+ struct configfs_group_operations *ops;
struct configfs_attribute *attr;
struct configfs_bin_attribute *bin_attr;
int error = 0;
@@ -587,14 +588,23 @@ static int populate_attrs(struct config_item *item)
if (!t)
return -EINVAL;
+
+ ops = t->ct_group_ops;
+
if (t->ct_attrs) {
for (i = 0; (attr = t->ct_attrs[i]) != NULL; i++) {
+ if (ops && ops->is_visible && !ops->is_visible(item, attr, i))
+ continue;
+
if ((error = configfs_create_file(item, attr)))
break;
}
}
if (t->ct_bin_attrs) {
for (i = 0; (bin_attr = t->ct_bin_attrs[i]) != NULL; i++) {
+ if (ops && ops->is_bin_visible && !ops->is_bin_visible(item, bin_attr, i))
+ continue;
+
error = configfs_create_bin_file(item, bin_attr);
if (error)
break;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 460690ca0174..b84d1747a020 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -811,19 +811,19 @@ out:
static int cramfs_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
u32 maxblock;
int bytes_filled;
void *pgdata;
+ bool success = false;
maxblock = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
bytes_filled = 0;
- pgdata = kmap_local_page(page);
+ pgdata = kmap_local_folio(folio, 0);
- if (page->index < maxblock) {
+ if (folio->index < maxblock) {
struct super_block *sb = inode->i_sb;
- u32 blkptr_offset = OFFSET(inode) + page->index * 4;
+ u32 blkptr_offset = OFFSET(inode) + folio->index * 4;
u32 block_ptr, block_start, block_len;
bool uncompressed, direct;
@@ -844,7 +844,7 @@ static int cramfs_read_folio(struct file *file, struct folio *folio)
if (uncompressed) {
block_len = PAGE_SIZE;
/* if last block: cap to file length */
- if (page->index == maxblock - 1)
+ if (folio->index == maxblock - 1)
block_len =
offset_in_page(inode->i_size);
} else {
@@ -861,7 +861,7 @@ static int cramfs_read_folio(struct file *file, struct folio *folio)
* from the previous block's pointer.
*/
block_start = OFFSET(inode) + maxblock * 4;
- if (page->index)
+ if (folio->index)
block_start = *(u32 *)
cramfs_read(sb, blkptr_offset - 4, 4);
/* Beware... previous ptr might be a direct ptr */
@@ -906,17 +906,12 @@ static int cramfs_read_folio(struct file *file, struct folio *folio)
}
memset(pgdata + bytes_filled, 0, PAGE_SIZE - bytes_filled);
- flush_dcache_page(page);
- kunmap_local(pgdata);
- SetPageUptodate(page);
- unlock_page(page);
- return 0;
+ flush_dcache_folio(folio);
+ success = true;
err:
kunmap_local(pgdata);
- ClearPageUptodate(page);
- SetPageError(page);
- unlock_page(page);
+ folio_end_read(folio, success);
return 0;
}
@@ -1003,4 +998,5 @@ static void __exit exit_cramfs_fs(void)
module_init(init_cramfs_fs)
module_exit(exit_cramfs_fs)
+MODULE_DESCRIPTION("Compressed ROM file system support");
MODULE_LICENSE("GPL");
diff --git a/fs/dcache.c b/fs/dcache.c
index 407095188f83..8bdc278a0205 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -35,6 +35,8 @@
#include "internal.h"
#include "mount.h"
+#include <asm/runtime-const.h>
+
/*
* Usage:
* dcache->d_inode->i_lock protects:
@@ -100,9 +102,10 @@ static unsigned int d_hash_shift __ro_after_init;
static struct hlist_bl_head *dentry_hashtable __ro_after_init;
-static inline struct hlist_bl_head *d_hash(unsigned int hash)
+static inline struct hlist_bl_head *d_hash(unsigned long hashlen)
{
- return dentry_hashtable + (hash >> d_hash_shift);
+ return runtime_const_ptr(dentry_hashtable) +
+ runtime_const_shift_right_32(hashlen, d_hash_shift);
}
#define IN_LOOKUP_SHIFT 10
@@ -355,7 +358,11 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry)
flags &= ~DCACHE_ENTRY_TYPE;
WRITE_ONCE(dentry->d_flags, flags);
dentry->d_inode = NULL;
- if (flags & DCACHE_LRU_LIST)
+ /*
+ * The negative counter only tracks dentries on the LRU. Don't inc if
+ * d_lru is on another list.
+ */
+ if ((flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST)
this_cpu_inc(nr_dentry_negative);
}
@@ -1548,7 +1555,7 @@ void shrink_dcache_for_umount(struct super_block *sb)
{
struct dentry *dentry;
- WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked");
+ rwsem_assert_held_write(&sb->s_umount);
dentry = sb->s_root;
sb->s_root = NULL;
@@ -1844,9 +1851,11 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
spin_lock(&dentry->d_lock);
/*
- * Decrement negative dentry count if it was in the LRU list.
+ * The negative counter only tracks dentries on the LRU. Don't dec if
+ * d_lru is on another list.
*/
- if (dentry->d_flags & DCACHE_LRU_LIST)
+ if ((dentry->d_flags &
+ (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST)
this_cpu_dec(nr_dentry_negative);
hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
raw_write_seqcount_begin(&dentry->d_seq);
@@ -2104,7 +2113,7 @@ static noinline struct dentry *__d_lookup_rcu_op_compare(
unsigned *seqp)
{
u64 hashlen = name->hash_len;
- struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen));
+ struct hlist_bl_head *b = d_hash(hashlen);
struct hlist_bl_node *node;
struct dentry *dentry;
@@ -2171,7 +2180,7 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
{
u64 hashlen = name->hash_len;
const unsigned char *str = name->name;
- struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen));
+ struct hlist_bl_head *b = d_hash(hashlen);
struct hlist_bl_node *node;
struct dentry *dentry;
@@ -3029,28 +3038,25 @@ EXPORT_SYMBOL(d_splice_alias);
bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
{
- bool result;
+ bool subdir;
unsigned seq;
if (new_dentry == old_dentry)
return true;
- do {
- /* for restarting inner loop in case of seq retry */
- seq = read_seqbegin(&rename_lock);
- /*
- * Need rcu_readlock to protect against the d_parent trashing
- * due to d_move
- */
- rcu_read_lock();
- if (d_ancestor(old_dentry, new_dentry))
- result = true;
- else
- result = false;
- rcu_read_unlock();
- } while (read_seqretry(&rename_lock, seq));
-
- return result;
+ /* Access d_parent under rcu as d_move() may change it. */
+ rcu_read_lock();
+ seq = read_seqbegin(&rename_lock);
+ subdir = d_ancestor(old_dentry, new_dentry);
+ /* Try lockless once... */
+ if (read_seqretry(&rename_lock, seq)) {
+ /* ...else acquire lock for progress even on deep chains. */
+ read_seqlock_excl(&rename_lock);
+ subdir = d_ancestor(old_dentry, new_dentry);
+ read_sequnlock_excl(&rename_lock);
+ }
+ rcu_read_unlock();
+ return subdir;
}
EXPORT_SYMBOL(is_subdir);
@@ -3100,6 +3106,34 @@ void d_tmpfile(struct file *file, struct inode *inode)
}
EXPORT_SYMBOL(d_tmpfile);
+/*
+ * Obtain inode number of the parent dentry.
+ */
+ino_t d_parent_ino(struct dentry *dentry)
+{
+ struct dentry *parent;
+ struct inode *iparent;
+ unsigned seq;
+ ino_t ret;
+
+ scoped_guard(rcu) {
+ seq = raw_seqcount_begin(&dentry->d_seq);
+ parent = READ_ONCE(dentry->d_parent);
+ iparent = d_inode_rcu(parent);
+ if (likely(iparent)) {
+ ret = iparent->i_ino;
+ if (!read_seqcount_retry(&dentry->d_seq, seq))
+ return ret;
+ }
+ }
+
+ spin_lock(&dentry->d_lock);
+ ret = dentry->d_parent->d_inode->i_ino;
+ spin_unlock(&dentry->d_lock);
+ return ret;
+}
+EXPORT_SYMBOL(d_parent_ino);
+
static __initdata unsigned long dhash_entries;
static int __init set_dhash_entries(char *str)
{
@@ -3129,6 +3163,9 @@ static void __init dcache_init_early(void)
0,
0);
d_hash_shift = 32 - d_hash_shift;
+
+ runtime_const_init(shift, d_hash_shift);
+ runtime_const_init(ptr, dentry_hashtable);
}
static void __init dcache_init(void)
@@ -3157,6 +3194,9 @@ static void __init dcache_init(void)
0,
0);
d_hash_shift = 32 - d_hash_shift;
+
+ runtime_const_init(shift, d_hash_shift);
+ runtime_const_init(ptr, dentry_hashtable);
}
/* SLAB cache for __getname() consumers */
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 8fd928899a59..91521576f500 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -92,9 +92,9 @@ enum {
};
static const struct fs_parameter_spec debugfs_param_specs[] = {
- fsparam_u32 ("gid", Opt_gid),
+ fsparam_gid ("gid", Opt_gid),
fsparam_u32oct ("mode", Opt_mode),
- fsparam_u32 ("uid", Opt_uid),
+ fsparam_uid ("uid", Opt_uid),
{}
};
@@ -102,8 +102,6 @@ static int debugfs_parse_param(struct fs_context *fc, struct fs_parameter *param
{
struct debugfs_fs_info *opts = fc->s_fs_info;
struct fs_parse_result result;
- kuid_t uid;
- kgid_t gid;
int opt;
opt = fs_parse(fc, debugfs_param_specs, param, &result);
@@ -120,16 +118,10 @@ static int debugfs_parse_param(struct fs_context *fc, struct fs_parameter *param
switch (opt) {
case Opt_uid:
- uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(uid))
- return invalf(fc, "Unknown uid");
- opts->uid = uid;
+ opts->uid = result.uid;
break;
case Opt_gid:
- gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(gid))
- return invalf(fc, "Unknown gid");
- opts->gid = gid;
+ opts->gid = result.gid;
break;
case Opt_mode:
opts->mode = result.uint_32 & S_IALLUGO;
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 59711486d801..742b30b61c19 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -18,35 +18,52 @@
#include "user.h"
#include "ast.h"
-static void dlm_callback_work(struct work_struct *work)
+static void dlm_run_callback(uint32_t ls_id, uint32_t lkb_id, int8_t mode,
+ uint32_t flags, uint8_t sb_flags, int sb_status,
+ struct dlm_lksb *lksb,
+ void (*astfn)(void *astparam),
+ void (*bastfn)(void *astparam, int mode),
+ void *astparam, const char *res_name,
+ size_t res_length)
{
- struct dlm_callback *cb = container_of(work, struct dlm_callback, work);
-
- if (cb->flags & DLM_CB_BAST) {
- trace_dlm_bast(cb->ls_id, cb->lkb_id, cb->mode, cb->res_name,
- cb->res_length);
- cb->bastfn(cb->astparam, cb->mode);
- } else if (cb->flags & DLM_CB_CAST) {
- trace_dlm_ast(cb->ls_id, cb->lkb_id, cb->sb_status,
- cb->sb_flags, cb->res_name, cb->res_length);
- cb->lkb_lksb->sb_status = cb->sb_status;
- cb->lkb_lksb->sb_flags = cb->sb_flags;
- cb->astfn(cb->astparam);
+ if (flags & DLM_CB_BAST) {
+ trace_dlm_bast(ls_id, lkb_id, mode, res_name, res_length);
+ bastfn(astparam, mode);
+ } else if (flags & DLM_CB_CAST) {
+ trace_dlm_ast(ls_id, lkb_id, sb_status, sb_flags, res_name,
+ res_length);
+ lksb->sb_status = sb_status;
+ lksb->sb_flags = sb_flags;
+ astfn(astparam);
}
+}
+static void dlm_do_callback(struct dlm_callback *cb)
+{
+ dlm_run_callback(cb->ls_id, cb->lkb_id, cb->mode, cb->flags,
+ cb->sb_flags, cb->sb_status, cb->lkb_lksb,
+ cb->astfn, cb->bastfn, cb->astparam,
+ cb->res_name, cb->res_length);
dlm_free_cb(cb);
}
-int dlm_queue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
- int status, uint32_t sbflags,
- struct dlm_callback **cb)
+static void dlm_callback_work(struct work_struct *work)
+{
+ struct dlm_callback *cb = container_of(work, struct dlm_callback, work);
+
+ dlm_do_callback(cb);
+}
+
+bool dlm_may_skip_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
+ int status, uint32_t sbflags, int *copy_lvb)
{
struct dlm_rsb *rsb = lkb->lkb_resource;
- int rv = DLM_ENQUEUE_CALLBACK_SUCCESS;
struct dlm_ls *ls = rsb->res_ls;
- int copy_lvb = 0;
int prev_mode;
+ if (copy_lvb)
+ *copy_lvb = 0;
+
if (flags & DLM_CB_BAST) {
/* if cb is a bast, it should be skipped if the blocking mode is
* compatible with the last granted mode
@@ -56,7 +73,7 @@ int dlm_queue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
log_debug(ls, "skip %x bast mode %d for cast mode %d",
lkb->lkb_id, mode,
lkb->lkb_last_cast_cb_mode);
- goto out;
+ return true;
}
}
@@ -74,7 +91,7 @@ int dlm_queue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
(prev_mode > mode && prev_mode > DLM_LOCK_PR)) {
log_debug(ls, "skip %x add bast mode %d for bast mode %d",
lkb->lkb_id, mode, prev_mode);
- goto out;
+ return true;
}
}
@@ -85,8 +102,10 @@ int dlm_queue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
prev_mode = lkb->lkb_last_cast_cb_mode;
if (!status && lkb->lkb_lksb->sb_lvbptr &&
- dlm_lvb_operations[prev_mode + 1][mode + 1])
- copy_lvb = 1;
+ dlm_lvb_operations[prev_mode + 1][mode + 1]) {
+ if (copy_lvb)
+ *copy_lvb = 1;
+ }
}
lkb->lkb_last_cast_cb_mode = mode;
@@ -96,11 +115,19 @@ int dlm_queue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
lkb->lkb_last_cb_mode = mode;
lkb->lkb_last_cb_flags = flags;
+ return false;
+}
+
+int dlm_get_cb(struct dlm_lkb *lkb, uint32_t flags, int mode,
+ int status, uint32_t sbflags,
+ struct dlm_callback **cb)
+{
+ struct dlm_rsb *rsb = lkb->lkb_resource;
+ struct dlm_ls *ls = rsb->res_ls;
+
*cb = dlm_allocate_cb();
- if (!*cb) {
- rv = DLM_ENQUEUE_CALLBACK_FAILURE;
- goto out;
- }
+ if (WARN_ON_ONCE(!*cb))
+ return -ENOMEM;
/* for tracing */
(*cb)->lkb_id = lkb->lkb_id;
@@ -112,19 +139,34 @@ int dlm_queue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
(*cb)->mode = mode;
(*cb)->sb_status = status;
(*cb)->sb_flags = (sbflags & 0x000000FF);
- (*cb)->copy_lvb = copy_lvb;
(*cb)->lkb_lksb = lkb->lkb_lksb;
- rv = DLM_ENQUEUE_CALLBACK_NEED_SCHED;
+ return 0;
+}
+
+static int dlm_get_queue_cb(struct dlm_lkb *lkb, uint32_t flags, int mode,
+ int status, uint32_t sbflags,
+ struct dlm_callback **cb)
+{
+ int rv;
+
+ rv = dlm_get_cb(lkb, flags, mode, status, sbflags, cb);
+ if (rv)
+ return rv;
-out:
- return rv;
+ (*cb)->astfn = lkb->lkb_astfn;
+ (*cb)->bastfn = lkb->lkb_bastfn;
+ (*cb)->astparam = lkb->lkb_astparam;
+ INIT_WORK(&(*cb)->work, dlm_callback_work);
+
+ return 0;
}
void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
- uint32_t sbflags)
+ uint32_t sbflags)
{
- struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+ struct dlm_rsb *rsb = lkb->lkb_resource;
+ struct dlm_ls *ls = rsb->res_ls;
struct dlm_callback *cb;
int rv;
@@ -133,34 +175,36 @@ void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
return;
}
- rv = dlm_queue_lkb_callback(lkb, flags, mode, status, sbflags,
- &cb);
- switch (rv) {
- case DLM_ENQUEUE_CALLBACK_NEED_SCHED:
- cb->astfn = lkb->lkb_astfn;
- cb->bastfn = lkb->lkb_bastfn;
- cb->astparam = lkb->lkb_astparam;
- INIT_WORK(&cb->work, dlm_callback_work);
-
- spin_lock_bh(&ls->ls_cb_lock);
- if (test_bit(LSFL_CB_DELAY, &ls->ls_flags))
+ if (dlm_may_skip_callback(lkb, flags, mode, status, sbflags, NULL))
+ return;
+
+ spin_lock_bh(&ls->ls_cb_lock);
+ if (test_bit(LSFL_CB_DELAY, &ls->ls_flags)) {
+ rv = dlm_get_queue_cb(lkb, flags, mode, status, sbflags, &cb);
+ if (!rv)
list_add(&cb->list, &ls->ls_cb_delay);
- else
- queue_work(ls->ls_callback_wq, &cb->work);
- spin_unlock_bh(&ls->ls_cb_lock);
- break;
- case DLM_ENQUEUE_CALLBACK_SUCCESS:
- break;
- case DLM_ENQUEUE_CALLBACK_FAILURE:
- fallthrough;
- default:
- WARN_ON_ONCE(1);
- break;
+ } else {
+ if (test_bit(LSFL_SOFTIRQ, &ls->ls_flags)) {
+ dlm_run_callback(ls->ls_global_id, lkb->lkb_id, mode, flags,
+ sbflags, status, lkb->lkb_lksb,
+ lkb->lkb_astfn, lkb->lkb_bastfn,
+ lkb->lkb_astparam, rsb->res_name,
+ rsb->res_length);
+ } else {
+ rv = dlm_get_queue_cb(lkb, flags, mode, status, sbflags, &cb);
+ if (!rv)
+ queue_work(ls->ls_callback_wq, &cb->work);
+ }
}
+ spin_unlock_bh(&ls->ls_cb_lock);
}
int dlm_callback_start(struct dlm_ls *ls)
{
+ if (!test_bit(LSFL_FS, &ls->ls_flags) ||
+ test_bit(LSFL_SOFTIRQ, &ls->ls_flags))
+ return 0;
+
ls->ls_callback_wq = alloc_ordered_workqueue("dlm_callback",
WQ_HIGHPRI | WQ_MEM_RECLAIM);
if (!ls->ls_callback_wq) {
@@ -178,13 +222,15 @@ void dlm_callback_stop(struct dlm_ls *ls)
void dlm_callback_suspend(struct dlm_ls *ls)
{
- if (ls->ls_callback_wq) {
- spin_lock_bh(&ls->ls_cb_lock);
- set_bit(LSFL_CB_DELAY, &ls->ls_flags);
- spin_unlock_bh(&ls->ls_cb_lock);
+ if (!test_bit(LSFL_FS, &ls->ls_flags))
+ return;
+
+ spin_lock_bh(&ls->ls_cb_lock);
+ set_bit(LSFL_CB_DELAY, &ls->ls_flags);
+ spin_unlock_bh(&ls->ls_cb_lock);
+ if (ls->ls_callback_wq)
flush_workqueue(ls->ls_callback_wq);
- }
}
#define MAX_CB_QUEUE 25
@@ -195,14 +241,18 @@ void dlm_callback_resume(struct dlm_ls *ls)
int count = 0, sum = 0;
bool empty;
- if (!ls->ls_callback_wq)
+ if (!test_bit(LSFL_FS, &ls->ls_flags))
return;
more:
spin_lock_bh(&ls->ls_cb_lock);
list_for_each_entry_safe(cb, safe, &ls->ls_cb_delay, list) {
list_del(&cb->list);
- queue_work(ls->ls_callback_wq, &cb->work);
+ if (test_bit(LSFL_SOFTIRQ, &ls->ls_flags))
+ dlm_do_callback(cb);
+ else
+ queue_work(ls->ls_callback_wq, &cb->work);
+
count++;
if (count == MAX_CB_QUEUE)
break;
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index 9093ff043bee..e2b86845d331 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -11,12 +11,11 @@
#ifndef __ASTD_DOT_H__
#define __ASTD_DOT_H__
-#define DLM_ENQUEUE_CALLBACK_NEED_SCHED 1
-#define DLM_ENQUEUE_CALLBACK_SUCCESS 0
-#define DLM_ENQUEUE_CALLBACK_FAILURE -1
-int dlm_queue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
- int status, uint32_t sbflags,
- struct dlm_callback **cb);
+bool dlm_may_skip_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
+ int status, uint32_t sbflags, int *copy_lvb);
+int dlm_get_cb(struct dlm_lkb *lkb, uint32_t flags, int mode,
+ int status, uint32_t sbflags,
+ struct dlm_callback **cb);
void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
uint32_t sbflags);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 517fa975dc5a..99952234799e 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -672,7 +672,7 @@ static ssize_t comm_addr_store(struct config_item *item, const char *buf,
memcpy(addr, buf, len);
- rv = dlm_midcomms_addr(cm->nodeid, addr, len);
+ rv = dlm_midcomms_addr(cm->nodeid, addr);
if (rv) {
kfree(addr);
return rv;
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 6ab3ed4074c6..7112958c2e5b 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -380,7 +380,7 @@ static const struct seq_operations format4_seq_ops;
static int table_seq_show(struct seq_file *seq, void *iter_ptr)
{
- struct dlm_rsb *rsb = list_entry(iter_ptr, struct dlm_rsb, res_rsbs_list);
+ struct dlm_rsb *rsb = list_entry(iter_ptr, struct dlm_rsb, res_slow_list);
if (seq->op == &format1_seq_ops)
print_format1(rsb, seq);
@@ -409,9 +409,9 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
}
if (seq->op == &format4_seq_ops)
- list = &ls->ls_toss;
+ list = &ls->ls_slow_inactive;
else
- list = &ls->ls_keep;
+ list = &ls->ls_slow_active;
read_lock_bh(&ls->ls_rsbtbl_lock);
return seq_list_start(list, *pos);
@@ -423,9 +423,9 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
struct list_head *list;
if (seq->op == &format4_seq_ops)
- list = &ls->ls_toss;
+ list = &ls->ls_slow_inactive;
else
- list = &ls->ls_keep;
+ list = &ls->ls_slow_active;
return seq_list_next(iter_ptr, list, pos);
}
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 9085ba3b2f20..32d98e63d25e 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -36,7 +36,7 @@
#include <linux/miscdevice.h>
#include <linux/rhashtable.h>
#include <linux/mutex.h>
-#include <linux/idr.h>
+#include <linux/xarray.h>
#include <linux/ratelimit.h>
#include <linux/uaccess.h>
@@ -316,26 +316,24 @@ struct dlm_rsb {
int res_nodeid;
int res_master_nodeid;
int res_dir_nodeid;
- int res_id; /* for ls_recover_idr */
+ unsigned long res_id; /* for ls_recover_xa */
uint32_t res_lvbseq;
uint32_t res_hash;
unsigned long res_toss_time;
uint32_t res_first_lkid;
struct list_head res_lookup; /* lkbs waiting on first */
- union {
- struct list_head res_hashchain;
- struct rhash_head res_node; /* rsbtbl */
- };
+ struct rhash_head res_node; /* rsbtbl */
struct list_head res_grantqueue;
struct list_head res_convertqueue;
struct list_head res_waitqueue;
- struct list_head res_rsbs_list;
+ struct list_head res_slow_list; /* ls_slow_* */
+ struct list_head res_scan_list;
struct list_head res_root_list; /* used for recovery */
struct list_head res_masters_list; /* used for recovery */
struct list_head res_recover_list; /* used for recovery */
- struct list_head res_toss_q_list;
int res_recover_locks_count;
+ struct rcu_head rcu;
char *res_lvbptr;
char res_name[DLM_RESNAME_MAXLEN+1];
@@ -368,7 +366,8 @@ enum rsb_flags {
RSB_RECOVER_CONVERT,
RSB_RECOVER_GRANT,
RSB_RECOVER_LVB_INVAL,
- RSB_TOSS,
+ RSB_INACTIVE,
+ RSB_HASHED, /* set while rsb is on ls_rsbtbl */
};
static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
@@ -559,16 +558,8 @@ struct rcom_lock {
char rl_lvb[];
};
-/*
- * The max number of resources per rsbtbl bucket that shrink will attempt
- * to remove in each iteration.
- */
-
-#define DLM_REMOVE_NAMES_MAX 8
-
struct dlm_ls {
struct list_head ls_list; /* list of lockspaces */
- dlm_lockspace_t *ls_local_handle;
uint32_t ls_global_id; /* global unique lockspace ID */
uint32_t ls_generation;
uint32_t ls_exflags;
@@ -578,26 +569,21 @@ struct dlm_ls {
wait_queue_head_t ls_count_wait;
int ls_create_count; /* create/release refcount */
unsigned long ls_flags; /* LSFL_ */
- unsigned long ls_scan_time;
struct kobject ls_kobj;
- struct idr ls_lkbidr;
- rwlock_t ls_lkbidr_lock;
+ struct xarray ls_lkbxa;
+ rwlock_t ls_lkbxa_lock;
+ /* an rsb is on rsbtl for primary locking functions,
+ and on a slow list for recovery/dump iteration */
struct rhashtable ls_rsbtbl;
- rwlock_t ls_rsbtbl_lock;
+ rwlock_t ls_rsbtbl_lock; /* for ls_rsbtbl and ls_slow */
+ struct list_head ls_slow_inactive; /* to iterate rsbtbl */
+ struct list_head ls_slow_active; /* to iterate rsbtbl */
- struct list_head ls_toss;
- struct list_head ls_keep;
-
- struct timer_list ls_timer;
- /* this queue is ordered according the
- * absolute res_toss_time jiffies time
- * to mod_timer() with the first element
- * if necessary.
- */
- struct list_head ls_toss_q;
- spinlock_t ls_toss_q_lock;
+ struct timer_list ls_scan_timer; /* based on first scan_list rsb toss_time */
+ struct list_head ls_scan_list; /* rsbs ordered by res_toss_time */
+ spinlock_t ls_scan_lock;
spinlock_t ls_waiters_lock;
struct list_head ls_waiters; /* lkbs needing a reply */
@@ -605,10 +591,6 @@ struct dlm_ls {
spinlock_t ls_orphans_lock;
struct list_head ls_orphans;
- spinlock_t ls_new_rsb_spin;
- int ls_new_rsb_count;
- struct list_head ls_new_rsb; /* new rsb structs */
-
struct list_head ls_nodes; /* current nodes in ls */
struct list_head ls_nodes_gone; /* dead node list, recovery */
int ls_num_nodes; /* number of nodes in ls */
@@ -664,8 +646,8 @@ struct dlm_ls {
struct list_head ls_recover_list;
spinlock_t ls_recover_list_lock;
int ls_recover_list_count;
- struct idr ls_recover_idr;
- spinlock_t ls_recover_idr_lock;
+ struct xarray ls_recover_xa;
+ spinlock_t ls_recover_xa_lock;
wait_queue_head_t ls_wait_general;
wait_queue_head_t ls_recover_lock_wait;
spinlock_t ls_clear_proc_locks;
@@ -716,6 +698,8 @@ struct dlm_ls {
#define LSFL_CB_DELAY 9
#define LSFL_NODIR 10
#define LSFL_RECV_MSG_BLOCKED 11
+#define LSFL_FS 12
+#define LSFL_SOFTIRQ 13
#define DLM_PROC_FLAGS_CLOSING 1
#define DLM_PROC_FLAGS_COMPAT 2
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index f103b8c30592..8bee4f444afd 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -89,7 +89,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
const struct dlm_message *ms, bool local);
static int receive_extralen(const struct dlm_message *ms);
static void do_purge(struct dlm_ls *ls, int nodeid, int pid);
-static void toss_rsb(struct kref *kref);
+static void deactivate_rsb(struct kref *kref);
/*
* Lock compatibilty matrix - thanks Steve
@@ -330,8 +330,8 @@ static inline unsigned long rsb_toss_jiffies(void)
static inline void hold_rsb(struct dlm_rsb *r)
{
- /* rsbs in toss state never get referenced */
- WARN_ON(rsb_flag(r, RSB_TOSS));
+ /* inactive rsbs are not ref counted */
+ WARN_ON(rsb_flag(r, RSB_INACTIVE));
kref_get(&r->res_ref);
}
@@ -370,15 +370,12 @@ static inline int dlm_kref_put_write_lock_bh(struct kref *kref,
return 0;
}
-/* When all references to the rsb are gone it's transferred to
- the tossed list for later disposal. */
-
static void put_rsb(struct dlm_rsb *r)
{
struct dlm_ls *ls = r->res_ls;
int rv;
- rv = dlm_kref_put_write_lock_bh(&r->res_ref, toss_rsb,
+ rv = dlm_kref_put_write_lock_bh(&r->res_ref, deactivate_rsb,
&ls->ls_rsbtbl_lock);
if (rv)
write_unlock_bh(&ls->ls_rsbtbl_lock);
@@ -389,82 +386,54 @@ void dlm_put_rsb(struct dlm_rsb *r)
put_rsb(r);
}
-static int pre_rsb_struct(struct dlm_ls *ls)
-{
- struct dlm_rsb *r1, *r2;
- int count = 0;
-
- spin_lock_bh(&ls->ls_new_rsb_spin);
- if (ls->ls_new_rsb_count > dlm_config.ci_new_rsb_count / 2) {
- spin_unlock_bh(&ls->ls_new_rsb_spin);
- return 0;
- }
- spin_unlock_bh(&ls->ls_new_rsb_spin);
-
- r1 = dlm_allocate_rsb(ls);
- r2 = dlm_allocate_rsb(ls);
-
- spin_lock_bh(&ls->ls_new_rsb_spin);
- if (r1) {
- list_add(&r1->res_hashchain, &ls->ls_new_rsb);
- ls->ls_new_rsb_count++;
- }
- if (r2) {
- list_add(&r2->res_hashchain, &ls->ls_new_rsb);
- ls->ls_new_rsb_count++;
- }
- count = ls->ls_new_rsb_count;
- spin_unlock_bh(&ls->ls_new_rsb_spin);
-
- if (!count)
- return -ENOMEM;
- return 0;
-}
-
/* connected with timer_delete_sync() in dlm_ls_stop() to stop
* new timers when recovery is triggered and don't run them
- * again until a dlm_timer_resume() tries it again.
+ * again until a resume_scan_timer() tries it again.
*/
-static void __rsb_mod_timer(struct dlm_ls *ls, unsigned long jiffies)
+static void enable_scan_timer(struct dlm_ls *ls, unsigned long jiffies)
{
if (!dlm_locking_stopped(ls))
- mod_timer(&ls->ls_timer, jiffies);
+ mod_timer(&ls->ls_scan_timer, jiffies);
}
/* This function tries to resume the timer callback if a rsb
- * is on the toss list and no timer is pending. It might that
+ * is on the scan list and no timer is pending. It might that
* the first entry is on currently executed as timer callback
* but we don't care if a timer queued up again and does
* nothing. Should be a rare case.
*/
-void dlm_timer_resume(struct dlm_ls *ls)
+void resume_scan_timer(struct dlm_ls *ls)
{
struct dlm_rsb *r;
- spin_lock_bh(&ls->ls_toss_q_lock);
- r = list_first_entry_or_null(&ls->ls_toss_q, struct dlm_rsb,
- res_toss_q_list);
- if (r && !timer_pending(&ls->ls_timer))
- __rsb_mod_timer(ls, r->res_toss_time);
- spin_unlock_bh(&ls->ls_toss_q_lock);
+ spin_lock_bh(&ls->ls_scan_lock);
+ r = list_first_entry_or_null(&ls->ls_scan_list, struct dlm_rsb,
+ res_scan_list);
+ if (r && !timer_pending(&ls->ls_scan_timer))
+ enable_scan_timer(ls, r->res_toss_time);
+ spin_unlock_bh(&ls->ls_scan_lock);
}
-/* ls_rsbtbl_lock must be held and being sure the rsb is in toss state */
-static void rsb_delete_toss_timer(struct dlm_ls *ls, struct dlm_rsb *r)
+/* ls_rsbtbl_lock must be held */
+
+static void del_scan(struct dlm_ls *ls, struct dlm_rsb *r)
{
struct dlm_rsb *first;
- spin_lock_bh(&ls->ls_toss_q_lock);
+ /* active rsbs should never be on the scan list */
+ WARN_ON(!rsb_flag(r, RSB_INACTIVE));
+
+ spin_lock_bh(&ls->ls_scan_lock);
r->res_toss_time = 0;
/* if the rsb is not queued do nothing */
- if (list_empty(&r->res_toss_q_list))
+ if (list_empty(&r->res_scan_list))
goto out;
/* get the first element before delete */
- first = list_first_entry(&ls->ls_toss_q, struct dlm_rsb,
- res_toss_q_list);
- list_del_init(&r->res_toss_q_list);
+ first = list_first_entry(&ls->ls_scan_list, struct dlm_rsb,
+ res_scan_list);
+ list_del_init(&r->res_scan_list);
/* check if the first element was the rsb we deleted */
if (first == r) {
/* try to get the new first element, if the list
@@ -474,70 +443,59 @@ static void rsb_delete_toss_timer(struct dlm_ls *ls, struct dlm_rsb *r)
* if the list isn't empty and a new first element got
* in place, set the new timer expire time.
*/
- first = list_first_entry_or_null(&ls->ls_toss_q, struct dlm_rsb,
- res_toss_q_list);
+ first = list_first_entry_or_null(&ls->ls_scan_list, struct dlm_rsb,
+ res_scan_list);
if (!first)
- timer_delete(&ls->ls_timer);
+ timer_delete(&ls->ls_scan_timer);
else
- __rsb_mod_timer(ls, first->res_toss_time);
+ enable_scan_timer(ls, first->res_toss_time);
}
out:
- spin_unlock_bh(&ls->ls_toss_q_lock);
+ spin_unlock_bh(&ls->ls_scan_lock);
}
-/* Caller must held ls_rsbtbl_lock and need to be called every time
- * when either the rsb enters toss state or the toss state changes
- * the dir/master nodeid.
- */
-static void rsb_mod_timer(struct dlm_ls *ls, struct dlm_rsb *r)
+static void add_scan(struct dlm_ls *ls, struct dlm_rsb *r)
{
int our_nodeid = dlm_our_nodeid();
struct dlm_rsb *first;
- /* If we're the directory record for this rsb, and
- * we're not the master of it, then we need to wait
- * for the master node to send us a dir remove for
- * before removing the dir record.
- */
- if (!dlm_no_directory(ls) &&
- (r->res_master_nodeid != our_nodeid) &&
- (dlm_dir_nodeid(r) == our_nodeid)) {
- rsb_delete_toss_timer(ls, r);
- return;
- }
+ /* A dir record for a remote master rsb should never be on the scan list. */
+ WARN_ON(!dlm_no_directory(ls) &&
+ (r->res_master_nodeid != our_nodeid) &&
+ (dlm_dir_nodeid(r) == our_nodeid));
+
+ /* An active rsb should never be on the scan list. */
+ WARN_ON(!rsb_flag(r, RSB_INACTIVE));
- spin_lock_bh(&ls->ls_toss_q_lock);
+ /* An rsb should not already be on the scan list. */
+ WARN_ON(!list_empty(&r->res_scan_list));
+
+ spin_lock_bh(&ls->ls_scan_lock);
/* set the new rsb absolute expire time in the rsb */
r->res_toss_time = rsb_toss_jiffies();
- if (list_empty(&ls->ls_toss_q)) {
+ if (list_empty(&ls->ls_scan_list)) {
/* if the queue is empty add the element and it's
* our new expire time
*/
- list_add_tail(&r->res_toss_q_list, &ls->ls_toss_q);
- __rsb_mod_timer(ls, r->res_toss_time);
+ list_add_tail(&r->res_scan_list, &ls->ls_scan_list);
+ enable_scan_timer(ls, r->res_toss_time);
} else {
- /* check if the rsb was already queued, if so delete
- * it from the toss queue
- */
- if (!list_empty(&r->res_toss_q_list))
- list_del(&r->res_toss_q_list);
-
/* try to get the maybe new first element and then add
* to this rsb with the oldest expire time to the end
* of the queue. If the list was empty before this
* rsb expire time is our next expiration if it wasn't
* the now new first elemet is our new expiration time
*/
- first = list_first_entry_or_null(&ls->ls_toss_q, struct dlm_rsb,
- res_toss_q_list);
- list_add_tail(&r->res_toss_q_list, &ls->ls_toss_q);
+ first = list_first_entry_or_null(&ls->ls_scan_list, struct dlm_rsb,
+ res_scan_list);
+ list_add_tail(&r->res_scan_list, &ls->ls_scan_list);
if (!first)
- __rsb_mod_timer(ls, r->res_toss_time);
+ enable_scan_timer(ls, r->res_toss_time);
else
- __rsb_mod_timer(ls, first->res_toss_time);
+ enable_scan_timer(ls, first->res_toss_time);
}
- spin_unlock_bh(&ls->ls_toss_q_lock);
+ spin_unlock_bh(&ls->ls_scan_lock);
}
/* if we hit contention we do in 250 ms a retry to trylock.
@@ -547,9 +505,11 @@ static void rsb_mod_timer(struct dlm_ls *ls, struct dlm_rsb *r)
*/
#define DLM_TOSS_TIMER_RETRY (jiffies + msecs_to_jiffies(250))
-void dlm_rsb_toss_timer(struct timer_list *timer)
+/* Called by lockspace scan_timer to free unused rsb's. */
+
+void dlm_rsb_scan(struct timer_list *timer)
{
- struct dlm_ls *ls = from_timer(ls, timer, ls_timer);
+ struct dlm_ls *ls = from_timer(ls, timer, ls_scan_timer);
int our_nodeid = dlm_our_nodeid();
struct dlm_rsb *r;
int rv;
@@ -557,76 +517,63 @@ void dlm_rsb_toss_timer(struct timer_list *timer)
while (1) {
/* interrupting point to leave iteration when
* recovery waits for timer_delete_sync(), recovery
- * will take care to delete everything in toss queue.
+ * will take care to delete everything in scan list.
*/
if (dlm_locking_stopped(ls))
break;
- rv = spin_trylock(&ls->ls_toss_q_lock);
+ rv = spin_trylock(&ls->ls_scan_lock);
if (!rv) {
/* rearm again try timer */
- __rsb_mod_timer(ls, DLM_TOSS_TIMER_RETRY);
+ enable_scan_timer(ls, DLM_TOSS_TIMER_RETRY);
break;
}
- r = list_first_entry_or_null(&ls->ls_toss_q, struct dlm_rsb,
- res_toss_q_list);
+ r = list_first_entry_or_null(&ls->ls_scan_list, struct dlm_rsb,
+ res_scan_list);
if (!r) {
- /* nothing to do anymore next rsb queue will
- * set next mod_timer() expire.
- */
- spin_unlock(&ls->ls_toss_q_lock);
+ /* the next add_scan will enable the timer again */
+ spin_unlock(&ls->ls_scan_lock);
break;
}
- /* test if the first rsb isn't expired yet, if
- * so we stop freeing rsb from toss queue as
- * the order in queue is ascending to the
- * absolute res_toss_time jiffies
+ /*
+ * If the first rsb is not yet expired, then stop because the
+ * list is sorted with nearest expiration first.
*/
if (time_before(jiffies, r->res_toss_time)) {
/* rearm with the next rsb to expire in the future */
- __rsb_mod_timer(ls, r->res_toss_time);
- spin_unlock(&ls->ls_toss_q_lock);
+ enable_scan_timer(ls, r->res_toss_time);
+ spin_unlock(&ls->ls_scan_lock);
break;
}
/* in find_rsb_dir/nodir there is a reverse order of this
* lock, however this is only a trylock if we hit some
* possible contention we try it again.
- *
- * This lock synchronized while holding ls_toss_q_lock
- * synchronize everything that rsb_delete_toss_timer()
- * or rsb_mod_timer() can't run after this timer callback
- * deletes the rsb from the ls_toss_q. Whereas the other
- * holders have always a priority to run as this is only
- * a caching handling and the other holders might to put
- * this rsb out of the toss state.
*/
rv = write_trylock(&ls->ls_rsbtbl_lock);
if (!rv) {
- spin_unlock(&ls->ls_toss_q_lock);
+ spin_unlock(&ls->ls_scan_lock);
/* rearm again try timer */
- __rsb_mod_timer(ls, DLM_TOSS_TIMER_RETRY);
+ enable_scan_timer(ls, DLM_TOSS_TIMER_RETRY);
break;
}
- list_del(&r->res_rsbs_list);
+ list_del(&r->res_slow_list);
rhashtable_remove_fast(&ls->ls_rsbtbl, &r->res_node,
dlm_rhash_rsb_params);
+ rsb_clear_flag(r, RSB_HASHED);
- /* not necessary to held the ls_rsbtbl_lock when
- * calling send_remove()
- */
+ /* ls_rsbtbl_lock is not needed when calling send_remove() */
write_unlock(&ls->ls_rsbtbl_lock);
- /* remove the rsb out of the toss queue its gone
- * drom DLM now
- */
- list_del_init(&r->res_toss_q_list);
- spin_unlock(&ls->ls_toss_q_lock);
+ list_del_init(&r->res_scan_list);
+ spin_unlock(&ls->ls_scan_lock);
- /* no rsb in this state should ever run a timer */
+ /* An rsb that is a dir record for a remote master rsb
+ * cannot be removed, and should not have a timer enabled.
+ */
WARN_ON(!dlm_no_directory(ls) &&
(r->res_master_nodeid != our_nodeid) &&
(dlm_dir_nodeid(r) == our_nodeid));
@@ -640,7 +587,7 @@ void dlm_rsb_toss_timer(struct timer_list *timer)
(dlm_dir_nodeid(r) != our_nodeid))
send_remove(r);
- free_toss_rsb(r);
+ free_inactive_rsb(r);
}
}
@@ -652,22 +599,10 @@ static int get_rsb_struct(struct dlm_ls *ls, const void *name, int len,
struct dlm_rsb **r_ret)
{
struct dlm_rsb *r;
- int count;
- spin_lock_bh(&ls->ls_new_rsb_spin);
- if (list_empty(&ls->ls_new_rsb)) {
- count = ls->ls_new_rsb_count;
- spin_unlock_bh(&ls->ls_new_rsb_spin);
- log_debug(ls, "find_rsb retry %d %d %s",
- count, dlm_config.ci_new_rsb_count,
- (const char *)name);
- return -EAGAIN;
- }
-
- r = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, res_hashchain);
- list_del(&r->res_hashchain);
- ls->ls_new_rsb_count--;
- spin_unlock_bh(&ls->ls_new_rsb_spin);
+ r = dlm_allocate_rsb(ls);
+ if (!r)
+ return -ENOMEM;
r->res_ls = ls;
r->res_length = len;
@@ -679,7 +614,7 @@ static int get_rsb_struct(struct dlm_ls *ls, const void *name, int len,
INIT_LIST_HEAD(&r->res_convertqueue);
INIT_LIST_HEAD(&r->res_waitqueue);
INIT_LIST_HEAD(&r->res_root_list);
- INIT_LIST_HEAD(&r->res_toss_q_list);
+ INIT_LIST_HEAD(&r->res_scan_list);
INIT_LIST_HEAD(&r->res_recover_list);
INIT_LIST_HEAD(&r->res_masters_list);
@@ -702,8 +637,14 @@ int dlm_search_rsb_tree(struct rhashtable *rhash, const void *name, int len,
static int rsb_insert(struct dlm_rsb *rsb, struct rhashtable *rhash)
{
- return rhashtable_insert_fast(rhash, &rsb->res_node,
- dlm_rhash_rsb_params);
+ int rv;
+
+ rv = rhashtable_insert_fast(rhash, &rsb->res_node,
+ dlm_rhash_rsb_params);
+ if (!rv)
+ rsb_set_flag(rsb, RSB_HASHED);
+
+ return rv;
}
/*
@@ -733,7 +674,7 @@ static int rsb_insert(struct dlm_rsb *rsb, struct rhashtable *rhash)
* So, if the given rsb is on the toss list, it is moved to the keep list
* before being returned.
*
- * toss_rsb() happens when all local usage of the rsb is done, i.e. no
+ * deactivate_rsb() happens when all local usage of the rsb is done, i.e. no
* more refcounts exist, so the rsb is moved from the keep list to the
* toss list.
*
@@ -781,9 +722,9 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len,
*
* If someone sends us a request, we are the dir node, and we do
* not find the rsb anywhere, then recreate it. This happens if
- * someone sends us a request after we have removed/freed an rsb
- * from our toss list. (They sent a request instead of lookup
- * because they are using an rsb from their toss list.)
+ * someone sends us a request after we have removed/freed an rsb.
+ * (They sent a request instead of lookup because they are using
+ * an rsb taken from their scan list.)
*/
if (from_local || from_dir ||
@@ -792,15 +733,8 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len,
}
retry:
- if (create) {
- error = pre_rsb_struct(ls);
- if (error < 0)
- goto out;
- }
- retry_lookup:
-
- /* check if the rsb is in keep state under read lock - likely path */
+ /* check if the rsb is active under read lock - likely path */
read_lock_bh(&ls->ls_rsbtbl_lock);
error = dlm_search_rsb_tree(&ls->ls_rsbtbl, name, len, &r);
if (error) {
@@ -812,9 +746,9 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len,
* rsb is active, so we can't check master_nodeid without lock_rsb.
*/
- if (rsb_flag(r, RSB_TOSS)) {
+ if (rsb_flag(r, RSB_INACTIVE)) {
read_unlock_bh(&ls->ls_rsbtbl_lock);
- goto do_toss;
+ goto do_inactive;
}
kref_get(&r->res_ref);
@@ -822,17 +756,29 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len,
goto out;
- do_toss:
+ do_inactive:
write_lock_bh(&ls->ls_rsbtbl_lock);
- /* retry lookup under write lock to see if its still in toss state
- * if not it's in keep state and we relookup - unlikely path.
+ /*
+ * The expectation here is that the rsb will have HASHED and
+ * INACTIVE flags set, and that the rsb can be moved from
+ * inactive back to active again. However, between releasing
+ * the read lock and acquiring the write lock, this rsb could
+ * have been removed from rsbtbl, and had HASHED cleared, to
+ * be freed. To deal with this case, we would normally need
+ * to repeat dlm_search_rsb_tree while holding the write lock,
+ * but rcu allows us to simply check the HASHED flag, because
+ * the rcu read lock means the rsb will not be freed yet.
+ * If the HASHED flag is not set, then the rsb is being freed,
+ * so we add a new rsb struct. If the HASHED flag is set,
+ * and INACTIVE is not set, it means another thread has
+ * made the rsb active, as we're expecting to do here, and
+ * we just repeat the lookup (this will be very unlikely.)
*/
- error = dlm_search_rsb_tree(&ls->ls_rsbtbl, name, len, &r);
- if (!error) {
- if (!rsb_flag(r, RSB_TOSS)) {
+ if (rsb_flag(r, RSB_HASHED)) {
+ if (!rsb_flag(r, RSB_INACTIVE)) {
write_unlock_bh(&ls->ls_rsbtbl_lock);
- goto retry_lookup;
+ goto retry;
}
} else {
write_unlock_bh(&ls->ls_rsbtbl_lock);
@@ -842,14 +788,14 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len,
/*
* rsb found inactive (master_nodeid may be out of date unless
* we are the dir_nodeid or were the master) No other thread
- * is using this rsb because it's on the toss list, so we can
+ * is using this rsb because it's inactive, so we can
* look at or update res_master_nodeid without lock_rsb.
*/
if ((r->res_master_nodeid != our_nodeid) && from_other) {
/* our rsb was not master, and another node (not the dir node)
has sent us a request */
- log_debug(ls, "find_rsb toss from_other %d master %d dir %d %s",
+ log_debug(ls, "find_rsb inactive from_other %d master %d dir %d %s",
from_nodeid, r->res_master_nodeid, dir_nodeid,
r->res_name);
write_unlock_bh(&ls->ls_rsbtbl_lock);
@@ -859,7 +805,7 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len,
if ((r->res_master_nodeid != our_nodeid) && from_dir) {
/* don't think this should ever happen */
- log_error(ls, "find_rsb toss from_dir %d master %d",
+ log_error(ls, "find_rsb inactive from_dir %d master %d",
from_nodeid, r->res_master_nodeid);
dlm_print_rsb(r);
/* fix it and go on */
@@ -876,14 +822,12 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len,
r->res_first_lkid = 0;
}
- list_move(&r->res_rsbs_list, &ls->ls_keep);
- rsb_clear_flag(r, RSB_TOSS);
- /* rsb got out of toss state, it becomes alive again
- * and we reinit the reference counter that is only
- * valid for keep state rsbs
- */
- kref_init(&r->res_ref);
- rsb_delete_toss_timer(ls, r);
+ /* A dir record will not be on the scan list. */
+ if (r->res_dir_nodeid != our_nodeid)
+ del_scan(ls, r);
+ list_move(&r->res_slow_list, &ls->ls_slow_active);
+ rsb_clear_flag(r, RSB_INACTIVE);
+ kref_init(&r->res_ref); /* ref is now used in active state */
write_unlock_bh(&ls->ls_rsbtbl_lock);
goto out;
@@ -898,9 +842,7 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len,
goto out;
error = get_rsb_struct(ls, name, len, &r);
- if (error == -EAGAIN)
- goto retry;
- if (error)
+ if (WARN_ON_ONCE(error))
goto out;
r->res_hash = hash;
@@ -952,9 +894,9 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len,
*/
write_unlock_bh(&ls->ls_rsbtbl_lock);
dlm_free_rsb(r);
- goto retry_lookup;
+ goto retry;
} else if (!error) {
- list_add(&r->res_rsbs_list, &ls->ls_keep);
+ list_add(&r->res_slow_list, &ls->ls_slow_active);
}
write_unlock_bh(&ls->ls_rsbtbl_lock);
out:
@@ -976,13 +918,8 @@ static int find_rsb_nodir(struct dlm_ls *ls, const void *name, int len,
int error;
retry:
- error = pre_rsb_struct(ls);
- if (error < 0)
- goto out;
- retry_lookup:
-
- /* check if the rsb is in keep state under read lock - likely path */
+ /* check if the rsb is in active state under read lock - likely path */
read_lock_bh(&ls->ls_rsbtbl_lock);
error = dlm_search_rsb_tree(&ls->ls_rsbtbl, name, len, &r);
if (error) {
@@ -990,9 +927,9 @@ static int find_rsb_nodir(struct dlm_ls *ls, const void *name, int len,
goto do_new;
}
- if (rsb_flag(r, RSB_TOSS)) {
+ if (rsb_flag(r, RSB_INACTIVE)) {
read_unlock_bh(&ls->ls_rsbtbl_lock);
- goto do_toss;
+ goto do_inactive;
}
/*
@@ -1005,17 +942,14 @@ static int find_rsb_nodir(struct dlm_ls *ls, const void *name, int len,
goto out;
- do_toss:
+ do_inactive:
write_lock_bh(&ls->ls_rsbtbl_lock);
- /* retry lookup under write lock to see if its still in toss state
- * if not it's in keep state and we relookup - unlikely path.
- */
- error = dlm_search_rsb_tree(&ls->ls_rsbtbl, name, len, &r);
- if (!error) {
- if (!rsb_flag(r, RSB_TOSS)) {
+ /* See comment in find_rsb_dir. */
+ if (rsb_flag(r, RSB_HASHED)) {
+ if (!rsb_flag(r, RSB_INACTIVE)) {
write_unlock_bh(&ls->ls_rsbtbl_lock);
- goto retry_lookup;
+ goto retry;
}
} else {
write_unlock_bh(&ls->ls_rsbtbl_lock);
@@ -1025,14 +959,14 @@ static int find_rsb_nodir(struct dlm_ls *ls, const void *name, int len,
/*
* rsb found inactive. No other thread is using this rsb because
- * it's on the toss list, so we can look at or update
- * res_master_nodeid without lock_rsb.
+ * it's inactive, so we can look at or update res_master_nodeid
+ * without lock_rsb.
*/
if (!recover && (r->res_master_nodeid != our_nodeid) && from_nodeid) {
/* our rsb is not master, and another node has sent us a
request; this should never happen */
- log_error(ls, "find_rsb toss from_nodeid %d master %d dir %d",
+ log_error(ls, "find_rsb inactive from_nodeid %d master %d dir %d",
from_nodeid, r->res_master_nodeid, dir_nodeid);
dlm_print_rsb(r);
write_unlock_bh(&ls->ls_rsbtbl_lock);
@@ -1044,21 +978,17 @@ static int find_rsb_nodir(struct dlm_ls *ls, const void *name, int len,
(dir_nodeid == our_nodeid)) {
/* our rsb is not master, and we are dir; may as well fix it;
this should never happen */
- log_error(ls, "find_rsb toss our %d master %d dir %d",
+ log_error(ls, "find_rsb inactive our %d master %d dir %d",
our_nodeid, r->res_master_nodeid, dir_nodeid);
dlm_print_rsb(r);
r->res_master_nodeid = our_nodeid;
r->res_nodeid = 0;
}
- list_move(&r->res_rsbs_list, &ls->ls_keep);
- rsb_clear_flag(r, RSB_TOSS);
- /* rsb got out of toss state, it becomes alive again
- * and we reinit the reference counter that is only
- * valid for keep state rsbs
- */
+ list_move(&r->res_slow_list, &ls->ls_slow_active);
+ rsb_clear_flag(r, RSB_INACTIVE);
kref_init(&r->res_ref);
- rsb_delete_toss_timer(ls, r);
+ del_scan(ls, r);
write_unlock_bh(&ls->ls_rsbtbl_lock);
goto out;
@@ -1070,10 +1000,7 @@ static int find_rsb_nodir(struct dlm_ls *ls, const void *name, int len,
*/
error = get_rsb_struct(ls, name, len, &r);
- if (error == -EAGAIN) {
- goto retry;
- }
- if (error)
+ if (WARN_ON_ONCE(error))
goto out;
r->res_hash = hash;
@@ -1090,9 +1017,9 @@ static int find_rsb_nodir(struct dlm_ls *ls, const void *name, int len,
*/
write_unlock_bh(&ls->ls_rsbtbl_lock);
dlm_free_rsb(r);
- goto retry_lookup;
+ goto retry;
} else if (!error) {
- list_add(&r->res_rsbs_list, &ls->ls_keep);
+ list_add(&r->res_slow_list, &ls->ls_slow_active);
}
write_unlock_bh(&ls->ls_rsbtbl_lock);
@@ -1101,12 +1028,54 @@ static int find_rsb_nodir(struct dlm_ls *ls, const void *name, int len,
return error;
}
+/*
+ * rsb rcu usage
+ *
+ * While rcu read lock is held, the rsb cannot be freed,
+ * which allows a lookup optimization.
+ *
+ * Two threads are accessing the same rsb concurrently,
+ * the first (A) is trying to use the rsb, the second (B)
+ * is trying to free the rsb.
+ *
+ * thread A thread B
+ * (trying to use rsb) (trying to free rsb)
+ *
+ * A1. rcu read lock
+ * A2. rsbtbl read lock
+ * A3. look up rsb in rsbtbl
+ * A4. rsbtbl read unlock
+ * B1. rsbtbl write lock
+ * B2. look up rsb in rsbtbl
+ * B3. remove rsb from rsbtbl
+ * B4. clear rsb HASHED flag
+ * B5. rsbtbl write unlock
+ * B6. begin freeing rsb using rcu...
+ *
+ * (rsb is inactive, so try to make it active again)
+ * A5. read rsb HASHED flag (safe because rsb is not freed yet)
+ * A6. the rsb HASHED flag is not set, which it means the rsb
+ * is being removed from rsbtbl and freed, so don't use it.
+ * A7. rcu read unlock
+ *
+ * B7. ...finish freeing rsb using rcu
+ * A8. create a new rsb
+ *
+ * Without the rcu optimization, steps A5-8 would need to do
+ * an extra rsbtbl lookup:
+ * A5. rsbtbl write lock
+ * A6. look up rsb in rsbtbl, not found
+ * A7. rsbtbl write unlock
+ * A8. create a new rsb
+ */
+
static int find_rsb(struct dlm_ls *ls, const void *name, int len,
int from_nodeid, unsigned int flags,
struct dlm_rsb **r_ret)
{
int dir_nodeid;
uint32_t hash;
+ int rv;
if (len > DLM_RESNAME_MAXLEN)
return -EINVAL;
@@ -1114,12 +1083,15 @@ static int find_rsb(struct dlm_ls *ls, const void *name, int len,
hash = jhash(name, len, 0);
dir_nodeid = dlm_hash2nodeid(ls, hash);
+ rcu_read_lock();
if (dlm_no_directory(ls))
- return find_rsb_nodir(ls, name, len, hash, dir_nodeid,
+ rv = find_rsb_nodir(ls, name, len, hash, dir_nodeid,
from_nodeid, flags, r_ret);
else
- return find_rsb_dir(ls, name, len, hash, dir_nodeid,
+ rv = find_rsb_dir(ls, name, len, hash, dir_nodeid,
from_nodeid, flags, r_ret);
+ rcu_read_unlock();
+ return rv;
}
/* we have received a request and found that res_master_nodeid != our_nodeid,
@@ -1166,7 +1138,7 @@ static int validate_master_nodeid(struct dlm_ls *ls, struct dlm_rsb *r,
}
static void __dlm_master_lookup(struct dlm_ls *ls, struct dlm_rsb *r, int our_nodeid,
- int from_nodeid, bool toss_list, unsigned int flags,
+ int from_nodeid, bool is_inactive, unsigned int flags,
int *r_nodeid, int *result)
{
int fix_master = (flags & DLM_LU_RECOVER_MASTER);
@@ -1190,9 +1162,9 @@ static void __dlm_master_lookup(struct dlm_ls *ls, struct dlm_rsb *r, int our_no
r->res_nodeid = from_nodeid;
rsb_set_flag(r, RSB_NEW_MASTER);
- if (toss_list) {
- /* I don't think we should ever find it on toss list. */
- log_error(ls, "%s fix_master on toss", __func__);
+ if (is_inactive) {
+ /* I don't think we should ever find it inactive. */
+ log_error(ls, "%s fix_master inactive", __func__);
dlm_dump_rsb(r);
}
}
@@ -1232,7 +1204,7 @@ static void __dlm_master_lookup(struct dlm_ls *ls, struct dlm_rsb *r, int our_no
if (!from_master && !fix_master &&
(r->res_master_nodeid == from_nodeid)) {
/* this can happen when the master sends remove, the dir node
- * finds the rsb on the keep list and ignores the remove,
+ * finds the rsb on the active list and ignores the remove,
* and the former master sends a lookup
*/
@@ -1276,8 +1248,8 @@ static void __dlm_master_lookup(struct dlm_ls *ls, struct dlm_rsb *r, int our_no
* . dlm_master_lookup RECOVER_MASTER (fix_master 1, from_master 0)
*/
-int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name,
- int len, unsigned int flags, int *r_nodeid, int *result)
+static int _dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name,
+ int len, unsigned int flags, int *r_nodeid, int *result)
{
struct dlm_rsb *r = NULL;
uint32_t hash;
@@ -1304,19 +1276,14 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name,
}
retry:
- error = pre_rsb_struct(ls);
- if (error < 0)
- return error;
-
- retry_lookup:
- /* check if the rsb is in keep state under read lock - likely path */
+ /* check if the rsb is active under read lock - likely path */
read_lock_bh(&ls->ls_rsbtbl_lock);
error = dlm_search_rsb_tree(&ls->ls_rsbtbl, name, len, &r);
if (!error) {
- if (rsb_flag(r, RSB_TOSS)) {
+ if (rsb_flag(r, RSB_INACTIVE)) {
read_unlock_bh(&ls->ls_rsbtbl_lock);
- goto do_toss;
+ goto do_inactive;
}
/* because the rsb is active, we need to lock_rsb before
@@ -1340,53 +1307,48 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name,
goto not_found;
}
- do_toss:
+ do_inactive:
/* unlikely path - relookup under write */
write_lock_bh(&ls->ls_rsbtbl_lock);
- /* rsb_mod_timer() requires to held ls_rsbtbl_lock in write lock
- * check if the rsb is still in toss state, if not relookup
- */
error = dlm_search_rsb_tree(&ls->ls_rsbtbl, name, len, &r);
if (!error) {
- if (!rsb_flag(r, RSB_TOSS)) {
+ if (!rsb_flag(r, RSB_INACTIVE)) {
write_unlock_bh(&ls->ls_rsbtbl_lock);
/* something as changed, very unlikely but
* try again
*/
- goto retry_lookup;
+ goto retry;
}
} else {
write_unlock_bh(&ls->ls_rsbtbl_lock);
goto not_found;
}
- /* because the rsb is inactive (on toss list), it's not refcounted
- * and lock_rsb is not used, but is protected by the rsbtbl lock
- */
+ /* because the rsb is inactive, it's not refcounted and lock_rsb
+ is not used, but is protected by the rsbtbl lock */
__dlm_master_lookup(ls, r, our_nodeid, from_nodeid, true, flags,
r_nodeid, result);
- rsb_mod_timer(ls, r);
- /* the rsb was inactive (on toss list) */
+ /* A dir record rsb should never be on scan list. */
+ /* Try to fix this with del_scan? */
+ WARN_ON(!list_empty(&r->res_scan_list));
+
write_unlock_bh(&ls->ls_rsbtbl_lock);
return 0;
not_found:
error = get_rsb_struct(ls, name, len, &r);
- if (error == -EAGAIN)
- goto retry;
- if (error)
+ if (WARN_ON_ONCE(error))
goto out;
r->res_hash = hash;
r->res_dir_nodeid = our_nodeid;
r->res_master_nodeid = from_nodeid;
r->res_nodeid = from_nodeid;
- kref_init(&r->res_ref);
- rsb_set_flag(r, RSB_TOSS);
+ rsb_set_flag(r, RSB_INACTIVE);
write_lock_bh(&ls->ls_rsbtbl_lock);
error = rsb_insert(r, &ls->ls_rsbtbl);
@@ -1396,7 +1358,7 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name,
*/
write_unlock_bh(&ls->ls_rsbtbl_lock);
dlm_free_rsb(r);
- goto retry_lookup;
+ goto retry;
} else if (error) {
write_unlock_bh(&ls->ls_rsbtbl_lock);
/* should never happen */
@@ -1404,8 +1366,7 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name,
goto retry;
}
- list_add(&r->res_rsbs_list, &ls->ls_toss);
- rsb_mod_timer(ls, r);
+ list_add(&r->res_slow_list, &ls->ls_slow_inactive);
write_unlock_bh(&ls->ls_rsbtbl_lock);
if (result)
@@ -1415,12 +1376,22 @@ int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name,
return error;
}
+int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name,
+ int len, unsigned int flags, int *r_nodeid, int *result)
+{
+ int rv;
+ rcu_read_lock();
+ rv = _dlm_master_lookup(ls, from_nodeid, name, len, flags, r_nodeid, result);
+ rcu_read_unlock();
+ return rv;
+}
+
static void dlm_dump_rsb_hash(struct dlm_ls *ls, uint32_t hash)
{
struct dlm_rsb *r;
read_lock_bh(&ls->ls_rsbtbl_lock);
- list_for_each_entry(r, &ls->ls_keep, res_rsbs_list) {
+ list_for_each_entry(r, &ls->ls_slow_active, res_slow_list) {
if (r->res_hash == hash)
dlm_dump_rsb(r);
}
@@ -1442,15 +1413,28 @@ void dlm_dump_rsb_name(struct dlm_ls *ls, const char *name, int len)
read_unlock_bh(&ls->ls_rsbtbl_lock);
}
-static void toss_rsb(struct kref *kref)
+static void deactivate_rsb(struct kref *kref)
{
struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
struct dlm_ls *ls = r->res_ls;
+ int our_nodeid = dlm_our_nodeid();
DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
- rsb_set_flag(r, RSB_TOSS);
- list_move(&r->res_rsbs_list, &ls->ls_toss);
- rsb_mod_timer(ls, r);
+ rsb_set_flag(r, RSB_INACTIVE);
+ list_move(&r->res_slow_list, &ls->ls_slow_inactive);
+
+ /*
+ * When the rsb becomes unused:
+ * - If it's not a dir record for a remote master rsb,
+ * then it is put on the scan list to be freed.
+ * - If it's a dir record for a remote master rsb,
+ * then it is kept in the inactive state until
+ * receive_remove() from the master node.
+ */
+ if (!dlm_no_directory(ls) &&
+ (r->res_master_nodeid != our_nodeid) &&
+ (dlm_dir_nodeid(r) != our_nodeid))
+ add_scan(ls, r);
if (r->res_lvbptr) {
dlm_free_lvb(r->res_lvbptr);
@@ -1464,22 +1448,22 @@ static void unhold_rsb(struct dlm_rsb *r)
{
int rv;
- /* rsbs in toss state never get referenced */
- WARN_ON(rsb_flag(r, RSB_TOSS));
- rv = kref_put(&r->res_ref, toss_rsb);
+ /* inactive rsbs are not ref counted */
+ WARN_ON(rsb_flag(r, RSB_INACTIVE));
+ rv = kref_put(&r->res_ref, deactivate_rsb);
DLM_ASSERT(!rv, dlm_dump_rsb(r););
}
-void free_toss_rsb(struct dlm_rsb *r)
+void free_inactive_rsb(struct dlm_rsb *r)
{
- WARN_ON_ONCE(!rsb_flag(r, RSB_TOSS));
+ WARN_ON_ONCE(!rsb_flag(r, RSB_INACTIVE));
DLM_ASSERT(list_empty(&r->res_lookup), dlm_dump_rsb(r););
DLM_ASSERT(list_empty(&r->res_grantqueue), dlm_dump_rsb(r););
DLM_ASSERT(list_empty(&r->res_convertqueue), dlm_dump_rsb(r););
DLM_ASSERT(list_empty(&r->res_waitqueue), dlm_dump_rsb(r););
DLM_ASSERT(list_empty(&r->res_root_list), dlm_dump_rsb(r););
- DLM_ASSERT(list_empty(&r->res_toss_q_list), dlm_dump_rsb(r););
+ DLM_ASSERT(list_empty(&r->res_scan_list), dlm_dump_rsb(r););
DLM_ASSERT(list_empty(&r->res_recover_list), dlm_dump_rsb(r););
DLM_ASSERT(list_empty(&r->res_masters_list), dlm_dump_rsb(r););
@@ -1504,11 +1488,15 @@ static void detach_lkb(struct dlm_lkb *lkb)
}
static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret,
- int start, int end)
+ unsigned long start, unsigned long end)
{
+ struct xa_limit limit;
struct dlm_lkb *lkb;
int rv;
+ limit.max = end;
+ limit.min = start;
+
lkb = dlm_allocate_lkb(ls);
if (!lkb)
return -ENOMEM;
@@ -1522,14 +1510,12 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret,
INIT_LIST_HEAD(&lkb->lkb_ownqueue);
INIT_LIST_HEAD(&lkb->lkb_rsb_lookup);
- write_lock_bh(&ls->ls_lkbidr_lock);
- rv = idr_alloc(&ls->ls_lkbidr, lkb, start, end, GFP_NOWAIT);
- if (rv >= 0)
- lkb->lkb_id = rv;
- write_unlock_bh(&ls->ls_lkbidr_lock);
+ write_lock_bh(&ls->ls_lkbxa_lock);
+ rv = xa_alloc(&ls->ls_lkbxa, &lkb->lkb_id, lkb, limit, GFP_ATOMIC);
+ write_unlock_bh(&ls->ls_lkbxa_lock);
if (rv < 0) {
- log_error(ls, "create_lkb idr error %d", rv);
+ log_error(ls, "create_lkb xa error %d", rv);
dlm_free_lkb(lkb);
return rv;
}
@@ -1540,18 +1526,18 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret,
static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
{
- return _create_lkb(ls, lkb_ret, 1, 0);
+ return _create_lkb(ls, lkb_ret, 1, ULONG_MAX);
}
static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
{
struct dlm_lkb *lkb;
- read_lock_bh(&ls->ls_lkbidr_lock);
- lkb = idr_find(&ls->ls_lkbidr, lkid);
+ read_lock_bh(&ls->ls_lkbxa_lock);
+ lkb = xa_load(&ls->ls_lkbxa, lkid);
if (lkb)
kref_get(&lkb->lkb_ref);
- read_unlock_bh(&ls->ls_lkbidr_lock);
+ read_unlock_bh(&ls->ls_lkbxa_lock);
*lkb_ret = lkb;
return lkb ? 0 : -ENOENT;
@@ -1576,10 +1562,10 @@ static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
int rv;
rv = dlm_kref_put_write_lock_bh(&lkb->lkb_ref, kill_lkb,
- &ls->ls_lkbidr_lock);
+ &ls->ls_lkbxa_lock);
if (rv) {
- idr_remove(&ls->ls_lkbidr, lkid);
- write_unlock_bh(&ls->ls_lkbidr_lock);
+ xa_erase(&ls->ls_lkbxa, lkid);
+ write_unlock_bh(&ls->ls_lkbxa_lock);
detach_lkb(lkb);
@@ -4323,8 +4309,9 @@ static void receive_remove(struct dlm_ls *ls, const struct dlm_message *ms)
return;
}
- /* Look for name in rsb toss state, if it's there, kill it.
- * If it's in non toss state, it's being used, and we should ignore this
+ /*
+ * Look for inactive rsb, if it's there, free it.
+ * If the rsb is active, it's being used, and we should ignore this
* message. This is an expected race between the dir node sending a
* request to the master node at the same time as the master node sends
* a remove to the dir node. The resolution to that race is for the
@@ -4347,16 +4334,18 @@ static void receive_remove(struct dlm_ls *ls, const struct dlm_message *ms)
return;
}
- if (!rsb_flag(r, RSB_TOSS)) {
+ if (!rsb_flag(r, RSB_INACTIVE)) {
if (r->res_master_nodeid != from_nodeid) {
/* should not happen */
- log_error(ls, "receive_remove keep from %d master %d",
+ log_error(ls, "receive_remove on active rsb from %d master %d",
from_nodeid, r->res_master_nodeid);
dlm_print_rsb(r);
write_unlock_bh(&ls->ls_rsbtbl_lock);
return;
}
+ /* Ignore the remove message, see race comment above. */
+
log_debug(ls, "receive_remove from %d master %d first %x %s",
from_nodeid, r->res_master_nodeid, r->res_first_lkid,
name);
@@ -4365,19 +4354,20 @@ static void receive_remove(struct dlm_ls *ls, const struct dlm_message *ms)
}
if (r->res_master_nodeid != from_nodeid) {
- log_error(ls, "receive_remove toss from %d master %d",
+ log_error(ls, "receive_remove inactive from %d master %d",
from_nodeid, r->res_master_nodeid);
dlm_print_rsb(r);
write_unlock_bh(&ls->ls_rsbtbl_lock);
return;
}
- list_del(&r->res_rsbs_list);
+ list_del(&r->res_slow_list);
rhashtable_remove_fast(&ls->ls_rsbtbl, &r->res_node,
dlm_rhash_rsb_params);
+ rsb_clear_flag(r, RSB_HASHED);
write_unlock_bh(&ls->ls_rsbtbl_lock);
- free_toss_rsb(r);
+ free_inactive_rsb(r);
}
static void receive_purge(struct dlm_ls *ls, const struct dlm_message *ms)
@@ -5444,7 +5434,7 @@ static struct dlm_rsb *find_grant_rsb(struct dlm_ls *ls)
struct dlm_rsb *r;
read_lock_bh(&ls->ls_rsbtbl_lock);
- list_for_each_entry(r, &ls->ls_keep, res_rsbs_list) {
+ list_for_each_entry(r, &ls->ls_slow_active, res_slow_list) {
if (!rsb_flag(r, RSB_RECOVER_GRANT))
continue;
if (!is_master(r)) {
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 8de9dee4c058..4ed8d36f9c6d 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -11,7 +11,6 @@
#ifndef __LOCK_DOT_H__
#define __LOCK_DOT_H__
-void dlm_rsb_toss_timer(struct timer_list *timer);
void dlm_dump_rsb(struct dlm_rsb *r);
void dlm_dump_rsb_name(struct dlm_ls *ls, const char *name, int len);
void dlm_print_lkb(struct dlm_lkb *lkb);
@@ -19,15 +18,15 @@ void dlm_receive_message_saved(struct dlm_ls *ls, const struct dlm_message *ms,
uint32_t saved_seq);
void dlm_receive_buffer(const union dlm_packet *p, int nodeid);
int dlm_modes_compat(int mode1, int mode2);
-void free_toss_rsb(struct dlm_rsb *r);
+void free_inactive_rsb(struct dlm_rsb *r);
void dlm_put_rsb(struct dlm_rsb *r);
void dlm_hold_rsb(struct dlm_rsb *r);
int dlm_put_lkb(struct dlm_lkb *lkb);
-void dlm_scan_rsbs(struct dlm_ls *ls);
int dlm_lock_recovery_try(struct dlm_ls *ls);
void dlm_lock_recovery(struct dlm_ls *ls);
void dlm_unlock_recovery(struct dlm_ls *ls);
-void dlm_timer_resume(struct dlm_ls *ls);
+void dlm_rsb_scan(struct timer_list *timer);
+void resume_scan_timer(struct dlm_ls *ls);
int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name,
int len, unsigned int flags, int *r_nodeid, int *result);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 475ab4370dda..1848cbbc96a9 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -38,7 +38,7 @@ static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len)
if (rc)
return rc;
- ls = dlm_find_lockspace_local(ls->ls_local_handle);
+ ls = dlm_find_lockspace_local(ls);
if (!ls)
return -EINVAL;
@@ -265,18 +265,9 @@ struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
struct dlm_ls *dlm_find_lockspace_local(dlm_lockspace_t *lockspace)
{
- struct dlm_ls *ls;
+ struct dlm_ls *ls = lockspace;
- spin_lock_bh(&lslist_lock);
- list_for_each_entry(ls, &lslist, ls_list) {
- if (ls->ls_local_handle == lockspace) {
- atomic_inc(&ls->ls_count);
- goto out;
- }
- }
- ls = NULL;
- out:
- spin_unlock_bh(&lslist_lock);
+ atomic_inc(&ls->ls_count);
return ls;
}
@@ -410,37 +401,37 @@ static int new_lockspace(const char *name, const char *cluster,
atomic_set(&ls->ls_count, 0);
init_waitqueue_head(&ls->ls_count_wait);
ls->ls_flags = 0;
- ls->ls_scan_time = jiffies;
if (ops && dlm_config.ci_recover_callbacks) {
ls->ls_ops = ops;
ls->ls_ops_arg = ops_arg;
}
+ if (flags & DLM_LSFL_SOFTIRQ)
+ set_bit(LSFL_SOFTIRQ, &ls->ls_flags);
+
/* ls_exflags are forced to match among nodes, and we don't
* need to require all nodes to have some flags set
*/
- ls->ls_exflags = (flags & ~(DLM_LSFL_FS | DLM_LSFL_NEWEXCL));
+ ls->ls_exflags = (flags & ~(DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
+ DLM_LSFL_SOFTIRQ));
- INIT_LIST_HEAD(&ls->ls_toss);
- INIT_LIST_HEAD(&ls->ls_keep);
+ INIT_LIST_HEAD(&ls->ls_slow_inactive);
+ INIT_LIST_HEAD(&ls->ls_slow_active);
rwlock_init(&ls->ls_rsbtbl_lock);
error = rhashtable_init(&ls->ls_rsbtbl, &dlm_rhash_rsb_params);
if (error)
goto out_lsfree;
- idr_init(&ls->ls_lkbidr);
- rwlock_init(&ls->ls_lkbidr_lock);
+ xa_init_flags(&ls->ls_lkbxa, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_BH);
+ rwlock_init(&ls->ls_lkbxa_lock);
INIT_LIST_HEAD(&ls->ls_waiters);
spin_lock_init(&ls->ls_waiters_lock);
INIT_LIST_HEAD(&ls->ls_orphans);
spin_lock_init(&ls->ls_orphans_lock);
- INIT_LIST_HEAD(&ls->ls_new_rsb);
- spin_lock_init(&ls->ls_new_rsb_spin);
-
INIT_LIST_HEAD(&ls->ls_nodes);
INIT_LIST_HEAD(&ls->ls_nodes_gone);
ls->ls_num_nodes = 0;
@@ -484,7 +475,7 @@ static int new_lockspace(const char *name, const char *cluster,
ls->ls_recover_buf = kmalloc(DLM_MAX_SOCKET_BUFSIZE, GFP_NOFS);
if (!ls->ls_recover_buf) {
error = -ENOMEM;
- goto out_lkbidr;
+ goto out_lkbxa;
}
ls->ls_slot = 0;
@@ -494,32 +485,31 @@ static int new_lockspace(const char *name, const char *cluster,
INIT_LIST_HEAD(&ls->ls_recover_list);
spin_lock_init(&ls->ls_recover_list_lock);
- idr_init(&ls->ls_recover_idr);
- spin_lock_init(&ls->ls_recover_idr_lock);
+ xa_init_flags(&ls->ls_recover_xa, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_BH);
+ spin_lock_init(&ls->ls_recover_xa_lock);
ls->ls_recover_list_count = 0;
- ls->ls_local_handle = ls;
init_waitqueue_head(&ls->ls_wait_general);
INIT_LIST_HEAD(&ls->ls_masters_list);
rwlock_init(&ls->ls_masters_lock);
INIT_LIST_HEAD(&ls->ls_dir_dump_list);
rwlock_init(&ls->ls_dir_dump_lock);
- INIT_LIST_HEAD(&ls->ls_toss_q);
- spin_lock_init(&ls->ls_toss_q_lock);
- timer_setup(&ls->ls_timer, dlm_rsb_toss_timer,
- TIMER_DEFERRABLE);
+ INIT_LIST_HEAD(&ls->ls_scan_list);
+ spin_lock_init(&ls->ls_scan_lock);
+ timer_setup(&ls->ls_scan_timer, dlm_rsb_scan, TIMER_DEFERRABLE);
spin_lock_bh(&lslist_lock);
ls->ls_create_count = 1;
list_add(&ls->ls_list, &lslist);
spin_unlock_bh(&lslist_lock);
- if (flags & DLM_LSFL_FS) {
- error = dlm_callback_start(ls);
- if (error) {
- log_error(ls, "can't start dlm_callback %d", error);
- goto out_delist;
- }
+ if (flags & DLM_LSFL_FS)
+ set_bit(LSFL_FS, &ls->ls_flags);
+
+ error = dlm_callback_start(ls);
+ if (error) {
+ log_error(ls, "can't start dlm_callback %d", error);
+ goto out_delist;
}
init_waitqueue_head(&ls->ls_recover_lock_wait);
@@ -584,10 +574,10 @@ static int new_lockspace(const char *name, const char *cluster,
spin_lock_bh(&lslist_lock);
list_del(&ls->ls_list);
spin_unlock_bh(&lslist_lock);
- idr_destroy(&ls->ls_recover_idr);
+ xa_destroy(&ls->ls_recover_xa);
kfree(ls->ls_recover_buf);
- out_lkbidr:
- idr_destroy(&ls->ls_lkbidr);
+ out_lkbxa:
+ xa_destroy(&ls->ls_lkbxa);
rhashtable_destroy(&ls->ls_rsbtbl);
out_lsfree:
if (do_unreg)
@@ -643,26 +633,15 @@ int dlm_new_user_lockspace(const char *name, const char *cluster,
void *ops_arg, int *ops_result,
dlm_lockspace_t **lockspace)
{
+ if (flags & DLM_LSFL_SOFTIRQ)
+ return -EINVAL;
+
return __dlm_new_lockspace(name, cluster, flags, lvblen, ops,
ops_arg, ops_result, lockspace);
}
-static int lkb_idr_is_local(int id, void *p, void *data)
-{
- struct dlm_lkb *lkb = p;
-
- return lkb->lkb_nodeid == 0 && lkb->lkb_grmode != DLM_LOCK_IV;
-}
-
-static int lkb_idr_is_any(int id, void *p, void *data)
-{
- return 1;
-}
-
-static int lkb_idr_free(int id, void *p, void *data)
+static int lkb_idr_free(struct dlm_lkb *lkb)
{
- struct dlm_lkb *lkb = p;
-
if (lkb->lkb_lvbptr && test_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags))
dlm_free_lvb(lkb->lkb_lvbptr);
@@ -670,23 +649,34 @@ static int lkb_idr_free(int id, void *p, void *data)
return 0;
}
-/* NOTE: We check the lkbidr here rather than the resource table.
+/* NOTE: We check the lkbxa here rather than the resource table.
This is because there may be LKBs queued as ASTs that have been unlinked
from their RSBs and are pending deletion once the AST has been delivered */
static int lockspace_busy(struct dlm_ls *ls, int force)
{
- int rv;
+ struct dlm_lkb *lkb;
+ unsigned long id;
+ int rv = 0;
- read_lock_bh(&ls->ls_lkbidr_lock);
+ read_lock_bh(&ls->ls_lkbxa_lock);
if (force == 0) {
- rv = idr_for_each(&ls->ls_lkbidr, lkb_idr_is_any, ls);
+ xa_for_each(&ls->ls_lkbxa, id, lkb) {
+ rv = 1;
+ break;
+ }
} else if (force == 1) {
- rv = idr_for_each(&ls->ls_lkbidr, lkb_idr_is_local, ls);
+ xa_for_each(&ls->ls_lkbxa, id, lkb) {
+ if (lkb->lkb_nodeid == 0 &&
+ lkb->lkb_grmode != DLM_LOCK_IV) {
+ rv = 1;
+ break;
+ }
+ }
} else {
rv = 0;
}
- read_unlock_bh(&ls->ls_lkbidr_lock);
+ read_unlock_bh(&ls->ls_lkbxa_lock);
return rv;
}
@@ -699,7 +689,8 @@ static void rhash_free_rsb(void *ptr, void *arg)
static int release_lockspace(struct dlm_ls *ls, int force)
{
- struct dlm_rsb *rsb;
+ struct dlm_lkb *lkb;
+ unsigned long id;
int busy, rv;
busy = lockspace_busy(ls, force);
@@ -739,7 +730,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
* time_shutdown_sync(), we don't care anymore
*/
clear_bit(LSFL_RUNNING, &ls->ls_flags);
- timer_shutdown_sync(&ls->ls_timer);
+ timer_shutdown_sync(&ls->ls_scan_timer);
if (ls_count == 1) {
dlm_clear_members(ls);
@@ -752,28 +743,22 @@ static int release_lockspace(struct dlm_ls *ls, int force)
dlm_delete_debug_file(ls);
- idr_destroy(&ls->ls_recover_idr);
+ xa_destroy(&ls->ls_recover_xa);
kfree(ls->ls_recover_buf);
/*
- * Free all lkb's in idr
+ * Free all lkb's in xa
*/
-
- idr_for_each(&ls->ls_lkbidr, lkb_idr_free, ls);
- idr_destroy(&ls->ls_lkbidr);
+ xa_for_each(&ls->ls_lkbxa, id, lkb) {
+ lkb_idr_free(lkb);
+ }
+ xa_destroy(&ls->ls_lkbxa);
/*
* Free all rsb's on rsbtbl
*/
rhashtable_free_and_destroy(&ls->ls_rsbtbl, rhash_free_rsb, NULL);
- while (!list_empty(&ls->ls_new_rsb)) {
- rsb = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb,
- res_hashchain);
- list_del(&rsb->res_hashchain);
- dlm_free_rsb(rsb);
- }
-
/*
* Free structures on any other lists
*/
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 6b8078085e56..2e3e269d820e 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -461,7 +461,7 @@ static bool dlm_lowcomms_con_has_addr(const struct connection *con,
return false;
}
-int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
+int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr)
{
struct connection *con;
bool ret, idx;
@@ -858,12 +858,6 @@ static void free_processqueue_entry(struct processqueue_entry *pentry)
kfree(pentry);
}
-struct dlm_processed_nodes {
- int nodeid;
-
- struct list_head list;
-};
-
static void process_dlm_messages(struct work_struct *work)
{
struct processqueue_entry *pentry;
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
index 8deb16f8f620..fd0df604eb93 100644
--- a/fs/dlm/lowcomms.h
+++ b/fs/dlm/lowcomms.h
@@ -46,7 +46,7 @@ void dlm_lowcomms_put_msg(struct dlm_msg *msg);
int dlm_lowcomms_resend_msg(struct dlm_msg *msg);
int dlm_lowcomms_connect_node(int nodeid);
int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark);
-int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len);
+int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr);
void dlm_midcomms_receive_done(int nodeid);
struct kmem_cache *dlm_lowcomms_writequeue_cache_create(void);
struct kmem_cache *dlm_lowcomms_msg_cache_create(void);
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index c46e306f2e5c..a7ee7fd2b9d3 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -642,7 +642,7 @@ int dlm_ls_stop(struct dlm_ls *ls)
set_bit(LSFL_RECOVER_STOP, &ls->ls_flags);
new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags);
if (new)
- timer_delete_sync(&ls->ls_timer);
+ timer_delete_sync(&ls->ls_scan_timer);
ls->ls_recover_seq++;
/* activate requestqueue and stop processing */
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index 15a8b1cee433..8c44b954c166 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -72,6 +72,8 @@ out:
void dlm_memory_exit(void)
{
+ rcu_barrier();
+
kmem_cache_destroy(writequeue_cache);
kmem_cache_destroy(mhandle_cache);
kmem_cache_destroy(msg_cache);
@@ -101,13 +103,19 @@ struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls)
return r;
}
-void dlm_free_rsb(struct dlm_rsb *r)
+static void __free_rsb_rcu(struct rcu_head *rcu)
{
+ struct dlm_rsb *r = container_of(rcu, struct dlm_rsb, rcu);
if (r->res_lvbptr)
dlm_free_lvb(r->res_lvbptr);
kmem_cache_free(rsb_cache, r);
}
+void dlm_free_rsb(struct dlm_rsb *r)
+{
+ call_rcu(&r->rcu, __free_rsb_rcu);
+}
+
struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
{
struct dlm_lkb *lkb;
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index c34f38e9ee5c..2c101bbe261a 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -334,12 +334,12 @@ static struct midcomms_node *nodeid2node(int nodeid)
return __find_node(nodeid, nodeid_hash(nodeid));
}
-int dlm_midcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
+int dlm_midcomms_addr(int nodeid, struct sockaddr_storage *addr)
{
int ret, idx, r = nodeid_hash(nodeid);
struct midcomms_node *node;
- ret = dlm_lowcomms_addr(nodeid, addr, len);
+ ret = dlm_lowcomms_addr(nodeid, addr);
if (ret)
return ret;
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
index 278d26fdeb2c..7fad1d170bba 100644
--- a/fs/dlm/midcomms.h
+++ b/fs/dlm/midcomms.h
@@ -19,7 +19,7 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen);
struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, char **ppc);
void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, const void *name,
int namelen);
-int dlm_midcomms_addr(int nodeid, struct sockaddr_storage *addr, int len);
+int dlm_midcomms_addr(int nodeid, struct sockaddr_storage *addr);
void dlm_midcomms_version_wait(void);
int dlm_midcomms_close(int nodeid);
int dlm_midcomms_start(void);
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index f493d5f30c58..c7afb428a2b4 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -293,73 +293,78 @@ static void recover_list_clear(struct dlm_ls *ls)
spin_unlock_bh(&ls->ls_recover_list_lock);
}
-static int recover_idr_empty(struct dlm_ls *ls)
+static int recover_xa_empty(struct dlm_ls *ls)
{
int empty = 1;
- spin_lock_bh(&ls->ls_recover_idr_lock);
+ spin_lock_bh(&ls->ls_recover_xa_lock);
if (ls->ls_recover_list_count)
empty = 0;
- spin_unlock_bh(&ls->ls_recover_idr_lock);
+ spin_unlock_bh(&ls->ls_recover_xa_lock);
return empty;
}
-static int recover_idr_add(struct dlm_rsb *r)
+static int recover_xa_add(struct dlm_rsb *r)
{
struct dlm_ls *ls = r->res_ls;
+ struct xa_limit limit = {
+ .min = 1,
+ .max = UINT_MAX,
+ };
+ uint32_t id;
int rv;
- spin_lock_bh(&ls->ls_recover_idr_lock);
+ spin_lock_bh(&ls->ls_recover_xa_lock);
if (r->res_id) {
rv = -1;
goto out_unlock;
}
- rv = idr_alloc(&ls->ls_recover_idr, r, 1, 0, GFP_NOWAIT);
+ rv = xa_alloc(&ls->ls_recover_xa, &id, r, limit, GFP_ATOMIC);
if (rv < 0)
goto out_unlock;
- r->res_id = rv;
+ r->res_id = id;
ls->ls_recover_list_count++;
dlm_hold_rsb(r);
rv = 0;
out_unlock:
- spin_unlock_bh(&ls->ls_recover_idr_lock);
+ spin_unlock_bh(&ls->ls_recover_xa_lock);
return rv;
}
-static void recover_idr_del(struct dlm_rsb *r)
+static void recover_xa_del(struct dlm_rsb *r)
{
struct dlm_ls *ls = r->res_ls;
- spin_lock_bh(&ls->ls_recover_idr_lock);
- idr_remove(&ls->ls_recover_idr, r->res_id);
+ spin_lock_bh(&ls->ls_recover_xa_lock);
+ xa_erase_bh(&ls->ls_recover_xa, r->res_id);
r->res_id = 0;
ls->ls_recover_list_count--;
- spin_unlock_bh(&ls->ls_recover_idr_lock);
+ spin_unlock_bh(&ls->ls_recover_xa_lock);
dlm_put_rsb(r);
}
-static struct dlm_rsb *recover_idr_find(struct dlm_ls *ls, uint64_t id)
+static struct dlm_rsb *recover_xa_find(struct dlm_ls *ls, uint64_t id)
{
struct dlm_rsb *r;
- spin_lock_bh(&ls->ls_recover_idr_lock);
- r = idr_find(&ls->ls_recover_idr, (int)id);
- spin_unlock_bh(&ls->ls_recover_idr_lock);
+ spin_lock_bh(&ls->ls_recover_xa_lock);
+ r = xa_load(&ls->ls_recover_xa, (int)id);
+ spin_unlock_bh(&ls->ls_recover_xa_lock);
return r;
}
-static void recover_idr_clear(struct dlm_ls *ls)
+static void recover_xa_clear(struct dlm_ls *ls)
{
struct dlm_rsb *r;
- int id;
+ unsigned long id;
- spin_lock_bh(&ls->ls_recover_idr_lock);
+ spin_lock_bh(&ls->ls_recover_xa_lock);
- idr_for_each_entry(&ls->ls_recover_idr, r, id) {
- idr_remove(&ls->ls_recover_idr, id);
+ xa_for_each(&ls->ls_recover_xa, id, r) {
+ xa_erase_bh(&ls->ls_recover_xa, id);
r->res_id = 0;
r->res_recover_locks_count = 0;
ls->ls_recover_list_count--;
@@ -372,7 +377,7 @@ static void recover_idr_clear(struct dlm_ls *ls)
ls->ls_recover_list_count);
ls->ls_recover_list_count = 0;
}
- spin_unlock_bh(&ls->ls_recover_idr_lock);
+ spin_unlock_bh(&ls->ls_recover_xa_lock);
}
@@ -470,7 +475,7 @@ static int recover_master(struct dlm_rsb *r, unsigned int *count, uint64_t seq)
set_new_master(r);
error = 0;
} else {
- recover_idr_add(r);
+ recover_xa_add(r);
error = dlm_send_rcom_lookup(r, dir_nodeid, seq);
}
@@ -551,10 +556,10 @@ int dlm_recover_masters(struct dlm_ls *ls, uint64_t seq,
log_rinfo(ls, "dlm_recover_masters %u of %u", count, total);
- error = dlm_wait_function(ls, &recover_idr_empty);
+ error = dlm_wait_function(ls, &recover_xa_empty);
out:
if (error)
- recover_idr_clear(ls);
+ recover_xa_clear(ls);
return error;
}
@@ -563,7 +568,7 @@ int dlm_recover_master_reply(struct dlm_ls *ls, const struct dlm_rcom *rc)
struct dlm_rsb *r;
int ret_nodeid, new_master;
- r = recover_idr_find(ls, le64_to_cpu(rc->rc_id));
+ r = recover_xa_find(ls, le64_to_cpu(rc->rc_id));
if (!r) {
log_error(ls, "dlm_recover_master_reply no id %llx",
(unsigned long long)le64_to_cpu(rc->rc_id));
@@ -582,9 +587,9 @@ int dlm_recover_master_reply(struct dlm_ls *ls, const struct dlm_rcom *rc)
r->res_nodeid = new_master;
set_new_master(r);
unlock_rsb(r);
- recover_idr_del(r);
+ recover_xa_del(r);
- if (recover_idr_empty(ls))
+ if (recover_xa_empty(ls))
wake_up(&ls->ls_wait_general);
out:
return 0;
@@ -877,29 +882,26 @@ void dlm_recover_rsbs(struct dlm_ls *ls, const struct list_head *root_list)
log_rinfo(ls, "dlm_recover_rsbs %d done", count);
}
-/* Create a single list of all root rsb's to be used during recovery */
-
-void dlm_clear_toss(struct dlm_ls *ls)
+void dlm_clear_inactive(struct dlm_ls *ls)
{
struct dlm_rsb *r, *safe;
unsigned int count = 0;
write_lock_bh(&ls->ls_rsbtbl_lock);
- list_for_each_entry_safe(r, safe, &ls->ls_toss, res_rsbs_list) {
- list_del(&r->res_rsbs_list);
+ list_for_each_entry_safe(r, safe, &ls->ls_slow_inactive, res_slow_list) {
+ list_del(&r->res_slow_list);
rhashtable_remove_fast(&ls->ls_rsbtbl, &r->res_node,
dlm_rhash_rsb_params);
- /* remove it from the toss queue if its part of it */
- if (!list_empty(&r->res_toss_q_list))
- list_del_init(&r->res_toss_q_list);
+ if (!list_empty(&r->res_scan_list))
+ list_del_init(&r->res_scan_list);
- free_toss_rsb(r);
+ free_inactive_rsb(r);
count++;
}
write_unlock_bh(&ls->ls_rsbtbl_lock);
if (count)
- log_rinfo(ls, "dlm_clear_toss %u done", count);
+ log_rinfo(ls, "dlm_clear_inactive %u done", count);
}
diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h
index efc79a6e577d..ec69896462fb 100644
--- a/fs/dlm/recover.h
+++ b/fs/dlm/recover.h
@@ -25,7 +25,7 @@ int dlm_recover_master_reply(struct dlm_ls *ls, const struct dlm_rcom *rc);
int dlm_recover_locks(struct dlm_ls *ls, uint64_t seq,
const struct list_head *root_list);
void dlm_recovered_lock(struct dlm_rsb *r);
-void dlm_clear_toss(struct dlm_ls *ls);
+void dlm_clear_inactive(struct dlm_ls *ls);
void dlm_recover_rsbs(struct dlm_ls *ls, const struct list_head *root_list);
#endif /* __RECOVER_DOT_H__ */
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 17a40d1e6036..34f4f9f49a6c 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -33,7 +33,7 @@ static int dlm_create_masters_list(struct dlm_ls *ls)
}
read_lock_bh(&ls->ls_rsbtbl_lock);
- list_for_each_entry(r, &ls->ls_keep, res_rsbs_list) {
+ list_for_each_entry(r, &ls->ls_slow_active, res_slow_list) {
if (r->res_nodeid)
continue;
@@ -63,12 +63,12 @@ static void dlm_create_root_list(struct dlm_ls *ls, struct list_head *root_list)
struct dlm_rsb *r;
read_lock_bh(&ls->ls_rsbtbl_lock);
- list_for_each_entry(r, &ls->ls_keep, res_rsbs_list) {
+ list_for_each_entry(r, &ls->ls_slow_active, res_slow_list) {
list_add(&r->res_root_list, root_list);
dlm_hold_rsb(r);
}
- WARN_ON_ONCE(!list_empty(&ls->ls_toss));
+ WARN_ON_ONCE(!list_empty(&ls->ls_slow_inactive));
read_unlock_bh(&ls->ls_rsbtbl_lock);
}
@@ -98,16 +98,16 @@ static int enable_locking(struct dlm_ls *ls, uint64_t seq)
spin_lock_bh(&ls->ls_recover_lock);
if (ls->ls_recover_seq == seq) {
set_bit(LSFL_RUNNING, &ls->ls_flags);
- /* Schedule next timer if recovery put something on toss.
+ /* Schedule next timer if recovery put something on inactive.
*
* The rsbs that was queued while recovery on toss hasn't
* started yet because LSFL_RUNNING was set everything
* else recovery hasn't started as well because ls_in_recovery
* is still hold. So we should not run into the case that
- * dlm_timer_resume() queues a timer that can occur in
+ * resume_scan_timer() queues a timer that can occur in
* a no op.
*/
- dlm_timer_resume(ls);
+ resume_scan_timer(ls);
/* unblocks processes waiting to enter the dlm */
up_write(&ls->ls_in_recovery);
clear_bit(LSFL_RECOVER_LOCK, &ls->ls_flags);
@@ -131,7 +131,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
dlm_callback_suspend(ls);
- dlm_clear_toss(ls);
+ dlm_clear_inactive(ls);
/*
* This list of root rsb's will be the basis of most of the recovery
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 3173b974e8c8..5cb3896be826 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -182,7 +182,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
struct dlm_user_args *ua;
struct dlm_user_proc *proc;
struct dlm_callback *cb;
- int rv;
+ int rv, copy_lvb;
if (test_bit(DLM_DFL_ORPHAN_BIT, &lkb->lkb_dflags) ||
test_bit(DLM_IFL_DEAD_BIT, &lkb->lkb_iflags))
@@ -213,28 +213,22 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
spin_lock_bh(&proc->asts_spin);
- rv = dlm_queue_lkb_callback(lkb, flags, mode, status, sbflags, &cb);
- switch (rv) {
- case DLM_ENQUEUE_CALLBACK_NEED_SCHED:
- cb->ua = *ua;
- cb->lkb_lksb = &cb->ua.lksb;
- if (cb->copy_lvb) {
- memcpy(cb->lvbptr, ua->lksb.sb_lvbptr,
- DLM_USER_LVB_LEN);
- cb->lkb_lksb->sb_lvbptr = cb->lvbptr;
+ if (!dlm_may_skip_callback(lkb, flags, mode, status, sbflags,
+ &copy_lvb)) {
+ rv = dlm_get_cb(lkb, flags, mode, status, sbflags, &cb);
+ if (!rv) {
+ cb->copy_lvb = copy_lvb;
+ cb->ua = *ua;
+ cb->lkb_lksb = &cb->ua.lksb;
+ if (copy_lvb) {
+ memcpy(cb->lvbptr, ua->lksb.sb_lvbptr,
+ DLM_USER_LVB_LEN);
+ cb->lkb_lksb->sb_lvbptr = cb->lvbptr;
+ }
+
+ list_add_tail(&cb->list, &proc->asts);
+ wake_up_interruptible(&proc->wait);
}
-
- list_add_tail(&cb->list, &proc->asts);
- wake_up_interruptible(&proc->wait);
- break;
- case DLM_ENQUEUE_CALLBACK_SUCCESS:
- break;
- case DLM_ENQUEUE_CALLBACK_FAILURE:
- fallthrough;
- default:
- spin_unlock_bh(&proc->asts_spin);
- WARN_ON_ONCE(1);
- goto out;
}
spin_unlock_bh(&proc->asts_spin);
@@ -454,7 +448,7 @@ static int device_remove_lockspace(struct dlm_lspace_params *params)
if (params->flags & DLM_USER_LSFLG_FORCEFREE)
force = 2;
- lockspace = ls->ls_local_handle;
+ lockspace = ls;
dlm_put_lockspace(ls);
/* The final dlm_release_lockspace waits for references to go to
@@ -657,7 +651,7 @@ static int device_open(struct inode *inode, struct file *file)
return -ENOMEM;
}
- proc->lockspace = ls->ls_local_handle;
+ proc->lockspace = ls;
INIT_LIST_HEAD(&proc->asts);
INIT_LIST_HEAD(&proc->locks);
INIT_LIST_HEAD(&proc->unlocking);
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index bb14462f6d99..a929f1b613be 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -275,8 +275,8 @@ enum {
};
static const struct fs_parameter_spec efivarfs_parameters[] = {
- fsparam_u32("uid", Opt_uid),
- fsparam_u32("gid", Opt_gid),
+ fsparam_uid("uid", Opt_uid),
+ fsparam_gid("gid", Opt_gid),
{},
};
@@ -293,14 +293,10 @@ static int efivarfs_parse_param(struct fs_context *fc, struct fs_parameter *para
switch (opt) {
case Opt_uid:
- opts->uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(opts->uid))
- return -EINVAL;
+ opts->uid = result.uid;
break;
case Opt_gid:
- opts->gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(opts->gid))
- return -EINVAL;
+ opts->gid = result.gid;
break;
default:
return -EINVAL;
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 7844ab24b813..462619e59766 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -311,4 +311,5 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) {
return 0;
}
+MODULE_DESCRIPTION("Extent File System (efs)");
MODULE_LICENSE("GPL");
diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c
index 3b03a573cb1a..7749feded722 100644
--- a/fs/efs/symlink.c
+++ b/fs/efs/symlink.c
@@ -14,10 +14,9 @@
static int efs_symlink_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- char *link = page_address(page);
- struct buffer_head * bh;
- struct inode * inode = page->mapping->host;
+ char *link = folio_address(folio);
+ struct buffer_head *bh;
+ struct inode *inode = folio->mapping->host;
efs_block_t size = inode->i_size;
int err;
@@ -40,12 +39,9 @@ static int efs_symlink_read_folio(struct file *file, struct folio *folio)
brelse(bh);
}
link[size] = '\0';
- SetPageUptodate(page);
- unlock_page(page);
- return 0;
+ err = 0;
fail:
- SetPageError(page);
- unlock_page(page);
+ folio_end_read(folio, err == 0);
return err;
}
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 19d53c30c8af..7bfe251680ec 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -24,6 +24,8 @@ struct z_erofs_decompressor {
void *data, int size);
int (*decompress)(struct z_erofs_decompress_req *rq,
struct page **pagepool);
+ int (*init)(void);
+ void (*exit)(void);
char *name;
};
@@ -52,17 +54,14 @@ struct z_erofs_decompressor {
*/
/*
- * short-lived pages are pages directly from buddy system with specific
- * page->private (no need to set PagePrivate since these are non-LRU /
- * non-movable pages and bypass reclaim / migration code).
+ * Currently, short-lived pages are pages directly from buddy system
+ * with specific page->private (Z_EROFS_SHORTLIVED_PAGE).
+ * In the future world of Memdescs, it should be type 0 (Misc) memory
+ * which type can be checked with a new helper.
*/
static inline bool z_erofs_is_shortlived_page(struct page *page)
{
- if (page->private != Z_EROFS_SHORTLIVED_PAGE)
- return false;
-
- DBG_BUGON(page->mapping);
- return true;
+ return page->private == Z_EROFS_SHORTLIVED_PAGE;
}
static inline bool z_erofs_put_shortlivedpage(struct page **pagepool,
@@ -70,32 +69,32 @@ static inline bool z_erofs_put_shortlivedpage(struct page **pagepool,
{
if (!z_erofs_is_shortlived_page(page))
return false;
-
- /* short-lived pages should not be used by others at the same time */
- if (page_ref_count(page) > 1) {
- put_page(page);
- } else {
- /* follow the pcluster rule above. */
- erofs_pagepool_add(pagepool, page);
- }
+ erofs_pagepool_add(pagepool, page);
return true;
}
+extern const struct z_erofs_decompressor z_erofs_lzma_decomp;
+extern const struct z_erofs_decompressor z_erofs_deflate_decomp;
+extern const struct z_erofs_decompressor z_erofs_zstd_decomp;
+extern const struct z_erofs_decompressor *z_erofs_decomp[];
+
+struct z_erofs_stream_dctx {
+ struct z_erofs_decompress_req *rq;
+ unsigned int inpages, outpages; /* # of {en,de}coded pages */
+ int no, ni; /* the current {en,de}coded page # */
+
+ unsigned int avail_out; /* remaining bytes in the decoded buffer */
+ unsigned int inbuf_pos, inbuf_sz;
+ /* current status of the encoded buffer */
+ u8 *kin, *kout; /* buffer mapped pointers */
+ void *bounce; /* bounce buffer for inplace I/Os */
+ bool bounced; /* is the bounce buffer used now? */
+};
+
+int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst,
+ void **src, struct page **pgpl);
int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
unsigned int padbufsize);
-extern const struct z_erofs_decompressor erofs_decompressors[];
-
-/* prototypes for specific algorithms */
-int z_erofs_load_lzma_config(struct super_block *sb,
- struct erofs_super_block *dsb, void *data, int size);
-int z_erofs_load_deflate_config(struct super_block *sb,
- struct erofs_super_block *dsb, void *data, int size);
-int z_erofs_load_zstd_config(struct super_block *sb,
- struct erofs_super_block *dsb, void *data, int size);
-int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
- struct page **pagepool);
-int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
- struct page **pagepool);
-int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq,
- struct page **pgpl);
+int __init z_erofs_init_decompressor(void);
+void z_erofs_exit_decompressor(void);
#endif
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 9d85b6c11c6b..c2253b6a5416 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2019 HUAWEI, Inc.
* https://www.huawei.com/
+ * Copyright (C) 2024 Alibaba Cloud
*/
#include "compress.h"
#include <linux/lz4.h>
@@ -109,7 +110,6 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
if (top) {
victim = availables[--top];
- get_page(victim);
} else {
victim = __erofs_allocpage(pagepool, rq->gfp, true);
if (!victim)
@@ -371,40 +371,113 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
return 0;
}
-const struct z_erofs_decompressor erofs_decompressors[] = {
- [Z_EROFS_COMPRESSION_SHIFTED] = {
+int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst,
+ void **src, struct page **pgpl)
+{
+ struct z_erofs_decompress_req *rq = dctx->rq;
+ struct super_block *sb = rq->sb;
+ struct page **pgo, *tmppage;
+ unsigned int j;
+
+ if (!dctx->avail_out) {
+ if (++dctx->no >= dctx->outpages || !rq->outputsize) {
+ erofs_err(sb, "insufficient space for decompressed data");
+ return -EFSCORRUPTED;
+ }
+
+ if (dctx->kout)
+ kunmap_local(dctx->kout);
+ dctx->avail_out = min(rq->outputsize, PAGE_SIZE - rq->pageofs_out);
+ rq->outputsize -= dctx->avail_out;
+ pgo = &rq->out[dctx->no];
+ if (!*pgo && rq->fillgaps) { /* deduped */
+ *pgo = erofs_allocpage(pgpl, rq->gfp);
+ if (!*pgo) {
+ dctx->kout = NULL;
+ return -ENOMEM;
+ }
+ set_page_private(*pgo, Z_EROFS_SHORTLIVED_PAGE);
+ }
+ if (*pgo) {
+ dctx->kout = kmap_local_page(*pgo);
+ *dst = dctx->kout + rq->pageofs_out;
+ } else {
+ *dst = dctx->kout = NULL;
+ }
+ rq->pageofs_out = 0;
+ }
+
+ if (dctx->inbuf_pos == dctx->inbuf_sz && rq->inputsize) {
+ if (++dctx->ni >= dctx->inpages) {
+ erofs_err(sb, "invalid compressed data");
+ return -EFSCORRUPTED;
+ }
+ if (dctx->kout) /* unlike kmap(), take care of the orders */
+ kunmap_local(dctx->kout);
+ kunmap_local(dctx->kin);
+
+ dctx->inbuf_sz = min_t(u32, rq->inputsize, PAGE_SIZE);
+ rq->inputsize -= dctx->inbuf_sz;
+ dctx->kin = kmap_local_page(rq->in[dctx->ni]);
+ *src = dctx->kin;
+ dctx->bounced = false;
+ if (dctx->kout) {
+ j = (u8 *)*dst - dctx->kout;
+ dctx->kout = kmap_local_page(rq->out[dctx->no]);
+ *dst = dctx->kout + j;
+ }
+ dctx->inbuf_pos = 0;
+ }
+
+ /*
+ * Handle overlapping: Use the given bounce buffer if the input data is
+ * under processing; Or utilize short-lived pages from the on-stack page
+ * pool, where pages are shared among the same request. Note that only
+ * a few inplace I/O pages need to be doubled.
+ */
+ if (!dctx->bounced && rq->out[dctx->no] == rq->in[dctx->ni]) {
+ memcpy(dctx->bounce, *src, dctx->inbuf_sz);
+ *src = dctx->bounce;
+ dctx->bounced = true;
+ }
+
+ for (j = dctx->ni + 1; j < dctx->inpages; ++j) {
+ if (rq->out[dctx->no] != rq->in[j])
+ continue;
+ tmppage = erofs_allocpage(pgpl, rq->gfp);
+ if (!tmppage)
+ return -ENOMEM;
+ set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
+ copy_highpage(tmppage, rq->in[j]);
+ rq->in[j] = tmppage;
+ }
+ return 0;
+}
+
+const struct z_erofs_decompressor *z_erofs_decomp[] = {
+ [Z_EROFS_COMPRESSION_SHIFTED] = &(const struct z_erofs_decompressor) {
.decompress = z_erofs_transform_plain,
.name = "shifted"
},
- [Z_EROFS_COMPRESSION_INTERLACED] = {
+ [Z_EROFS_COMPRESSION_INTERLACED] = &(const struct z_erofs_decompressor) {
.decompress = z_erofs_transform_plain,
.name = "interlaced"
},
- [Z_EROFS_COMPRESSION_LZ4] = {
+ [Z_EROFS_COMPRESSION_LZ4] = &(const struct z_erofs_decompressor) {
.config = z_erofs_load_lz4_config,
.decompress = z_erofs_lz4_decompress,
+ .init = z_erofs_gbuf_init,
+ .exit = z_erofs_gbuf_exit,
.name = "lz4"
},
#ifdef CONFIG_EROFS_FS_ZIP_LZMA
- [Z_EROFS_COMPRESSION_LZMA] = {
- .config = z_erofs_load_lzma_config,
- .decompress = z_erofs_lzma_decompress,
- .name = "lzma"
- },
+ [Z_EROFS_COMPRESSION_LZMA] = &z_erofs_lzma_decomp,
#endif
#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE
- [Z_EROFS_COMPRESSION_DEFLATE] = {
- .config = z_erofs_load_deflate_config,
- .decompress = z_erofs_deflate_decompress,
- .name = "deflate"
- },
+ [Z_EROFS_COMPRESSION_DEFLATE] = &z_erofs_deflate_decomp,
#endif
#ifdef CONFIG_EROFS_FS_ZIP_ZSTD
- [Z_EROFS_COMPRESSION_ZSTD] = {
- .config = z_erofs_load_zstd_config,
- .decompress = z_erofs_zstd_decompress,
- .name = "zstd"
- },
+ [Z_EROFS_COMPRESSION_ZSTD] = &z_erofs_zstd_decomp,
#endif
};
@@ -432,6 +505,7 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb)
offset = EROFS_SUPER_OFFSET + sbi->sb_size;
alg = 0;
for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) {
+ const struct z_erofs_decompressor *dec = z_erofs_decomp[alg];
void *data;
if (!(algs & 1))
@@ -443,16 +517,13 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb)
break;
}
- if (alg >= ARRAY_SIZE(erofs_decompressors) ||
- !erofs_decompressors[alg].config) {
+ if (alg < Z_EROFS_COMPRESSION_MAX && dec && dec->config) {
+ ret = dec->config(sb, dsb, data, size);
+ } else {
erofs_err(sb, "algorithm %d isn't enabled on this kernel",
alg);
ret = -EOPNOTSUPP;
- } else {
- ret = erofs_decompressors[alg].config(sb,
- dsb, data, size);
}
-
kfree(data);
if (ret)
break;
@@ -460,3 +531,28 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb)
erofs_put_metabuf(&buf);
return ret;
}
+
+int __init z_erofs_init_decompressor(void)
+{
+ int i, err;
+
+ for (i = 0; i < Z_EROFS_COMPRESSION_MAX; ++i) {
+ err = z_erofs_decomp[i] ? z_erofs_decomp[i]->init() : 0;
+ if (err) {
+ while (--i)
+ if (z_erofs_decomp[i])
+ z_erofs_decomp[i]->exit();
+ return err;
+ }
+ }
+ return 0;
+}
+
+void z_erofs_exit_decompressor(void)
+{
+ int i;
+
+ for (i = 0; i < Z_EROFS_COMPRESSION_MAX; ++i)
+ if (z_erofs_decomp[i])
+ z_erofs_decomp[i]->exit();
+}
diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c
index 3a3461561a3c..5070d2fcc737 100644
--- a/fs/erofs/decompressor_deflate.c
+++ b/fs/erofs/decompressor_deflate.c
@@ -15,7 +15,7 @@ static DECLARE_WAIT_QUEUE_HEAD(z_erofs_deflate_wq);
module_param_named(deflate_streams, z_erofs_deflate_nstrms, uint, 0444);
-void z_erofs_deflate_exit(void)
+static void z_erofs_deflate_exit(void)
{
/* there should be no running fs instance */
while (z_erofs_deflate_avail_strms) {
@@ -41,7 +41,7 @@ void z_erofs_deflate_exit(void)
}
}
-int __init z_erofs_deflate_init(void)
+static int __init z_erofs_deflate_init(void)
{
/* by default, use # of possible CPUs instead */
if (!z_erofs_deflate_nstrms)
@@ -49,7 +49,7 @@ int __init z_erofs_deflate_init(void)
return 0;
}
-int z_erofs_load_deflate_config(struct super_block *sb,
+static int z_erofs_load_deflate_config(struct super_block *sb,
struct erofs_super_block *dsb, void *data, int size)
{
struct z_erofs_deflate_cfgs *dfl = data;
@@ -97,27 +97,26 @@ failed:
return -ENOMEM;
}
-int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
- struct page **pgpl)
+static int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
{
- const unsigned int nrpages_out =
- PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
- const unsigned int nrpages_in =
- PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
struct super_block *sb = rq->sb;
- unsigned int insz, outsz, pofs;
+ struct z_erofs_stream_dctx dctx = {
+ .rq = rq,
+ .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT,
+ .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize)
+ >> PAGE_SHIFT,
+ .no = -1, .ni = 0,
+ };
struct z_erofs_deflate *strm;
- u8 *kin, *kout = NULL;
- bool bounced = false;
- int no = -1, ni = 0, j = 0, zerr, err;
+ int zerr, err;
/* 1. get the exact DEFLATE compressed size */
- kin = kmap_local_page(*rq->in);
- err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in,
- min_t(unsigned int, rq->inputsize,
- sb->s_blocksize - rq->pageofs_in));
+ dctx.kin = kmap_local_page(*rq->in);
+ err = z_erofs_fixup_insize(rq, dctx.kin + rq->pageofs_in,
+ min(rq->inputsize, sb->s_blocksize - rq->pageofs_in));
if (err) {
- kunmap_local(kin);
+ kunmap_local(dctx.kin);
return err;
}
@@ -134,102 +133,35 @@ again:
spin_unlock(&z_erofs_deflate_lock);
/* 3. multi-call decompress */
- insz = rq->inputsize;
- outsz = rq->outputsize;
zerr = zlib_inflateInit2(&strm->z, -MAX_WBITS);
if (zerr != Z_OK) {
err = -EIO;
goto failed_zinit;
}
- pofs = rq->pageofs_out;
- strm->z.avail_in = min_t(u32, insz, PAGE_SIZE - rq->pageofs_in);
- insz -= strm->z.avail_in;
- strm->z.next_in = kin + rq->pageofs_in;
+ rq->fillgaps = true; /* DEFLATE doesn't support NULL output buffer */
+ strm->z.avail_in = min(rq->inputsize, PAGE_SIZE - rq->pageofs_in);
+ rq->inputsize -= strm->z.avail_in;
+ strm->z.next_in = dctx.kin + rq->pageofs_in;
strm->z.avail_out = 0;
+ dctx.bounce = strm->bounce;
while (1) {
- if (!strm->z.avail_out) {
- if (++no >= nrpages_out || !outsz) {
- erofs_err(sb, "insufficient space for decompressed data");
- err = -EFSCORRUPTED;
- break;
- }
-
- if (kout)
- kunmap_local(kout);
- strm->z.avail_out = min_t(u32, outsz, PAGE_SIZE - pofs);
- outsz -= strm->z.avail_out;
- if (!rq->out[no]) {
- rq->out[no] = erofs_allocpage(pgpl, rq->gfp);
- if (!rq->out[no]) {
- kout = NULL;
- err = -ENOMEM;
- break;
- }
- set_page_private(rq->out[no],
- Z_EROFS_SHORTLIVED_PAGE);
- }
- kout = kmap_local_page(rq->out[no]);
- strm->z.next_out = kout + pofs;
- pofs = 0;
- }
-
- if (!strm->z.avail_in && insz) {
- if (++ni >= nrpages_in) {
- erofs_err(sb, "invalid compressed data");
- err = -EFSCORRUPTED;
- break;
- }
-
- if (kout) { /* unlike kmap(), take care of the orders */
- j = strm->z.next_out - kout;
- kunmap_local(kout);
- }
- kunmap_local(kin);
- strm->z.avail_in = min_t(u32, insz, PAGE_SIZE);
- insz -= strm->z.avail_in;
- kin = kmap_local_page(rq->in[ni]);
- strm->z.next_in = kin;
- bounced = false;
- if (kout) {
- kout = kmap_local_page(rq->out[no]);
- strm->z.next_out = kout + j;
- }
- }
-
- /*
- * Handle overlapping: Use bounced buffer if the compressed
- * data is under processing; Or use short-lived pages from the
- * on-stack pagepool where pages share among the same request
- * and not _all_ inplace I/O pages are needed to be doubled.
- */
- if (!bounced && rq->out[no] == rq->in[ni]) {
- memcpy(strm->bounce, strm->z.next_in, strm->z.avail_in);
- strm->z.next_in = strm->bounce;
- bounced = true;
- }
-
- for (j = ni + 1; j < nrpages_in; ++j) {
- struct page *tmppage;
-
- if (rq->out[no] != rq->in[j])
- continue;
- tmppage = erofs_allocpage(pgpl, rq->gfp);
- if (!tmppage) {
- err = -ENOMEM;
- goto failed;
- }
- set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
- copy_highpage(tmppage, rq->in[j]);
- rq->in[j] = tmppage;
- }
+ dctx.avail_out = strm->z.avail_out;
+ dctx.inbuf_sz = strm->z.avail_in;
+ err = z_erofs_stream_switch_bufs(&dctx,
+ (void **)&strm->z.next_out,
+ (void **)&strm->z.next_in, pgpl);
+ if (err)
+ break;
+ strm->z.avail_out = dctx.avail_out;
+ strm->z.avail_in = dctx.inbuf_sz;
zerr = zlib_inflate(&strm->z, Z_SYNC_FLUSH);
- if (zerr != Z_OK || !(outsz + strm->z.avail_out)) {
+ if (zerr != Z_OK || !(rq->outputsize + strm->z.avail_out)) {
if (zerr == Z_OK && rq->partial_decoding)
break;
- if (zerr == Z_STREAM_END && !outsz)
+ if (zerr == Z_STREAM_END && !rq->outputsize)
break;
erofs_err(sb, "failed to decompress %d in[%u] out[%u]",
zerr, rq->inputsize, rq->outputsize);
@@ -237,13 +169,12 @@ again:
break;
}
}
-failed:
if (zlib_inflateEnd(&strm->z) != Z_OK && !err)
err = -EIO;
- if (kout)
- kunmap_local(kout);
+ if (dctx.kout)
+ kunmap_local(dctx.kout);
failed_zinit:
- kunmap_local(kin);
+ kunmap_local(dctx.kin);
/* 4. push back DEFLATE stream context to the global list */
spin_lock(&z_erofs_deflate_lock);
strm->next = z_erofs_deflate_head;
@@ -252,3 +183,11 @@ failed_zinit:
wake_up(&z_erofs_deflate_wq);
return err;
}
+
+const struct z_erofs_decompressor z_erofs_deflate_decomp = {
+ .config = z_erofs_load_deflate_config,
+ .decompress = z_erofs_deflate_decompress,
+ .init = z_erofs_deflate_init,
+ .exit = z_erofs_deflate_exit,
+ .name = "deflate",
+};
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index 4b28dc130c9f..06a722b85a45 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -5,7 +5,6 @@
struct z_erofs_lzma {
struct z_erofs_lzma *next;
struct xz_dec_microlzma *state;
- struct xz_buf buf;
u8 bounce[PAGE_SIZE];
};
@@ -18,7 +17,7 @@ static DECLARE_WAIT_QUEUE_HEAD(z_erofs_lzma_wq);
module_param_named(lzma_streams, z_erofs_lzma_nstrms, uint, 0444);
-void z_erofs_lzma_exit(void)
+static void z_erofs_lzma_exit(void)
{
/* there should be no running fs instance */
while (z_erofs_lzma_avail_strms) {
@@ -46,7 +45,7 @@ void z_erofs_lzma_exit(void)
}
}
-int __init z_erofs_lzma_init(void)
+static int __init z_erofs_lzma_init(void)
{
unsigned int i;
@@ -70,7 +69,7 @@ int __init z_erofs_lzma_init(void)
return 0;
}
-int z_erofs_load_lzma_config(struct super_block *sb,
+static int z_erofs_load_lzma_config(struct super_block *sb,
struct erofs_super_block *dsb, void *data, int size)
{
static DEFINE_MUTEX(lzma_resize_mutex);
@@ -147,26 +146,28 @@ again:
return err;
}
-int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
- struct page **pgpl)
+static int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
{
- const unsigned int nrpages_out =
- PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
- const unsigned int nrpages_in =
- PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
- unsigned int inlen, outlen, pageofs;
+ struct super_block *sb = rq->sb;
+ struct z_erofs_stream_dctx dctx = {
+ .rq = rq,
+ .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT,
+ .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize)
+ >> PAGE_SHIFT,
+ .no = -1, .ni = 0,
+ };
+ struct xz_buf buf = {};
struct z_erofs_lzma *strm;
- u8 *kin;
- bool bounced = false;
- int no, ni, j, err = 0;
+ enum xz_ret xz_err;
+ int err;
/* 1. get the exact LZMA compressed size */
- kin = kmap(*rq->in);
- err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in,
- min_t(unsigned int, rq->inputsize,
- rq->sb->s_blocksize - rq->pageofs_in));
+ dctx.kin = kmap_local_page(*rq->in);
+ err = z_erofs_fixup_insize(rq, dctx.kin + rq->pageofs_in,
+ min(rq->inputsize, sb->s_blocksize - rq->pageofs_in));
if (err) {
- kunmap(*rq->in);
+ kunmap_local(dctx.kin);
return err;
}
@@ -183,108 +184,45 @@ again:
spin_unlock(&z_erofs_lzma_lock);
/* 3. multi-call decompress */
- inlen = rq->inputsize;
- outlen = rq->outputsize;
- xz_dec_microlzma_reset(strm->state, inlen, outlen,
+ xz_dec_microlzma_reset(strm->state, rq->inputsize, rq->outputsize,
!rq->partial_decoding);
- pageofs = rq->pageofs_out;
- strm->buf.in = kin + rq->pageofs_in;
- strm->buf.in_pos = 0;
- strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - rq->pageofs_in);
- inlen -= strm->buf.in_size;
- strm->buf.out = NULL;
- strm->buf.out_pos = 0;
- strm->buf.out_size = 0;
-
- for (ni = 0, no = -1;;) {
- enum xz_ret xz_err;
-
- if (strm->buf.out_pos == strm->buf.out_size) {
- if (strm->buf.out) {
- kunmap(rq->out[no]);
- strm->buf.out = NULL;
- }
-
- if (++no >= nrpages_out || !outlen) {
- erofs_err(rq->sb, "decompressed buf out of bound");
- err = -EFSCORRUPTED;
- break;
- }
- strm->buf.out_pos = 0;
- strm->buf.out_size = min_t(u32, outlen,
- PAGE_SIZE - pageofs);
- outlen -= strm->buf.out_size;
- if (!rq->out[no] && rq->fillgaps) { /* deduped */
- rq->out[no] = erofs_allocpage(pgpl, rq->gfp);
- if (!rq->out[no]) {
- err = -ENOMEM;
- break;
- }
- set_page_private(rq->out[no],
- Z_EROFS_SHORTLIVED_PAGE);
- }
- if (rq->out[no])
- strm->buf.out = kmap(rq->out[no]) + pageofs;
- pageofs = 0;
- } else if (strm->buf.in_pos == strm->buf.in_size) {
- kunmap(rq->in[ni]);
-
- if (++ni >= nrpages_in || !inlen) {
- erofs_err(rq->sb, "compressed buf out of bound");
- err = -EFSCORRUPTED;
- break;
- }
- strm->buf.in_pos = 0;
- strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE);
- inlen -= strm->buf.in_size;
- kin = kmap(rq->in[ni]);
- strm->buf.in = kin;
- bounced = false;
- }
+ buf.in_size = min(rq->inputsize, PAGE_SIZE - rq->pageofs_in);
+ rq->inputsize -= buf.in_size;
+ buf.in = dctx.kin + rq->pageofs_in,
+ dctx.bounce = strm->bounce;
+ do {
+ dctx.avail_out = buf.out_size - buf.out_pos;
+ dctx.inbuf_sz = buf.in_size;
+ dctx.inbuf_pos = buf.in_pos;
+ err = z_erofs_stream_switch_bufs(&dctx, (void **)&buf.out,
+ (void **)&buf.in, pgpl);
+ if (err)
+ break;
- /*
- * Handle overlapping: Use bounced buffer if the compressed
- * data is under processing; Otherwise, Use short-lived pages
- * from the on-stack pagepool where pages share with the same
- * request.
- */
- if (!bounced && rq->out[no] == rq->in[ni]) {
- memcpy(strm->bounce, strm->buf.in, strm->buf.in_size);
- strm->buf.in = strm->bounce;
- bounced = true;
+ if (buf.out_size == buf.out_pos) {
+ buf.out_size = dctx.avail_out;
+ buf.out_pos = 0;
}
- for (j = ni + 1; j < nrpages_in; ++j) {
- struct page *tmppage;
+ buf.in_size = dctx.inbuf_sz;
+ buf.in_pos = dctx.inbuf_pos;
- if (rq->out[no] != rq->in[j])
- continue;
- tmppage = erofs_allocpage(pgpl, rq->gfp);
- if (!tmppage) {
- err = -ENOMEM;
- goto failed;
- }
- set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
- copy_highpage(tmppage, rq->in[j]);
- rq->in[j] = tmppage;
- }
- xz_err = xz_dec_microlzma_run(strm->state, &strm->buf);
- DBG_BUGON(strm->buf.out_pos > strm->buf.out_size);
- DBG_BUGON(strm->buf.in_pos > strm->buf.in_size);
+ xz_err = xz_dec_microlzma_run(strm->state, &buf);
+ DBG_BUGON(buf.out_pos > buf.out_size);
+ DBG_BUGON(buf.in_pos > buf.in_size);
if (xz_err != XZ_OK) {
- if (xz_err == XZ_STREAM_END && !outlen)
+ if (xz_err == XZ_STREAM_END && !rq->outputsize)
break;
- erofs_err(rq->sb, "failed to decompress %d in[%u] out[%u]",
+ erofs_err(sb, "failed to decompress %d in[%u] out[%u]",
xz_err, rq->inputsize, rq->outputsize);
err = -EFSCORRUPTED;
break;
}
- }
-failed:
- if (no < nrpages_out && strm->buf.out)
- kunmap(rq->out[no]);
- if (ni < nrpages_in)
- kunmap(rq->in[ni]);
+ } while (1);
+
+ if (dctx.kout)
+ kunmap_local(dctx.kout);
+ kunmap_local(dctx.kin);
/* 4. push back LZMA stream context to the global list */
spin_lock(&z_erofs_lzma_lock);
strm->next = z_erofs_lzma_head;
@@ -293,3 +231,11 @@ failed:
wake_up(&z_erofs_lzma_wq);
return err;
}
+
+const struct z_erofs_decompressor z_erofs_lzma_decomp = {
+ .config = z_erofs_load_lzma_config,
+ .decompress = z_erofs_lzma_decompress,
+ .init = z_erofs_lzma_init,
+ .exit = z_erofs_lzma_exit,
+ .name = "lzma"
+};
diff --git a/fs/erofs/decompressor_zstd.c b/fs/erofs/decompressor_zstd.c
index 63a23cac3af4..7e177304967e 100644
--- a/fs/erofs/decompressor_zstd.c
+++ b/fs/erofs/decompressor_zstd.c
@@ -34,7 +34,7 @@ again:
return strm;
}
-void z_erofs_zstd_exit(void)
+static void z_erofs_zstd_exit(void)
{
while (z_erofs_zstd_avail_strms) {
struct z_erofs_zstd *strm, *n;
@@ -49,7 +49,7 @@ void z_erofs_zstd_exit(void)
}
}
-int __init z_erofs_zstd_init(void)
+static int __init z_erofs_zstd_init(void)
{
/* by default, use # of possible CPUs instead */
if (!z_erofs_zstd_nstrms)
@@ -72,7 +72,7 @@ int __init z_erofs_zstd_init(void)
return 0;
}
-int z_erofs_load_zstd_config(struct super_block *sb,
+static int z_erofs_load_zstd_config(struct super_block *sb,
struct erofs_super_block *dsb, void *data, int size)
{
static DEFINE_MUTEX(zstd_resize_mutex);
@@ -135,30 +135,29 @@ int z_erofs_load_zstd_config(struct super_block *sb,
return strm ? -ENOMEM : 0;
}
-int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq,
- struct page **pgpl)
+static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
{
- const unsigned int nrpages_out =
- PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
- const unsigned int nrpages_in =
- PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
- zstd_dstream *stream;
struct super_block *sb = rq->sb;
- unsigned int insz, outsz, pofs;
- struct z_erofs_zstd *strm;
+ struct z_erofs_stream_dctx dctx = {
+ .rq = rq,
+ .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT,
+ .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize)
+ >> PAGE_SHIFT,
+ .no = -1, .ni = 0,
+ };
zstd_in_buffer in_buf = { NULL, 0, 0 };
zstd_out_buffer out_buf = { NULL, 0, 0 };
- u8 *kin, *kout = NULL;
- bool bounced = false;
- int no = -1, ni = 0, j = 0, zerr, err;
+ struct z_erofs_zstd *strm;
+ zstd_dstream *stream;
+ int zerr, err;
/* 1. get the exact compressed size */
- kin = kmap_local_page(*rq->in);
- err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in,
- min_t(unsigned int, rq->inputsize,
- sb->s_blocksize - rq->pageofs_in));
+ dctx.kin = kmap_local_page(*rq->in);
+ err = z_erofs_fixup_insize(rq, dctx.kin + rq->pageofs_in,
+ min(rq->inputsize, sb->s_blocksize - rq->pageofs_in));
if (err) {
- kunmap_local(kin);
+ kunmap_local(dctx.kin);
return err;
}
@@ -166,109 +165,48 @@ int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq,
strm = z_erofs_isolate_strms(false);
/* 3. multi-call decompress */
- insz = rq->inputsize;
- outsz = rq->outputsize;
stream = zstd_init_dstream(z_erofs_zstd_max_dictsize, strm->wksp, strm->wkspsz);
if (!stream) {
err = -EIO;
goto failed_zinit;
}
- pofs = rq->pageofs_out;
- in_buf.size = min_t(u32, insz, PAGE_SIZE - rq->pageofs_in);
- insz -= in_buf.size;
- in_buf.src = kin + rq->pageofs_in;
+ rq->fillgaps = true; /* ZSTD doesn't support NULL output buffer */
+ in_buf.size = min_t(u32, rq->inputsize, PAGE_SIZE - rq->pageofs_in);
+ rq->inputsize -= in_buf.size;
+ in_buf.src = dctx.kin + rq->pageofs_in;
+ dctx.bounce = strm->bounce;
+
do {
- if (out_buf.size == out_buf.pos) {
- if (++no >= nrpages_out || !outsz) {
- erofs_err(sb, "insufficient space for decompressed data");
- err = -EFSCORRUPTED;
- break;
- }
+ dctx.avail_out = out_buf.size - out_buf.pos;
+ dctx.inbuf_sz = in_buf.size;
+ dctx.inbuf_pos = in_buf.pos;
+ err = z_erofs_stream_switch_bufs(&dctx, &out_buf.dst,
+ (void **)&in_buf.src, pgpl);
+ if (err)
+ break;
- if (kout)
- kunmap_local(kout);
- out_buf.size = min_t(u32, outsz, PAGE_SIZE - pofs);
- outsz -= out_buf.size;
- if (!rq->out[no]) {
- rq->out[no] = erofs_allocpage(pgpl, rq->gfp);
- if (!rq->out[no]) {
- kout = NULL;
- err = -ENOMEM;
- break;
- }
- set_page_private(rq->out[no],
- Z_EROFS_SHORTLIVED_PAGE);
- }
- kout = kmap_local_page(rq->out[no]);
- out_buf.dst = kout + pofs;
+ if (out_buf.size == out_buf.pos) {
+ out_buf.size = dctx.avail_out;
out_buf.pos = 0;
- pofs = 0;
- }
-
- if (in_buf.size == in_buf.pos && insz) {
- if (++ni >= nrpages_in) {
- erofs_err(sb, "invalid compressed data");
- err = -EFSCORRUPTED;
- break;
- }
-
- if (kout) /* unlike kmap(), take care of the orders */
- kunmap_local(kout);
- kunmap_local(kin);
- in_buf.size = min_t(u32, insz, PAGE_SIZE);
- insz -= in_buf.size;
- kin = kmap_local_page(rq->in[ni]);
- in_buf.src = kin;
- in_buf.pos = 0;
- bounced = false;
- if (kout) {
- j = (u8 *)out_buf.dst - kout;
- kout = kmap_local_page(rq->out[no]);
- out_buf.dst = kout + j;
- }
}
+ in_buf.size = dctx.inbuf_sz;
+ in_buf.pos = dctx.inbuf_pos;
- /*
- * Handle overlapping: Use bounced buffer if the compressed
- * data is under processing; Or use short-lived pages from the
- * on-stack pagepool where pages share among the same request
- * and not _all_ inplace I/O pages are needed to be doubled.
- */
- if (!bounced && rq->out[no] == rq->in[ni]) {
- memcpy(strm->bounce, in_buf.src, in_buf.size);
- in_buf.src = strm->bounce;
- bounced = true;
- }
-
- for (j = ni + 1; j < nrpages_in; ++j) {
- struct page *tmppage;
-
- if (rq->out[no] != rq->in[j])
- continue;
- tmppage = erofs_allocpage(pgpl, rq->gfp);
- if (!tmppage) {
- err = -ENOMEM;
- goto failed;
- }
- set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
- copy_highpage(tmppage, rq->in[j]);
- rq->in[j] = tmppage;
- }
zerr = zstd_decompress_stream(stream, &out_buf, &in_buf);
- if (zstd_is_error(zerr) || (!zerr && outsz)) {
+ if (zstd_is_error(zerr) || (!zerr && rq->outputsize)) {
erofs_err(sb, "failed to decompress in[%u] out[%u]: %s",
rq->inputsize, rq->outputsize,
zerr ? zstd_get_error_name(zerr) : "unexpected end of stream");
err = -EFSCORRUPTED;
break;
}
- } while (outsz || out_buf.pos < out_buf.size);
-failed:
- if (kout)
- kunmap_local(kout);
+ } while (rq->outputsize || out_buf.pos < out_buf.size);
+
+ if (dctx.kout)
+ kunmap_local(dctx.kout);
failed_zinit:
- kunmap_local(kin);
+ kunmap_local(dctx.kin);
/* 4. push back ZSTD stream context to the global list */
spin_lock(&z_erofs_zstd_lock);
strm->next = z_erofs_zstd_head;
@@ -277,3 +215,11 @@ failed_zinit:
wake_up(&z_erofs_zstd_wq);
return err;
}
+
+const struct z_erofs_decompressor z_erofs_zstd_decomp = {
+ .config = z_erofs_load_zstd_config,
+ .decompress = z_erofs_zstd_decompress,
+ .init = z_erofs_zstd_init,
+ .exit = z_erofs_zstd_exit,
+ .name = "zstd",
+};
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 0c1b44ac9524..736607675396 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -312,17 +312,13 @@ static inline unsigned int erofs_inode_datalayout(unsigned int ifmt)
return (ifmt >> EROFS_I_DATALAYOUT_BIT) & EROFS_I_DATALAYOUT_MASK;
}
-/*
- * Different from grab_cache_page_nowait(), reclaiming is never triggered
- * when allocating new pages.
- */
-static inline
-struct page *erofs_grab_cache_page_nowait(struct address_space *mapping,
- pgoff_t index)
+/* reclaiming is never triggered when allocating new folios. */
+static inline struct folio *erofs_grab_folio_nowait(struct address_space *as,
+ pgoff_t index)
{
- return pagecache_get_page(mapping, index,
+ return __filemap_get_folio(as, index,
FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
- readahead_gfp_mask(mapping) & ~__GFP_RECLAIM);
+ readahead_gfp_mask(as) & ~__GFP_RECLAIM);
}
/* Has a disk mapping */
@@ -458,8 +454,8 @@ void erofs_shrinker_register(struct super_block *sb);
void erofs_shrinker_unregister(struct super_block *sb);
int __init erofs_init_shrinker(void);
void erofs_exit_shrinker(void);
-int __init z_erofs_init_zip_subsystem(void);
-void z_erofs_exit_zip_subsystem(void);
+int __init z_erofs_init_subsystem(void);
+void z_erofs_exit_subsystem(void);
int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
struct erofs_workgroup *egrp);
int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
@@ -476,37 +472,11 @@ static inline void erofs_shrinker_register(struct super_block *sb) {}
static inline void erofs_shrinker_unregister(struct super_block *sb) {}
static inline int erofs_init_shrinker(void) { return 0; }
static inline void erofs_exit_shrinker(void) {}
-static inline int z_erofs_init_zip_subsystem(void) { return 0; }
-static inline void z_erofs_exit_zip_subsystem(void) {}
-static inline int z_erofs_gbuf_init(void) { return 0; }
-static inline void z_erofs_gbuf_exit(void) {}
+static inline int z_erofs_init_subsystem(void) { return 0; }
+static inline void z_erofs_exit_subsystem(void) {}
static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; }
#endif /* !CONFIG_EROFS_FS_ZIP */
-#ifdef CONFIG_EROFS_FS_ZIP_LZMA
-int __init z_erofs_lzma_init(void);
-void z_erofs_lzma_exit(void);
-#else
-static inline int z_erofs_lzma_init(void) { return 0; }
-static inline int z_erofs_lzma_exit(void) { return 0; }
-#endif /* !CONFIG_EROFS_FS_ZIP_LZMA */
-
-#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE
-int __init z_erofs_deflate_init(void);
-void z_erofs_deflate_exit(void);
-#else
-static inline int z_erofs_deflate_init(void) { return 0; }
-static inline int z_erofs_deflate_exit(void) { return 0; }
-#endif /* !CONFIG_EROFS_FS_ZIP_DEFLATE */
-
-#ifdef CONFIG_EROFS_FS_ZIP_ZSTD
-int __init z_erofs_zstd_init(void);
-void z_erofs_zstd_exit(void);
-#else
-static inline int z_erofs_zstd_init(void) { return 0; }
-static inline int z_erofs_zstd_exit(void) { return 0; }
-#endif /* !CONFIG_EROFS_FS_ZIP_ZSTD */
-
#ifdef CONFIG_EROFS_FS_ONDEMAND
int erofs_fscache_register_fs(struct super_block *sb);
void erofs_fscache_unregister_fs(struct super_block *sb);
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index c93bd24d2771..35268263aaed 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -343,7 +343,7 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->build_time = le64_to_cpu(dsb->build_time);
sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec);
- memcpy(&sb->s_uuid, dsb->uuid, sizeof(dsb->uuid));
+ super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid));
ret = strscpy(sbi->volume_name, dsb->volume_name,
sizeof(dsb->volume_name));
@@ -849,23 +849,7 @@ static int __init erofs_module_init(void)
if (err)
goto shrinker_err;
- err = z_erofs_lzma_init();
- if (err)
- goto lzma_err;
-
- err = z_erofs_deflate_init();
- if (err)
- goto deflate_err;
-
- err = z_erofs_zstd_init();
- if (err)
- goto zstd_err;
-
- err = z_erofs_gbuf_init();
- if (err)
- goto gbuf_err;
-
- err = z_erofs_init_zip_subsystem();
+ err = z_erofs_init_subsystem();
if (err)
goto zip_err;
@@ -882,16 +866,8 @@ static int __init erofs_module_init(void)
fs_err:
erofs_exit_sysfs();
sysfs_err:
- z_erofs_exit_zip_subsystem();
+ z_erofs_exit_subsystem();
zip_err:
- z_erofs_gbuf_exit();
-gbuf_err:
- z_erofs_zstd_exit();
-zstd_err:
- z_erofs_deflate_exit();
-deflate_err:
- z_erofs_lzma_exit();
-lzma_err:
erofs_exit_shrinker();
shrinker_err:
kmem_cache_destroy(erofs_inode_cachep);
@@ -906,13 +882,9 @@ static void __exit erofs_module_exit(void)
rcu_barrier();
erofs_exit_sysfs();
- z_erofs_exit_zip_subsystem();
- z_erofs_zstd_exit();
- z_erofs_deflate_exit();
- z_erofs_lzma_exit();
+ z_erofs_exit_subsystem();
erofs_exit_shrinker();
kmem_cache_destroy(erofs_inode_cachep);
- z_erofs_gbuf_exit();
}
static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index d6fe002a4a71..424f656cd765 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -19,10 +19,7 @@
typedef void *z_erofs_next_pcluster_t;
struct z_erofs_bvec {
- union {
- struct page *page;
- struct folio *folio;
- };
+ struct page *page;
int offset;
unsigned int end;
};
@@ -449,44 +446,51 @@ static inline int erofs_cpu_hotplug_init(void) { return 0; }
static inline void erofs_cpu_hotplug_destroy(void) {}
#endif
-void z_erofs_exit_zip_subsystem(void)
+void z_erofs_exit_subsystem(void)
{
erofs_cpu_hotplug_destroy();
erofs_destroy_percpu_workers();
destroy_workqueue(z_erofs_workqueue);
z_erofs_destroy_pcluster_pool();
+ z_erofs_exit_decompressor();
}
-int __init z_erofs_init_zip_subsystem(void)
+int __init z_erofs_init_subsystem(void)
{
- int err = z_erofs_create_pcluster_pool();
+ int err = z_erofs_init_decompressor();
if (err)
- goto out_error_pcluster_pool;
+ goto err_decompressor;
+
+ err = z_erofs_create_pcluster_pool();
+ if (err)
+ goto err_pcluster_pool;
z_erofs_workqueue = alloc_workqueue("erofs_worker",
WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus());
if (!z_erofs_workqueue) {
err = -ENOMEM;
- goto out_error_workqueue_init;
+ goto err_workqueue_init;
}
err = erofs_init_percpu_workers();
if (err)
- goto out_error_pcpu_worker;
+ goto err_pcpu_worker;
err = erofs_cpu_hotplug_init();
if (err < 0)
- goto out_error_cpuhp_init;
+ goto err_cpuhp_init;
return err;
-out_error_cpuhp_init:
+err_cpuhp_init:
erofs_destroy_percpu_workers();
-out_error_pcpu_worker:
+err_pcpu_worker:
destroy_workqueue(z_erofs_workqueue);
-out_error_workqueue_init:
+err_workqueue_init:
z_erofs_destroy_pcluster_pool();
-out_error_pcluster_pool:
+err_pcluster_pool:
+ z_erofs_exit_decompressor();
+err_decompressor:
return err;
}
@@ -617,32 +621,31 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
}
-/* called by erofs_shrinker to get rid of all cached compressed bvecs */
+/* (erofs_shrinker) disconnect cached encoded data with pclusters */
int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
struct erofs_workgroup *grp)
{
struct z_erofs_pcluster *const pcl =
container_of(grp, struct z_erofs_pcluster, obj);
unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
+ struct folio *folio;
int i;
DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
- /* There is no actice user since the pcluster is now freezed */
+ /* Each cached folio contains one page unless bs > ps is supported */
for (i = 0; i < pclusterpages; ++i) {
- struct folio *folio = pcl->compressed_bvecs[i].folio;
-
- if (!folio)
- continue;
+ if (pcl->compressed_bvecs[i].page) {
+ folio = page_folio(pcl->compressed_bvecs[i].page);
+ /* Avoid reclaiming or migrating this folio */
+ if (!folio_trylock(folio))
+ return -EBUSY;
- /* Avoid reclaiming or migrating this folio */
- if (!folio_trylock(folio))
- return -EBUSY;
-
- if (!erofs_folio_is_managed(sbi, folio))
- continue;
- pcl->compressed_bvecs[i].folio = NULL;
- folio_detach_private(folio);
- folio_unlock(folio);
+ if (!erofs_folio_is_managed(sbi, folio))
+ continue;
+ pcl->compressed_bvecs[i].page = NULL;
+ folio_detach_private(folio);
+ folio_unlock(folio);
+ }
}
return 0;
}
@@ -650,9 +653,9 @@ int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
{
struct z_erofs_pcluster *pcl = folio_get_private(folio);
- unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
+ struct z_erofs_bvec *bvec = pcl->compressed_bvecs;
+ struct z_erofs_bvec *end = bvec + z_erofs_pclusterpages(pcl);
bool ret;
- int i;
if (!folio_test_private(folio))
return true;
@@ -661,9 +664,9 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
spin_lock(&pcl->obj.lockref.lock);
if (pcl->obj.lockref.count <= 0) {
DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
- for (i = 0; i < pclusterpages; ++i) {
- if (pcl->compressed_bvecs[i].folio == folio) {
- pcl->compressed_bvecs[i].folio = NULL;
+ for (; bvec < end; ++bvec) {
+ if (bvec->page && page_folio(bvec->page) == folio) {
+ bvec->page = NULL;
folio_detach_private(folio);
ret = true;
break;
@@ -925,7 +928,7 @@ static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
fe->pcl = NULL;
}
-static int z_erofs_read_fragment(struct super_block *sb, struct page *page,
+static int z_erofs_read_fragment(struct super_block *sb, struct folio *folio,
unsigned int cur, unsigned int end, erofs_off_t pos)
{
struct inode *packed_inode = EROFS_SB(sb)->packed_inode;
@@ -938,113 +941,109 @@ static int z_erofs_read_fragment(struct super_block *sb, struct page *page,
buf.mapping = packed_inode->i_mapping;
for (; cur < end; cur += cnt, pos += cnt) {
- cnt = min_t(unsigned int, end - cur,
- sb->s_blocksize - erofs_blkoff(sb, pos));
+ cnt = min(end - cur, sb->s_blocksize - erofs_blkoff(sb, pos));
src = erofs_bread(&buf, pos, EROFS_KMAP);
if (IS_ERR(src)) {
erofs_put_metabuf(&buf);
return PTR_ERR(src);
}
- memcpy_to_page(page, cur, src, cnt);
+ memcpy_to_folio(folio, cur, src, cnt);
}
erofs_put_metabuf(&buf);
return 0;
}
-static int z_erofs_scan_folio(struct z_erofs_decompress_frontend *fe,
+static int z_erofs_scan_folio(struct z_erofs_decompress_frontend *f,
struct folio *folio, bool ra)
{
- struct inode *const inode = fe->inode;
- struct erofs_map_blocks *const map = &fe->map;
+ struct inode *const inode = f->inode;
+ struct erofs_map_blocks *const map = &f->map;
const loff_t offset = folio_pos(folio);
- const unsigned int bs = i_blocksize(inode), fs = folio_size(folio);
- bool tight = true, exclusive;
- unsigned int cur, end, len, split;
+ const unsigned int bs = i_blocksize(inode);
+ unsigned int end = folio_size(folio), split = 0, cur, pgs;
+ bool tight, excl;
int err = 0;
+ tight = (bs == PAGE_SIZE);
z_erofs_onlinefolio_init(folio);
- split = 0;
- end = fs;
-repeat:
- if (offset + end - 1 < map->m_la ||
- offset + end - 1 >= map->m_la + map->m_llen) {
- z_erofs_pcluster_end(fe);
- map->m_la = offset + end - 1;
- map->m_llen = 0;
- err = z_erofs_map_blocks_iter(inode, map, 0);
- if (err)
- goto out;
- }
-
- cur = offset > map->m_la ? 0 : map->m_la - offset;
- /* bump split parts first to avoid several separate cases */
- ++split;
-
- if (!(map->m_flags & EROFS_MAP_MAPPED)) {
- folio_zero_segment(folio, cur, end);
- tight = false;
- goto next_part;
- }
-
- if (map->m_flags & EROFS_MAP_FRAGMENT) {
- erofs_off_t fpos = offset + cur - map->m_la;
+ do {
+ if (offset + end - 1 < map->m_la ||
+ offset + end - 1 >= map->m_la + map->m_llen) {
+ z_erofs_pcluster_end(f);
+ map->m_la = offset + end - 1;
+ map->m_llen = 0;
+ err = z_erofs_map_blocks_iter(inode, map, 0);
+ if (err)
+ break;
+ }
- len = min_t(unsigned int, map->m_llen - fpos, end - cur);
- err = z_erofs_read_fragment(inode->i_sb, &folio->page, cur,
- cur + len, EROFS_I(inode)->z_fragmentoff + fpos);
- if (err)
- goto out;
- tight = false;
- goto next_part;
- }
+ cur = offset > map->m_la ? 0 : map->m_la - offset;
+ pgs = round_down(cur, PAGE_SIZE);
+ /* bump split parts first to avoid several separate cases */
+ ++split;
+
+ if (!(map->m_flags & EROFS_MAP_MAPPED)) {
+ folio_zero_segment(folio, cur, end);
+ tight = false;
+ } else if (map->m_flags & EROFS_MAP_FRAGMENT) {
+ erofs_off_t fpos = offset + cur - map->m_la;
+
+ err = z_erofs_read_fragment(inode->i_sb, folio, cur,
+ cur + min(map->m_llen - fpos, end - cur),
+ EROFS_I(inode)->z_fragmentoff + fpos);
+ if (err)
+ break;
+ tight = false;
+ } else {
+ if (!f->pcl) {
+ err = z_erofs_pcluster_begin(f);
+ if (err)
+ break;
+ f->pcl->besteffort |= !ra;
+ }
- if (!fe->pcl) {
- err = z_erofs_pcluster_begin(fe);
- if (err)
- goto out;
- fe->pcl->besteffort |= !ra;
- }
+ pgs = round_down(end - 1, PAGE_SIZE);
+ /*
+ * Ensure this partial page belongs to this submit chain
+ * rather than other concurrent submit chains or
+ * noio(bypass) chains since those chains are handled
+ * asynchronously thus it cannot be used for inplace I/O
+ * or bvpage (should be processed in the strict order.)
+ */
+ tight &= (f->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
+ excl = false;
+ if (cur <= pgs) {
+ excl = (split <= 1) || tight;
+ cur = pgs;
+ }
- /*
- * Ensure the current partial folio belongs to this submit chain rather
- * than other concurrent submit chains or the noio(bypass) chain since
- * those chains are handled asynchronously thus the folio cannot be used
- * for inplace I/O or bvpage (should be processed in a strict order.)
- */
- tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
- exclusive = (!cur && ((split <= 1) || (tight && bs == fs)));
- if (cur)
- tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
-
- err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) {
- .page = &folio->page,
- .offset = offset - map->m_la,
- .end = end,
- }), exclusive);
- if (err)
- goto out;
-
- z_erofs_onlinefolio_split(folio);
- if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
- fe->pcl->multibases = true;
- if (fe->pcl->length < offset + end - map->m_la) {
- fe->pcl->length = offset + end - map->m_la;
- fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
- }
- if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
- !(map->m_flags & EROFS_MAP_PARTIAL_REF) &&
- fe->pcl->length == map->m_llen)
- fe->pcl->partial = false;
-next_part:
- /* shorten the remaining extent to update progress */
- map->m_llen = offset + cur - map->m_la;
- map->m_flags &= ~EROFS_MAP_FULL_MAPPED;
-
- end = cur;
- if (end > 0)
- goto repeat;
+ err = z_erofs_attach_page(f, &((struct z_erofs_bvec) {
+ .page = folio_page(folio, pgs >> PAGE_SHIFT),
+ .offset = offset + pgs - map->m_la,
+ .end = end - pgs, }), excl);
+ if (err)
+ break;
-out:
+ z_erofs_onlinefolio_split(folio);
+ if (f->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
+ f->pcl->multibases = true;
+ if (f->pcl->length < offset + end - map->m_la) {
+ f->pcl->length = offset + end - map->m_la;
+ f->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
+ }
+ if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
+ !(map->m_flags & EROFS_MAP_PARTIAL_REF) &&
+ f->pcl->length == map->m_llen)
+ f->pcl->partial = false;
+ }
+ /* shorten the remaining extent to update progress */
+ map->m_llen = offset + cur - map->m_la;
+ map->m_flags &= ~EROFS_MAP_FULL_MAPPED;
+ if (cur <= pgs) {
+ split = cur < pgs;
+ tight = (bs == PAGE_SIZE);
+ }
+ } while ((end = cur) > 0);
z_erofs_onlinefolio_end(folio, err);
return err;
}
@@ -1066,7 +1065,7 @@ static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi,
static bool z_erofs_page_is_invalidated(struct page *page)
{
- return !page->mapping && !z_erofs_is_shortlived_page(page);
+ return !page_folio(page)->mapping && !z_erofs_is_shortlived_page(page);
}
struct z_erofs_decompress_backend {
@@ -1221,8 +1220,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
struct z_erofs_pcluster *pcl = be->pcl;
unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
const struct z_erofs_decompressor *decomp =
- &erofs_decompressors[pcl->algorithmformat];
- int i, err2;
+ z_erofs_decomp[pcl->algorithmformat];
+ int i, j, jtop, err2;
struct page *page;
bool overlapped;
@@ -1280,10 +1279,9 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL);
put_page(page);
} else {
+ /* managed folios are still left in compressed_bvecs[] */
for (i = 0; i < pclusterpages; ++i) {
- /* consider shortlived pages added when decompressing */
page = be->compressed_pages[i];
-
if (!page ||
erofs_folio_is_managed(sbi, page_folio(page)))
continue;
@@ -1294,21 +1292,31 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
if (be->compressed_pages < be->onstack_pages ||
be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES)
kvfree(be->compressed_pages);
- z_erofs_fill_other_copies(be, err);
+ jtop = 0;
+ z_erofs_fill_other_copies(be, err);
for (i = 0; i < be->nr_pages; ++i) {
page = be->decompressed_pages[i];
if (!page)
continue;
DBG_BUGON(z_erofs_page_is_invalidated(page));
-
- /* recycle all individual short-lived pages */
- if (z_erofs_put_shortlivedpage(be->pagepool, page))
+ if (!z_erofs_is_shortlived_page(page)) {
+ z_erofs_onlinefolio_end(page_folio(page), err);
continue;
- z_erofs_onlinefolio_end(page_folio(page), err);
+ }
+ if (pcl->algorithmformat != Z_EROFS_COMPRESSION_LZ4) {
+ erofs_pagepool_add(be->pagepool, page);
+ continue;
+ }
+ for (j = 0; j < jtop && be->decompressed_pages[j] != page; ++j)
+ ;
+ if (j >= jtop) /* this bounce page is newly detected */
+ be->decompressed_pages[jtop++] = page;
}
-
+ while (jtop)
+ erofs_pagepool_add(be->pagepool,
+ be->decompressed_pages[--jtop]);
if (be->decompressed_pages != be->onstack_pages)
kvfree(be->decompressed_pages);
@@ -1419,7 +1427,7 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
bool tocache = false;
struct z_erofs_bvec zbv;
struct address_space *mapping;
- struct page *page;
+ struct folio *folio;
int bs = i_blocksize(f->inode);
/* Except for inplace folios, the entire folio can be used for I/Os */
@@ -1429,23 +1437,25 @@ repeat:
spin_lock(&pcl->obj.lockref.lock);
zbv = pcl->compressed_bvecs[nr];
spin_unlock(&pcl->obj.lockref.lock);
- if (!zbv.folio)
+ if (!zbv.page)
goto out_allocfolio;
- bvec->bv_page = &zbv.folio->page;
+ bvec->bv_page = zbv.page;
DBG_BUGON(z_erofs_is_shortlived_page(bvec->bv_page));
+
+ folio = page_folio(zbv.page);
/*
* Handle preallocated cached folios. We tried to allocate such folios
* without triggering direct reclaim. If allocation failed, inplace
* file-backed folios will be used instead.
*/
- if (zbv.folio->private == (void *)Z_EROFS_PREALLOCATED_PAGE) {
- zbv.folio->private = 0;
+ if (folio->private == (void *)Z_EROFS_PREALLOCATED_PAGE) {
+ folio->private = 0;
tocache = true;
goto out_tocache;
}
- mapping = READ_ONCE(zbv.folio->mapping);
+ mapping = READ_ONCE(folio->mapping);
/*
* File-backed folios for inplace I/Os are all locked steady,
* therefore it is impossible for `mapping` to be NULL.
@@ -1457,21 +1467,21 @@ repeat:
return;
}
- folio_lock(zbv.folio);
- if (zbv.folio->mapping == mc) {
+ folio_lock(folio);
+ if (folio->mapping == mc) {
/*
* The cached folio is still in managed cache but without
* a valid `->private` pcluster hint. Let's reconnect them.
*/
- if (!folio_test_private(zbv.folio)) {
- folio_attach_private(zbv.folio, pcl);
+ if (!folio_test_private(folio)) {
+ folio_attach_private(folio, pcl);
/* compressed_bvecs[] already takes a ref before */
- folio_put(zbv.folio);
+ folio_put(folio);
}
/* no need to submit if it is already up-to-date */
- if (folio_test_uptodate(zbv.folio)) {
- folio_unlock(zbv.folio);
+ if (folio_test_uptodate(folio)) {
+ folio_unlock(folio);
bvec->bv_page = NULL;
}
return;
@@ -1481,32 +1491,31 @@ repeat:
* It has been truncated, so it's unsafe to reuse this one. Let's
* allocate a new page for compressed data.
*/
- DBG_BUGON(zbv.folio->mapping);
+ DBG_BUGON(folio->mapping);
tocache = true;
- folio_unlock(zbv.folio);
- folio_put(zbv.folio);
+ folio_unlock(folio);
+ folio_put(folio);
out_allocfolio:
- page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
+ zbv.page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
spin_lock(&pcl->obj.lockref.lock);
- if (pcl->compressed_bvecs[nr].folio) {
- erofs_pagepool_add(&f->pagepool, page);
+ if (pcl->compressed_bvecs[nr].page) {
+ erofs_pagepool_add(&f->pagepool, zbv.page);
spin_unlock(&pcl->obj.lockref.lock);
cond_resched();
goto repeat;
}
- pcl->compressed_bvecs[nr].folio = zbv.folio = page_folio(page);
+ bvec->bv_page = pcl->compressed_bvecs[nr].page = zbv.page;
+ folio = page_folio(zbv.page);
+ /* first mark it as a temporary shortlived folio (now 1 ref) */
+ folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
spin_unlock(&pcl->obj.lockref.lock);
- bvec->bv_page = page;
out_tocache:
if (!tocache || bs != PAGE_SIZE ||
- filemap_add_folio(mc, zbv.folio, pcl->obj.index + nr, gfp)) {
- /* turn into a temporary shortlived folio (1 ref) */
- zbv.folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
+ filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp))
return;
- }
- folio_attach_private(zbv.folio, pcl);
+ folio_attach_private(folio, pcl);
/* drop a refcount added by allocpage (then 2 refs in total here) */
- folio_put(zbv.folio);
+ folio_put(folio);
}
static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb,
@@ -1767,7 +1776,6 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
end = round_up(end, PAGE_SIZE);
} else {
end = round_up(map->m_la, PAGE_SIZE);
-
if (!map->m_llen)
return;
}
@@ -1775,15 +1783,15 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
cur = map->m_la + map->m_llen - 1;
while ((cur >= end) && (cur < i_size_read(inode))) {
pgoff_t index = cur >> PAGE_SHIFT;
- struct page *page;
+ struct folio *folio;
- page = erofs_grab_cache_page_nowait(inode->i_mapping, index);
- if (page) {
- if (PageUptodate(page))
- unlock_page(page);
+ folio = erofs_grab_folio_nowait(inode->i_mapping, index);
+ if (!IS_ERR_OR_NULL(folio)) {
+ if (folio_test_uptodate(folio))
+ folio_unlock(folio);
else
- z_erofs_scan_folio(f, page_folio(page), !!rac);
- put_page(page);
+ z_erofs_scan_folio(f, folio, !!rac);
+ folio_put(folio);
}
if (cur < PAGE_SIZE)
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 9b248ee5fef2..403af6e31d5b 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -686,7 +686,7 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
struct erofs_inode *const vi = EROFS_I(inode);
int err = 0;
- trace_z_erofs_map_blocks_iter_enter(inode, map, flags);
+ trace_erofs_map_blocks_enter(inode, map, flags);
/* when trying to read beyond EOF, leave it unmapped */
if (map->m_la >= inode->i_size) {
@@ -711,7 +711,9 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
err = z_erofs_do_map_blocks(inode, map, flags);
out:
- trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err);
+ if (err)
+ map->m_llen = 0;
+ trace_erofs_map_blocks_exit(inode, map, flags, err);
return err;
}
diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c
index 036024bce9f7..b80f612867c2 100644
--- a/fs/erofs/zutil.c
+++ b/fs/erofs/zutil.c
@@ -148,7 +148,7 @@ int __init z_erofs_gbuf_init(void)
void z_erofs_gbuf_exit(void)
{
- int i;
+ int i, j;
for (i = 0; i < z_erofs_gbuf_count + (!!z_erofs_rsvbuf); ++i) {
struct z_erofs_gbuf *gbuf = &z_erofs_gbufpool[i];
@@ -161,9 +161,9 @@ void z_erofs_gbuf_exit(void)
if (!gbuf->pages)
continue;
- for (i = 0; i < gbuf->nrpages; ++i)
- if (gbuf->pages[i])
- put_page(gbuf->pages[i]);
+ for (j = 0; j < gbuf->nrpages; ++j)
+ if (gbuf->pages[j])
+ put_page(gbuf->pages[j]);
kfree(gbuf->pages);
gbuf->pages = NULL;
}
diff --git a/fs/exec.c b/fs/exec.c
index 40073142288f..a47d0e4c54f6 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -486,6 +486,35 @@ static int count_strings_kernel(const char *const *argv)
return i;
}
+static inline int bprm_set_stack_limit(struct linux_binprm *bprm,
+ unsigned long limit)
+{
+#ifdef CONFIG_MMU
+ /* Avoid a pathological bprm->p. */
+ if (bprm->p < limit)
+ return -E2BIG;
+ bprm->argmin = bprm->p - limit;
+#endif
+ return 0;
+}
+static inline bool bprm_hit_stack_limit(struct linux_binprm *bprm)
+{
+#ifdef CONFIG_MMU
+ return bprm->p < bprm->argmin;
+#else
+ return false;
+#endif
+}
+
+/*
+ * Calculate bprm->argmin from:
+ * - _STK_LIM
+ * - ARG_MAX
+ * - bprm->rlim_stack.rlim_cur
+ * - bprm->argc
+ * - bprm->envc
+ * - bprm->p
+ */
static int bprm_stack_limits(struct linux_binprm *bprm)
{
unsigned long limit, ptr_size;
@@ -505,6 +534,9 @@ static int bprm_stack_limits(struct linux_binprm *bprm)
* of argument strings even with small stacks
*/
limit = max_t(unsigned long, limit, ARG_MAX);
+ /* Reject totally pathological counts. */
+ if (bprm->argc < 0 || bprm->envc < 0)
+ return -E2BIG;
/*
* We must account for the size of all the argv and envp pointers to
* the argv and envp strings, since they will also take up space in
@@ -518,13 +550,14 @@ static int bprm_stack_limits(struct linux_binprm *bprm)
* argc can never be 0, to keep them from walking envp by accident.
* See do_execveat_common().
*/
- ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *);
+ if (check_add_overflow(max(bprm->argc, 1), bprm->envc, &ptr_size) ||
+ check_mul_overflow(ptr_size, sizeof(void *), &ptr_size))
+ return -E2BIG;
if (limit <= ptr_size)
return -E2BIG;
limit -= ptr_size;
- bprm->argmin = bprm->p - limit;
- return 0;
+ return bprm_set_stack_limit(bprm, limit);
}
/*
@@ -562,10 +595,8 @@ static int copy_strings(int argc, struct user_arg_ptr argv,
pos = bprm->p;
str += len;
bprm->p -= len;
-#ifdef CONFIG_MMU
- if (bprm->p < bprm->argmin)
+ if (bprm_hit_stack_limit(bprm))
goto out;
-#endif
while (len > 0) {
int offset, bytes_to_copy;
@@ -640,7 +671,7 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
/* We're going to work our way backwards. */
arg += len;
bprm->p -= len;
- if (IS_ENABLED(CONFIG_MMU) && bprm->p < bprm->argmin)
+ if (bprm_hit_stack_limit(bprm))
return -E2BIG;
while (len > 0) {
@@ -952,10 +983,6 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
path_noexec(&file->f_path)))
goto exit;
- err = deny_write_access(file);
- if (err)
- goto exit;
-
out:
return file;
@@ -971,8 +998,7 @@ exit:
*
* Returns ERR_PTR on failure or allocated struct file on success.
*
- * As this is a wrapper for the internal do_open_execat(), callers
- * must call allow_write_access() before fput() on release. Also see
+ * As this is a wrapper for the internal do_open_execat(). Also see
* do_close_execat().
*/
struct file *open_exec(const char *name)
@@ -1524,10 +1550,8 @@ static int prepare_bprm_creds(struct linux_binprm *bprm)
/* Matches do_open_execat() */
static void do_close_execat(struct file *file)
{
- if (!file)
- return;
- allow_write_access(file);
- fput(file);
+ if (file)
+ fput(file);
}
static void free_bprm(struct linux_binprm *bprm)
@@ -1846,7 +1870,6 @@ static int exec_binprm(struct linux_binprm *bprm)
bprm->file = bprm->interpreter;
bprm->interpreter = NULL;
- allow_write_access(exec);
if (unlikely(bprm->have_execfd)) {
if (bprm->executable) {
fput(exec);
@@ -2211,3 +2234,7 @@ static int __init init_fs_exec_sysctls(void)
fs_initcall(init_fs_exec_sysctls);
#endif /* CONFIG_SYSCTL */
+
+#ifdef CONFIG_EXEC_KUNIT_TEST
+#include "exec_test.c"
+#endif
diff --git a/fs/exec_test.c b/fs/exec_test.c
new file mode 100644
index 000000000000..7c77d039680b
--- /dev/null
+++ b/fs/exec_test.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <kunit/test.h>
+
+struct bprm_stack_limits_result {
+ struct linux_binprm bprm;
+ int expected_rc;
+ unsigned long expected_argmin;
+};
+
+static const struct bprm_stack_limits_result bprm_stack_limits_results[] = {
+ /* Negative argc/envc counts produce -E2BIG */
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ULONG_MAX,
+ .argc = INT_MIN, .envc = INT_MIN }, .expected_rc = -E2BIG },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ULONG_MAX,
+ .argc = 5, .envc = -1 }, .expected_rc = -E2BIG },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ULONG_MAX,
+ .argc = -1, .envc = 10 }, .expected_rc = -E2BIG },
+ /* The max value of argc or envc is MAX_ARG_STRINGS. */
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ULONG_MAX,
+ .argc = INT_MAX, .envc = INT_MAX }, .expected_rc = -E2BIG },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ULONG_MAX,
+ .argc = MAX_ARG_STRINGS, .envc = MAX_ARG_STRINGS }, .expected_rc = -E2BIG },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ULONG_MAX,
+ .argc = 0, .envc = MAX_ARG_STRINGS }, .expected_rc = -E2BIG },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ULONG_MAX,
+ .argc = MAX_ARG_STRINGS, .envc = 0 }, .expected_rc = -E2BIG },
+ /*
+ * On 32-bit system these argc and envc counts, while likely impossible
+ * to represent within the associated TASK_SIZE, could overflow the
+ * limit calculation, and bypass the ptr_size <= limit check.
+ */
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ULONG_MAX,
+ .argc = 0x20000001, .envc = 0x20000001 }, .expected_rc = -E2BIG },
+#ifdef CONFIG_MMU
+ /* Make sure a pathological bprm->p doesn't cause an overflow. */
+ { { .p = sizeof(void *), .rlim_stack.rlim_cur = ULONG_MAX,
+ .argc = 10, .envc = 10 }, .expected_rc = -E2BIG },
+#endif
+ /*
+ * 0 rlim_stack will get raised to ARG_MAX. With 1 string pointer,
+ * we should see p - ARG_MAX + sizeof(void *).
+ */
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 0,
+ .argc = 1, .envc = 0 }, .expected_argmin = ULONG_MAX - ARG_MAX + sizeof(void *)},
+ /* Validate that argc is always raised to a minimum of 1. */
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 0,
+ .argc = 0, .envc = 0 }, .expected_argmin = ULONG_MAX - ARG_MAX + sizeof(void *)},
+ /*
+ * 0 rlim_stack will get raised to ARG_MAX. With pointers filling ARG_MAX,
+ * we should see -E2BIG. (Note argc is always raised to at least 1.)
+ */
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 0,
+ .argc = ARG_MAX / sizeof(void *), .envc = 0 }, .expected_rc = -E2BIG },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 0,
+ .argc = 0, .envc = ARG_MAX / sizeof(void *) - 1 }, .expected_rc = -E2BIG },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 0,
+ .argc = ARG_MAX / sizeof(void *) + 1, .envc = 0 }, .expected_rc = -E2BIG },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 0,
+ .argc = 0, .envc = ARG_MAX / sizeof(void *) }, .expected_rc = -E2BIG },
+ /* And with one less, we see space for exactly 1 pointer. */
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 0,
+ .argc = (ARG_MAX / sizeof(void *)) - 1, .envc = 0 },
+ .expected_argmin = ULONG_MAX - sizeof(void *) },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 0,
+ .argc = 0, .envc = (ARG_MAX / sizeof(void *)) - 2, },
+ .expected_argmin = ULONG_MAX - sizeof(void *) },
+ /* If we raise rlim_stack / 4 to exactly ARG_MAX, nothing changes. */
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ARG_MAX * 4,
+ .argc = ARG_MAX / sizeof(void *), .envc = 0 }, .expected_rc = -E2BIG },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ARG_MAX * 4,
+ .argc = 0, .envc = ARG_MAX / sizeof(void *) - 1 }, .expected_rc = -E2BIG },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ARG_MAX * 4,
+ .argc = ARG_MAX / sizeof(void *) + 1, .envc = 0 }, .expected_rc = -E2BIG },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ARG_MAX * 4,
+ .argc = 0, .envc = ARG_MAX / sizeof(void *) }, .expected_rc = -E2BIG },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ARG_MAX * 4,
+ .argc = (ARG_MAX / sizeof(void *)) - 1, .envc = 0 },
+ .expected_argmin = ULONG_MAX - sizeof(void *) },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = ARG_MAX * 4,
+ .argc = 0, .envc = (ARG_MAX / sizeof(void *)) - 2, },
+ .expected_argmin = ULONG_MAX - sizeof(void *) },
+ /* But raising it another pointer * 4 will provide space for 1 more pointer. */
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = (ARG_MAX + sizeof(void *)) * 4,
+ .argc = ARG_MAX / sizeof(void *), .envc = 0 },
+ .expected_argmin = ULONG_MAX - sizeof(void *) },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = (ARG_MAX + sizeof(void *)) * 4,
+ .argc = 0, .envc = ARG_MAX / sizeof(void *) - 1 },
+ .expected_argmin = ULONG_MAX - sizeof(void *) },
+ /* Raising rlim_stack / 4 to _STK_LIM / 4 * 3 will see more space. */
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 4 * (_STK_LIM / 4 * 3),
+ .argc = 0, .envc = 0 },
+ .expected_argmin = ULONG_MAX - (_STK_LIM / 4 * 3) + sizeof(void *) },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 4 * (_STK_LIM / 4 * 3),
+ .argc = 0, .envc = 0 },
+ .expected_argmin = ULONG_MAX - (_STK_LIM / 4 * 3) + sizeof(void *) },
+ /* But raising it any further will see no increase. */
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 4 * (_STK_LIM / 4 * 3 + sizeof(void *)),
+ .argc = 0, .envc = 0 },
+ .expected_argmin = ULONG_MAX - (_STK_LIM / 4 * 3) + sizeof(void *) },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 4 * (_STK_LIM / 4 * + sizeof(void *)),
+ .argc = 0, .envc = 0 },
+ .expected_argmin = ULONG_MAX - (_STK_LIM / 4 * 3) + sizeof(void *) },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 4 * _STK_LIM,
+ .argc = 0, .envc = 0 },
+ .expected_argmin = ULONG_MAX - (_STK_LIM / 4 * 3) + sizeof(void *) },
+ { { .p = ULONG_MAX, .rlim_stack.rlim_cur = 4 * _STK_LIM,
+ .argc = 0, .envc = 0 },
+ .expected_argmin = ULONG_MAX - (_STK_LIM / 4 * 3) + sizeof(void *) },
+};
+
+static void exec_test_bprm_stack_limits(struct kunit *test)
+{
+ /* Double-check the constants. */
+ KUNIT_EXPECT_EQ(test, _STK_LIM, SZ_8M);
+ KUNIT_EXPECT_EQ(test, ARG_MAX, 32 * SZ_4K);
+ KUNIT_EXPECT_EQ(test, MAX_ARG_STRINGS, 0x7FFFFFFF);
+
+ for (int i = 0; i < ARRAY_SIZE(bprm_stack_limits_results); i++) {
+ const struct bprm_stack_limits_result *result = &bprm_stack_limits_results[i];
+ struct linux_binprm bprm = result->bprm;
+ int rc;
+
+ rc = bprm_stack_limits(&bprm);
+ KUNIT_EXPECT_EQ_MSG(test, rc, result->expected_rc, "on loop %d", i);
+#ifdef CONFIG_MMU
+ KUNIT_EXPECT_EQ_MSG(test, bprm.argmin, result->expected_argmin, "on loop %d", i);
+#endif
+ }
+}
+
+static struct kunit_case exec_test_cases[] = {
+ KUNIT_CASE(exec_test_bprm_stack_limits),
+ {},
+};
+
+static struct kunit_suite exec_test_suite = {
+ .name = "exec",
+ .test_cases = exec_test_cases,
+};
+
+kunit_test_suite(exec_test_suite);
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index 84572e11cc05..7446bf09a04a 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -813,7 +813,7 @@ static int __exfat_get_dentry_set(struct exfat_entry_set_cache *es,
num_bh = EXFAT_B_TO_BLK_ROUND_UP(off + num_entries * DENTRY_SIZE, sb);
if (num_bh > ARRAY_SIZE(es->__bh)) {
- es->bh = kmalloc_array(num_bh, sizeof(*es->bh), GFP_KERNEL);
+ es->bh = kmalloc_array(num_bh, sizeof(*es->bh), GFP_NOFS);
if (!es->bh) {
brelse(bh);
return -ENOMEM;
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 9adfc38ca7da..64c31867bc76 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -89,12 +89,14 @@ free_clu:
return -EIO;
}
-static bool exfat_allow_set_time(struct exfat_sb_info *sbi, struct inode *inode)
+static bool exfat_allow_set_time(struct mnt_idmap *idmap,
+ struct exfat_sb_info *sbi, struct inode *inode)
{
mode_t allow_utime = sbi->options.allow_utime;
- if (!uid_eq(current_fsuid(), inode->i_uid)) {
- if (in_group_p(inode->i_gid))
+ if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode),
+ current_fsuid())) {
+ if (vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)))
allow_utime >>= 3;
if (allow_utime & MAY_WRITE)
return true;
@@ -283,7 +285,7 @@ int exfat_getattr(struct mnt_idmap *idmap, const struct path *path,
struct inode *inode = d_backing_inode(path->dentry);
struct exfat_inode_info *ei = EXFAT_I(inode);
- generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+ generic_fillattr(idmap, request_mask, inode, stat);
exfat_truncate_atime(&stat->atime);
stat->result_mask |= STATX_BTIME;
stat->btime.tv_sec = ei->i_crtime.tv_sec;
@@ -311,20 +313,22 @@ int exfat_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
/* Check for setting the inode time. */
ia_valid = attr->ia_valid;
if ((ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) &&
- exfat_allow_set_time(sbi, inode)) {
+ exfat_allow_set_time(idmap, sbi, inode)) {
attr->ia_valid &= ~(ATTR_MTIME_SET | ATTR_ATIME_SET |
ATTR_TIMES_SET);
}
- error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
+ error = setattr_prepare(idmap, dentry, attr);
attr->ia_valid = ia_valid;
if (error)
goto out;
if (((attr->ia_valid & ATTR_UID) &&
- !uid_eq(attr->ia_uid, sbi->options.fs_uid)) ||
+ (!uid_eq(from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid),
+ sbi->options.fs_uid))) ||
((attr->ia_valid & ATTR_GID) &&
- !gid_eq(attr->ia_gid, sbi->options.fs_gid)) ||
+ (!gid_eq(from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid),
+ sbi->options.fs_gid))) ||
((attr->ia_valid & ATTR_MODE) &&
(attr->ia_mode & ~(S_IFREG | S_IFLNK | S_IFDIR | 0777)))) {
error = -EPERM;
@@ -343,7 +347,7 @@ int exfat_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (attr->ia_valid & ATTR_SIZE)
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
- setattr_copy(&nop_mnt_idmap, inode, attr);
+ setattr_copy(idmap, inode, attr);
exfat_truncate_inode_atime(inode);
if (attr->ia_valid & ATTR_SIZE) {
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 3d5ea2cfad66..323ecebe6f0e 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -225,8 +225,8 @@ static const struct constant_table exfat_param_enums[] = {
};
static const struct fs_parameter_spec exfat_parameters[] = {
- fsparam_u32("uid", Opt_uid),
- fsparam_u32("gid", Opt_gid),
+ fsparam_uid("uid", Opt_uid),
+ fsparam_gid("gid", Opt_gid),
fsparam_u32oct("umask", Opt_umask),
fsparam_u32oct("dmask", Opt_dmask),
fsparam_u32oct("fmask", Opt_fmask),
@@ -262,10 +262,10 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param)
switch (opt) {
case Opt_uid:
- opts->fs_uid = make_kuid(current_user_ns(), result.uint_32);
+ opts->fs_uid = result.uid;
break;
case Opt_gid:
- opts->fs_gid = make_kgid(current_user_ns(), result.uint_32);
+ opts->fs_gid = result.gid;
break;
case Opt_umask:
opts->fs_fmask = result.uint_32;
@@ -788,7 +788,7 @@ static struct file_system_type exfat_fs_type = {
.init_fs_context = exfat_init_fs_context,
.parameters = exfat_parameters,
.kill_sb = exfat_kill_sb,
- .fs_flags = FS_REQUIRES_DEV,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
static void exfat_inode_init_once(void *foo)
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 07ea3d62b298..4f2dd4ab4486 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -427,7 +427,7 @@ EXPORT_SYMBOL_GPL(exportfs_encode_fh);
struct dentry *
exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len,
- int fileid_type,
+ int fileid_type, unsigned int flags,
int (*acceptable)(void *, struct dentry *),
void *context)
{
@@ -445,6 +445,11 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len,
if (IS_ERR_OR_NULL(result))
return result;
+ if ((flags & EXPORT_FH_DIR_ONLY) && !d_is_dir(result)) {
+ err = -ENOTDIR;
+ goto err_result;
+ }
+
/*
* If no acceptance criteria was specified by caller, a disconnected
* dentry is also accepatable. Callers may use this mode to query if
@@ -581,7 +586,7 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
{
struct dentry *ret;
- ret = exportfs_decode_fh_raw(mnt, fid, fh_len, fileid_type,
+ ret = exportfs_decode_fh_raw(mnt, fid, fh_len, fileid_type, 0,
acceptable, context);
if (IS_ERR_OR_NULL(ret)) {
if (ret == ERR_PTR(-ENOMEM))
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 1bfd6ab11038..b8cfab8f98b9 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -77,26 +77,33 @@ static int ext2_valid_block_bitmap(struct super_block *sb,
ext2_grpblk_t next_zero_bit;
ext2_fsblk_t bitmap_blk;
ext2_fsblk_t group_first_block;
+ ext2_grpblk_t max_bit;
group_first_block = ext2_group_first_block_no(sb, block_group);
+ max_bit = ext2_group_last_block_no(sb, block_group) - group_first_block;
/* check whether block bitmap block number is set */
bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
offset = bitmap_blk - group_first_block;
- if (!ext2_test_bit(offset, bh->b_data))
+ if (offset < 0 || offset > max_bit ||
+ !ext2_test_bit(offset, bh->b_data))
/* bad block bitmap */
goto err_out;
/* check whether the inode bitmap block number is set */
bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap);
offset = bitmap_blk - group_first_block;
- if (!ext2_test_bit(offset, bh->b_data))
+ if (offset < 0 || offset > max_bit ||
+ !ext2_test_bit(offset, bh->b_data))
/* bad block bitmap */
goto err_out;
/* check whether the inode table block number is set */
bitmap_blk = le32_to_cpu(desc->bg_inode_table);
offset = bitmap_blk - group_first_block;
+ if (offset < 0 || offset > max_bit ||
+ offset + EXT2_SB(sb)->s_itb_per_group - 1 > max_bit)
+ goto err_out;
next_zero_bit = ext2_find_next_zero_bit(bh->b_data,
offset + EXT2_SB(sb)->s_itb_per_group,
offset);
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 6fe3c941b565..87ee3a17bd29 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -72,7 +72,7 @@ static int add_system_zone(struct ext4_system_blocks *system_blks,
{
struct ext4_system_zone *new_entry, *entry;
struct rb_node **n = &system_blks->root.rb_node, *node;
- struct rb_node *parent = NULL, *new_node = NULL;
+ struct rb_node *parent = NULL, *new_node;
while (*n) {
parent = *n;
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 7ae0b61258a7..0a056d97e640 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -31,11 +31,10 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
ext4_fname_from_fscrypt_name(fname, &name);
-#if IS_ENABLED(CONFIG_UNICODE)
err = ext4_fname_setup_ci_filename(dir, iname, fname);
if (err)
ext4_fname_free_filename(fname);
-#endif
+
return err;
}
@@ -51,11 +50,9 @@ int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry,
ext4_fname_from_fscrypt_name(fname, &name);
-#if IS_ENABLED(CONFIG_UNICODE)
err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname);
if (err)
ext4_fname_free_filename(fname);
-#endif
return err;
}
@@ -70,10 +67,7 @@ void ext4_fname_free_filename(struct ext4_filename *fname)
fname->usr_fname = NULL;
fname->disk_name.name = NULL;
-#if IS_ENABLED(CONFIG_UNICODE)
- kfree(fname->cf_name.name);
- fname->cf_name.name = NULL;
-#endif
+ ext4_fname_free_ci_filename(fname);
}
static bool uuid_is_zero(__u8 u[16])
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 983dad8c07ec..08acd152261e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1347,7 +1347,7 @@ struct ext4_super_block {
/*60*/ __le32 s_feature_incompat; /* incompatible feature set */
__le32 s_feature_ro_compat; /* readonly-compatible feature set */
/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */
-/*78*/ char s_volume_name[EXT4_LABEL_MAX]; /* volume name */
+/*78*/ char s_volume_name[EXT4_LABEL_MAX] __nonstring; /* volume name */
/*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */
/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */
/*
@@ -2511,7 +2511,7 @@ struct ext4_filename {
struct fscrypt_str crypto_buf;
#endif
#if IS_ENABLED(CONFIG_UNICODE)
- struct fscrypt_str cf_name;
+ struct qstr cf_name;
#endif
};
@@ -2745,8 +2745,25 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
#if IS_ENABLED(CONFIG_UNICODE)
extern int ext4_fname_setup_ci_filename(struct inode *dir,
- const struct qstr *iname,
- struct ext4_filename *fname);
+ const struct qstr *iname,
+ struct ext4_filename *fname);
+
+static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname)
+{
+ kfree(fname->cf_name.name);
+ fname->cf_name.name = NULL;
+}
+#else
+static inline int ext4_fname_setup_ci_filename(struct inode *dir,
+ const struct qstr *iname,
+ struct ext4_filename *fname)
+{
+ return 0;
+}
+
+static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname)
+{
+}
#endif
/* ext4 encryption related stuff goes here crypto.c */
@@ -2769,16 +2786,11 @@ static inline int ext4_fname_setup_filename(struct inode *dir,
int lookup,
struct ext4_filename *fname)
{
- int err = 0;
fname->usr_fname = iname;
fname->disk_name.name = (unsigned char *) iname->name;
fname->disk_name.len = iname->len;
-#if IS_ENABLED(CONFIG_UNICODE)
- err = ext4_fname_setup_ci_filename(dir, iname, fname);
-#endif
-
- return err;
+ return ext4_fname_setup_ci_filename(dir, iname, fname);
}
static inline int ext4_fname_prepare_lookup(struct inode *dir,
@@ -2790,10 +2802,7 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir,
static inline void ext4_fname_free_filename(struct ext4_filename *fname)
{
-#if IS_ENABLED(CONFIG_UNICODE)
- kfree(fname->cf_name.name);
- fname->cf_name.name = NULL;
-#endif
+ ext4_fname_free_ci_filename(fname);
}
static inline int ext4_ioctl_get_encryption_pwsalt(struct file *filp,
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 4a00e2f019d9..17dcf13adde2 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -310,6 +310,8 @@ void ext4_es_find_extent_range(struct inode *inode,
ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es)
{
+ es->es_lblk = es->es_len = es->es_pblk = 0;
+
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
@@ -2052,34 +2054,49 @@ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
}
/*
- * ext4_es_insert_delayed_block - adds a delayed block to the extents status
- * tree, adding a pending reservation where
- * needed
+ * ext4_es_insert_delayed_extent - adds some delayed blocks to the extents
+ * status tree, adding a pending reservation
+ * where needed
*
* @inode - file containing the newly added block
- * @lblk - logical block to be added
- * @allocated - indicates whether a physical cluster has been allocated for
- * the logical cluster that contains the block
+ * @lblk - start logical block to be added
+ * @len - length of blocks to be added
+ * @lclu_allocated/end_allocated - indicates whether a physical cluster has
+ * been allocated for the logical cluster
+ * that contains the start/end block. Note that
+ * end_allocated should always be set to false
+ * if the start and the end block are in the
+ * same cluster
*/
-void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
- bool allocated)
+void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, bool lclu_allocated,
+ bool end_allocated)
{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct extent_status newes;
+ ext4_lblk_t end = lblk + len - 1;
int err1 = 0, err2 = 0, err3 = 0;
struct extent_status *es1 = NULL;
struct extent_status *es2 = NULL;
- struct pending_reservation *pr = NULL;
+ struct pending_reservation *pr1 = NULL;
+ struct pending_reservation *pr2 = NULL;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
- es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
- lblk, inode->i_ino);
+ es_debug("add [%u/%u) delayed to extent status tree of inode %lu\n",
+ lblk, len, inode->i_ino);
+ if (!len)
+ return;
+
+ WARN_ON_ONCE((EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) &&
+ end_allocated);
newes.es_lblk = lblk;
- newes.es_len = 1;
+ newes.es_len = len;
ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
- trace_ext4_es_insert_delayed_block(inode, &newes, allocated);
+ trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated,
+ end_allocated);
ext4_es_insert_extent_check(inode, &newes);
@@ -2088,11 +2105,15 @@ retry:
es1 = __es_alloc_extent(true);
if ((err1 || err2) && !es2)
es2 = __es_alloc_extent(true);
- if ((err1 || err2 || err3) && allocated && !pr)
- pr = __alloc_pending(true);
+ if (err1 || err2 || err3) {
+ if (lclu_allocated && !pr1)
+ pr1 = __alloc_pending(true);
+ if (end_allocated && !pr2)
+ pr2 = __alloc_pending(true);
+ }
write_lock(&EXT4_I(inode)->i_es_lock);
- err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1);
+ err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
if (err1 != 0)
goto error;
/* Free preallocated extent if it didn't get used. */
@@ -2112,13 +2133,22 @@ retry:
es2 = NULL;
}
- if (allocated) {
- err3 = __insert_pending(inode, lblk, &pr);
+ if (lclu_allocated) {
+ err3 = __insert_pending(inode, lblk, &pr1);
if (err3 != 0)
goto error;
- if (pr) {
- __free_pending(pr);
- pr = NULL;
+ if (pr1) {
+ __free_pending(pr1);
+ pr1 = NULL;
+ }
+ }
+ if (end_allocated) {
+ err3 = __insert_pending(inode, end, &pr2);
+ if (err3 != 0)
+ goto error;
+ if (pr2) {
+ __free_pending(pr2);
+ pr2 = NULL;
}
}
error:
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index d9847a4a25db..3c8e2edee5d5 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -249,8 +249,9 @@ extern void ext4_exit_pending(void);
extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
-extern void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
- bool allocated);
+extern void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, bool lclu_allocated,
+ bool end_allocated);
extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
extern void ext4_clear_inode_es(struct inode *inode);
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 87c009e0c59a..3926a05eceee 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -353,7 +353,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl
read_unlock(&sbi->s_journal->j_state_lock);
}
spin_lock(&sbi->s_fc_lock);
- if (sbi->s_fc_ineligible_tid < tid)
+ if (tid_gt(tid, sbi->s_fc_ineligible_tid))
sbi->s_fc_ineligible_tid = tid;
spin_unlock(&sbi->s_fc_lock);
WARN_ON(reason >= EXT4_FC_REASON_MAX);
@@ -649,6 +649,12 @@ void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t star
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
return;
+ if (ext4_has_inline_data(inode)) {
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR,
+ handle);
+ return;
+ }
+
args.start = start;
args.end = end;
@@ -1207,7 +1213,7 @@ restart_fc:
if (ret == -EALREADY) {
/* There was an ongoing commit, check if we need to restart */
if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
- commit_tid > journal->j_commit_sequence)
+ tid_gt(commit_tid, journal->j_commit_sequence))
goto restart_fc;
ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
commit_tid);
@@ -1282,7 +1288,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
list_del_init(&iter->i_fc_list);
ext4_clear_inode_state(&iter->vfs_inode,
EXT4_STATE_FC_COMMITTING);
- if (iter->i_sync_tid <= tid)
+ if (tid_geq(tid, iter->i_sync_tid))
ext4_fc_reset_inode(&iter->vfs_inode);
/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
smp_mb();
@@ -1313,7 +1319,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
&sbi->s_fc_q[FC_Q_MAIN]);
- if (tid >= sbi->s_fc_ineligible_tid) {
+ if (tid_geq(tid, sbi->s_fc_ineligible_tid)) {
sbi->s_fc_ineligible_tid = 0;
ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index e9bbb1da2d0a..9dfd768ed9f8 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1336,10 +1336,7 @@ got:
}
}
- if (ext4_handle_valid(handle)) {
- ei->i_sync_tid = handle->h_transaction->t_tid;
- ei->i_datasync_tid = handle->h_transaction->t_tid;
- }
+ ext4_update_inode_fsync_trans(handle, inode, 1);
err = ext4_mark_inode_dirty(handle, inode);
if (err) {
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index d5bd1e3a5d36..e7a09a99837b 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1410,7 +1410,11 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
hinfo->hash = EXT4_DIRENT_HASH(de);
hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de);
} else {
- ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
+ err = ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
+ if (err) {
+ ret = err;
+ goto out;
+ }
}
if ((hinfo->hash < start_hash) ||
((hinfo->hash == start_hash) &&
diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c
index f0c0fd507fbc..749af7ad4e09 100644
--- a/fs/ext4/inode-test.c
+++ b/fs/ext4/inode-test.c
@@ -279,4 +279,5 @@ static struct kunit_suite ext4_inode_test_suite = {
kunit_test_suites(&ext4_inode_test_suite);
+MODULE_DESCRIPTION("KUnit test of ext4 inode timestamp decoding");
MODULE_LICENSE("GPL v2");
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4bae9ccf5fe0..941c1c0d5c6e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -453,6 +453,35 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
}
#endif /* ES_AGGRESSIVE_TEST */
+static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map)
+{
+ unsigned int status;
+ int retval;
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ retval = ext4_ext_map_blocks(handle, inode, map, 0);
+ else
+ retval = ext4_ind_map_blocks(handle, inode, map, 0);
+
+ if (retval <= 0)
+ return retval;
+
+ if (unlikely(retval != map->m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map->m_len);
+ WARN_ON(1);
+ }
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
+ return retval;
+}
+
/*
* The ext4_map_blocks() function tries to look up the requested blocks,
* and returns if the blocks are already mapped.
@@ -1450,9 +1479,9 @@ static int ext4_journalled_write_end(struct file *file,
}
/*
- * Reserve space for a single cluster
+ * Reserve space for 'nr_resv' clusters
*/
-static int ext4_da_reserve_space(struct inode *inode)
+static int ext4_da_reserve_space(struct inode *inode, int nr_resv)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -1463,18 +1492,18 @@ static int ext4_da_reserve_space(struct inode *inode)
* us from metadata over-estimation, though we may go over by
* a small amount in the end. Here we just reserve for data.
*/
- ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
+ ret = dquot_reserve_block(inode, EXT4_C2B(sbi, nr_resv));
if (ret)
return ret;
spin_lock(&ei->i_block_reservation_lock);
- if (ext4_claim_free_clusters(sbi, 1, 0)) {
+ if (ext4_claim_free_clusters(sbi, nr_resv, 0)) {
spin_unlock(&ei->i_block_reservation_lock);
- dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, nr_resv));
return -ENOSPC;
}
- ei->i_reserved_data_blocks++;
- trace_ext4_da_reserve_space(inode);
+ ei->i_reserved_data_blocks += nr_resv;
+ trace_ext4_da_reserve_space(inode, nr_resv);
spin_unlock(&ei->i_block_reservation_lock);
return 0; /* success */
@@ -1621,24 +1650,58 @@ static void ext4_print_free_blocks(struct inode *inode)
}
/*
- * ext4_insert_delayed_block - adds a delayed block to the extents status
- * tree, incrementing the reserved cluster/block
- * count or making a pending reservation
- * where needed
+ * Check whether the cluster containing lblk has been allocated or has
+ * delalloc reservation.
+ *
+ * Returns 0 if the cluster doesn't have either, 1 if it has delalloc
+ * reservation, 2 if it's already been allocated, negative error code on
+ * failure.
+ */
+static int ext4_clu_alloc_state(struct inode *inode, ext4_lblk_t lblk)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int ret;
+
+ /* Has delalloc reservation? */
+ if (ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk))
+ return 1;
+
+ /* Already been allocated? */
+ if (ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk))
+ return 2;
+ ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk));
+ if (ret < 0)
+ return ret;
+ if (ret > 0)
+ return 2;
+
+ return 0;
+}
+
+/*
+ * ext4_insert_delayed_blocks - adds a multiple delayed blocks to the extents
+ * status tree, incrementing the reserved
+ * cluster/block count or making pending
+ * reservations where needed
*
* @inode - file containing the newly added block
- * @lblk - logical block to be added
+ * @lblk - start logical block to be added
+ * @len - length of blocks to be added
*
* Returns 0 on success, negative error code on failure.
*/
-static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
+static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int ret;
- bool allocated = false;
+ bool lclu_allocated = false;
+ bool end_allocated = false;
+ ext4_lblk_t resv_clu;
+ ext4_lblk_t end = lblk + len - 1;
/*
- * If the cluster containing lblk is shared with a delayed,
+ * If the cluster containing lblk or end is shared with a delayed,
* written, or unwritten extent in a bigalloc file system, it's
* already been accounted for and does not need to be reserved.
* A pending reservation must be made for the cluster if it's
@@ -1649,81 +1712,84 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
* extents status tree doesn't get a match.
*/
if (sbi->s_cluster_ratio == 1) {
- ret = ext4_da_reserve_space(inode);
+ ret = ext4_da_reserve_space(inode, len);
if (ret != 0) /* ENOSPC */
return ret;
} else { /* bigalloc */
- if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
- if (!ext4_es_scan_clu(inode,
- &ext4_es_is_mapped, lblk)) {
- ret = ext4_clu_mapped(inode,
- EXT4_B2C(sbi, lblk));
- if (ret < 0)
- return ret;
- if (ret == 0) {
- ret = ext4_da_reserve_space(inode);
- if (ret != 0) /* ENOSPC */
- return ret;
- } else {
- allocated = true;
- }
- } else {
- allocated = true;
+ resv_clu = EXT4_B2C(sbi, end) - EXT4_B2C(sbi, lblk) + 1;
+
+ ret = ext4_clu_alloc_state(inode, lblk);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ resv_clu--;
+ lclu_allocated = (ret == 2);
+ }
+
+ if (EXT4_B2C(sbi, lblk) != EXT4_B2C(sbi, end)) {
+ ret = ext4_clu_alloc_state(inode, end);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ resv_clu--;
+ end_allocated = (ret == 2);
}
}
+
+ if (resv_clu) {
+ ret = ext4_da_reserve_space(inode, resv_clu);
+ if (ret != 0) /* ENOSPC */
+ return ret;
+ }
}
- ext4_es_insert_delayed_block(inode, lblk, allocated);
+ ext4_es_insert_delayed_extent(inode, lblk, len, lclu_allocated,
+ end_allocated);
return 0;
}
/*
- * This function is grabs code from the very beginning of
- * ext4_map_blocks, but assumes that the caller is from delayed write
- * time. This function looks up the requested blocks and sets the
- * buffer delay bit under the protection of i_data_sem.
+ * Looks up the requested blocks and sets the delalloc extent map.
+ * First try to look up for the extent entry that contains the requested
+ * blocks in the extent status tree without i_data_sem, then try to look
+ * up for the ondisk extent mapping with i_data_sem in read mode,
+ * finally hold i_data_sem in write mode, looks up again and add a
+ * delalloc extent entry if it still couldn't find any extent. Pass out
+ * the mapped extent through @map and return 0 on success.
*/
-static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
- struct ext4_map_blocks *map,
- struct buffer_head *bh)
+static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
{
struct extent_status es;
int retval;
- sector_t invalid_block = ~((sector_t) 0xffff);
#ifdef ES_AGGRESSIVE_TEST
struct ext4_map_blocks orig_map;
memcpy(&orig_map, map, sizeof(*map));
#endif
- if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
- invalid_block = ~0;
-
map->m_flags = 0;
ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len,
(unsigned long) map->m_lblk);
/* Lookup extent status tree firstly */
- if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
+ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+ map->m_len = min_t(unsigned int, map->m_len,
+ es.es_len - (map->m_lblk - es.es_lblk));
+
if (ext4_es_is_hole(&es))
goto add_delayed;
+found:
/*
* Delayed extent could be allocated by fallocate.
* So we need to check it.
*/
- if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
- map_bh(bh, inode->i_sb, invalid_block);
- set_buffer_new(bh);
- set_buffer_delay(bh);
+ if (ext4_es_is_delonly(&es)) {
+ map->m_flags |= EXT4_MAP_DELAYED;
return 0;
}
- map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
- retval = es.es_len - (iblock - es.es_lblk);
- if (retval > map->m_len)
- retval = map->m_len;
- map->m_len = retval;
+ map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk;
if (ext4_es_is_written(&es))
map->m_flags |= EXT4_MAP_MAPPED;
else if (ext4_es_is_unwritten(&es))
@@ -1734,7 +1800,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
#ifdef ES_AGGRESSIVE_TEST
ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
#endif
- return retval;
+ return 0;
}
/*
@@ -1744,44 +1810,41 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
down_read(&EXT4_I(inode)->i_data_sem);
if (ext4_has_inline_data(inode))
retval = 0;
- else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- retval = ext4_ext_map_blocks(NULL, inode, map, 0);
else
- retval = ext4_ind_map_blocks(NULL, inode, map, 0);
- if (retval < 0) {
- up_read(&EXT4_I(inode)->i_data_sem);
- return retval;
- }
- if (retval > 0) {
- unsigned int status;
-
- if (unlikely(retval != map->m_len)) {
- ext4_warning(inode->i_sb,
- "ES len assertion failed for inode "
- "%lu: retval %d != map->m_len %d",
- inode->i_ino, retval, map->m_len);
- WARN_ON(1);
- }
-
- status = map->m_flags & EXT4_MAP_UNWRITTEN ?
- EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status);
- up_read(&EXT4_I(inode)->i_data_sem);
- return retval;
- }
+ retval = ext4_map_query_blocks(NULL, inode, map);
up_read(&EXT4_I(inode)->i_data_sem);
+ if (retval)
+ return retval < 0 ? retval : 0;
add_delayed:
down_write(&EXT4_I(inode)->i_data_sem);
- retval = ext4_insert_delayed_block(inode, map->m_lblk);
+ /*
+ * Page fault path (ext4_page_mkwrite does not take i_rwsem)
+ * and fallocate path (no folio lock) can race. Make sure we
+ * lookup the extent status tree here again while i_data_sem
+ * is held in write mode, before inserting a new da entry in
+ * the extent status tree.
+ */
+ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+ map->m_len = min_t(unsigned int, map->m_len,
+ es.es_len - (map->m_lblk - es.es_lblk));
+
+ if (!ext4_es_is_hole(&es)) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto found;
+ }
+ } else if (!ext4_has_inline_data(inode)) {
+ retval = ext4_map_query_blocks(NULL, inode, map);
+ if (retval) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ return retval < 0 ? retval : 0;
+ }
+ }
+
+ map->m_flags |= EXT4_MAP_DELAYED;
+ retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len);
up_write(&EXT4_I(inode)->i_data_sem);
- if (retval)
- return retval;
- map_bh(bh, inode->i_sb, invalid_block);
- set_buffer_new(bh);
- set_buffer_delay(bh);
return retval;
}
@@ -1801,11 +1864,15 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)
{
struct ext4_map_blocks map;
+ sector_t invalid_block = ~((sector_t) 0xffff);
int ret = 0;
BUG_ON(create == 0);
BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
+ if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
+ invalid_block = ~0;
+
map.m_lblk = iblock;
map.m_len = 1;
@@ -1814,10 +1881,17 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
* preallocated blocks are unmapped but should treated
* the same as allocated blocks.
*/
- ret = ext4_da_map_blocks(inode, iblock, &map, bh);
- if (ret <= 0)
+ ret = ext4_da_map_blocks(inode, &map);
+ if (ret < 0)
return ret;
+ if (map.m_flags & EXT4_MAP_DELAYED) {
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
+ return 0;
+ }
+
map_bh(bh, inode->i_sb, map.m_pblk);
ext4_update_bh_state(bh, map.m_flags);
@@ -2945,6 +3019,11 @@ static int ext4_da_do_write_end(struct address_space *mapping,
bool disksize_changed = false;
loff_t new_i_size;
+ if (unlikely(!folio_buffers(folio))) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return -EIO;
+ }
/*
* block_write_end() will mark the inode as dirty with I_DIRTY_PAGES
* flag, which all that's needed to trigger page writeback.
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index dab7acd49709..e8bf5972dd47 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1151,7 +1151,7 @@ static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label
BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX);
lock_buffer(sbi->s_sbh);
- strscpy_pad(label, sbi->s_es->s_volume_name);
+ memtostr_pad(label, sbi->s_es->s_volume_name);
unlock_buffer(sbi->s_sbh);
if (copy_to_user(user_label, label, sizeof(label)))
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a630b27a4cc6..6a95713f9193 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -151,10 +151,11 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
return bh;
}
- if (!bh && (type == INDEX || type == DIRENT_HTREE)) {
+ /* The first directory block must not be a hole. */
+ if (!bh && (type == INDEX || type == DIRENT_HTREE || block == 0)) {
ext4_error_inode(inode, func, line, block,
- "Directory hole found for htree %s block",
- (type == INDEX) ? "index" : "leaf");
+ "Directory hole found for htree %s block %u",
+ (type == INDEX) ? "index" : "leaf", block);
return ERR_PTR(-EFSCORRUPTED);
}
if (!bh)
@@ -1390,62 +1391,11 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
}
#if IS_ENABLED(CONFIG_UNICODE)
-/*
- * Test whether a case-insensitive directory entry matches the filename
- * being searched for. If quick is set, assume the name being looked up
- * is already in the casefolded form.
- *
- * Returns: 0 if the directory entry matches, more than 0 if it
- * doesn't match or less than zero on error.
- */
-static int ext4_ci_compare(const struct inode *parent, const struct qstr *name,
- u8 *de_name, size_t de_name_len, bool quick)
-{
- const struct super_block *sb = parent->i_sb;
- const struct unicode_map *um = sb->s_encoding;
- struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len);
- struct qstr entry = QSTR_INIT(de_name, de_name_len);
- int ret;
-
- if (IS_ENCRYPTED(parent)) {
- const struct fscrypt_str encrypted_name =
- FSTR_INIT(de_name, de_name_len);
-
- decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL);
- if (!decrypted_name.name)
- return -ENOMEM;
- ret = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name,
- &decrypted_name);
- if (ret < 0)
- goto out;
- entry.name = decrypted_name.name;
- entry.len = decrypted_name.len;
- }
-
- if (quick)
- ret = utf8_strncasecmp_folded(um, name, &entry);
- else
- ret = utf8_strncasecmp(um, name, &entry);
- if (ret < 0) {
- /* Handle invalid character sequence as either an error
- * or as an opaque byte sequence.
- */
- if (sb_has_strict_encoding(sb))
- ret = -EINVAL;
- else if (name->len != entry.len)
- ret = 1;
- else
- ret = !!memcmp(name->name, entry.name, entry.len);
- }
-out:
- kfree(decrypted_name.name);
- return ret;
-}
-
int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
struct ext4_filename *name)
{
- struct fscrypt_str *cf_name = &name->cf_name;
+ struct qstr *cf_name = &name->cf_name;
+ unsigned char *buf;
struct dx_hash_info *hinfo = &name->hinfo;
int len;
@@ -1455,18 +1405,18 @@ int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
return 0;
}
- cf_name->name = kmalloc(EXT4_NAME_LEN, GFP_NOFS);
- if (!cf_name->name)
+ buf = kmalloc(EXT4_NAME_LEN, GFP_NOFS);
+ if (!buf)
return -ENOMEM;
- len = utf8_casefold(dir->i_sb->s_encoding,
- iname, cf_name->name,
- EXT4_NAME_LEN);
+ len = utf8_casefold(dir->i_sb->s_encoding, iname, buf, EXT4_NAME_LEN);
if (len <= 0) {
- kfree(cf_name->name);
- cf_name->name = NULL;
+ kfree(buf);
+ buf = NULL;
}
+ cf_name->name = buf;
cf_name->len = (unsigned) len;
+
if (!IS_ENCRYPTED(dir))
return 0;
@@ -1502,22 +1452,29 @@ static bool ext4_match(struct inode *parent,
#if IS_ENABLED(CONFIG_UNICODE)
if (IS_CASEFOLDED(parent) &&
(!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) {
- if (fname->cf_name.name) {
- struct qstr cf = {.name = fname->cf_name.name,
- .len = fname->cf_name.len};
- if (IS_ENCRYPTED(parent)) {
- if (fname->hinfo.hash != EXT4_DIRENT_HASH(de) ||
- fname->hinfo.minor_hash !=
- EXT4_DIRENT_MINOR_HASH(de)) {
-
- return false;
- }
- }
- return !ext4_ci_compare(parent, &cf, de->name,
- de->name_len, true);
- }
- return !ext4_ci_compare(parent, fname->usr_fname, de->name,
- de->name_len, false);
+ /*
+ * Just checking IS_ENCRYPTED(parent) below is not
+ * sufficient to decide whether one can use the hash for
+ * skipping the string comparison, because the key might
+ * have been added right after
+ * ext4_fname_setup_ci_filename(). In this case, a hash
+ * mismatch will be a false negative. Therefore, make
+ * sure cf_name was properly initialized before
+ * considering the calculated hash.
+ */
+ if (IS_ENCRYPTED(parent) && fname->cf_name.name &&
+ (fname->hinfo.hash != EXT4_DIRENT_HASH(de) ||
+ fname->hinfo.minor_hash != EXT4_DIRENT_MINOR_HASH(de)))
+ return false;
+ /*
+ * Treat comparison errors as not a match. The
+ * only case where it happens is on a disk
+ * corruption or ENOMEM.
+ */
+
+ return generic_ci_match(parent, fname->usr_fname,
+ &fname->cf_name, de->name,
+ de->name_len) > 0;
}
#endif
@@ -1869,8 +1826,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
}
}
-#if IS_ENABLED(CONFIG_UNICODE)
- if (!inode && IS_CASEFOLDED(dir)) {
+ if (IS_ENABLED(CONFIG_UNICODE) && !inode && IS_CASEFOLDED(dir)) {
/* Eventually we want to call d_add_ci(dentry, NULL)
* for negative dentries in the encoding case as
* well. For now, prevent the negative dentry
@@ -1878,7 +1834,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
*/
return NULL;
}
-#endif
+
return d_splice_alias(inode, dentry);
}
@@ -2217,6 +2173,52 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
return err ? err : err2;
}
+static bool ext4_check_dx_root(struct inode *dir, struct dx_root *root)
+{
+ struct fake_dirent *fde;
+ const char *error_msg;
+ unsigned int rlen;
+ unsigned int blocksize = dir->i_sb->s_blocksize;
+ char *blockend = (char *)root + dir->i_sb->s_blocksize;
+
+ fde = &root->dot;
+ if (unlikely(fde->name_len != 1)) {
+ error_msg = "invalid name_len for '.'";
+ goto corrupted;
+ }
+ if (unlikely(strncmp(root->dot_name, ".", fde->name_len))) {
+ error_msg = "invalid name for '.'";
+ goto corrupted;
+ }
+ rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize);
+ if (unlikely((char *)fde + rlen >= blockend)) {
+ error_msg = "invalid rec_len for '.'";
+ goto corrupted;
+ }
+
+ fde = &root->dotdot;
+ if (unlikely(fde->name_len != 2)) {
+ error_msg = "invalid name_len for '..'";
+ goto corrupted;
+ }
+ if (unlikely(strncmp(root->dotdot_name, "..", fde->name_len))) {
+ error_msg = "invalid name for '..'";
+ goto corrupted;
+ }
+ rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize);
+ if (unlikely((char *)fde + rlen >= blockend)) {
+ error_msg = "invalid rec_len for '..'";
+ goto corrupted;
+ }
+
+ return true;
+
+corrupted:
+ EXT4_ERROR_INODE(dir, "Corrupt dir, %s, running e2fsck is recommended",
+ error_msg);
+ return false;
+}
+
/*
* This converts a one block unindexed directory to a 3 block indexed
* directory, and adds the dentry to the indexed directory.
@@ -2251,17 +2253,17 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
brelse(bh);
return retval;
}
+
root = (struct dx_root *) bh->b_data;
+ if (!ext4_check_dx_root(dir, root)) {
+ brelse(bh);
+ return -EFSCORRUPTED;
+ }
/* The 0th block becomes the root, move the dirents out */
fde = &root->dotdot;
de = (struct ext4_dir_entry_2 *)((char *)fde +
ext4_rec_len_from_disk(fde->rec_len, blocksize));
- if ((char *) de >= (((char *) root) + blocksize)) {
- EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
- brelse(bh);
- return -EFSCORRUPTED;
- }
len = ((char *) root) + (blocksize - csum_size) - (char *) de;
/* Allocate new block for the 0th block's dirents */
@@ -3083,10 +3085,7 @@ bool ext4_empty_dir(struct inode *inode)
EXT4_ERROR_INODE(inode, "invalid size");
return false;
}
- /* The first directory block must not be a hole,
- * so treat it as DIRENT_HTREE
- */
- bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
+ bh = ext4_read_dirblock(inode, 0, EITHER);
if (IS_ERR(bh))
return false;
@@ -3208,16 +3207,14 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
ext4_fc_track_unlink(handle, dentry);
retval = ext4_mark_inode_dirty(handle, dir);
-#if IS_ENABLED(CONFIG_UNICODE)
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
* invalidating the dentries here, alongside with returning the
* negative dentries at ext4_lookup(), when it is better
* supported by the VFS for the CI case.
*/
- if (IS_CASEFOLDED(dir))
+ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
d_invalidate(dentry);
-#endif
end_rmdir:
brelse(bh);
@@ -3319,16 +3316,15 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
goto out_trace;
retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry), dentry);
-#if IS_ENABLED(CONFIG_UNICODE)
+
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
* invalidating the dentries here, alongside with returning the
* negative dentries at ext4_lookup(), when it is better
* supported by the VFS for the CI case.
*/
- if (IS_CASEFOLDED(dir))
+ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
d_invalidate(dentry);
-#endif
out_trace:
trace_ext4_unlink_exit(dentry, retval);
@@ -3531,10 +3527,7 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
struct ext4_dir_entry_2 *de;
unsigned int offset;
- /* The first directory block must not be a hole, so
- * treat it as DIRENT_HTREE
- */
- bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
+ bh = ext4_read_dirblock(inode, 0, EITHER);
if (IS_ERR(bh)) {
*retval = PTR_ERR(bh);
return NULL;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c682fb927b64..e72145c4ae5a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1327,6 +1327,9 @@ static void ext4_put_super(struct super_block *sb)
ext4_group_desc_free(sbi);
ext4_flex_groups_free(sbi);
+
+ WARN_ON_ONCE(!(sbi->s_mount_state & EXT4_ERROR_FS) &&
+ percpu_counter_sum(&sbi->s_dirtyclusters_counter));
ext4_percpu_param_destroy(sbi);
#ifdef CONFIG_QUOTA
for (int i = 0; i < EXT4_MAXQUOTAS; i++)
@@ -1457,7 +1460,8 @@ static void ext4_destroy_inode(struct inode *inode)
dump_stack();
}
- if (EXT4_I(inode)->i_reserved_data_blocks)
+ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS) &&
+ WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks))
ext4_msg(inode->i_sb, KERN_ERR,
"Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!",
inode->i_ino, EXT4_I(inode),
@@ -1721,8 +1725,8 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
fsparam_flag ("bsdgroups", Opt_grpid),
fsparam_flag ("nogrpid", Opt_nogrpid),
fsparam_flag ("sysvgroups", Opt_nogrpid),
- fsparam_u32 ("resgid", Opt_resgid),
- fsparam_u32 ("resuid", Opt_resuid),
+ fsparam_gid ("resgid", Opt_resgid),
+ fsparam_uid ("resuid", Opt_resuid),
fsparam_u32 ("sb", Opt_sb),
fsparam_enum ("errors", Opt_errors, ext4_param_errors),
fsparam_flag ("nouid32", Opt_nouid32),
@@ -2127,8 +2131,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
struct fs_parse_result result;
const struct mount_opts *m;
int is_remount;
- kuid_t uid;
- kgid_t gid;
int token;
token = fs_parse(fc, ext4_param_specs, param, &result);
@@ -2270,23 +2272,11 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
ctx->spec |= EXT4_SPEC_s_stripe;
return 0;
case Opt_resuid:
- uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(uid)) {
- ext4_msg(NULL, KERN_ERR, "Invalid uid value %d",
- result.uint_32);
- return -EINVAL;
- }
- ctx->s_resuid = uid;
+ ctx->s_resuid = result.uid;
ctx->spec |= EXT4_SPEC_s_resuid;
return 0;
case Opt_resgid:
- gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(gid)) {
- ext4_msg(NULL, KERN_ERR, "Invalid gid value %d",
- result.uint_32);
- return -EINVAL;
- }
- ctx->s_resgid = gid;
+ ctx->s_resgid = result.gid;
ctx->spec |= EXT4_SPEC_s_resgid;
return 0;
case Opt_journal_dev:
@@ -3586,14 +3576,12 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly)
return 0;
}
-#if !IS_ENABLED(CONFIG_UNICODE)
- if (ext4_has_feature_casefold(sb)) {
+ if (!IS_ENABLED(CONFIG_UNICODE) && ext4_has_feature_casefold(sb)) {
ext4_msg(sb, KERN_ERR,
"Filesystem with casefold feature cannot be "
"mounted without CONFIG_UNICODE");
return 0;
}
-#endif
if (readonly)
return 1;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 6460879b9fcb..46ce2f21fef9 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1433,6 +1433,12 @@ retry:
goto out;
memcpy(bh->b_data, buf, csize);
+ /*
+ * Zero out block tail to avoid writing uninitialized memory
+ * to disk.
+ */
+ if (csize < blocksize)
+ memset(bh->b_data + csize, 0, blocksize - csize);
set_buffer_uptodate(bh);
ext4_handle_dirty_metadata(handle, ea_inode, bh);
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index ec2aeccb69a3..8bffdeccdbc3 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -219,8 +219,7 @@ static int f2fs_acl_update_mode(struct mnt_idmap *idmap,
return error;
if (error == 0)
*acl = NULL;
- if (!vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)) &&
- !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID))
+ if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode)))
mode &= ~S_ISGID;
*mode_p = mode;
return 0;
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 02c9355176d3..cbd7a5e96a37 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -42,35 +42,49 @@ static unsigned int bucket_blocks(unsigned int level)
return 4;
}
+#if IS_ENABLED(CONFIG_UNICODE)
/* If @dir is casefolded, initialize @fname->cf_name from @fname->usr_fname. */
int f2fs_init_casefolded_name(const struct inode *dir,
struct f2fs_filename *fname)
{
-#if IS_ENABLED(CONFIG_UNICODE)
struct super_block *sb = dir->i_sb;
+ unsigned char *buf;
+ int len;
if (IS_CASEFOLDED(dir) &&
!is_dot_dotdot(fname->usr_fname->name, fname->usr_fname->len)) {
- fname->cf_name.name = f2fs_kmem_cache_alloc(f2fs_cf_name_slab,
- GFP_NOFS, false, F2FS_SB(sb));
- if (!fname->cf_name.name)
+ buf = f2fs_kmem_cache_alloc(f2fs_cf_name_slab,
+ GFP_NOFS, false, F2FS_SB(sb));
+ if (!buf)
return -ENOMEM;
- fname->cf_name.len = utf8_casefold(sb->s_encoding,
- fname->usr_fname,
- fname->cf_name.name,
- F2FS_NAME_LEN);
- if ((int)fname->cf_name.len <= 0) {
- kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name);
- fname->cf_name.name = NULL;
+
+ len = utf8_casefold(sb->s_encoding, fname->usr_fname,
+ buf, F2FS_NAME_LEN);
+ if (len <= 0) {
+ kmem_cache_free(f2fs_cf_name_slab, buf);
if (sb_has_strict_encoding(sb))
return -EINVAL;
/* fall back to treating name as opaque byte sequence */
+ return 0;
}
+ fname->cf_name.name = buf;
+ fname->cf_name.len = len;
}
-#endif
+
return 0;
}
+void f2fs_free_casefolded_name(struct f2fs_filename *fname)
+{
+ unsigned char *buf = (unsigned char *)fname->cf_name.name;
+
+ if (buf) {
+ kmem_cache_free(f2fs_cf_name_slab, buf);
+ fname->cf_name.name = NULL;
+ }
+}
+#endif /* CONFIG_UNICODE */
+
static int __f2fs_setup_filename(const struct inode *dir,
const struct fscrypt_name *crypt_name,
struct f2fs_filename *fname)
@@ -142,12 +156,7 @@ void f2fs_free_filename(struct f2fs_filename *fname)
kfree(fname->crypto_buf.name);
fname->crypto_buf.name = NULL;
#endif
-#if IS_ENABLED(CONFIG_UNICODE)
- if (fname->cf_name.name) {
- kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name);
- fname->cf_name.name = NULL;
- }
-#endif
+ f2fs_free_casefolded_name(fname);
}
static unsigned long dir_block_index(unsigned int level,
@@ -176,58 +185,6 @@ static struct f2fs_dir_entry *find_in_block(struct inode *dir,
return f2fs_find_target_dentry(&d, fname, max_slots);
}
-#if IS_ENABLED(CONFIG_UNICODE)
-/*
- * Test whether a case-insensitive directory entry matches the filename
- * being searched for.
- *
- * Returns 1 for a match, 0 for no match, and -errno on an error.
- */
-static int f2fs_match_ci_name(const struct inode *dir, const struct qstr *name,
- const u8 *de_name, u32 de_name_len)
-{
- const struct super_block *sb = dir->i_sb;
- const struct unicode_map *um = sb->s_encoding;
- struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len);
- struct qstr entry = QSTR_INIT(de_name, de_name_len);
- int res;
-
- if (IS_ENCRYPTED(dir)) {
- const struct fscrypt_str encrypted_name =
- FSTR_INIT((u8 *)de_name, de_name_len);
-
- if (WARN_ON_ONCE(!fscrypt_has_encryption_key(dir)))
- return -EINVAL;
-
- decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL);
- if (!decrypted_name.name)
- return -ENOMEM;
- res = fscrypt_fname_disk_to_usr(dir, 0, 0, &encrypted_name,
- &decrypted_name);
- if (res < 0)
- goto out;
- entry.name = decrypted_name.name;
- entry.len = decrypted_name.len;
- }
-
- res = utf8_strncasecmp_folded(um, name, &entry);
- /*
- * In strict mode, ignore invalid names. In non-strict mode,
- * fall back to treating them as opaque byte sequences.
- */
- if (res < 0 && !sb_has_strict_encoding(sb)) {
- res = name->len == entry.len &&
- memcmp(name->name, entry.name, name->len) == 0;
- } else {
- /* utf8_strncasecmp_folded returns 0 on match */
- res = (res == 0);
- }
-out:
- kfree(decrypted_name.name);
- return res;
-}
-#endif /* CONFIG_UNICODE */
-
static inline int f2fs_match_name(const struct inode *dir,
const struct f2fs_filename *fname,
const u8 *de_name, u32 de_name_len)
@@ -235,11 +192,11 @@ static inline int f2fs_match_name(const struct inode *dir,
struct fscrypt_name f;
#if IS_ENABLED(CONFIG_UNICODE)
- if (fname->cf_name.name) {
- struct qstr cf = FSTR_TO_QSTR(&fname->cf_name);
+ if (fname->cf_name.name)
+ return generic_ci_match(dir, fname->usr_fname,
+ &fname->cf_name,
+ de_name, de_name_len);
- return f2fs_match_ci_name(dir, &cf, de_name, de_name_len);
- }
#endif
f.usr_fname = fname->usr_fname;
f.disk_name = fname->disk_name;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 1974b6aff397..8a9d910aa552 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -531,7 +531,7 @@ struct f2fs_filename {
* internal operation where usr_fname is also NULL. In all these cases
* we fall back to treating the name as an opaque byte sequence.
*/
- struct fscrypt_str cf_name;
+ struct qstr cf_name;
#endif
};
@@ -3533,8 +3533,22 @@ int f2fs_get_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
/*
* dir.c
*/
+#if IS_ENABLED(CONFIG_UNICODE)
int f2fs_init_casefolded_name(const struct inode *dir,
struct f2fs_filename *fname);
+void f2fs_free_casefolded_name(struct f2fs_filename *fname);
+#else
+static inline int f2fs_init_casefolded_name(const struct inode *dir,
+ struct f2fs_filename *fname)
+{
+ return 0;
+}
+
+static inline void f2fs_free_casefolded_name(struct f2fs_filename *fname)
+{
+}
+#endif /* CONFIG_UNICODE */
+
int f2fs_setup_filename(struct inode *dir, const struct qstr *iname,
int lookup, struct f2fs_filename *fname);
int f2fs_prepare_lookup(struct inode *dir, struct dentry *dentry,
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 5c0b281a70f3..c1ad9b278c47 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -185,7 +185,7 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
if (!dentry)
return 0;
- *pino = parent_ino(dentry);
+ *pino = d_parent_ino(dentry);
dput(dentry);
return 1;
}
@@ -923,10 +923,8 @@ static void __setattr_copy(struct mnt_idmap *idmap,
inode_set_ctime_to_ts(inode, attr->ia_ctime);
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
- vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
- if (!vfsgid_in_group_p(vfsgid) &&
- !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID))
+ if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode)))
mode &= ~S_ISGID;
set_acl_inode(inode, mode);
}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index e54f8c08bda8..1ecde2b45e99 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -576,8 +576,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
goto out_iput;
}
out_splice:
-#if IS_ENABLED(CONFIG_UNICODE)
- if (!inode && IS_CASEFOLDED(dir)) {
+ if (IS_ENABLED(CONFIG_UNICODE) && !inode && IS_CASEFOLDED(dir)) {
/* Eventually we want to call d_add_ci(dentry, NULL)
* for negative dentries in the encoding case as
* well. For now, prevent the negative dentry
@@ -586,7 +585,7 @@ out_splice:
trace_f2fs_lookup_end(dir, dentry, ino, err);
return NULL;
}
-#endif
+
new = d_splice_alias(inode, dentry);
trace_f2fs_lookup_end(dir, !IS_ERR_OR_NULL(new) ? new : dentry,
ino, IS_ERR(new) ? PTR_ERR(new) : err);
@@ -639,16 +638,15 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
f2fs_delete_entry(de, page, dir, inode);
f2fs_unlock_op(sbi);
-#if IS_ENABLED(CONFIG_UNICODE)
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
* invalidating the dentries here, alongside with returning the
* negative dentries at f2fs_lookup(), when it is better
* supported by the VFS for the CI case.
*/
- if (IS_CASEFOLDED(dir))
+ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
d_invalidate(dentry);
-#endif
+
if (IS_DIRSYNC(dir))
f2fs_sync_fs(sbi->sb, 1);
fail:
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 496aee53c38a..8712e264071f 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -46,10 +46,6 @@
static struct kmem_cache *fsync_entry_slab;
-#if IS_ENABLED(CONFIG_UNICODE)
-extern struct kmem_cache *f2fs_cf_name_slab;
-#endif
-
bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi)
{
s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count);
@@ -153,11 +149,8 @@ static int init_recovered_filename(const struct inode *dir,
if (err)
return err;
f2fs_hash_filename(dir, fname);
-#if IS_ENABLED(CONFIG_UNICODE)
/* Case-sensitive match is fine for recovery */
- kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name);
- fname->cf_name.name = NULL;
-#endif
+ f2fs_free_casefolded_name(fname);
} else {
f2fs_hash_filename(dir, fname);
}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 1f1b3647a998..df4cf31f93df 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -321,7 +321,7 @@ struct kmem_cache *f2fs_cf_name_slab;
static int __init f2fs_create_casefold_cache(void)
{
f2fs_cf_name_slab = f2fs_kmem_cache_create("f2fs_casefolded_name",
- F2FS_NAME_LEN);
+ F2FS_NAME_LEN);
return f2fs_cf_name_slab ? 0 : -ENOMEM;
}
@@ -1326,13 +1326,13 @@ default_check:
return -EINVAL;
}
#endif
-#if !IS_ENABLED(CONFIG_UNICODE)
- if (f2fs_sb_has_casefold(sbi)) {
+
+ if (!IS_ENABLED(CONFIG_UNICODE) && f2fs_sb_has_casefold(sbi)) {
f2fs_err(sbi,
"Filesystem with casefold feature cannot be mounted without CONFIG_UNICODE");
return -EINVAL;
}
-#endif
+
/*
* The BLKZONED feature indicates that the drive was formatted with
* zone alignment optimization. This is optional for host-aware
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 66cf4778cf3b..d3e426de5f01 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -7,6 +7,8 @@
#include <linux/hash.h>
#include <linux/ratelimit.h>
#include <linux/msdos_fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
/*
* vfat shortname flags
@@ -51,7 +53,8 @@ struct fat_mount_options {
tz_set:1, /* Filesystem timestamps' offset set */
rodir:1, /* allow ATTR_RO for directory */
discard:1, /* Issue discard requests on deletions */
- dos1xfloppy:1; /* Assume default BPB for DOS 1.x floppies */
+ dos1xfloppy:1, /* Assume default BPB for DOS 1.x floppies */
+ debug:1; /* Not currently used */
};
#define FAT_HASH_BITS 8
@@ -415,12 +418,21 @@ extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos);
extern struct inode *fat_build_inode(struct super_block *sb,
struct msdos_dir_entry *de, loff_t i_pos);
extern int fat_sync_inode(struct inode *inode);
-extern int fat_fill_super(struct super_block *sb, void *data, int silent,
- int isvfat, void (*setup)(struct super_block *));
+extern int fat_fill_super(struct super_block *sb, struct fs_context *fc,
+ void (*setup)(struct super_block *));
extern int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de);
extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
struct inode *i2);
+
+extern const struct fs_parameter_spec fat_param_spec[];
+int fat_init_fs_context(struct fs_context *fc, bool is_vfat);
+void fat_free_fc(struct fs_context *fc);
+
+int fat_parse_param(struct fs_context *fc, struct fs_parameter *param,
+ bool is_vfat);
+int fat_reconfigure(struct fs_context *fc);
+
static inline unsigned long fat_dir_hash(int logstart)
{
return hash_32(logstart, FAT_HASH_BITS);
diff --git a/fs/fat/fat_test.c b/fs/fat/fat_test.c
index 2dab4ca1d0d8..1f0062659067 100644
--- a/fs/fat/fat_test.c
+++ b/fs/fat/fat_test.c
@@ -193,4 +193,5 @@ static struct kunit_suite fat_test_suite = {
kunit_test_suites(&fat_test_suite);
+MODULE_DESCRIPTION("KUnit tests for FAT filesystems");
MODULE_LICENSE("GPL v2");
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index d9e6fbb6f246..19115fd2d2a4 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -16,7 +16,6 @@
#include <linux/mpage.h>
#include <linux/vfs.h>
#include <linux/seq_file.h>
-#include <linux/parser.h>
#include <linux/uio.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
@@ -804,16 +803,17 @@ static void __exit fat_destroy_inodecache(void)
kmem_cache_destroy(fat_inode_cachep);
}
-static int fat_remount(struct super_block *sb, int *flags, char *data)
+int fat_reconfigure(struct fs_context *fc)
{
bool new_rdonly;
+ struct super_block *sb = fc->root->d_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
- *flags |= SB_NODIRATIME | (sbi->options.isvfat ? 0 : SB_NOATIME);
+ fc->sb_flags |= SB_NODIRATIME | (sbi->options.isvfat ? 0 : SB_NOATIME);
sync_filesystem(sb);
/* make sure we update state on remount. */
- new_rdonly = *flags & SB_RDONLY;
+ new_rdonly = fc->sb_flags & SB_RDONLY;
if (new_rdonly != sb_rdonly(sb)) {
if (new_rdonly)
fat_set_state(sb, 0, 0);
@@ -822,6 +822,7 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
}
return 0;
}
+EXPORT_SYMBOL_GPL(fat_reconfigure);
static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
{
@@ -939,8 +940,6 @@ static const struct super_operations fat_sops = {
.evict_inode = fat_evict_inode,
.put_super = fat_put_super,
.statfs = fat_statfs,
- .remount_fs = fat_remount,
-
.show_options = fat_show_options,
};
@@ -1037,355 +1036,282 @@ static int fat_show_options(struct seq_file *m, struct dentry *root)
}
enum {
- Opt_check_n, Opt_check_r, Opt_check_s, Opt_uid, Opt_gid,
- Opt_umask, Opt_dmask, Opt_fmask, Opt_allow_utime, Opt_codepage,
- Opt_usefree, Opt_nocase, Opt_quiet, Opt_showexec, Opt_debug,
- Opt_immutable, Opt_dots, Opt_nodots,
- Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
- Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
- Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
- Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
- Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset,
- Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err, Opt_dos1xfloppy,
+ Opt_check, Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask,
+ Opt_allow_utime, Opt_codepage, Opt_usefree, Opt_nocase, Opt_quiet,
+ Opt_showexec, Opt_debug, Opt_immutable, Opt_dots, Opt_dotsOK,
+ Opt_charset, Opt_shortname, Opt_utf8, Opt_utf8_bool,
+ Opt_uni_xl, Opt_uni_xl_bool, Opt_nonumtail, Opt_nonumtail_bool,
+ Opt_obsolete, Opt_flush, Opt_tz, Opt_rodir, Opt_errors, Opt_discard,
+ Opt_nfs, Opt_nfs_enum, Opt_time_offset, Opt_dos1xfloppy,
};
-static const match_table_t fat_tokens = {
- {Opt_check_r, "check=relaxed"},
- {Opt_check_s, "check=strict"},
- {Opt_check_n, "check=normal"},
- {Opt_check_r, "check=r"},
- {Opt_check_s, "check=s"},
- {Opt_check_n, "check=n"},
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_umask, "umask=%o"},
- {Opt_dmask, "dmask=%o"},
- {Opt_fmask, "fmask=%o"},
- {Opt_allow_utime, "allow_utime=%o"},
- {Opt_codepage, "codepage=%u"},
- {Opt_usefree, "usefree"},
- {Opt_nocase, "nocase"},
- {Opt_quiet, "quiet"},
- {Opt_showexec, "showexec"},
- {Opt_debug, "debug"},
- {Opt_immutable, "sys_immutable"},
- {Opt_flush, "flush"},
- {Opt_tz_utc, "tz=UTC"},
- {Opt_time_offset, "time_offset=%d"},
- {Opt_err_cont, "errors=continue"},
- {Opt_err_panic, "errors=panic"},
- {Opt_err_ro, "errors=remount-ro"},
- {Opt_discard, "discard"},
- {Opt_nfs_stale_rw, "nfs"},
- {Opt_nfs_stale_rw, "nfs=stale_rw"},
- {Opt_nfs_nostale_ro, "nfs=nostale_ro"},
- {Opt_dos1xfloppy, "dos1xfloppy"},
- {Opt_obsolete, "conv=binary"},
- {Opt_obsolete, "conv=text"},
- {Opt_obsolete, "conv=auto"},
- {Opt_obsolete, "conv=b"},
- {Opt_obsolete, "conv=t"},
- {Opt_obsolete, "conv=a"},
- {Opt_obsolete, "fat=%u"},
- {Opt_obsolete, "blocksize=%u"},
- {Opt_obsolete, "cvf_format=%20s"},
- {Opt_obsolete, "cvf_options=%100s"},
- {Opt_obsolete, "posix"},
- {Opt_err, NULL},
-};
-static const match_table_t msdos_tokens = {
- {Opt_nodots, "nodots"},
- {Opt_nodots, "dotsOK=no"},
- {Opt_dots, "dots"},
- {Opt_dots, "dotsOK=yes"},
- {Opt_err, NULL}
+static const struct constant_table fat_param_check[] = {
+ {"relaxed", 'r'},
+ {"r", 'r'},
+ {"strict", 's'},
+ {"s", 's'},
+ {"normal", 'n'},
+ {"n", 'n'},
+ {}
};
-static const match_table_t vfat_tokens = {
- {Opt_charset, "iocharset=%s"},
- {Opt_shortname_lower, "shortname=lower"},
- {Opt_shortname_win95, "shortname=win95"},
- {Opt_shortname_winnt, "shortname=winnt"},
- {Opt_shortname_mixed, "shortname=mixed"},
- {Opt_utf8_no, "utf8=0"}, /* 0 or no or false */
- {Opt_utf8_no, "utf8=no"},
- {Opt_utf8_no, "utf8=false"},
- {Opt_utf8_yes, "utf8=1"}, /* empty or 1 or yes or true */
- {Opt_utf8_yes, "utf8=yes"},
- {Opt_utf8_yes, "utf8=true"},
- {Opt_utf8_yes, "utf8"},
- {Opt_uni_xl_no, "uni_xlate=0"}, /* 0 or no or false */
- {Opt_uni_xl_no, "uni_xlate=no"},
- {Opt_uni_xl_no, "uni_xlate=false"},
- {Opt_uni_xl_yes, "uni_xlate=1"}, /* empty or 1 or yes or true */
- {Opt_uni_xl_yes, "uni_xlate=yes"},
- {Opt_uni_xl_yes, "uni_xlate=true"},
- {Opt_uni_xl_yes, "uni_xlate"},
- {Opt_nonumtail_no, "nonumtail=0"}, /* 0 or no or false */
- {Opt_nonumtail_no, "nonumtail=no"},
- {Opt_nonumtail_no, "nonumtail=false"},
- {Opt_nonumtail_yes, "nonumtail=1"}, /* empty or 1 or yes or true */
- {Opt_nonumtail_yes, "nonumtail=yes"},
- {Opt_nonumtail_yes, "nonumtail=true"},
- {Opt_nonumtail_yes, "nonumtail"},
- {Opt_rodir, "rodir"},
- {Opt_err, NULL}
+
+static const struct constant_table fat_param_tz[] = {
+ {"UTC", 0},
+ {}
};
-static int parse_options(struct super_block *sb, char *options, int is_vfat,
- int silent, int *debug, struct fat_mount_options *opts)
-{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
- char *iocharset;
+static const struct constant_table fat_param_errors[] = {
+ {"continue", FAT_ERRORS_CONT},
+ {"panic", FAT_ERRORS_PANIC},
+ {"remount-ro", FAT_ERRORS_RO},
+ {}
+};
- opts->isvfat = is_vfat;
- opts->fs_uid = current_uid();
- opts->fs_gid = current_gid();
- opts->fs_fmask = opts->fs_dmask = current_umask();
- opts->allow_utime = -1;
- opts->codepage = fat_default_codepage;
- fat_reset_iocharset(opts);
- if (is_vfat) {
- opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95;
- opts->rodir = 0;
- } else {
- opts->shortname = 0;
- opts->rodir = 1;
- }
- opts->name_check = 'n';
- opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK = 0;
- opts->unicode_xlate = 0;
- opts->numtail = 1;
- opts->usefree = opts->nocase = 0;
- opts->tz_set = 0;
- opts->nfs = 0;
- opts->errors = FAT_ERRORS_RO;
- *debug = 0;
+static const struct constant_table fat_param_nfs[] = {
+ {"stale_rw", FAT_NFS_STALE_RW},
+ {"nostale_ro", FAT_NFS_NOSTALE_RO},
+ {}
+};
- opts->utf8 = IS_ENABLED(CONFIG_FAT_DEFAULT_UTF8) && is_vfat;
+/*
+ * These are all obsolete but we still reject invalid options.
+ * The corresponding values are therefore meaningless.
+ */
+static const struct constant_table fat_param_conv[] = {
+ {"binary", 0},
+ {"text", 0},
+ {"auto", 0},
+ {"b", 0},
+ {"t", 0},
+ {"a", 0},
+ {}
+};
- if (!options)
- goto out;
+/* Core options. See below for vfat and msdos extras */
+const struct fs_parameter_spec fat_param_spec[] = {
+ fsparam_enum ("check", Opt_check, fat_param_check),
+ fsparam_uid ("uid", Opt_uid),
+ fsparam_gid ("gid", Opt_gid),
+ fsparam_u32oct ("umask", Opt_umask),
+ fsparam_u32oct ("dmask", Opt_dmask),
+ fsparam_u32oct ("fmask", Opt_fmask),
+ fsparam_u32oct ("allow_utime", Opt_allow_utime),
+ fsparam_u32 ("codepage", Opt_codepage),
+ fsparam_flag ("usefree", Opt_usefree),
+ fsparam_flag ("nocase", Opt_nocase),
+ fsparam_flag ("quiet", Opt_quiet),
+ fsparam_flag ("showexec", Opt_showexec),
+ fsparam_flag ("debug", Opt_debug),
+ fsparam_flag ("sys_immutable", Opt_immutable),
+ fsparam_flag ("flush", Opt_flush),
+ fsparam_enum ("tz", Opt_tz, fat_param_tz),
+ fsparam_s32 ("time_offset", Opt_time_offset),
+ fsparam_enum ("errors", Opt_errors, fat_param_errors),
+ fsparam_flag ("discard", Opt_discard),
+ fsparam_flag ("nfs", Opt_nfs),
+ fsparam_enum ("nfs", Opt_nfs_enum, fat_param_nfs),
+ fsparam_flag ("dos1xfloppy", Opt_dos1xfloppy),
+ __fsparam(fs_param_is_enum, "conv",
+ Opt_obsolete, fs_param_deprecated, fat_param_conv),
+ __fsparam(fs_param_is_u32, "fat",
+ Opt_obsolete, fs_param_deprecated, NULL),
+ __fsparam(fs_param_is_u32, "blocksize",
+ Opt_obsolete, fs_param_deprecated, NULL),
+ __fsparam(fs_param_is_string, "cvf_format",
+ Opt_obsolete, fs_param_deprecated, NULL),
+ __fsparam(fs_param_is_string, "cvf_options",
+ Opt_obsolete, fs_param_deprecated, NULL),
+ __fsparam(NULL, "posix",
+ Opt_obsolete, fs_param_deprecated, NULL),
+ {}
+};
+EXPORT_SYMBOL_GPL(fat_param_spec);
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
+static const struct fs_parameter_spec msdos_param_spec[] = {
+ fsparam_flag_no ("dots", Opt_dots),
+ fsparam_bool ("dotsOK", Opt_dotsOK),
+ {}
+};
- token = match_token(p, fat_tokens, args);
- if (token == Opt_err) {
- if (is_vfat)
- token = match_token(p, vfat_tokens, args);
- else
- token = match_token(p, msdos_tokens, args);
- }
- switch (token) {
- case Opt_check_s:
- opts->name_check = 's';
- break;
- case Opt_check_r:
- opts->name_check = 'r';
- break;
- case Opt_check_n:
- opts->name_check = 'n';
- break;
- case Opt_usefree:
- opts->usefree = 1;
- break;
- case Opt_nocase:
- if (!is_vfat)
- opts->nocase = 1;
- else {
- /* for backward compatibility */
- opts->shortname = VFAT_SFN_DISPLAY_WIN95
- | VFAT_SFN_CREATE_WIN95;
- }
- break;
- case Opt_quiet:
- opts->quiet = 1;
- break;
- case Opt_showexec:
- opts->showexec = 1;
- break;
- case Opt_debug:
- *debug = 1;
- break;
- case Opt_immutable:
- opts->sys_immutable = 1;
- break;
- case Opt_uid:
- if (match_int(&args[0], &option))
- return -EINVAL;
- opts->fs_uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(opts->fs_uid))
- return -EINVAL;
- break;
- case Opt_gid:
- if (match_int(&args[0], &option))
- return -EINVAL;
- opts->fs_gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(opts->fs_gid))
- return -EINVAL;
- break;
- case Opt_umask:
- if (match_octal(&args[0], &option))
- return -EINVAL;
- opts->fs_fmask = opts->fs_dmask = option;
- break;
- case Opt_dmask:
- if (match_octal(&args[0], &option))
- return -EINVAL;
- opts->fs_dmask = option;
- break;
- case Opt_fmask:
- if (match_octal(&args[0], &option))
- return -EINVAL;
- opts->fs_fmask = option;
- break;
- case Opt_allow_utime:
- if (match_octal(&args[0], &option))
- return -EINVAL;
- opts->allow_utime = option & (S_IWGRP | S_IWOTH);
- break;
- case Opt_codepage:
- if (match_int(&args[0], &option))
- return -EINVAL;
- opts->codepage = option;
- break;
- case Opt_flush:
- opts->flush = 1;
- break;
- case Opt_time_offset:
- if (match_int(&args[0], &option))
- return -EINVAL;
- /*
- * GMT+-12 zones may have DST corrections so at least
- * 13 hours difference is needed. Make the limit 24
- * just in case someone invents something unusual.
- */
- if (option < -24 * 60 || option > 24 * 60)
- return -EINVAL;
- opts->tz_set = 1;
- opts->time_offset = option;
- break;
- case Opt_tz_utc:
- opts->tz_set = 1;
- opts->time_offset = 0;
- break;
- case Opt_err_cont:
- opts->errors = FAT_ERRORS_CONT;
- break;
- case Opt_err_panic:
- opts->errors = FAT_ERRORS_PANIC;
- break;
- case Opt_err_ro:
- opts->errors = FAT_ERRORS_RO;
- break;
- case Opt_nfs_stale_rw:
- opts->nfs = FAT_NFS_STALE_RW;
- break;
- case Opt_nfs_nostale_ro:
- opts->nfs = FAT_NFS_NOSTALE_RO;
- break;
- case Opt_dos1xfloppy:
- opts->dos1xfloppy = 1;
- break;
+static const struct constant_table fat_param_shortname[] = {
+ {"lower", VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95},
+ {"win95", VFAT_SFN_DISPLAY_WIN95 | VFAT_SFN_CREATE_WIN95},
+ {"winnt", VFAT_SFN_DISPLAY_WINNT | VFAT_SFN_CREATE_WINNT},
+ {"mixed", VFAT_SFN_DISPLAY_WINNT | VFAT_SFN_CREATE_WIN95},
+ {}
+};
- /* msdos specific */
- case Opt_dots:
- opts->dotsOK = 1;
- break;
- case Opt_nodots:
- opts->dotsOK = 0;
- break;
+static const struct fs_parameter_spec vfat_param_spec[] = {
+ fsparam_string ("iocharset", Opt_charset),
+ fsparam_enum ("shortname", Opt_shortname, fat_param_shortname),
+ fsparam_flag ("utf8", Opt_utf8),
+ fsparam_bool ("utf8", Opt_utf8_bool),
+ fsparam_flag ("uni_xlate", Opt_uni_xl),
+ fsparam_bool ("uni_xlate", Opt_uni_xl_bool),
+ fsparam_flag ("nonumtail", Opt_nonumtail),
+ fsparam_bool ("nonumtail", Opt_nonumtail_bool),
+ fsparam_flag ("rodir", Opt_rodir),
+ {}
+};
- /* vfat specific */
- case Opt_charset:
- fat_reset_iocharset(opts);
- iocharset = match_strdup(&args[0]);
- if (!iocharset)
- return -ENOMEM;
- opts->iocharset = iocharset;
- break;
- case Opt_shortname_lower:
- opts->shortname = VFAT_SFN_DISPLAY_LOWER
- | VFAT_SFN_CREATE_WIN95;
- break;
- case Opt_shortname_win95:
- opts->shortname = VFAT_SFN_DISPLAY_WIN95
- | VFAT_SFN_CREATE_WIN95;
- break;
- case Opt_shortname_winnt:
- opts->shortname = VFAT_SFN_DISPLAY_WINNT
- | VFAT_SFN_CREATE_WINNT;
- break;
- case Opt_shortname_mixed:
- opts->shortname = VFAT_SFN_DISPLAY_WINNT
- | VFAT_SFN_CREATE_WIN95;
- break;
- case Opt_utf8_no: /* 0 or no or false */
- opts->utf8 = 0;
- break;
- case Opt_utf8_yes: /* empty or 1 or yes or true */
- opts->utf8 = 1;
- break;
- case Opt_uni_xl_no: /* 0 or no or false */
- opts->unicode_xlate = 0;
- break;
- case Opt_uni_xl_yes: /* empty or 1 or yes or true */
- opts->unicode_xlate = 1;
- break;
- case Opt_nonumtail_no: /* 0 or no or false */
- opts->numtail = 1; /* negated option */
- break;
- case Opt_nonumtail_yes: /* empty or 1 or yes or true */
- opts->numtail = 0; /* negated option */
- break;
- case Opt_rodir:
- opts->rodir = 1;
- break;
- case Opt_discard:
- opts->discard = 1;
- break;
+int fat_parse_param(struct fs_context *fc, struct fs_parameter *param,
+ bool is_vfat)
+{
+ struct fat_mount_options *opts = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
- /* obsolete mount options */
- case Opt_obsolete:
- fat_msg(sb, KERN_INFO, "\"%s\" option is obsolete, "
- "not supported now", p);
- break;
- /* unknown option */
- default:
- if (!silent) {
- fat_msg(sb, KERN_ERR,
- "Unrecognized mount option \"%s\" "
- "or missing value", p);
- }
- return -EINVAL;
- }
- }
+ /* remount options have traditionally been ignored */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)
+ return 0;
-out:
- /* UTF-8 doesn't provide FAT semantics */
- if (!strcmp(opts->iocharset, "utf8")) {
- fat_msg(sb, KERN_WARNING, "utf8 is not a recommended IO charset"
- " for FAT filesystems, filesystem will be "
- "case sensitive!");
+ opt = fs_parse(fc, fat_param_spec, param, &result);
+ /* If option not found in fat_param_spec, try vfat/msdos options */
+ if (opt == -ENOPARAM) {
+ if (is_vfat)
+ opt = fs_parse(fc, vfat_param_spec, param, &result);
+ else
+ opt = fs_parse(fc, msdos_param_spec, param, &result);
}
- /* If user doesn't specify allow_utime, it's initialized from dmask. */
- if (opts->allow_utime == (unsigned short)-1)
- opts->allow_utime = ~opts->fs_dmask & (S_IWGRP | S_IWOTH);
- if (opts->unicode_xlate)
- opts->utf8 = 0;
- if (opts->nfs == FAT_NFS_NOSTALE_RO) {
- sb->s_flags |= SB_RDONLY;
- sb->s_export_op = &fat_export_ops_nostale;
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_check:
+ opts->name_check = result.uint_32;
+ break;
+ case Opt_usefree:
+ opts->usefree = 1;
+ break;
+ case Opt_nocase:
+ if (!is_vfat)
+ opts->nocase = 1;
+ else {
+ /* for backward compatibility */
+ opts->shortname = VFAT_SFN_DISPLAY_WIN95
+ | VFAT_SFN_CREATE_WIN95;
+ }
+ break;
+ case Opt_quiet:
+ opts->quiet = 1;
+ break;
+ case Opt_showexec:
+ opts->showexec = 1;
+ break;
+ case Opt_debug:
+ opts->debug = 1;
+ break;
+ case Opt_immutable:
+ opts->sys_immutable = 1;
+ break;
+ case Opt_uid:
+ opts->fs_uid = result.uid;
+ break;
+ case Opt_gid:
+ opts->fs_gid = result.gid;
+ break;
+ case Opt_umask:
+ opts->fs_fmask = opts->fs_dmask = result.uint_32;
+ break;
+ case Opt_dmask:
+ opts->fs_dmask = result.uint_32;
+ break;
+ case Opt_fmask:
+ opts->fs_fmask = result.uint_32;
+ break;
+ case Opt_allow_utime:
+ opts->allow_utime = result.uint_32 & (S_IWGRP | S_IWOTH);
+ break;
+ case Opt_codepage:
+ opts->codepage = result.uint_32;
+ break;
+ case Opt_flush:
+ opts->flush = 1;
+ break;
+ case Opt_time_offset:
+ /*
+ * GMT+-12 zones may have DST corrections so at least
+ * 13 hours difference is needed. Make the limit 24
+ * just in case someone invents something unusual.
+ */
+ if (result.int_32 < -24 * 60 || result.int_32 > 24 * 60)
+ return -EINVAL;
+ opts->tz_set = 1;
+ opts->time_offset = result.int_32;
+ break;
+ case Opt_tz:
+ opts->tz_set = 1;
+ opts->time_offset = result.uint_32;
+ break;
+ case Opt_errors:
+ opts->errors = result.uint_32;
+ break;
+ case Opt_nfs:
+ opts->nfs = FAT_NFS_STALE_RW;
+ break;
+ case Opt_nfs_enum:
+ opts->nfs = result.uint_32;
+ break;
+ case Opt_dos1xfloppy:
+ opts->dos1xfloppy = 1;
+ break;
+
+ /* msdos specific */
+ case Opt_dots: /* dots / nodots */
+ opts->dotsOK = !result.negated;
+ break;
+ case Opt_dotsOK: /* dotsOK = yes/no */
+ opts->dotsOK = result.boolean;
+ break;
+
+ /* vfat specific */
+ case Opt_charset:
+ fat_reset_iocharset(opts);
+ opts->iocharset = param->string;
+ param->string = NULL; /* Steal string */
+ break;
+ case Opt_shortname:
+ opts->shortname = result.uint_32;
+ break;
+ case Opt_utf8:
+ opts->utf8 = 1;
+ break;
+ case Opt_utf8_bool:
+ opts->utf8 = result.boolean;
+ break;
+ case Opt_uni_xl:
+ opts->unicode_xlate = 1;
+ break;
+ case Opt_uni_xl_bool:
+ opts->unicode_xlate = result.boolean;
+ break;
+ case Opt_nonumtail:
+ opts->numtail = 0; /* negated option */
+ break;
+ case Opt_nonumtail_bool:
+ opts->numtail = !result.boolean; /* negated option */
+ break;
+ case Opt_rodir:
+ opts->rodir = 1;
+ break;
+ case Opt_discard:
+ opts->discard = 1;
+ break;
+
+ /* obsolete mount options */
+ case Opt_obsolete:
+ printk(KERN_INFO "FAT-fs: \"%s\" option is obsolete, "
+ "not supported now", param->key);
+ break;
+ default:
+ return -EINVAL;
}
return 0;
}
+EXPORT_SYMBOL_GPL(fat_parse_param);
static int fat_read_root(struct inode *inode)
{
@@ -1604,9 +1530,11 @@ out:
/*
* Read the super block of an MS-DOS FS.
*/
-int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
+int fat_fill_super(struct super_block *sb, struct fs_context *fc,
void (*setup)(struct super_block *))
{
+ struct fat_mount_options *opts = fc->fs_private;
+ int silent = fc->sb_flags & SB_SILENT;
struct inode *root_inode = NULL, *fat_inode = NULL;
struct inode *fsinfo_inode = NULL;
struct buffer_head *bh;
@@ -1614,7 +1542,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
struct msdos_sb_info *sbi;
u16 logical_sector_size;
u32 total_sectors, total_clusters, fat_clusters, rootdir_sectors;
- int debug;
long error;
char buf[50];
struct timespec64 ts;
@@ -1643,9 +1570,27 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
- error = parse_options(sb, data, isvfat, silent, &debug, &sbi->options);
- if (error)
- goto out_fail;
+ /* UTF-8 doesn't provide FAT semantics */
+ if (!strcmp(opts->iocharset, "utf8")) {
+ fat_msg(sb, KERN_WARNING, "utf8 is not a recommended IO charset"
+ " for FAT filesystems, filesystem will be"
+ " case sensitive!");
+ }
+
+ /* If user doesn't specify allow_utime, it's initialized from dmask. */
+ if (opts->allow_utime == (unsigned short)-1)
+ opts->allow_utime = ~opts->fs_dmask & (S_IWGRP | S_IWOTH);
+ if (opts->unicode_xlate)
+ opts->utf8 = 0;
+ if (opts->nfs == FAT_NFS_NOSTALE_RO) {
+ sb->s_flags |= SB_RDONLY;
+ sb->s_export_op = &fat_export_ops_nostale;
+ }
+
+ /* Apply parsed options to sbi (structure copy) */
+ sbi->options = *opts;
+ /* Transfer ownership of iocharset to sbi->options */
+ opts->iocharset = NULL;
setup(sb); /* flavour-specific stuff that needs options */
@@ -1950,6 +1895,57 @@ int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2)
}
EXPORT_SYMBOL_GPL(fat_flush_inodes);
+int fat_init_fs_context(struct fs_context *fc, bool is_vfat)
+{
+ struct fat_mount_options *opts;
+
+ opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+
+ opts->isvfat = is_vfat;
+ opts->fs_uid = current_uid();
+ opts->fs_gid = current_gid();
+ opts->fs_fmask = opts->fs_dmask = current_umask();
+ opts->allow_utime = -1;
+ opts->codepage = fat_default_codepage;
+ fat_reset_iocharset(opts);
+ if (is_vfat) {
+ opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95;
+ opts->rodir = 0;
+ } else {
+ opts->shortname = 0;
+ opts->rodir = 1;
+ }
+ opts->name_check = 'n';
+ opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK = 0;
+ opts->unicode_xlate = 0;
+ opts->numtail = 1;
+ opts->usefree = opts->nocase = 0;
+ opts->tz_set = 0;
+ opts->nfs = 0;
+ opts->errors = FAT_ERRORS_RO;
+ opts->debug = 0;
+
+ opts->utf8 = IS_ENABLED(CONFIG_FAT_DEFAULT_UTF8) && is_vfat;
+
+ fc->fs_private = opts;
+ /* fc->ops assigned by caller */
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fat_init_fs_context);
+
+void fat_free_fc(struct fs_context *fc)
+{
+ struct fat_mount_options *opts = fc->fs_private;
+
+ if (opts->iocharset != fat_default_iocharset)
+ kfree(opts->iocharset);
+ kfree(fc->fs_private);
+}
+EXPORT_SYMBOL_GPL(fat_free_fc);
+
static int __init init_fat_fs(void)
{
int err;
@@ -1978,4 +1974,5 @@ static void __exit exit_fat_fs(void)
module_init(init_fat_fs)
module_exit(exit_fat_fs)
+MODULE_DESCRIPTION("Core FAT filesystem support");
MODULE_LICENSE("GPL");
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 2116c486843b..f06f6ba643cc 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -650,24 +650,48 @@ static void setup(struct super_block *sb)
sb->s_flags |= SB_NOATIME;
}
-static int msdos_fill_super(struct super_block *sb, void *data, int silent)
+static int msdos_fill_super(struct super_block *sb, struct fs_context *fc)
{
- return fat_fill_super(sb, data, silent, 0, setup);
+ return fat_fill_super(sb, fc, setup);
}
-static struct dentry *msdos_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name,
- void *data)
+static int msdos_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, msdos_fill_super);
+ return get_tree_bdev(fc, msdos_fill_super);
+}
+
+static int msdos_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ return fat_parse_param(fc, param, false);
+}
+
+static const struct fs_context_operations msdos_context_ops = {
+ .parse_param = msdos_parse_param,
+ .get_tree = msdos_get_tree,
+ .reconfigure = fat_reconfigure,
+ .free = fat_free_fc,
+};
+
+static int msdos_init_fs_context(struct fs_context *fc)
+{
+ int err;
+
+ /* Initialize with is_vfat == false */
+ err = fat_init_fs_context(fc, false);
+ if (err)
+ return err;
+
+ fc->ops = &msdos_context_ops;
+ return 0;
}
static struct file_system_type msdos_fs_type = {
.owner = THIS_MODULE,
.name = "msdos",
- .mount = msdos_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+ .init_fs_context = msdos_init_fs_context,
+ .parameters = fat_param_spec,
};
MODULE_ALIAS_FS("msdos");
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index c4d00999a433..6423e1dedf14 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -1195,24 +1195,48 @@ static void setup(struct super_block *sb)
sb->s_d_op = &vfat_dentry_ops;
}
-static int vfat_fill_super(struct super_block *sb, void *data, int silent)
+static int vfat_fill_super(struct super_block *sb, struct fs_context *fc)
{
- return fat_fill_super(sb, data, silent, 1, setup);
+ return fat_fill_super(sb, fc, setup);
}
-static struct dentry *vfat_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name,
- void *data)
+static int vfat_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, vfat_fill_super);
+ return get_tree_bdev(fc, vfat_fill_super);
+}
+
+static int vfat_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ return fat_parse_param(fc, param, true);
+}
+
+static const struct fs_context_operations vfat_context_ops = {
+ .parse_param = vfat_parse_param,
+ .get_tree = vfat_get_tree,
+ .reconfigure = fat_reconfigure,
+ .free = fat_free_fc,
+};
+
+static int vfat_init_fs_context(struct fs_context *fc)
+{
+ int err;
+
+ /* Initialize with is_vfat == true */
+ err = fat_init_fs_context(fc, true);
+ if (err)
+ return err;
+
+ fc->ops = &vfat_context_ops;
+ return 0;
}
static struct file_system_type vfat_fs_type = {
.owner = THIS_MODULE,
.name = "vfat",
- .mount = vfat_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+ .init_fs_context = vfat_init_fs_context,
+ .parameters = fat_param_spec,
};
MODULE_ALIAS_FS("vfat");
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 8a7f86c2139a..6e8cea16790e 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -115,88 +115,188 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
return err;
}
-static struct vfsmount *get_vfsmount_from_fd(int fd)
+static int get_path_from_fd(int fd, struct path *root)
{
- struct vfsmount *mnt;
-
if (fd == AT_FDCWD) {
struct fs_struct *fs = current->fs;
spin_lock(&fs->lock);
- mnt = mntget(fs->pwd.mnt);
+ *root = fs->pwd;
+ path_get(root);
spin_unlock(&fs->lock);
} else {
struct fd f = fdget(fd);
if (!f.file)
- return ERR_PTR(-EBADF);
- mnt = mntget(f.file->f_path.mnt);
+ return -EBADF;
+ *root = f.file->f_path;
+ path_get(root);
fdput(f);
}
- return mnt;
+
+ return 0;
}
+enum handle_to_path_flags {
+ HANDLE_CHECK_PERMS = (1 << 0),
+ HANDLE_CHECK_SUBTREE = (1 << 1),
+};
+
+struct handle_to_path_ctx {
+ struct path root;
+ enum handle_to_path_flags flags;
+ unsigned int fh_flags;
+};
+
static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
{
- return 1;
+ struct handle_to_path_ctx *ctx = context;
+ struct user_namespace *user_ns = current_user_ns();
+ struct dentry *d, *root = ctx->root.dentry;
+ struct mnt_idmap *idmap = mnt_idmap(ctx->root.mnt);
+ int retval = 0;
+
+ if (!root)
+ return 1;
+
+ /* Old permission model with global CAP_DAC_READ_SEARCH. */
+ if (!ctx->flags)
+ return 1;
+
+ /*
+ * It's racy as we're not taking rename_lock but we're able to ignore
+ * permissions and we just need an approximation whether we were able
+ * to follow a path to the file.
+ *
+ * It's also potentially expensive on some filesystems especially if
+ * there is a deep path.
+ */
+ d = dget(dentry);
+ while (d != root && !IS_ROOT(d)) {
+ struct dentry *parent = dget_parent(d);
+
+ /*
+ * We know that we have the ability to override DAC permissions
+ * as we've verified this earlier via CAP_DAC_READ_SEARCH. But
+ * we also need to make sure that there aren't any unmapped
+ * inodes in the path that would prevent us from reaching the
+ * file.
+ */
+ if (!privileged_wrt_inode_uidgid(user_ns, idmap,
+ d_inode(parent))) {
+ dput(d);
+ dput(parent);
+ return retval;
+ }
+
+ dput(d);
+ d = parent;
+ }
+
+ if (!(ctx->flags & HANDLE_CHECK_SUBTREE) || d == root)
+ retval = 1;
+ WARN_ON_ONCE(d != root && d != root->d_sb->s_root);
+ dput(d);
+ return retval;
}
-static int do_handle_to_path(int mountdirfd, struct file_handle *handle,
- struct path *path)
+static int do_handle_to_path(struct file_handle *handle, struct path *path,
+ struct handle_to_path_ctx *ctx)
{
- int retval = 0;
int handle_dwords;
+ struct vfsmount *mnt = ctx->root.mnt;
- path->mnt = get_vfsmount_from_fd(mountdirfd);
- if (IS_ERR(path->mnt)) {
- retval = PTR_ERR(path->mnt);
- goto out_err;
- }
/* change the handle size to multiple of sizeof(u32) */
handle_dwords = handle->handle_bytes >> 2;
- path->dentry = exportfs_decode_fh(path->mnt,
+ path->dentry = exportfs_decode_fh_raw(mnt,
(struct fid *)handle->f_handle,
handle_dwords, handle->handle_type,
- vfs_dentry_acceptable, NULL);
- if (IS_ERR(path->dentry)) {
- retval = PTR_ERR(path->dentry);
- goto out_mnt;
+ ctx->fh_flags,
+ vfs_dentry_acceptable, ctx);
+ if (IS_ERR_OR_NULL(path->dentry)) {
+ if (path->dentry == ERR_PTR(-ENOMEM))
+ return -ENOMEM;
+ return -ESTALE;
}
+ path->mnt = mntget(mnt);
return 0;
-out_mnt:
- mntput(path->mnt);
-out_err:
- return retval;
+}
+
+/*
+ * Allow relaxed permissions of file handles if the caller has the
+ * ability to mount the filesystem or create a bind-mount of the
+ * provided @mountdirfd.
+ *
+ * In both cases the caller may be able to get an unobstructed way to
+ * the encoded file handle. If the caller is only able to create a
+ * bind-mount we need to verify that there are no locked mounts on top
+ * of it that could prevent us from getting to the encoded file.
+ *
+ * In principle, locked mounts can prevent the caller from mounting the
+ * filesystem but that only applies to procfs and sysfs neither of which
+ * support decoding file handles.
+ */
+static inline bool may_decode_fh(struct handle_to_path_ctx *ctx,
+ unsigned int o_flags)
+{
+ struct path *root = &ctx->root;
+
+ /*
+ * Restrict to O_DIRECTORY to provide a deterministic API that avoids a
+ * confusing api in the face of disconnected non-dir dentries.
+ *
+ * There's only one dentry for each directory inode (VFS rule)...
+ */
+ if (!(o_flags & O_DIRECTORY))
+ return false;
+
+ if (ns_capable(root->mnt->mnt_sb->s_user_ns, CAP_SYS_ADMIN))
+ ctx->flags = HANDLE_CHECK_PERMS;
+ else if (is_mounted(root->mnt) &&
+ ns_capable(real_mount(root->mnt)->mnt_ns->user_ns,
+ CAP_SYS_ADMIN) &&
+ !has_locked_children(real_mount(root->mnt), root->dentry))
+ ctx->flags = HANDLE_CHECK_PERMS | HANDLE_CHECK_SUBTREE;
+ else
+ return false;
+
+ /* Are we able to override DAC permissions? */
+ if (!ns_capable(current_user_ns(), CAP_DAC_READ_SEARCH))
+ return false;
+
+ ctx->fh_flags = EXPORT_FH_DIR_ONLY;
+ return true;
}
static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
- struct path *path)
+ struct path *path, unsigned int o_flags)
{
int retval = 0;
struct file_handle f_handle;
struct file_handle *handle = NULL;
+ struct handle_to_path_ctx ctx = {};
- /*
- * With handle we don't look at the execute bit on the
- * directory. Ideally we would like CAP_DAC_SEARCH.
- * But we don't have that
- */
- if (!capable(CAP_DAC_READ_SEARCH)) {
- retval = -EPERM;
+ retval = get_path_from_fd(mountdirfd, &ctx.root);
+ if (retval)
goto out_err;
+
+ if (!capable(CAP_DAC_READ_SEARCH) && !may_decode_fh(&ctx, o_flags)) {
+ retval = -EPERM;
+ goto out_path;
}
+
if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
retval = -EFAULT;
- goto out_err;
+ goto out_path;
}
if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
(f_handle.handle_bytes == 0)) {
retval = -EINVAL;
- goto out_err;
+ goto out_path;
}
handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes),
GFP_KERNEL);
if (!handle) {
retval = -ENOMEM;
- goto out_err;
+ goto out_path;
}
/* copy the full handle */
*handle = f_handle;
@@ -207,10 +307,12 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
goto out_handle;
}
- retval = do_handle_to_path(mountdirfd, handle, path);
+ retval = do_handle_to_path(handle, path, &ctx);
out_handle:
kfree(handle);
+out_path:
+ path_put(&ctx.root);
out_err:
return retval;
}
@@ -223,7 +325,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
struct file *file;
int fd;
- retval = handle_to_path(mountdirfd, ufh, &path);
+ retval = handle_to_path(mountdirfd, ufh, &path, open_flag);
if (retval)
return retval;
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index a4d6ca0b8971..24727ec34e5a 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -308,6 +308,40 @@ int fs_param_is_fd(struct p_log *log, const struct fs_parameter_spec *p,
}
EXPORT_SYMBOL(fs_param_is_fd);
+int fs_param_is_uid(struct p_log *log, const struct fs_parameter_spec *p,
+ struct fs_parameter *param, struct fs_parse_result *result)
+{
+ kuid_t uid;
+
+ if (fs_param_is_u32(log, p, param, result) != 0)
+ return fs_param_bad_value(log, param);
+
+ uid = make_kuid(current_user_ns(), result->uint_32);
+ if (!uid_valid(uid))
+ return inval_plog(log, "Invalid uid '%s'", param->string);
+
+ result->uid = uid;
+ return 0;
+}
+EXPORT_SYMBOL(fs_param_is_uid);
+
+int fs_param_is_gid(struct p_log *log, const struct fs_parameter_spec *p,
+ struct fs_parameter *param, struct fs_parse_result *result)
+{
+ kgid_t gid;
+
+ if (fs_param_is_u32(log, p, param, result) != 0)
+ return fs_param_bad_value(log, param);
+
+ gid = make_kgid(current_user_ns(), result->uint_32);
+ if (!gid_valid(gid))
+ return inval_plog(log, "Invalid gid '%s'", param->string);
+
+ result->gid = gid;
+ return 0;
+}
+EXPORT_SYMBOL(fs_param_is_gid);
+
int fs_param_is_blockdev(struct p_log *log, const struct fs_parameter_spec *p,
struct fs_parameter *param, struct fs_parse_result *result)
{
diff --git a/fs/fsopen.c b/fs/fsopen.c
index 6593ae518115..ed2dd000622e 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -220,10 +220,6 @@ static int vfs_cmd_create(struct fs_context *fc, bool exclusive)
if (!mount_capable(fc))
return -EPERM;
- /* require the new mount api */
- if (exclusive && fc->ops == &legacy_fs_context_ops)
- return -EOPNOTSUPP;
-
fc->phase = FS_CONTEXT_CREATING;
fc->exclusive = exclusive;
@@ -411,6 +407,7 @@ SYSCALL_DEFINE5(fsconfig,
case FSCONFIG_SET_PATH:
case FSCONFIG_SET_PATH_EMPTY:
case FSCONFIG_SET_FD:
+ case FSCONFIG_CMD_CREATE_EXCL:
ret = -EOPNOTSUPP;
goto out_f;
}
@@ -451,7 +448,7 @@ SYSCALL_DEFINE5(fsconfig,
fallthrough;
case FSCONFIG_SET_PATH:
param.type = fs_value_is_filename;
- param.name = getname_flags(_value, lookup_flags, NULL);
+ param.name = getname_flags(_value, lookup_flags);
if (IS_ERR(param.name)) {
ret = PTR_ERR(param.name);
goto out_key;
diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c
index 3d192b80a561..04cfd8fee992 100644
--- a/fs/fuse/acl.c
+++ b/fs/fuse/acl.c
@@ -146,8 +146,8 @@ int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
* be stripped.
*/
if (fc->posix_acl &&
- !vfsgid_in_group_p(i_gid_into_vfsgid(&nop_mnt_idmap, inode)) &&
- !capable_wrt_inode_uidgid(&nop_mnt_idmap, inode, CAP_FSETID))
+ !in_group_or_capable(&nop_mnt_idmap, inode,
+ i_gid_into_vfsgid(&nop_mnt_idmap, inode)))
extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID;
ret = fuse_setxattr(inode, name, value, size, 0, extra_flags);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 99e44ea7d875..d8ab4e93916f 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -740,8 +740,8 @@ static const struct fs_parameter_spec fuse_fs_parameters[] = {
fsparam_string ("source", OPT_SOURCE),
fsparam_u32 ("fd", OPT_FD),
fsparam_u32oct ("rootmode", OPT_ROOTMODE),
- fsparam_u32 ("user_id", OPT_USER_ID),
- fsparam_u32 ("group_id", OPT_GROUP_ID),
+ fsparam_uid ("user_id", OPT_USER_ID),
+ fsparam_gid ("group_id", OPT_GROUP_ID),
fsparam_flag ("default_permissions", OPT_DEFAULT_PERMISSIONS),
fsparam_flag ("allow_other", OPT_ALLOW_OTHER),
fsparam_u32 ("max_read", OPT_MAX_READ),
@@ -755,6 +755,8 @@ static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param)
struct fs_parse_result result;
struct fuse_fs_context *ctx = fsc->fs_private;
int opt;
+ kuid_t kuid;
+ kgid_t kgid;
if (fsc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
/*
@@ -799,16 +801,26 @@ static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param)
break;
case OPT_USER_ID:
- ctx->user_id = make_kuid(fsc->user_ns, result.uint_32);
- if (!uid_valid(ctx->user_id))
+ kuid = result.uid;
+ /*
+ * The requested uid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kuid_has_mapping(fsc->user_ns, kuid))
return invalfc(fsc, "Invalid user_id");
+ ctx->user_id = kuid;
ctx->user_id_present = true;
break;
case OPT_GROUP_ID:
- ctx->group_id = make_kgid(fsc->user_ns, result.uint_32);
- if (!gid_valid(ctx->group_id))
+ kgid = result.gid;
+ /*
+ * The requested gid must be representable in the
+ * filesystem's idmapping.
+ */
+ if (!kgid_has_mapping(fsc->user_ns, kgid))
return invalfc(fsc, "Invalid group_id");
+ ctx->group_id = kgid;
ctx->group_id_present = true;
break;
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 1a52a51b6b07..dd5260141615 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -51,7 +51,7 @@ struct virtio_fs_vq {
struct work_struct done_work;
struct list_head queued_reqs;
struct list_head end_reqs; /* End these requests */
- struct delayed_work dispatch_work;
+ struct work_struct dispatch_work;
struct fuse_dev *fud;
bool connected;
long in_flight;
@@ -233,7 +233,7 @@ static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq)
}
flush_work(&fsvq->done_work);
- flush_delayed_work(&fsvq->dispatch_work);
+ flush_work(&fsvq->dispatch_work);
}
static void virtio_fs_drain_all_queues_locked(struct virtio_fs *fs)
@@ -408,6 +408,10 @@ static void virtio_fs_hiprio_done_work(struct work_struct *work)
dec_in_flight_req(fsvq);
}
} while (!virtqueue_enable_cb(vq));
+
+ if (!list_empty(&fsvq->queued_reqs))
+ schedule_work(&fsvq->dispatch_work);
+
spin_unlock(&fsvq->lock);
}
@@ -415,7 +419,7 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work)
{
struct fuse_req *req;
struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
- dispatch_work.work);
+ dispatch_work);
int ret;
pr_debug("virtio-fs: worker %s called.\n", __func__);
@@ -447,11 +451,9 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work)
ret = virtio_fs_enqueue_req(fsvq, req, true);
if (ret < 0) {
- if (ret == -ENOMEM || ret == -ENOSPC) {
+ if (ret == -ENOSPC) {
spin_lock(&fsvq->lock);
list_add_tail(&req->list, &fsvq->queued_reqs);
- schedule_delayed_work(&fsvq->dispatch_work,
- msecs_to_jiffies(1));
spin_unlock(&fsvq->lock);
return;
}
@@ -494,12 +496,10 @@ static int send_forget_request(struct virtio_fs_vq *fsvq,
ret = virtqueue_add_outbuf(vq, &sg, 1, forget, GFP_ATOMIC);
if (ret < 0) {
- if (ret == -ENOMEM || ret == -ENOSPC) {
+ if (ret == -ENOSPC) {
pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n",
ret);
list_add_tail(&forget->list, &fsvq->queued_reqs);
- schedule_delayed_work(&fsvq->dispatch_work,
- msecs_to_jiffies(1));
if (!in_flight)
inc_in_flight_req(fsvq);
/* Queue is full */
@@ -531,7 +531,7 @@ static void virtio_fs_hiprio_dispatch_work(struct work_struct *work)
{
struct virtio_fs_forget *forget;
struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq,
- dispatch_work.work);
+ dispatch_work);
pr_debug("virtio-fs: worker %s called.\n", __func__);
while (1) {
spin_lock(&fsvq->lock);
@@ -709,6 +709,12 @@ static void virtio_fs_requests_done_work(struct work_struct *work)
virtio_fs_request_complete(req, fsvq);
}
}
+
+ /* Try to push previously queued requests, as the queue might no longer be full */
+ spin_lock(&fsvq->lock);
+ if (!list_empty(&fsvq->queued_reqs))
+ schedule_work(&fsvq->dispatch_work);
+ spin_unlock(&fsvq->lock);
}
static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *fs)
@@ -770,12 +776,12 @@ static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name,
if (vq_type == VQ_REQUEST) {
INIT_WORK(&fsvq->done_work, virtio_fs_requests_done_work);
- INIT_DELAYED_WORK(&fsvq->dispatch_work,
- virtio_fs_request_dispatch_work);
+ INIT_WORK(&fsvq->dispatch_work,
+ virtio_fs_request_dispatch_work);
} else {
INIT_WORK(&fsvq->done_work, virtio_fs_hiprio_done_work);
- INIT_DELAYED_WORK(&fsvq->dispatch_work,
- virtio_fs_hiprio_dispatch_work);
+ INIT_WORK(&fsvq->dispatch_work,
+ virtio_fs_hiprio_dispatch_work);
}
}
@@ -783,14 +789,13 @@ static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name,
static int virtio_fs_setup_vqs(struct virtio_device *vdev,
struct virtio_fs *fs)
{
+ struct virtqueue_info *vqs_info;
struct virtqueue **vqs;
- vq_callback_t **callbacks;
/* Specify pre_vectors to ensure that the queues before the
* request queues (e.g. hiprio) don't claim any of the CPUs in
* the multi-queue mapping and interrupt affinities
*/
struct irq_affinity desc = { .pre_vectors = VQ_REQUEST };
- const char **names;
unsigned int i;
int ret = 0;
@@ -808,20 +813,18 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
return -ENOMEM;
vqs = kmalloc_array(fs->nvqs, sizeof(vqs[VQ_HIPRIO]), GFP_KERNEL);
- callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]),
- GFP_KERNEL);
- names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL);
fs->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*fs->mq_map), GFP_KERNEL,
dev_to_node(&vdev->dev));
- if (!vqs || !callbacks || !names || !fs->mq_map) {
+ vqs_info = kcalloc(fs->nvqs, sizeof(*vqs_info), GFP_KERNEL);
+ if (!vqs || !vqs_info || !fs->mq_map) {
ret = -ENOMEM;
goto out;
}
/* Initialize the hiprio/forget request virtqueue */
- callbacks[VQ_HIPRIO] = virtio_fs_vq_done;
+ vqs_info[VQ_HIPRIO].callback = virtio_fs_vq_done;
virtio_fs_init_vq(&fs->vqs[VQ_HIPRIO], "hiprio", VQ_HIPRIO);
- names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name;
+ vqs_info[VQ_HIPRIO].name = fs->vqs[VQ_HIPRIO].name;
/* Initialize the requests virtqueues */
for (i = VQ_REQUEST; i < fs->nvqs; i++) {
@@ -829,11 +832,11 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
snprintf(vq_name, VQ_NAME_LEN, "requests.%u", i - VQ_REQUEST);
virtio_fs_init_vq(&fs->vqs[i], vq_name, VQ_REQUEST);
- callbacks[i] = virtio_fs_vq_done;
- names[i] = fs->vqs[i].name;
+ vqs_info[i].callback = virtio_fs_vq_done;
+ vqs_info[i].name = fs->vqs[i].name;
}
- ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, &desc);
+ ret = virtio_find_vqs(vdev, fs->nvqs, vqs, vqs_info, &desc);
if (ret < 0)
goto out;
@@ -842,8 +845,7 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
virtio_fs_start_all_queues(fs);
out:
- kfree(names);
- kfree(callbacks);
+ kfree(vqs_info);
kfree(vqs);
if (ret) {
kfree(fs->vqs);
@@ -1367,7 +1369,7 @@ __releases(fiq->lock)
fsvq = &fs->vqs[queue_id];
ret = virtio_fs_enqueue_req(fsvq, req, false);
if (ret < 0) {
- if (ret == -ENOMEM || ret == -ENOSPC) {
+ if (ret == -ENOSPC) {
/*
* Virtqueue full. Retry submission from worker
* context as we might be holding fc->bg_lock.
@@ -1375,8 +1377,6 @@ __releases(fiq->lock)
spin_lock(&fsvq->lock);
list_add_tail(&req->list, &fsvq->queued_reqs);
inc_in_flight_req(fsvq);
- schedule_delayed_work(&fsvq->dispatch_work,
- msecs_to_jiffies(1));
spin_unlock(&fsvq->lock);
return;
}
@@ -1386,7 +1386,7 @@ __releases(fiq->lock)
/* Can't end request in submission context. Use a worker */
spin_lock(&fsvq->lock);
list_add_tail(&req->list, &fsvq->end_reqs);
- schedule_delayed_work(&fsvq->dispatch_work, 0);
+ schedule_work(&fsvq->dispatch_work);
spin_unlock(&fsvq->lock);
return;
}
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 4ea6c8bfb4e6..12a769077ea0 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -61,12 +61,10 @@ struct gfs2_glock_iter {
typedef void (*glock_examiner) (struct gfs2_glock * gl);
static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
-static void __gfs2_glock_dq(struct gfs2_holder *gh);
-static void handle_callback(struct gfs2_glock *gl, unsigned int state,
- unsigned long delay, bool remote);
+static void request_demote(struct gfs2_glock *gl, unsigned int state,
+ unsigned long delay, bool remote);
static struct dentry *gfs2_root;
-static struct workqueue_struct *glock_workqueue;
static LIST_HEAD(lru_list);
static atomic_t lru_count = ATOMIC_INIT(0);
static DEFINE_SPINLOCK(lru_lock);
@@ -218,34 +216,9 @@ struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl)
return gl;
}
-/**
- * demote_ok - Check to see if it's ok to unlock a glock
- * @gl: the glock
- *
- * Returns: 1 if it's ok
- */
-
-static int demote_ok(const struct gfs2_glock *gl)
-{
- const struct gfs2_glock_operations *glops = gl->gl_ops;
-
- if (gl->gl_state == LM_ST_UNLOCKED)
- return 0;
- if (!list_empty(&gl->gl_holders))
- return 0;
- if (glops->go_demote_ok)
- return glops->go_demote_ok(gl);
- return 1;
-}
-
-
-void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
+static void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
{
- if (!(gl->gl_ops->go_flags & GLOF_LRU))
- return;
-
spin_lock(&lru_lock);
-
list_move_tail(&gl->gl_lru, &lru_list);
if (!test_bit(GLF_LRU, &gl->gl_flags)) {
@@ -258,9 +231,6 @@ void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
{
- if (!(gl->gl_ops->go_flags & GLOF_LRU))
- return;
-
spin_lock(&lru_lock);
if (test_bit(GLF_LRU, &gl->gl_flags)) {
list_del_init(&gl->gl_lru);
@@ -275,7 +245,9 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
* work queue.
*/
static void gfs2_glock_queue_work(struct gfs2_glock *gl, unsigned long delay) {
- if (!queue_delayed_work(glock_workqueue, &gl->gl_work, delay)) {
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+
+ if (!queue_delayed_work(sdp->sd_glock_wq, &gl->gl_work, delay)) {
/*
* We are holding the lockref spinlock, and the work was still
* queued above. The queued work (glock_work_func) takes that
@@ -305,6 +277,20 @@ static void __gfs2_glock_put(struct gfs2_glock *gl)
sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
}
+static bool __gfs2_glock_put_or_lock(struct gfs2_glock *gl)
+{
+ if (lockref_put_or_lock(&gl->gl_lockref))
+ return true;
+ GLOCK_BUG_ON(gl, gl->gl_lockref.count != 1);
+ if (gl->gl_state != LM_ST_UNLOCKED) {
+ gl->gl_lockref.count--;
+ gfs2_glock_add_to_lru(gl);
+ spin_unlock(&gl->gl_lockref.lock);
+ return true;
+ }
+ return false;
+}
+
/**
* gfs2_glock_put() - Decrement reference count on glock
* @gl: The glock to put
@@ -313,7 +299,7 @@ static void __gfs2_glock_put(struct gfs2_glock *gl)
void gfs2_glock_put(struct gfs2_glock *gl)
{
- if (lockref_put_or_lock(&gl->gl_lockref))
+ if (__gfs2_glock_put_or_lock(gl))
return;
__gfs2_glock_put(gl);
@@ -328,10 +314,9 @@ void gfs2_glock_put(struct gfs2_glock *gl)
*/
void gfs2_glock_put_async(struct gfs2_glock *gl)
{
- if (lockref_put_or_lock(&gl->gl_lockref))
+ if (__gfs2_glock_put_or_lock(gl))
return;
- GLOCK_BUG_ON(gl, gl->gl_lockref.count != 1);
gfs2_glock_queue_work(gl, 0);
spin_unlock(&gl->gl_lockref.lock);
}
@@ -570,18 +555,6 @@ static inline struct gfs2_holder *find_last_waiter(const struct gfs2_glock *gl)
static void state_change(struct gfs2_glock *gl, unsigned int new_state)
{
- int held1, held2;
-
- held1 = (gl->gl_state != LM_ST_UNLOCKED);
- held2 = (new_state != LM_ST_UNLOCKED);
-
- if (held1 != held2) {
- GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref));
- if (held2)
- gl->gl_lockref.count++;
- else
- gl->gl_lockref.count--;
- }
if (new_state != gl->gl_target)
/* shorten our minimum hold time */
gl->gl_hold_time = max(gl->gl_hold_time - GL_GLOCK_HOLD_DECR,
@@ -812,7 +785,7 @@ skip_inval:
(target != LM_ST_UNLOCKED ||
test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags))) {
if (!is_system_glock(gl)) {
- handle_callback(gl, LM_ST_UNLOCKED, 0, false); /* sets demote */
+ request_demote(gl, LM_ST_UNLOCKED, 0, false);
/*
* Ordinarily, we would call dlm and its callback would call
* finish_xmote, which would call state_change() to the new state.
@@ -910,7 +883,6 @@ out_sched:
out_unlock:
clear_bit(GLF_LOCK, &gl->gl_flags);
smp_mb__after_atomic();
- return;
}
/**
@@ -1111,19 +1083,21 @@ static void glock_work_func(struct work_struct *work)
unsigned int drop_refs = 1;
spin_lock(&gl->gl_lockref.lock);
- if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) {
- clear_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+ if (test_bit(GLF_HAVE_REPLY, &gl->gl_flags)) {
+ clear_bit(GLF_HAVE_REPLY, &gl->gl_flags);
finish_xmote(gl, gl->gl_reply);
drop_refs++;
}
if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
gl->gl_state != LM_ST_UNLOCKED &&
gl->gl_demote_state != LM_ST_EXCLUSIVE) {
- unsigned long holdtime, now = jiffies;
+ if (gl->gl_name.ln_type == LM_TYPE_INODE) {
+ unsigned long holdtime, now = jiffies;
- holdtime = gl->gl_tchange + gl->gl_hold_time;
- if (time_before(now, holdtime))
- delay = holdtime - now;
+ holdtime = gl->gl_tchange + gl->gl_hold_time;
+ if (time_before(now, holdtime))
+ delay = holdtime - now;
+ }
if (!delay) {
clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags);
@@ -1134,20 +1108,18 @@ static void glock_work_func(struct work_struct *work)
if (delay) {
/* Keep one glock reference for the work we requeue. */
drop_refs--;
- if (gl->gl_name.ln_type != LM_TYPE_INODE)
- delay = 0;
gfs2_glock_queue_work(gl, delay);
}
- /*
- * Drop the remaining glock references manually here. (Mind that
- * gfs2_glock_queue_work depends on the lockref spinlock begin held
- * here as well.)
- */
+ /* Drop the remaining glock references manually. */
+ GLOCK_BUG_ON(gl, gl->gl_lockref.count < drop_refs);
gl->gl_lockref.count -= drop_refs;
if (!gl->gl_lockref.count) {
- __gfs2_glock_put(gl);
- return;
+ if (gl->gl_state == LM_ST_UNLOCKED) {
+ __gfs2_glock_put(gl);
+ return;
+ }
+ gfs2_glock_add_to_lru(gl);
}
spin_unlock(&gl->gl_lockref.lock);
}
@@ -1183,6 +1155,8 @@ again:
out:
rcu_read_unlock();
finish_wait(wq, &wait.wait);
+ if (gl)
+ gfs2_glock_remove_from_lru(gl);
return gl;
}
@@ -1209,13 +1183,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
.ln_sbd = sdp };
struct gfs2_glock *gl, *tmp;
struct address_space *mapping;
- int ret = 0;
gl = find_insert_glock(&name, NULL);
- if (gl) {
- *glp = gl;
- return 0;
- }
+ if (gl)
+ goto found;
if (!create)
return -ENOENT;
@@ -1243,7 +1214,9 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
atomic_inc(&sdp->sd_glock_disposal);
gl->gl_node.next = NULL;
- gl->gl_flags = glops->go_instantiate ? BIT(GLF_INSTANTIATE_NEEDED) : 0;
+ gl->gl_flags = BIT(GLF_INITIAL);
+ if (glops->go_instantiate)
+ gl->gl_flags |= BIT(GLF_INSTANTIATE_NEEDED);
gl->gl_name = name;
lockdep_set_subclass(&gl->gl_lockref.lock, glops->go_subclass);
gl->gl_lockref.count = 1;
@@ -1275,23 +1248,19 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
}
tmp = find_insert_glock(&name, gl);
- if (!tmp) {
- *glp = gl;
- goto out;
- }
- if (IS_ERR(tmp)) {
- ret = PTR_ERR(tmp);
- goto out_free;
- }
- *glp = tmp;
+ if (tmp) {
+ gfs2_glock_dealloc(&gl->gl_rcu);
+ if (atomic_dec_and_test(&sdp->sd_glock_disposal))
+ wake_up(&sdp->sd_kill_wait);
-out_free:
- gfs2_glock_dealloc(&gl->gl_rcu);
- if (atomic_dec_and_test(&sdp->sd_glock_disposal))
- wake_up(&sdp->sd_kill_wait);
+ if (IS_ERR(tmp))
+ return PTR_ERR(tmp);
+ gl = tmp;
+ }
-out:
- return ret;
+found:
+ *glp = gl;
+ return 0;
}
/**
@@ -1461,7 +1430,7 @@ out:
}
/**
- * handle_callback - process a demote request
+ * request_demote - process a demote request
* @gl: the glock
* @state: the state the caller wants us to change to
* @delay: zero to demote immediately; otherwise pending demote
@@ -1471,8 +1440,8 @@ out:
* practise: LM_ST_SHARED and LM_ST_UNLOCKED
*/
-static void handle_callback(struct gfs2_glock *gl, unsigned int state,
- unsigned long delay, bool remote)
+static void request_demote(struct gfs2_glock *gl, unsigned int state,
+ unsigned long delay, bool remote)
{
if (delay)
set_bit(GLF_PENDING_DEMOTE, &gl->gl_flags);
@@ -1636,15 +1605,12 @@ unlock:
return error;
}
- if (test_bit(GLF_LRU, &gl->gl_flags))
- gfs2_glock_remove_from_lru(gl);
-
gh->gh_error = 0;
spin_lock(&gl->gl_lockref.lock);
add_to_queue(gh);
if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) &&
- test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))) {
- set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+ test_and_clear_bit(GLF_HAVE_FROZEN_REPLY, &gl->gl_flags))) {
+ set_bit(GLF_HAVE_REPLY, &gl->gl_flags);
gl->gl_lockref.count++;
gfs2_glock_queue_work(gl, 0);
}
@@ -1688,7 +1654,7 @@ static void __gfs2_glock_dq(struct gfs2_holder *gh)
* below.
*/
if (gh->gh_flags & GL_NOCACHE)
- handle_callback(gl, LM_ST_UNLOCKED, 0, false);
+ request_demote(gl, LM_ST_UNLOCKED, 0, false);
list_del_init(&gh->gh_list);
clear_bit(HIF_HOLDER, &gh->gh_iflags);
@@ -1703,9 +1669,6 @@ static void __gfs2_glock_dq(struct gfs2_holder *gh)
fast_path = 1;
}
- if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))
- gfs2_glock_add_to_lru(gl);
-
if (unlikely(!fast_path)) {
gl->gl_lockref.count++;
if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
@@ -1932,10 +1895,10 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
gl->gl_name.ln_type == LM_TYPE_INODE) {
if (time_before(now, holdtime))
delay = holdtime - now;
- if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+ if (test_bit(GLF_HAVE_REPLY, &gl->gl_flags))
delay = gl->gl_hold_time;
}
- handle_callback(gl, state, delay, true);
+ request_demote(gl, state, delay, true);
gfs2_glock_queue_work(gl, delay);
spin_unlock(&gl->gl_lockref.lock);
}
@@ -1988,14 +1951,14 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) {
if (gfs2_should_freeze(gl)) {
- set_bit(GLF_FROZEN, &gl->gl_flags);
+ set_bit(GLF_HAVE_FROZEN_REPLY, &gl->gl_flags);
spin_unlock(&gl->gl_lockref.lock);
return;
}
}
gl->gl_lockref.count++;
- set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+ set_bit(GLF_HAVE_REPLY, &gl->gl_flags);
gfs2_glock_queue_work(gl, 0);
spin_unlock(&gl->gl_lockref.lock);
}
@@ -2018,10 +1981,12 @@ static int glock_cmp(void *priv, const struct list_head *a,
static bool can_free_glock(struct gfs2_glock *gl)
{
- bool held = gl->gl_state != LM_ST_UNLOCKED;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
return !test_bit(GLF_LOCK, &gl->gl_flags) &&
- gl->gl_lockref.count == held;
+ !gl->gl_lockref.count &&
+ (!test_bit(GLF_LFLUSH, &gl->gl_flags) ||
+ test_bit(SDF_KILL, &sdp->sd_flags));
}
/**
@@ -2063,8 +2028,8 @@ add_back_to_lru:
clear_bit(GLF_LRU, &gl->gl_flags);
freed++;
gl->gl_lockref.count++;
- if (demote_ok(gl))
- handle_callback(gl, LM_ST_UNLOCKED, 0, false);
+ if (gl->gl_state != LM_ST_UNLOCKED)
+ request_demote(gl, LM_ST_UNLOCKED, 0, false);
gfs2_glock_queue_work(gl, 0);
spin_unlock(&gl->gl_lockref.lock);
cond_resched_lock(&lru_lock);
@@ -2182,13 +2147,14 @@ void gfs2_flush_delete_work(struct gfs2_sbd *sdp)
static void thaw_glock(struct gfs2_glock *gl)
{
- if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
+ if (!test_and_clear_bit(GLF_HAVE_FROZEN_REPLY, &gl->gl_flags))
return;
if (!lockref_get_not_dead(&gl->gl_lockref))
return;
+ gfs2_glock_remove_from_lru(gl);
spin_lock(&gl->gl_lockref.lock);
- set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+ set_bit(GLF_HAVE_REPLY, &gl->gl_flags);
gfs2_glock_queue_work(gl, 0);
spin_unlock(&gl->gl_lockref.lock);
}
@@ -2207,7 +2173,7 @@ static void clear_glock(struct gfs2_glock *gl)
if (!__lockref_is_dead(&gl->gl_lockref)) {
gl->gl_lockref.count++;
if (gl->gl_state != LM_ST_UNLOCKED)
- handle_callback(gl, LM_ST_UNLOCKED, 0, false);
+ request_demote(gl, LM_ST_UNLOCKED, 0, false);
gfs2_glock_queue_work(gl, 0);
}
spin_unlock(&gl->gl_lockref.lock);
@@ -2259,16 +2225,30 @@ void gfs2_gl_dq_holders(struct gfs2_sbd *sdp)
void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
{
+ unsigned long start = jiffies;
+ bool timed_out = false;
+
set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags);
- flush_workqueue(glock_workqueue);
+ flush_workqueue(sdp->sd_glock_wq);
glock_hash_walk(clear_glock, sdp);
- flush_workqueue(glock_workqueue);
- wait_event_timeout(sdp->sd_kill_wait,
- atomic_read(&sdp->sd_glock_disposal) == 0,
- HZ * 600);
+ flush_workqueue(sdp->sd_glock_wq);
+
+ while (!timed_out) {
+ wait_event_timeout(sdp->sd_kill_wait,
+ !atomic_read(&sdp->sd_glock_disposal),
+ HZ * 60);
+ if (!atomic_read(&sdp->sd_glock_disposal))
+ break;
+ timed_out = time_after(jiffies, start + (HZ * 600));
+ fs_warn(sdp, "%u glocks left after %u seconds%s\n",
+ atomic_read(&sdp->sd_glock_disposal),
+ jiffies_to_msecs(jiffies - start) / 1000,
+ timed_out ? ":" : "; still waiting");
+ }
gfs2_lm_unmount(sdp);
gfs2_free_dead_glocks(sdp);
glock_hash_walk(dump_glock_func, sdp);
+ destroy_workqueue(sdp->sd_glock_wq);
}
static const char *state2str(unsigned state)
@@ -2366,11 +2346,11 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
*p++ = 'f';
if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags))
*p++ = 'i';
- if (test_bit(GLF_REPLY_PENDING, gflags))
+ if (test_bit(GLF_HAVE_REPLY, gflags))
*p++ = 'r';
if (test_bit(GLF_INITIAL, gflags))
- *p++ = 'I';
- if (test_bit(GLF_FROZEN, gflags))
+ *p++ = 'a';
+ if (test_bit(GLF_HAVE_FROZEN_REPLY, gflags))
*p++ = 'F';
if (!list_empty(&gl->gl_holders))
*p++ = 'q';
@@ -2380,7 +2360,7 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
*p++ = 'o';
if (test_bit(GLF_BLOCKING, gflags))
*p++ = 'b';
- if (test_bit(GLF_FREEING, gflags))
+ if (test_bit(GLF_UNLOCKED, gflags))
*p++ = 'x';
if (test_bit(GLF_INSTANTIATE_NEEDED, gflags))
*p++ = 'n';
@@ -2533,16 +2513,8 @@ int __init gfs2_glock_init(void)
if (ret < 0)
return ret;
- glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
- WQ_HIGHPRI | WQ_FREEZABLE, 0);
- if (!glock_workqueue) {
- rhashtable_destroy(&gl_hash_table);
- return -ENOMEM;
- }
-
glock_shrinker = shrinker_alloc(0, "gfs2-glock");
if (!glock_shrinker) {
- destroy_workqueue(glock_workqueue);
rhashtable_destroy(&gl_hash_table);
return -ENOMEM;
}
@@ -2562,7 +2534,6 @@ void gfs2_glock_exit(void)
{
shrinker_free(glock_shrinker);
rhashtable_destroy(&gl_hash_table);
- destroy_workqueue(glock_workqueue);
}
static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi, loff_t n)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 19aef6d53267..adf0091cc98f 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -250,7 +250,6 @@ void gfs2_flush_delete_work(struct gfs2_sbd *sdp);
void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
void gfs2_gl_dq_holders(struct gfs2_sbd *sdp);
void gfs2_glock_thaw(struct gfs2_sbd *sdp);
-void gfs2_glock_add_to_lru(struct gfs2_glock *gl);
void gfs2_glock_free(struct gfs2_glock *gl);
void gfs2_glock_free_later(struct gfs2_glock *gl);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 68677fb69a73..95d8081681dc 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -385,23 +385,6 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
gfs2_clear_glop_pending(ip);
}
-/**
- * inode_go_demote_ok - Check to see if it's ok to unlock an inode glock
- * @gl: the glock
- *
- * Returns: 1 if it's ok
- */
-
-static int inode_go_demote_ok(const struct gfs2_glock *gl)
-{
- struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
-
- if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
- return 0;
-
- return 1;
-}
-
static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
@@ -648,21 +631,21 @@ static void iopen_go_callback(struct gfs2_glock *gl, bool remote)
}
/**
- * inode_go_free - wake up anyone waiting for dlm's unlock ast to free it
- * @gl: glock being freed
+ * inode_go_unlocked - wake up anyone waiting for dlm's unlock ast
+ * @gl: glock being unlocked
*
* For now, this is only used for the journal inode glock. In withdraw
- * situations, we need to wait for the glock to be freed so that we know
+ * situations, we need to wait for the glock to be unlocked so that we know
* other nodes may proceed with recovery / journal replay.
*/
-static void inode_go_free(struct gfs2_glock *gl)
+static void inode_go_unlocked(struct gfs2_glock *gl)
{
/* Note that we cannot reference gl_object because it's already set
* to NULL by this point in its lifecycle. */
- if (!test_bit(GLF_FREEING, &gl->gl_flags))
+ if (!test_bit(GLF_UNLOCKED, &gl->gl_flags))
return;
- clear_bit_unlock(GLF_FREEING, &gl->gl_flags);
- wake_up_bit(&gl->gl_flags, GLF_FREEING);
+ clear_bit_unlock(GLF_UNLOCKED, &gl->gl_flags);
+ wake_up_bit(&gl->gl_flags, GLF_UNLOCKED);
}
/**
@@ -722,13 +705,12 @@ const struct gfs2_glock_operations gfs2_meta_glops = {
const struct gfs2_glock_operations gfs2_inode_glops = {
.go_sync = inode_go_sync,
.go_inval = inode_go_inval,
- .go_demote_ok = inode_go_demote_ok,
.go_instantiate = inode_go_instantiate,
.go_held = inode_go_held,
.go_dump = inode_go_dump,
.go_type = LM_TYPE_INODE,
- .go_flags = GLOF_ASPACE | GLOF_LRU | GLOF_LVB,
- .go_free = inode_go_free,
+ .go_flags = GLOF_ASPACE | GLOF_LVB,
+ .go_unlocked = inode_go_unlocked,
};
const struct gfs2_glock_operations gfs2_rgrp_glops = {
@@ -751,13 +733,13 @@ const struct gfs2_glock_operations gfs2_iopen_glops = {
.go_type = LM_TYPE_IOPEN,
.go_callback = iopen_go_callback,
.go_dump = inode_go_dump,
- .go_flags = GLOF_LRU | GLOF_NONDISK,
+ .go_flags = GLOF_NONDISK,
.go_subclass = 1,
};
const struct gfs2_glock_operations gfs2_flock_glops = {
.go_type = LM_TYPE_FLOCK,
- .go_flags = GLOF_LRU | GLOF_NONDISK,
+ .go_flags = GLOF_NONDISK,
};
const struct gfs2_glock_operations gfs2_nondisk_glops = {
@@ -768,7 +750,7 @@ const struct gfs2_glock_operations gfs2_nondisk_glops = {
const struct gfs2_glock_operations gfs2_quota_glops = {
.go_type = LM_TYPE_QUOTA,
- .go_flags = GLOF_LVB | GLOF_LRU | GLOF_NONDISK,
+ .go_flags = GLOF_LVB | GLOF_NONDISK,
};
const struct gfs2_glock_operations gfs2_journal_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 60abd7050c99..aa4ef67a34e0 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -218,19 +218,17 @@ struct gfs2_glock_operations {
int (*go_sync) (struct gfs2_glock *gl);
int (*go_xmote_bh)(struct gfs2_glock *gl);
void (*go_inval) (struct gfs2_glock *gl, int flags);
- int (*go_demote_ok) (const struct gfs2_glock *gl);
int (*go_instantiate) (struct gfs2_glock *gl);
int (*go_held)(struct gfs2_holder *gh);
void (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl,
const char *fs_id_buf);
void (*go_callback)(struct gfs2_glock *gl, bool remote);
- void (*go_free)(struct gfs2_glock *gl);
+ void (*go_unlocked)(struct gfs2_glock *gl);
const int go_subclass;
const int go_type;
const unsigned long go_flags;
#define GLOF_ASPACE 1 /* address space attached */
#define GLOF_LVB 2 /* Lock Value Block attached */
-#define GLOF_LRU 4 /* LRU managed */
#define GLOF_NONDISK 8 /* not I/O related */
};
@@ -322,14 +320,14 @@ enum {
GLF_DIRTY = 6,
GLF_LFLUSH = 7,
GLF_INVALIDATE_IN_PROGRESS = 8,
- GLF_REPLY_PENDING = 9,
+ GLF_HAVE_REPLY = 9,
GLF_INITIAL = 10,
- GLF_FROZEN = 11,
+ GLF_HAVE_FROZEN_REPLY = 11,
GLF_INSTANTIATE_IN_PROG = 12, /* instantiate happening now */
GLF_LRU = 13,
GLF_OBJECT = 14, /* Used only for tracing */
GLF_BLOCKING = 15,
- GLF_FREEING = 16, /* Wait for glock to be freed */
+ GLF_UNLOCKED = 16, /* Wait for glock to be unlocked */
GLF_TRY_TO_EVICT = 17, /* iopen glocks only */
GLF_VERIFY_EVICT = 18, /* iopen glocks only */
};
@@ -772,6 +770,7 @@ struct gfs2_sbd {
/* Workqueue stuff */
+ struct workqueue_struct *sd_glock_wq;
struct workqueue_struct *sd_delete_wq;
/* Daemon stuff */
@@ -783,7 +782,6 @@ struct gfs2_sbd {
struct list_head sd_quota_list;
atomic_t sd_quota_count;
- struct mutex sd_quota_mutex;
struct mutex sd_quota_sync_mutex;
wait_queue_head_t sd_quota_wait;
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 49059274a528..fa5134df985f 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -134,8 +134,8 @@ static void gdlm_ast(void *arg)
switch (gl->gl_lksb.sb_status) {
case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
- if (gl->gl_ops->go_free)
- gl->gl_ops->go_free(gl);
+ if (gl->gl_ops->go_unlocked)
+ gl->gl_ops->go_unlocked(gl);
gfs2_glock_free(gl);
return;
case -DLM_ECANCEL: /* Cancel while getting lock */
@@ -163,11 +163,21 @@ static void gdlm_ast(void *arg)
BUG();
}
- set_bit(GLF_INITIAL, &gl->gl_flags);
+ /*
+ * The GLF_INITIAL flag is initially set for new glocks. Upon the
+ * first successful new (non-conversion) request, we clear this flag to
+ * indicate that a DLM lock exists and that gl->gl_lksb.sb_lkid is the
+ * identifier to use for identifying it.
+ *
+ * Any failed initial requests do not create a DLM lock, so we ignore
+ * the gl->gl_lksb.sb_lkid values that come with such requests.
+ */
+
+ clear_bit(GLF_INITIAL, &gl->gl_flags);
gfs2_glock_complete(gl, ret);
return;
out:
- if (!test_bit(GLF_INITIAL, &gl->gl_flags))
+ if (test_bit(GLF_INITIAL, &gl->gl_flags))
gl->gl_lksb.sb_lkid = 0;
gfs2_glock_complete(gl, ret);
}
@@ -239,7 +249,7 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags,
BUG();
}
- if (gl->gl_lksb.sb_lkid != 0) {
+ if (!test_bit(GLF_INITIAL, &gl->gl_flags)) {
lkf |= DLM_LKF_CONVERT;
if (test_bit(GLF_BLOCKING, &gl->gl_flags))
lkf |= DLM_LKF_QUECVT;
@@ -270,14 +280,14 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
lkf = make_flags(gl, flags, req);
gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT);
gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT);
- if (gl->gl_lksb.sb_lkid) {
- gfs2_update_request_times(gl);
- } else {
+ if (test_bit(GLF_INITIAL, &gl->gl_flags)) {
memset(strname, ' ', GDLM_STRNAME_BYTES - 1);
strname[GDLM_STRNAME_BYTES - 1] = '\0';
gfs2_reverse_hex(strname + 7, gl->gl_name.ln_type);
gfs2_reverse_hex(strname + 23, gl->gl_name.ln_number);
gl->gl_dstamp = ktime_get_real();
+ } else {
+ gfs2_update_request_times(gl);
}
/*
* Submit the actual lock request.
@@ -301,7 +311,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
BUG_ON(!__lockref_is_dead(&gl->gl_lockref));
- if (gl->gl_lksb.sb_lkid == 0) {
+ if (test_bit(GLF_INITIAL, &gl->gl_flags)) {
gfs2_glock_free(gl);
return;
}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 05975ec76d35..ff1f3e3dc65c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -103,7 +103,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
init_completion(&sdp->sd_journal_ready);
INIT_LIST_HEAD(&sdp->sd_quota_list);
- mutex_init(&sdp->sd_quota_mutex);
mutex_init(&sdp->sd_quota_sync_mutex);
init_waitqueue_head(&sdp->sd_quota_wait);
spin_lock_init(&sdp->sd_bitmap_lock);
@@ -1188,11 +1187,17 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s", sdp->sd_table_name);
+ error = -ENOMEM;
+ sdp->sd_glock_wq = alloc_workqueue("gfs2-glock/%s",
+ WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 0,
+ sdp->sd_fsname);
+ if (!sdp->sd_glock_wq)
+ goto fail_free;
+
sdp->sd_delete_wq = alloc_workqueue("gfs2-delete/%s",
WQ_MEM_RECLAIM | WQ_FREEZABLE, 0, sdp->sd_fsname);
- error = -ENOMEM;
if (!sdp->sd_delete_wq)
- goto fail_free;
+ goto fail_glock_wq;
error = gfs2_sys_fs_add(sdp);
if (error)
@@ -1301,6 +1306,8 @@ fail_debug:
gfs2_sys_fs_del(sdp);
fail_delete_wq:
destroy_workqueue(sdp->sd_delete_wq);
+fail_glock_wq:
+ destroy_workqueue(sdp->sd_glock_wq);
fail_free:
free_sbd(sdp);
sb->s_fs_info = NULL;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index aa9cf0102848..2e6bc77f4f81 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -75,9 +75,6 @@
#define GFS2_QD_HASH_SIZE BIT(GFS2_QD_HASH_SHIFT)
#define GFS2_QD_HASH_MASK (GFS2_QD_HASH_SIZE - 1)
-#define QC_CHANGE 0
-#define QC_SYNC 1
-
/* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */
/* -> sd_bitmap_lock */
static DEFINE_SPINLOCK(qd_lock);
@@ -319,11 +316,11 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
}
-static void qd_hold(struct gfs2_quota_data *qd)
+static void __qd_hold(struct gfs2_quota_data *qd)
{
struct gfs2_sbd *sdp = qd->qd_sbd;
- gfs2_assert(sdp, !__lockref_is_dead(&qd->qd_lockref));
- lockref_get(&qd->qd_lockref);
+ gfs2_assert(sdp, qd->qd_lockref.count > 0);
+ qd->qd_lockref.count++;
}
static void qd_put(struct gfs2_quota_data *qd)
@@ -400,16 +397,17 @@ static int bh_get(struct gfs2_quota_data *qd)
struct inode *inode = sdp->sd_qc_inode;
struct gfs2_inode *ip = GFS2_I(inode);
unsigned int block, offset;
- struct buffer_head *bh;
+ struct buffer_head *bh = NULL;
struct iomap iomap = { };
int error;
- mutex_lock(&sdp->sd_quota_mutex);
-
- if (qd->qd_bh_count++) {
- mutex_unlock(&sdp->sd_quota_mutex);
+ spin_lock(&qd->qd_lockref.lock);
+ if (qd->qd_bh_count) {
+ qd->qd_bh_count++;
+ spin_unlock(&qd->qd_lockref.lock);
return 0;
}
+ spin_unlock(&qd->qd_lockref.lock);
block = qd->qd_slot / sdp->sd_qc_per_block;
offset = qd->qd_slot % sdp->sd_qc_per_block;
@@ -418,122 +416,83 @@ static int bh_get(struct gfs2_quota_data *qd)
(loff_t)block << inode->i_blkbits,
i_blocksize(inode), &iomap);
if (error)
- goto fail;
+ return error;
error = -ENOENT;
if (iomap.type != IOMAP_MAPPED)
- goto fail;
+ return error;
error = gfs2_meta_read(ip->i_gl, iomap.addr >> inode->i_blkbits,
DIO_WAIT, 0, &bh);
if (error)
- goto fail;
+ return error;
error = -EIO;
if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC))
- goto fail_brelse;
-
- qd->qd_bh = bh;
- qd->qd_bh_qc = (struct gfs2_quota_change *)
- (bh->b_data + sizeof(struct gfs2_meta_header) +
- offset * sizeof(struct gfs2_quota_change));
-
- mutex_unlock(&sdp->sd_quota_mutex);
+ goto out;
- return 0;
+ spin_lock(&qd->qd_lockref.lock);
+ if (qd->qd_bh == NULL) {
+ qd->qd_bh = bh;
+ qd->qd_bh_qc = (struct gfs2_quota_change *)
+ (bh->b_data + sizeof(struct gfs2_meta_header) +
+ offset * sizeof(struct gfs2_quota_change));
+ bh = NULL;
+ }
+ qd->qd_bh_count++;
+ spin_unlock(&qd->qd_lockref.lock);
+ error = 0;
-fail_brelse:
+out:
brelse(bh);
-fail:
- qd->qd_bh_count--;
- mutex_unlock(&sdp->sd_quota_mutex);
return error;
}
static void bh_put(struct gfs2_quota_data *qd)
{
struct gfs2_sbd *sdp = qd->qd_sbd;
+ struct buffer_head *bh = NULL;
- mutex_lock(&sdp->sd_quota_mutex);
+ spin_lock(&qd->qd_lockref.lock);
gfs2_assert(sdp, qd->qd_bh_count);
if (!--qd->qd_bh_count) {
- brelse(qd->qd_bh);
+ bh = qd->qd_bh;
qd->qd_bh = NULL;
qd->qd_bh_qc = NULL;
}
- mutex_unlock(&sdp->sd_quota_mutex);
+ spin_unlock(&qd->qd_lockref.lock);
+ brelse(bh);
}
-static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd,
- u64 *sync_gen)
+static bool qd_grab_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd,
+ u64 sync_gen)
{
+ bool ret = false;
+
+ spin_lock(&qd->qd_lockref.lock);
if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
!test_bit(QDF_CHANGE, &qd->qd_flags) ||
- (sync_gen && (qd->qd_sync_gen >= *sync_gen)))
- return 0;
-
- /*
- * If qd_change is 0 it means a pending quota change was negated.
- * We should not sync it, but we still have a qd reference and slot
- * reference taken by gfs2_quota_change -> do_qc that need to be put.
- */
- if (!qd->qd_change && test_and_clear_bit(QDF_CHANGE, &qd->qd_flags)) {
- slot_put(qd);
- qd_put(qd);
- return 0;
- }
+ qd->qd_sync_gen >= sync_gen)
+ goto out;
- if (!lockref_get_not_dead(&qd->qd_lockref))
- return 0;
+ if (__lockref_is_dead(&qd->qd_lockref))
+ goto out;
+ qd->qd_lockref.count++;
list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
set_bit(QDF_LOCKED, &qd->qd_flags);
qd->qd_change_sync = qd->qd_change;
slot_hold(qd);
- return 1;
+ ret = true;
+
+out:
+ spin_unlock(&qd->qd_lockref.lock);
+ return ret;
}
-static int qd_bh_get_or_undo(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd)
+static void qd_ungrab_sync(struct gfs2_quota_data *qd)
{
- int error;
-
- error = bh_get(qd);
- if (!error)
- return 0;
-
clear_bit(QDF_LOCKED, &qd->qd_flags);
slot_put(qd);
qd_put(qd);
- return error;
-}
-
-static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
-{
- struct gfs2_quota_data *qd = NULL, *iter;
- int error;
-
- *qdp = NULL;
-
- if (sb_rdonly(sdp->sd_vfs))
- return 0;
-
- spin_lock(&qd_lock);
-
- list_for_each_entry(iter, &sdp->sd_quota_list, qd_list) {
- if (qd_check_sync(sdp, iter, &sdp->sd_quota_sync_gen)) {
- qd = iter;
- break;
- }
- }
-
- spin_unlock(&qd_lock);
-
- if (qd) {
- error = qd_bh_get_or_undo(sdp, qd);
- if (error)
- return error;
- *qdp = qd;
- }
-
- return 0;
}
static void qdsb_put(struct gfs2_quota_data *qd)
@@ -545,8 +504,10 @@ static void qdsb_put(struct gfs2_quota_data *qd)
static void qd_unlock(struct gfs2_quota_data *qd)
{
+ spin_lock(&qd->qd_lockref.lock);
gfs2_assert_warn(qd->qd_sbd, test_bit(QDF_LOCKED, &qd->qd_flags));
clear_bit(QDF_LOCKED, &qd->qd_flags);
+ spin_unlock(&qd->qd_lockref.lock);
qdsb_put(qd);
}
@@ -710,48 +671,57 @@ static int sort_qd(const void *a, const void *b)
return 0;
}
-static void do_qc(struct gfs2_quota_data *qd, s64 change, int qc_type)
+static void do_qc(struct gfs2_quota_data *qd, s64 change)
{
struct gfs2_sbd *sdp = qd->qd_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
struct gfs2_quota_change *qc = qd->qd_bh_qc;
+ bool needs_put = false;
s64 x;
- mutex_lock(&sdp->sd_quota_mutex);
gfs2_trans_add_meta(ip->i_gl, qd->qd_bh);
- if (!test_bit(QDF_CHANGE, &qd->qd_flags)) {
- qc->qc_change = 0;
+ /*
+ * The QDF_CHANGE flag indicates that the slot in the quota change file
+ * is used. Here, we use the value of qc->qc_change when the slot is
+ * used, and we assume a value of 0 otherwise.
+ */
+
+ spin_lock(&qd->qd_lockref.lock);
+
+ x = 0;
+ if (test_bit(QDF_CHANGE, &qd->qd_flags))
+ x = be64_to_cpu(qc->qc_change);
+ x += change;
+ qd->qd_change += change;
+
+ if (!x && test_bit(QDF_CHANGE, &qd->qd_flags)) {
+ /* The slot in the quota change file becomes unused. */
+ clear_bit(QDF_CHANGE, &qd->qd_flags);
+ qc->qc_flags = 0;
+ qc->qc_id = 0;
+ needs_put = true;
+ } else if (x && !test_bit(QDF_CHANGE, &qd->qd_flags)) {
+ /* The slot in the quota change file becomes used. */
+ set_bit(QDF_CHANGE, &qd->qd_flags);
+ __qd_hold(qd);
+ slot_hold(qd);
+
qc->qc_flags = 0;
if (qd->qd_id.type == USRQUOTA)
qc->qc_flags = cpu_to_be32(GFS2_QCF_USER);
qc->qc_id = cpu_to_be32(from_kqid(&init_user_ns, qd->qd_id));
}
-
- x = be64_to_cpu(qc->qc_change) + change;
qc->qc_change = cpu_to_be64(x);
- spin_lock(&qd_lock);
- qd->qd_change = x;
- spin_unlock(&qd_lock);
+ spin_unlock(&qd->qd_lockref.lock);
- if (qc_type == QC_CHANGE) {
- if (!test_and_set_bit(QDF_CHANGE, &qd->qd_flags)) {
- qd_hold(qd);
- slot_hold(qd);
- }
- } else {
- gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags));
- clear_bit(QDF_CHANGE, &qd->qd_flags);
- qc->qc_flags = 0;
- qc->qc_id = 0;
+ if (needs_put) {
slot_put(qd);
qd_put(qd);
}
-
if (change < 0) /* Reset quiet flag if we freed some blocks */
clear_bit(QDF_QMSG_QUIET, &qd->qd_flags);
- mutex_unlock(&sdp->sd_quota_mutex);
}
static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index,
@@ -890,6 +860,7 @@ static int gfs2_adjust_quota(struct gfs2_sbd *sdp, loff_t loc,
be64_add_cpu(&q.qu_value, change);
if (((s64)be64_to_cpu(q.qu_value)) < 0)
q.qu_value = 0; /* Never go negative on quota usage */
+ spin_lock(&qd->qd_lockref.lock);
qd->qd_qb.qb_value = q.qu_value;
if (fdq) {
if (fdq->d_fieldmask & QC_SPC_SOFT) {
@@ -905,6 +876,7 @@ static int gfs2_adjust_quota(struct gfs2_sbd *sdp, loff_t loc,
qd->qd_qb.qb_value = q.qu_value;
}
}
+ spin_unlock(&qd->qd_lockref.lock);
err = gfs2_write_disk_quota(sdp, &q, loc);
if (!err) {
@@ -919,7 +891,8 @@ static int gfs2_adjust_quota(struct gfs2_sbd *sdp, loff_t loc,
return err;
}
-static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
+static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda,
+ u64 sync_gen)
{
struct gfs2_sbd *sdp = (*qda)->qd_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
@@ -992,7 +965,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
if (error)
goto out_end_trans;
- do_qc(qd, -qd->qd_change_sync, QC_SYNC);
+ do_qc(qd, -qd->qd_change_sync);
set_bit(QDF_REFRESH, &qd->qd_flags);
}
@@ -1010,8 +983,13 @@ out_dq:
gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl,
GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_DO_SYNC);
if (!error) {
- for (x = 0; x < num_qd; x++)
- qda[x]->qd_sync_gen = sdp->sd_quota_sync_gen;
+ for (x = 0; x < num_qd; x++) {
+ qd = qda[x];
+ spin_lock(&qd->qd_lockref.lock);
+ if (qd->qd_sync_gen < sync_gen)
+ qd->qd_sync_gen = sync_gen;
+ spin_unlock(&qd->qd_lockref.lock);
+ }
}
return error;
}
@@ -1036,7 +1014,9 @@ static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd)
qlvb->qb_limit = q.qu_limit;
qlvb->qb_warn = q.qu_warn;
qlvb->qb_value = q.qu_value;
+ spin_lock(&qd->qd_lockref.lock);
qd->qd_qb = *qlvb;
+ spin_unlock(&qd->qd_lockref.lock);
return 0;
}
@@ -1058,7 +1038,9 @@ restart:
if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags))
force_refresh = FORCE;
+ spin_lock(&qd->qd_lockref.lock);
qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
+ spin_unlock(&qd->qd_lockref.lock);
if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) {
gfs2_glock_dq_uninit(q_gh);
@@ -1129,35 +1111,36 @@ static bool need_sync(struct gfs2_quota_data *qd)
{
struct gfs2_sbd *sdp = qd->qd_sbd;
struct gfs2_tune *gt = &sdp->sd_tune;
- s64 value;
+ s64 value, change, limit;
unsigned int num, den;
+ int ret = false;
+ spin_lock(&qd->qd_lockref.lock);
if (!qd->qd_qb.qb_limit)
- return false;
+ goto out;
- spin_lock(&qd_lock);
- value = qd->qd_change;
- spin_unlock(&qd_lock);
+ change = qd->qd_change;
+ if (change <= 0)
+ goto out;
+ value = (s64)be64_to_cpu(qd->qd_qb.qb_value);
+ limit = (s64)be64_to_cpu(qd->qd_qb.qb_limit);
+ if (value >= limit)
+ goto out;
spin_lock(&gt->gt_spin);
num = gt->gt_quota_scale_num;
den = gt->gt_quota_scale_den;
spin_unlock(&gt->gt_spin);
- if (value <= 0)
- return false;
- else if ((s64)be64_to_cpu(qd->qd_qb.qb_value) >=
- (s64)be64_to_cpu(qd->qd_qb.qb_limit))
- return false;
- else {
- value *= gfs2_jindex_size(sdp) * num;
- value = div_s64(value, den);
- value += (s64)be64_to_cpu(qd->qd_qb.qb_value);
- if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit))
- return false;
- }
+ change *= gfs2_jindex_size(sdp) * num;
+ change = div_s64(change, den);
+ if (value + change < limit)
+ goto out;
- return true;
+ ret = true;
+out:
+ spin_unlock(&qd->qd_lockref.lock);
+ return ret;
}
void gfs2_quota_unlock(struct gfs2_inode *ip)
@@ -1166,7 +1149,6 @@ void gfs2_quota_unlock(struct gfs2_inode *ip)
struct gfs2_quota_data *qda[2 * GFS2_MAXQUOTAS];
unsigned int count = 0;
u32 x;
- int found;
if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
return;
@@ -1174,6 +1156,7 @@ void gfs2_quota_unlock(struct gfs2_inode *ip)
for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
struct gfs2_quota_data *qd;
bool sync;
+ int error;
qd = ip->i_qadata->qa_qd[x];
sync = need_sync(qd);
@@ -1183,18 +1166,26 @@ void gfs2_quota_unlock(struct gfs2_inode *ip)
continue;
spin_lock(&qd_lock);
- found = qd_check_sync(sdp, qd, NULL);
+ sync = qd_grab_sync(sdp, qd, U64_MAX);
spin_unlock(&qd_lock);
- if (!found)
+ if (!sync)
continue;
- if (!qd_bh_get_or_undo(sdp, qd))
- qda[count++] = qd;
+ gfs2_assert_warn(sdp, qd->qd_change_sync);
+ error = bh_get(qd);
+ if (error) {
+ qd_ungrab_sync(qd);
+ continue;
+ }
+
+ qda[count++] = qd;
}
if (count) {
- do_sync(count, qda);
+ u64 sync_gen = READ_ONCE(sdp->sd_quota_sync_gen);
+
+ do_sync(count, qda, sync_gen);
for (x = 0; x < count; x++)
qd_unlock(qda[x]);
}
@@ -1253,12 +1244,12 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
qid_eq(qd->qd_id, make_kqid_gid(gid))))
continue;
+ spin_lock(&qd->qd_lockref.lock);
warn = (s64)be64_to_cpu(qd->qd_qb.qb_warn);
limit = (s64)be64_to_cpu(qd->qd_qb.qb_limit);
value = (s64)be64_to_cpu(qd->qd_qb.qb_value);
- spin_lock(&qd_lock);
value += qd->qd_change;
- spin_unlock(&qd_lock);
+ spin_unlock(&qd->qd_lockref.lock);
if (limit > 0 && (limit - value) < ap->allowed)
ap->allowed = limit - value;
@@ -1312,39 +1303,20 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
if (qid_eq(qd->qd_id, make_kqid_uid(uid)) ||
qid_eq(qd->qd_id, make_kqid_gid(gid))) {
- do_qc(qd, change, QC_CHANGE);
+ do_qc(qd, change);
}
}
}
-static bool qd_changed(struct gfs2_sbd *sdp)
-{
- struct gfs2_quota_data *qd;
- bool changed = false;
-
- spin_lock(&qd_lock);
- list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
- if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
- !test_bit(QDF_CHANGE, &qd->qd_flags))
- continue;
-
- changed = true;
- break;
- }
- spin_unlock(&qd_lock);
- return changed;
-}
-
int gfs2_quota_sync(struct super_block *sb, int type)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
struct gfs2_quota_data **qda;
unsigned int max_qd = PAGE_SIZE / sizeof(struct gfs2_holder);
- unsigned int num_qd;
- unsigned int x;
+ u64 sync_gen;
int error = 0;
- if (!qd_changed(sdp))
+ if (sb_rdonly(sdp->sd_vfs))
return 0;
qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL);
@@ -1352,27 +1324,44 @@ int gfs2_quota_sync(struct super_block *sb, int type)
return -ENOMEM;
mutex_lock(&sdp->sd_quota_sync_mutex);
- sdp->sd_quota_sync_gen++;
+ sync_gen = sdp->sd_quota_sync_gen + 1;
do {
- num_qd = 0;
+ struct gfs2_quota_data *iter;
+ unsigned int num_qd = 0;
+ unsigned int x;
- for (;;) {
- error = qd_fish(sdp, qda + num_qd);
- if (error || !qda[num_qd])
- break;
- if (++num_qd == max_qd)
- break;
+ spin_lock(&qd_lock);
+ list_for_each_entry(iter, &sdp->sd_quota_list, qd_list) {
+ if (qd_grab_sync(sdp, iter, sync_gen)) {
+ qda[num_qd++] = iter;
+ if (num_qd == max_qd)
+ break;
+ }
}
+ spin_unlock(&qd_lock);
- if (num_qd) {
+ if (!num_qd)
+ break;
+
+ for (x = 0; x < num_qd; x++) {
+ error = bh_get(qda[x]);
if (!error)
- error = do_sync(num_qd, qda);
+ continue;
+
+ while (x < num_qd)
+ qd_ungrab_sync(qda[--num_qd]);
+ break;
+ }
- for (x = 0; x < num_qd; x++)
- qd_unlock(qda[x]);
+ if (!error) {
+ WRITE_ONCE(sdp->sd_quota_sync_gen, sync_gen);
+ error = do_sync(num_qd, qda, sync_gen);
}
- } while (!error && num_qd == max_qd);
+
+ for (x = 0; x < num_qd; x++)
+ qd_unlock(qda[x]);
+ } while (!error);
mutex_unlock(&sdp->sd_quota_sync_mutex);
kfree(qda);
@@ -1407,6 +1396,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
unsigned int found = 0;
unsigned int hash;
unsigned int bm_size;
+ struct buffer_head *bh;
u64 dblock;
u32 extlen = 0;
int error;
@@ -1426,8 +1416,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
return error;
for (x = 0; x < blocks; x++) {
- struct buffer_head *bh;
- const struct gfs2_quota_change *qc;
+ struct gfs2_quota_change *qc;
unsigned int y;
if (!extlen) {
@@ -1440,15 +1429,13 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
if (!bh)
goto fail;
- if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) {
- brelse(bh);
- goto fail;
- }
+ if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC))
+ goto fail_brelse;
- qc = (const struct gfs2_quota_change *)(bh->b_data + sizeof(struct gfs2_meta_header));
+ qc = (struct gfs2_quota_change *)(bh->b_data + sizeof(struct gfs2_meta_header));
for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots;
y++, slot++) {
- struct gfs2_quota_data *qd;
+ struct gfs2_quota_data *old_qd, *qd;
s64 qc_change = be64_to_cpu(qc->qc_change);
u32 qc_flags = be32_to_cpu(qc->qc_flags);
enum quota_type qtype = (qc_flags & GFS2_QCF_USER) ?
@@ -1461,10 +1448,8 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
hash = gfs2_qd_hash(sdp, qc_id);
qd = qd_alloc(hash, sdp, qc_id);
- if (qd == NULL) {
- brelse(bh);
- goto fail;
- }
+ if (qd == NULL)
+ goto fail_brelse;
set_bit(QDF_CHANGE, &qd->qd_flags);
qd->qd_change = qc_change;
@@ -1472,18 +1457,41 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
qd->qd_slot_ref = 1;
spin_lock(&qd_lock);
+ spin_lock_bucket(hash);
+ old_qd = gfs2_qd_search_bucket(hash, sdp, qc_id);
+ if (old_qd) {
+ fs_err(sdp, "Corruption found in quota_change%u"
+ "file: duplicate identifier in "
+ "slot %u\n",
+ sdp->sd_jdesc->jd_jid, slot);
+
+ spin_unlock_bucket(hash);
+ spin_unlock(&qd_lock);
+ qd_put(old_qd);
+
+ gfs2_glock_put(qd->qd_gl);
+ kmem_cache_free(gfs2_quotad_cachep, qd);
+
+ /* zero out the duplicate slot */
+ lock_buffer(bh);
+ memset(qc, 0, sizeof(*qc));
+ mark_buffer_dirty(bh);
+ unlock_buffer(bh);
+
+ continue;
+ }
BUG_ON(test_and_set_bit(slot, sdp->sd_quota_bitmap));
list_add(&qd->qd_list, &sdp->sd_quota_list);
atomic_inc(&sdp->sd_quota_count);
- spin_unlock(&qd_lock);
-
- spin_lock_bucket(hash);
hlist_bl_add_head_rcu(&qd->qd_hlist, &qd_hash_table[hash]);
spin_unlock_bucket(hash);
+ spin_unlock(&qd_lock);
found++;
}
+ if (buffer_dirty(bh))
+ sync_dirty_buffer(bh);
brelse(bh);
dblock++;
extlen--;
@@ -1494,6 +1502,10 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
return 0;
+fail_brelse:
+ if (buffer_dirty(bh))
+ sync_dirty_buffer(bh);
+ brelse(bh);
fail:
gfs2_quota_cleanup(sdp);
return error;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 7a5aedfcd52a..6678060ed4d2 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1524,7 +1524,6 @@ out:
if (ip->i_gl) {
glock_clear_object(ip->i_gl, ip);
wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE);
- gfs2_glock_add_to_lru(ip->i_gl);
gfs2_glock_put_eventually(ip->i_gl);
rcu_assign_pointer(ip->i_gl, NULL);
}
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index a5deb9f86831..8eae8d62a413 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -53,9 +53,9 @@
{(1UL << GLF_DIRTY), "y" }, \
{(1UL << GLF_LFLUSH), "f" }, \
{(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \
- {(1UL << GLF_REPLY_PENDING), "r" }, \
- {(1UL << GLF_INITIAL), "I" }, \
- {(1UL << GLF_FROZEN), "F" }, \
+ {(1UL << GLF_HAVE_REPLY), "r" }, \
+ {(1UL << GLF_INITIAL), "a" }, \
+ {(1UL << GLF_HAVE_FROZEN_REPLY), "F" }, \
{(1UL << GLF_LRU), "L" }, \
{(1UL << GLF_OBJECT), "o" }, \
{(1UL << GLF_BLOCKING), "b" })
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index af4758d8d894..13be8d1d228b 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -99,12 +99,12 @@ out_unlock:
*/
int gfs2_freeze_lock_shared(struct gfs2_sbd *sdp)
{
+ int flags = LM_FLAG_NOEXP | GL_EXACT;
int error;
- error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED,
- LM_FLAG_NOEXP | GL_EXACT,
+ error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, flags,
&sdp->sd_freeze_gh);
- if (error)
+ if (error && error != GLR_TRYFAILED)
fs_err(sdp, "can't lock the freeze glock: %d\n", error);
return error;
}
@@ -206,9 +206,9 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp)
* on other nodes to be successful, otherwise we remain the owner of
* the glock as far as dlm is concerned.
*/
- if (i_gl->gl_ops->go_free) {
- set_bit(GLF_FREEING, &i_gl->gl_flags);
- wait_on_bit(&i_gl->gl_flags, GLF_FREEING, TASK_UNINTERRUPTIBLE);
+ if (i_gl->gl_ops->go_unlocked) {
+ set_bit(GLF_UNLOCKED, &i_gl->gl_flags);
+ wait_on_bit(&i_gl->gl_flags, GLF_UNLOCKED, TASK_UNINTERRUPTIBLE);
}
/*
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 8c34798a0715..744e10b46904 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -200,6 +200,7 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t
HFS_I(inode)->flags = 0;
HFS_I(inode)->rsrc_inode = NULL;
HFS_I(inode)->fs_blocks = 0;
+ HFS_I(inode)->tz_secondswest = sys_tz.tz_minuteswest * 60;
if (S_ISDIR(mode)) {
inode->i_size = 2;
HFS_SB(sb)->folder_count++;
@@ -275,6 +276,8 @@ void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
for (count = 0, i = 0; i < 3; i++)
count += be16_to_cpu(ext[i].count);
HFS_I(inode)->first_blocks = count;
+ HFS_I(inode)->cached_start = 0;
+ HFS_I(inode)->cached_blocks = 0;
inode->i_size = HFS_I(inode)->phys_size = log_size;
HFS_I(inode)->fs_blocks = (log_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 6764afa98a6f..eeac99765f0d 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -28,6 +28,7 @@
static struct kmem_cache *hfs_inode_cachep;
+MODULE_DESCRIPTION("Apple Macintosh file system support");
MODULE_LICENSE("GPL");
static int hfs_sync_fs(struct super_block *sb, int wait)
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index ca2ba8c9f82e..901e83d65d20 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -25,19 +25,8 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
fd->key = ptr + tree->max_key_len + 2;
hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n",
tree->cnid, __builtin_return_address(0));
- switch (tree->cnid) {
- case HFSPLUS_CAT_CNID:
- mutex_lock_nested(&tree->tree_lock, CATALOG_BTREE_MUTEX);
- break;
- case HFSPLUS_EXT_CNID:
- mutex_lock_nested(&tree->tree_lock, EXTENTS_BTREE_MUTEX);
- break;
- case HFSPLUS_ATTR_CNID:
- mutex_lock_nested(&tree->tree_lock, ATTR_BTREE_MUTEX);
- break;
- default:
- BUG();
- }
+ mutex_lock_nested(&tree->tree_lock,
+ hfsplus_btree_lock_class(tree));
return 0;
}
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 3c572e44f2ad..9c51867dddc5 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -430,7 +430,8 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid,
hfsplus_free_extents(sb, ext_entry, total_blocks - start,
total_blocks);
total_blocks = start;
- mutex_lock(&fd.tree->tree_lock);
+ mutex_lock_nested(&fd.tree->tree_lock,
+ hfsplus_btree_lock_class(fd.tree));
} while (total_blocks > blocks);
hfs_find_exit(&fd);
@@ -592,7 +593,8 @@ void hfsplus_file_truncate(struct inode *inode)
alloc_cnt, alloc_cnt - blk_cnt);
hfsplus_dump_extent(hip->first_extents);
hip->first_blocks = blk_cnt;
- mutex_lock(&fd.tree->tree_lock);
+ mutex_lock_nested(&fd.tree->tree_lock,
+ hfsplus_btree_lock_class(fd.tree));
break;
}
res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt);
@@ -606,7 +608,8 @@ void hfsplus_file_truncate(struct inode *inode)
hfsplus_free_extents(sb, hip->cached_extents,
alloc_cnt - start, alloc_cnt - blk_cnt);
hfsplus_dump_extent(hip->cached_extents);
- mutex_lock(&fd.tree->tree_lock);
+ mutex_lock_nested(&fd.tree->tree_lock,
+ hfsplus_btree_lock_class(fd.tree));
if (blk_cnt > start) {
hip->extent_state |= HFSPLUS_EXT_DIRTY;
break;
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 012a3d003fbe..9e78f181c24f 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -553,6 +553,27 @@ static inline __be32 __hfsp_ut2mt(time64_t ut)
return cpu_to_be32(lower_32_bits(ut) + HFSPLUS_UTC_OFFSET);
}
+static inline enum hfsplus_btree_mutex_classes
+hfsplus_btree_lock_class(struct hfs_btree *tree)
+{
+ enum hfsplus_btree_mutex_classes class;
+
+ switch (tree->cnid) {
+ case HFSPLUS_CAT_CNID:
+ class = CATALOG_BTREE_MUTEX;
+ break;
+ case HFSPLUS_EXT_CNID:
+ class = EXTENTS_BTREE_MUTEX;
+ break;
+ case HFSPLUS_ATTR_CNID:
+ class = ATTR_BTREE_MUTEX;
+ break;
+ default:
+ BUG();
+ }
+ return class;
+}
+
/* compatibility */
#define hfsp_mt2ut(t) (struct timespec64){ .tv_sec = __hfsp_mt2ut(t) }
#define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec)
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 5661a2e24d03..40d04dba13ac 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -40,7 +40,7 @@ static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags)
/* Directory containing the bootable system */
vh->finder_info[0] = bvh->finder_info[0] =
- cpu_to_be32(parent_ino(dentry));
+ cpu_to_be32(d_parent_ino(dentry));
/*
* Bootloader. Just using the inode here breaks in the case of
@@ -51,7 +51,7 @@ static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags)
/* Per spec, the OS X system folder - same as finder_info[0] here */
vh->finder_info[5] = bvh->finder_info[5] =
- cpu_to_be32(parent_ino(dentry));
+ cpu_to_be32(d_parent_ino(dentry));
mutex_unlock(&sbi->vh_mutex);
return 0;
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 5a400259ae74..9a1a93e3888b 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -696,7 +696,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size)
return err;
}
- strbuf = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN +
+ strbuf = kzalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN +
XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL);
if (!strbuf) {
res = -ENOMEM;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index a73d27c4dd58..3eb747d26924 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -16,11 +16,16 @@
#include <linux/seq_file.h>
#include <linux/writeback.h>
#include <linux/mount.h>
+#include <linux/fs_context.h>
#include <linux/namei.h>
#include "hostfs.h"
#include <init.h>
#include <kern.h>
+struct hostfs_fs_info {
+ char *host_root_path;
+};
+
struct hostfs_inode_info {
int fd;
fmode_t mode;
@@ -90,8 +95,10 @@ static char *__dentry_name(struct dentry *dentry, char *name)
char *p = dentry_path_raw(dentry, name, PATH_MAX);
char *root;
size_t len;
+ struct hostfs_fs_info *fsi;
- root = dentry->d_sb->s_fs_info;
+ fsi = dentry->d_sb->s_fs_info;
+ root = fsi->host_root_path;
len = strlen(root);
if (IS_ERR(p)) {
__putname(name);
@@ -196,8 +203,10 @@ static int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
long long f_bavail;
long long f_files;
long long f_ffree;
+ struct hostfs_fs_info *fsi;
- err = do_statfs(dentry->d_sb->s_fs_info,
+ fsi = dentry->d_sb->s_fs_info;
+ err = do_statfs(fsi->host_root_path,
&sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
&f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
&sf->f_namelen);
@@ -245,7 +254,11 @@ static void hostfs_free_inode(struct inode *inode)
static int hostfs_show_options(struct seq_file *seq, struct dentry *root)
{
- const char *root_path = root->d_sb->s_fs_info;
+ struct hostfs_fs_info *fsi;
+ const char *root_path;
+
+ fsi = root->d_sb->s_fs_info;
+ root_path = fsi->host_root_path;
size_t offset = strlen(root_ino) + 1;
if (strlen(root_path) > offset)
@@ -432,31 +445,20 @@ static int hostfs_writepage(struct page *page, struct writeback_control *wbc)
static int hostfs_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
char *buffer;
- loff_t start = page_offset(page);
+ loff_t start = folio_pos(folio);
int bytes_read, ret = 0;
- buffer = kmap_local_page(page);
+ buffer = kmap_local_folio(folio, 0);
bytes_read = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer,
PAGE_SIZE);
- if (bytes_read < 0) {
- ClearPageUptodate(page);
- SetPageError(page);
+ if (bytes_read < 0)
ret = bytes_read;
- goto out;
- }
-
- memset(buffer + bytes_read, 0, PAGE_SIZE - bytes_read);
-
- ClearPageError(page);
- SetPageUptodate(page);
-
- out:
- flush_dcache_page(page);
+ else
+ buffer = folio_zero_tail(folio, bytes_read, buffer);
kunmap_local(buffer);
- unlock_page(page);
+ folio_end_read(folio, ret == 0);
return ret;
}
@@ -922,10 +924,11 @@ static const struct inode_operations hostfs_link_iops = {
.get_link = hostfs_get_link,
};
-static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
+static int hostfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
+ struct hostfs_fs_info *fsi = sb->s_fs_info;
+ const char *host_root = fc->source;
struct inode *root_inode;
- char *host_root_path, *req_root = d;
int err;
sb->s_blocksize = 1024;
@@ -939,15 +942,15 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
return err;
/* NULL is printed as '(null)' by printf(): avoid that. */
- if (req_root == NULL)
- req_root = "";
+ if (fc->source == NULL)
+ host_root = "";
- sb->s_fs_info = host_root_path =
- kasprintf(GFP_KERNEL, "%s/%s", root_ino, req_root);
- if (host_root_path == NULL)
+ fsi->host_root_path =
+ kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root);
+ if (fsi->host_root_path == NULL)
return -ENOMEM;
- root_inode = hostfs_iget(sb, host_root_path);
+ root_inode = hostfs_iget(sb, fsi->host_root_path);
if (IS_ERR(root_inode))
return PTR_ERR(root_inode);
@@ -955,7 +958,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
char *name;
iput(root_inode);
- name = follow_link(host_root_path);
+ name = follow_link(fsi->host_root_path);
if (IS_ERR(name))
return PTR_ERR(name);
@@ -972,11 +975,38 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
return 0;
}
-static struct dentry *hostfs_read_sb(struct file_system_type *type,
- int flags, const char *dev_name,
- void *data)
+static int hostfs_fc_get_tree(struct fs_context *fc)
+{
+ return get_tree_nodev(fc, hostfs_fill_super);
+}
+
+static void hostfs_fc_free(struct fs_context *fc)
{
- return mount_nodev(type, flags, data, hostfs_fill_sb_common);
+ struct hostfs_fs_info *fsi = fc->s_fs_info;
+
+ if (!fsi)
+ return;
+
+ kfree(fsi->host_root_path);
+ kfree(fsi);
+}
+
+static const struct fs_context_operations hostfs_context_ops = {
+ .get_tree = hostfs_fc_get_tree,
+ .free = hostfs_fc_free,
+};
+
+static int hostfs_init_fs_context(struct fs_context *fc)
+{
+ struct hostfs_fs_info *fsi;
+
+ fsi = kzalloc(sizeof(*fsi), GFP_KERNEL);
+ if (!fsi)
+ return -ENOMEM;
+
+ fc->s_fs_info = fsi;
+ fc->ops = &hostfs_context_ops;
+ return 0;
}
static void hostfs_kill_sb(struct super_block *s)
@@ -986,11 +1016,11 @@ static void hostfs_kill_sb(struct super_block *s)
}
static struct file_system_type hostfs_type = {
- .owner = THIS_MODULE,
- .name = "hostfs",
- .mount = hostfs_read_sb,
- .kill_sb = hostfs_kill_sb,
- .fs_flags = 0,
+ .owner = THIS_MODULE,
+ .name = "hostfs",
+ .init_fs_context = hostfs_init_fs_context,
+ .kill_sb = hostfs_kill_sb,
+ .fs_flags = 0,
};
MODULE_ALIAS_FS("hostfs");
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 9184b4584b01..d0edf9ed33b6 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -472,9 +472,8 @@ out:
static int hpfs_symlink_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- char *link = page_address(page);
- struct inode *i = page->mapping->host;
+ char *link = folio_address(folio);
+ struct inode *i = folio->mapping->host;
struct fnode *fnode;
struct buffer_head *bh;
int err;
@@ -485,17 +484,9 @@ static int hpfs_symlink_read_folio(struct file *file, struct folio *folio)
goto fail;
err = hpfs_read_ea(i->i_sb, fnode, "SYMLINK", link, PAGE_SIZE);
brelse(bh);
- if (err)
- goto fail;
- hpfs_unlock(i->i_sb);
- SetPageUptodate(page);
- unlock_page(page);
- return 0;
-
fail:
hpfs_unlock(i->i_sb);
- SetPageError(page);
- unlock_page(page);
+ folio_end_read(folio, err == 0);
return err;
}
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 314834a078e9..e73717daa5f9 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -793,4 +793,5 @@ static void __exit exit_hpfs_fs(void)
module_init(init_hpfs_fs)
module_exit(exit_hpfs_fs)
+MODULE_DESCRIPTION("OS/2 HPFS file system support");
MODULE_LICENSE("GPL");
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 412f295acebe..9f6cff356796 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -73,13 +73,13 @@ enum hugetlb_param {
};
static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
- fsparam_u32 ("gid", Opt_gid),
+ fsparam_gid ("gid", Opt_gid),
fsparam_string("min_size", Opt_min_size),
fsparam_u32oct("mode", Opt_mode),
fsparam_string("nr_inodes", Opt_nr_inodes),
fsparam_string("pagesize", Opt_pagesize),
fsparam_string("size", Opt_size),
- fsparam_u32 ("uid", Opt_uid),
+ fsparam_uid ("uid", Opt_uid),
{}
};
@@ -222,13 +222,13 @@ generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
unsigned long flags)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma, *prev;
struct hstate *h = hstate_file(file);
const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
if (len & ~huge_page_mask(h))
return -EINVAL;
- if (len > TASK_SIZE)
+ if (len > mmap_end - mmap_min_addr)
return -ENOMEM;
if (flags & MAP_FIXED) {
@@ -239,9 +239,10 @@ generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
if (addr) {
addr = ALIGN(addr, huge_page_size(h));
- vma = find_vma(mm, addr);
- if (mmap_end - len >= addr &&
- (!vma || addr + len <= vm_start_gap(vma)))
+ vma = find_vma_prev(mm, addr, &prev);
+ if (mmap_end - len >= addr && addr >= mmap_min_addr &&
+ (!vma || addr + len <= vm_start_gap(vma)) &&
+ (!prev || addr >= vm_end_gap(prev)))
return addr;
}
@@ -422,7 +423,7 @@ static bool hugetlb_vma_maps_page(struct vm_area_struct *vma,
if (!ptep)
return false;
- pte = huge_ptep_get(ptep);
+ pte = huge_ptep_get(vma->vm_mm, addr, ptep);
if (huge_pte_none(pte) || !pte_present(pte))
return false;
@@ -892,7 +893,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
error = PTR_ERR(folio);
goto out;
}
- clear_huge_page(&folio->page, addr, pages_per_huge_page(h));
+ folio_zero_user(folio, ALIGN_DOWN(addr, hpage_size));
__folio_mark_uptodate(folio);
error = hugetlb_add_to_page_cache(folio, mapping, index);
if (unlikely(error)) {
@@ -1128,10 +1129,7 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping,
hugetlb_set_folio_subpool(src, NULL);
}
- if (mode != MIGRATE_SYNC_NO_COPY)
- folio_migrate_copy(dst, src);
- else
- folio_migrate_flags(dst, src);
+ folio_migrate_flags(dst, src);
return MIGRATEPAGE_SUCCESS;
}
@@ -1376,15 +1374,11 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
switch (opt) {
case Opt_uid:
- ctx->uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(ctx->uid))
- goto bad_val;
+ ctx->uid = result.uid;
return 0;
case Opt_gid:
- ctx->gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(ctx->gid))
- goto bad_val;
+ ctx->gid = result.gid;
return 0;
case Opt_mode:
diff --git a/fs/inode.c b/fs/inode.c
index 3a41f83a4ba5..f356fe2ec2b6 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -162,6 +162,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_sb = sb;
inode->i_blkbits = sb->s_blocksize_bits;
inode->i_flags = 0;
+ inode->i_state = 0;
atomic64_set(&inode->i_sequence, 0);
atomic_set(&inode->i_count, 1);
inode->i_op = &empty_iops;
@@ -231,6 +232,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
if (unlikely(security_inode_alloc(inode)))
return -ENOMEM;
+
this_cpu_inc(nr_inodes);
return 0;
@@ -886,36 +888,45 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
return freed;
}
-static void __wait_on_freeing_inode(struct inode *inode);
+static void __wait_on_freeing_inode(struct inode *inode, bool locked);
/*
* Called with the inode lock held.
*/
static struct inode *find_inode(struct super_block *sb,
struct hlist_head *head,
int (*test)(struct inode *, void *),
- void *data)
+ void *data, bool locked)
{
struct inode *inode = NULL;
+ if (locked)
+ lockdep_assert_held(&inode_hash_lock);
+ else
+ lockdep_assert_not_held(&inode_hash_lock);
+
+ rcu_read_lock();
repeat:
- hlist_for_each_entry(inode, head, i_hash) {
+ hlist_for_each_entry_rcu(inode, head, i_hash) {
if (inode->i_sb != sb)
continue;
if (!test(inode, data))
continue;
spin_lock(&inode->i_lock);
if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
- __wait_on_freeing_inode(inode);
+ __wait_on_freeing_inode(inode, locked);
goto repeat;
}
if (unlikely(inode->i_state & I_CREATING)) {
spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
return ERR_PTR(-ESTALE);
}
__iget(inode);
spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
return inode;
}
+ rcu_read_unlock();
return NULL;
}
@@ -924,29 +935,39 @@ repeat:
* iget_locked for details.
*/
static struct inode *find_inode_fast(struct super_block *sb,
- struct hlist_head *head, unsigned long ino)
+ struct hlist_head *head, unsigned long ino,
+ bool locked)
{
struct inode *inode = NULL;
+ if (locked)
+ lockdep_assert_held(&inode_hash_lock);
+ else
+ lockdep_assert_not_held(&inode_hash_lock);
+
+ rcu_read_lock();
repeat:
- hlist_for_each_entry(inode, head, i_hash) {
+ hlist_for_each_entry_rcu(inode, head, i_hash) {
if (inode->i_ino != ino)
continue;
if (inode->i_sb != sb)
continue;
spin_lock(&inode->i_lock);
if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
- __wait_on_freeing_inode(inode);
+ __wait_on_freeing_inode(inode, locked);
goto repeat;
}
if (unlikely(inode->i_state & I_CREATING)) {
spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
return ERR_PTR(-ESTALE);
}
__iget(inode);
spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
return inode;
}
+ rcu_read_unlock();
return NULL;
}
@@ -1004,14 +1025,7 @@ EXPORT_SYMBOL(get_next_ino);
*/
struct inode *new_inode_pseudo(struct super_block *sb)
{
- struct inode *inode = alloc_inode(sb);
-
- if (inode) {
- spin_lock(&inode->i_lock);
- inode->i_state = 0;
- spin_unlock(&inode->i_lock);
- }
- return inode;
+ return alloc_inode(sb);
}
/**
@@ -1161,7 +1175,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
again:
spin_lock(&inode_hash_lock);
- old = find_inode(inode->i_sb, head, test, data);
+ old = find_inode(inode->i_sb, head, test, data, true);
if (unlikely(old)) {
/*
* Uhhuh, somebody else created the same inode under us.
@@ -1235,7 +1249,6 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
struct inode *new = alloc_inode(sb);
if (new) {
- new->i_state = 0;
inode = inode_insert5(new, hashval, test, set, data);
if (unlikely(inode != new))
destroy_inode(new);
@@ -1246,6 +1259,47 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
EXPORT_SYMBOL(iget5_locked);
/**
+ * iget5_locked_rcu - obtain an inode from a mounted file system
+ * @sb: super block of file system
+ * @hashval: hash value (usually inode number) to get
+ * @test: callback used for comparisons between inodes
+ * @set: callback used to initialize a new struct inode
+ * @data: opaque data pointer to pass to @test and @set
+ *
+ * This is equivalent to iget5_locked, except the @test callback must
+ * tolerate the inode not being stable, including being mid-teardown.
+ */
+struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval,
+ int (*test)(struct inode *, void *),
+ int (*set)(struct inode *, void *), void *data)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+ struct inode *inode, *new;
+
+again:
+ inode = find_inode(sb, head, test, data, false);
+ if (inode) {
+ if (IS_ERR(inode))
+ return NULL;
+ wait_on_inode(inode);
+ if (unlikely(inode_unhashed(inode))) {
+ iput(inode);
+ goto again;
+ }
+ return inode;
+ }
+
+ new = alloc_inode(sb);
+ if (new) {
+ inode = inode_insert5(new, hashval, test, set, data);
+ if (unlikely(inode != new))
+ destroy_inode(new);
+ }
+ return inode;
+}
+EXPORT_SYMBOL_GPL(iget5_locked_rcu);
+
+/**
* iget_locked - obtain an inode from a mounted file system
* @sb: super block of file system
* @ino: inode number to get
@@ -1263,9 +1317,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
struct hlist_head *head = inode_hashtable + hash(sb, ino);
struct inode *inode;
again:
- spin_lock(&inode_hash_lock);
- inode = find_inode_fast(sb, head, ino);
- spin_unlock(&inode_hash_lock);
+ inode = find_inode_fast(sb, head, ino, false);
if (inode) {
if (IS_ERR(inode))
return NULL;
@@ -1283,7 +1335,7 @@ again:
spin_lock(&inode_hash_lock);
/* We released the lock, so.. */
- old = find_inode_fast(sb, head, ino);
+ old = find_inode_fast(sb, head, ino, true);
if (!old) {
inode->i_ino = ino;
spin_lock(&inode->i_lock);
@@ -1419,7 +1471,7 @@ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
struct inode *inode;
spin_lock(&inode_hash_lock);
- inode = find_inode(sb, head, test, data);
+ inode = find_inode(sb, head, test, data, true);
spin_unlock(&inode_hash_lock);
return IS_ERR(inode) ? NULL : inode;
@@ -1474,7 +1526,7 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
struct inode *inode;
again:
spin_lock(&inode_hash_lock);
- inode = find_inode_fast(sb, head, ino);
+ inode = find_inode_fast(sb, head, ino, true);
spin_unlock(&inode_hash_lock);
if (inode) {
@@ -2235,17 +2287,21 @@ EXPORT_SYMBOL(inode_needs_sync);
* wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
* will DTRT.
*/
-static void __wait_on_freeing_inode(struct inode *inode)
+static void __wait_on_freeing_inode(struct inode *inode, bool locked)
{
wait_queue_head_t *wq;
DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
wq = bit_waitqueue(&inode->i_state, __I_NEW);
prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_hash_lock);
+ rcu_read_unlock();
+ if (locked)
+ spin_unlock(&inode_hash_lock);
schedule();
finish_wait(wq, &wait.wq_entry);
- spin_lock(&inode_hash_lock);
+ if (locked)
+ spin_lock(&inode_hash_lock);
+ rcu_read_lock();
}
static __initdata unsigned long ihash_entries;
@@ -2538,6 +2594,7 @@ bool in_group_or_capable(struct mnt_idmap *idmap,
return true;
return false;
}
+EXPORT_SYMBOL(in_group_or_capable);
/**
* mode_strip_sgid - handle the sgid bit for non-directories
diff --git a/fs/internal.h b/fs/internal.h
index ab2225136f60..cdd73209eecb 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -17,6 +17,7 @@ struct fs_context;
struct pipe_inode_info;
struct iov_iter;
struct mnt_idmap;
+struct ns_common;
/*
* block/bdev.c
@@ -239,6 +240,7 @@ extern void mnt_pin_kill(struct mount *m);
* fs/nsfs.c
*/
extern const struct dentry_operations ns_dentry_operations;
+int open_namespace(struct ns_common *ns);
/*
* fs/stat.c:
@@ -247,6 +249,8 @@ extern const struct dentry_operations ns_dentry_operations;
int getname_statx_lookup_flags(int flags);
int do_statx(int dfd, struct filename *filename, unsigned int flags,
unsigned int mask, struct statx __user *buffer);
+int do_statx_fd(int fd, unsigned int flags, unsigned int mask,
+ struct statx __user *buffer);
/*
* fs/splice.c:
@@ -321,3 +325,15 @@ struct stashed_operations {
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
struct path *path);
void stashed_dentry_prune(struct dentry *dentry);
+/**
+ * path_mounted - check whether path is mounted
+ * @path: path to check
+ *
+ * Determine whether @path refers to the root of a mount.
+ *
+ * Return: true if @path is the root of a mount, false if not.
+ */
+static inline bool path_mounted(const struct path *path)
+{
+ return path->mnt->mnt_root == path->dentry;
+}
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index d46558990279..f420c53d86ac 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -307,8 +307,6 @@ static void iomap_finish_folio_read(struct folio *folio, size_t off,
spin_unlock_irqrestore(&ifs->state_lock, flags);
}
- if (error)
- folio_set_error(folio);
if (finished)
folio_end_read(folio, uptodate);
}
@@ -444,6 +442,24 @@ done:
return pos - orig_pos + plen;
}
+static loff_t iomap_read_folio_iter(const struct iomap_iter *iter,
+ struct iomap_readpage_ctx *ctx)
+{
+ struct folio *folio = ctx->cur_folio;
+ size_t offset = offset_in_folio(folio, iter->pos);
+ loff_t length = min_t(loff_t, folio_size(folio) - offset,
+ iomap_length(iter));
+ loff_t done, ret;
+
+ for (done = 0; done < length; done += ret) {
+ ret = iomap_readpage_iter(iter, ctx, done);
+ if (ret <= 0)
+ return ret;
+ }
+
+ return done;
+}
+
int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
{
struct iomap_iter iter = {
@@ -459,10 +475,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
trace_iomap_readpage(iter.inode, 1);
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_readpage_iter(&iter, &ctx, 0);
-
- if (ret < 0)
- folio_set_error(folio);
+ iter.processed = iomap_read_folio_iter(&iter, &ctx);
if (ctx.bio) {
submit_bio(ctx.bio);
@@ -698,7 +711,6 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
if (folio_test_uptodate(folio))
return 0;
- folio_clear_error(folio);
do {
iomap_adjust_read_range(iter->inode, folio, &block_start,
@@ -878,37 +890,22 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
size_t copied, struct folio *folio)
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
- loff_t old_size = iter->inode->i_size;
- size_t written;
if (srcmap->type == IOMAP_INLINE) {
iomap_write_end_inline(iter, folio, pos, copied);
- written = copied;
- } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
- written = block_write_end(NULL, iter->inode->i_mapping, pos,
- len, copied, &folio->page, NULL);
- WARN_ON_ONCE(written != copied && written != 0);
- } else {
- written = __iomap_write_end(iter->inode, pos, len, copied,
- folio) ? copied : 0;
+ return true;
}
- /*
- * Update the in-memory inode size after copying the data into the page
- * cache. It's up to the file system to write the updated size to disk,
- * preferably after I/O completion so that no stale data is exposed.
- * Only once that's done can we unlock and release the folio.
- */
- if (pos + written > old_size) {
- i_size_write(iter->inode, pos + written);
- iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
- }
- __iomap_put_folio(iter, pos, written, folio);
+ if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
+ size_t bh_written;
- if (old_size < pos)
- pagecache_isize_extended(iter->inode, old_size, pos);
+ bh_written = block_write_end(NULL, iter->inode->i_mapping, pos,
+ len, copied, &folio->page, NULL);
+ WARN_ON_ONCE(bh_written != copied && bh_written != 0);
+ return bh_written == copied;
+ }
- return written == copied;
+ return __iomap_write_end(iter->inode, pos, len, copied, folio);
}
static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
@@ -923,6 +920,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
do {
struct folio *folio;
+ loff_t old_size;
size_t offset; /* Offset into folio */
size_t bytes; /* Bytes to write to folio */
size_t copied; /* Bytes copied from user */
@@ -974,6 +972,23 @@ retry:
written = iomap_write_end(iter, pos, bytes, copied, folio) ?
copied : 0;
+ /*
+ * Update the in-memory inode size after copying the data into
+ * the page cache. It's up to the file system to write the
+ * updated size to disk, preferably after I/O completion so that
+ * no stale data is exposed. Only once that's done can we
+ * unlock and release the folio.
+ */
+ old_size = iter->inode->i_size;
+ if (pos + written > old_size) {
+ i_size_write(iter->inode, pos + written);
+ iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
+ }
+ __iomap_put_folio(iter, pos, written, folio);
+
+ if (old_size < pos)
+ pagecache_isize_extended(iter->inode, old_size, pos);
+
cond_resched();
if (unlikely(written == 0)) {
/*
@@ -1344,6 +1359,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
bytes = folio_size(folio) - offset;
ret = iomap_write_end(iter, pos, bytes, bytes, folio);
+ __iomap_put_folio(iter, pos, bytes, folio);
if (WARN_ON_ONCE(!ret))
return -EIO;
@@ -1409,6 +1425,7 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
folio_mark_accessed(folio);
ret = iomap_write_end(iter, pos, bytes, bytes, folio);
+ __iomap_put_folio(iter, pos, bytes, folio);
if (WARN_ON_ONCE(!ret))
return -EIO;
@@ -1539,8 +1556,6 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error)
/* walk all folios in bio, ending page IO on them */
bio_for_each_folio_all(fi, bio) {
- if (error)
- folio_set_error(fi.folio);
iomap_finish_folio_write(inode, fi.folio, fi.length);
folio_count++;
}
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 93b1077a380a..f50311a6b429 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -326,8 +326,8 @@ static const struct fs_parameter_spec isofs_param_spec[] = {
fsparam_u32 ("session", Opt_session),
fsparam_u32 ("sbsector", Opt_sb),
fsparam_enum ("check", Opt_check, isofs_param_check),
- fsparam_u32 ("uid", Opt_uid),
- fsparam_u32 ("gid", Opt_gid),
+ fsparam_uid ("uid", Opt_uid),
+ fsparam_gid ("gid", Opt_gid),
/* Note: mode/dmode historically accepted %u not strictly %o */
fsparam_u32 ("mode", Opt_mode),
fsparam_u32 ("dmode", Opt_dmode),
@@ -344,8 +344,6 @@ static int isofs_parse_param(struct fs_context *fc,
struct isofs_options *popt = fc->fs_private;
struct fs_parse_result result;
int opt;
- kuid_t uid;
- kgid_t gid;
unsigned int n;
/* There are no remountable options */
@@ -409,17 +407,11 @@ static int isofs_parse_param(struct fs_context *fc,
case Opt_ignore:
break;
case Opt_uid:
- uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(uid))
- return -EINVAL;
- popt->uid = uid;
+ popt->uid = result.uid;
popt->uid_set = 1;
break;
case Opt_gid:
- gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(gid))
- return -EINVAL;
- popt->gid = gid;
+ popt->gid = result.gid;
popt->gid_set = 1;
break;
case Opt_mode:
@@ -1625,4 +1617,5 @@ static void __exit exit_iso9660_fs(void)
module_init(init_iso9660_fs)
module_exit(exit_iso9660_fs)
+MODULE_DESCRIPTION("ISO 9660 CDROM file system support");
MODULE_LICENSE("GPL");
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index d6c17ad69dee..dbf911126e61 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -688,11 +688,10 @@ int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode,
*/
static int rock_ridge_symlink_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct iso_inode_info *ei = ISOFS_I(inode);
struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
- char *link = page_address(page);
+ char *link = folio_address(folio);
unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
struct buffer_head *bh;
char *rpnt = link;
@@ -779,9 +778,10 @@ repeat:
goto fail;
brelse(bh);
*rpnt = '\0';
- SetPageUptodate(page);
- unlock_page(page);
- return 0;
+ ret = 0;
+end:
+ folio_end_read(folio, ret == 0);
+ return ret;
/* error exit from macro */
out:
@@ -795,9 +795,8 @@ out_bad_span:
fail:
brelse(bh);
error:
- SetPageError(page);
- unlock_page(page);
- return -EIO;
+ ret = -EIO;
+ goto end;
}
const struct address_space_operations isofs_symlink_aops = {
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 75ea4e9a5cab..4305a1ac808a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -353,7 +353,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
struct buffer_head *descriptor;
struct buffer_head **wbuf = journal->j_wbuf;
int bufs;
- int flags;
+ int escape;
int err;
unsigned long long blocknr;
ktime_t start_time;
@@ -660,10 +660,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
*/
set_bit(BH_JWrite, &jh2bh(jh)->b_state);
JBUFFER_TRACE(jh, "ph3: write metadata");
- flags = jbd2_journal_write_metadata_buffer(commit_transaction,
+ escape = jbd2_journal_write_metadata_buffer(commit_transaction,
jh, &wbuf[bufs], blocknr);
- if (flags < 0) {
- jbd2_journal_abort(journal, flags);
+ if (escape < 0) {
+ jbd2_journal_abort(journal, escape);
continue;
}
jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
@@ -672,7 +672,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
buffer */
tag_flag = 0;
- if (flags & 1)
+ if (escape)
tag_flag |= JBD2_FLAG_ESCAPE;
if (!first_tag)
tag_flag |= JBD2_FLAG_SAME_UUID;
@@ -766,7 +766,7 @@ start_journal_io:
if (first_block < journal->j_tail)
freed += journal->j_last - journal->j_first;
/* Update tail only if we free significant amount of space */
- if (freed < jbd2_journal_get_max_txn_bufs(journal))
+ if (freed < journal->j_max_transaction_buffers)
update_tail = 0;
}
J_ASSERT(commit_transaction->t_state == T_COMMIT);
@@ -1107,7 +1107,7 @@ restart_loop:
commit_transaction->t_state = T_COMMIT_CALLBACK;
J_ASSERT(commit_transaction == journal->j_committing_transaction);
- journal->j_commit_sequence = commit_transaction->t_tid;
+ WRITE_ONCE(journal->j_commit_sequence, commit_transaction->t_tid);
journal->j_committing_transaction = NULL;
commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 03c4b9214f56..1ebf2393bfb7 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -220,19 +220,12 @@ loop:
* so we don't sleep
*/
DEFINE_WAIT(wait);
- int should_sleep = 1;
prepare_to_wait(&journal->j_wait_commit, &wait,
TASK_INTERRUPTIBLE);
- if (journal->j_commit_sequence != journal->j_commit_request)
- should_sleep = 0;
transaction = journal->j_running_transaction;
- if (transaction && time_after_eq(jiffies,
- transaction->t_expires))
- should_sleep = 0;
- if (journal->j_flags & JBD2_UNMOUNT)
- should_sleep = 0;
- if (should_sleep) {
+ if (transaction == NULL ||
+ time_before(jiffies, transaction->t_expires)) {
write_unlock(&journal->j_state_lock);
schedule();
write_lock(&journal->j_state_lock);
@@ -316,11 +309,8 @@ static void journal_kill_thread(journal_t *journal)
*
* Return value:
* <0: Error
- * >=0: Finished OK
- *
- * On success:
- * Bit 0 set == escape performed on the data
- * Bit 1 set == buffer copy-out performed (kfree the data after IO)
+ * =0: Finished OK without escape
+ * =1: Finished OK with escape
*/
int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
@@ -328,7 +318,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
struct buffer_head **bh_out,
sector_t blocknr)
{
- int need_copy_out = 0;
int done_copy_out = 0;
int do_escape = 0;
char *mapped_data;
@@ -355,7 +344,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
atomic_set(&new_bh->b_count, 1);
spin_lock(&jh_in->b_state_lock);
-repeat:
/*
* If a new transaction has already done a buffer copy-out, then
* we use that version of the data for the commit.
@@ -365,8 +353,8 @@ repeat:
new_folio = virt_to_folio(jh_in->b_frozen_data);
new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data);
} else {
- new_folio = jh2bh(jh_in)->b_folio;
- new_offset = offset_in_folio(new_folio, jh2bh(jh_in)->b_data);
+ new_folio = bh_in->b_folio;
+ new_offset = offset_in_folio(new_folio, bh_in->b_data);
}
mapped_data = kmap_local_folio(new_folio, new_offset);
@@ -383,54 +371,52 @@ repeat:
/*
* Check for escaping
*/
- if (*((__be32 *)mapped_data) == cpu_to_be32(JBD2_MAGIC_NUMBER)) {
- need_copy_out = 1;
+ if (*((__be32 *)mapped_data) == cpu_to_be32(JBD2_MAGIC_NUMBER))
do_escape = 1;
- }
kunmap_local(mapped_data);
/*
* Do we need to do a data copy?
*/
- if (need_copy_out && !done_copy_out) {
+ if (do_escape && !done_copy_out) {
char *tmp;
spin_unlock(&jh_in->b_state_lock);
tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
if (!tmp) {
brelse(new_bh);
+ free_buffer_head(new_bh);
return -ENOMEM;
}
spin_lock(&jh_in->b_state_lock);
if (jh_in->b_frozen_data) {
jbd2_free(tmp, bh_in->b_size);
- goto repeat;
+ goto copy_done;
}
jh_in->b_frozen_data = tmp;
memcpy_from_folio(tmp, new_folio, new_offset, bh_in->b_size);
-
- new_folio = virt_to_folio(tmp);
- new_offset = offset_in_folio(new_folio, tmp);
- done_copy_out = 1;
-
/*
* This isn't strictly necessary, as we're using frozen
* data for the escaping, but it keeps consistency with
* b_frozen_data usage.
*/
jh_in->b_frozen_triggers = jh_in->b_triggers;
+
+copy_done:
+ new_folio = virt_to_folio(jh_in->b_frozen_data);
+ new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data);
+ done_copy_out = 1;
}
/*
* Did we need to do an escaping? Now we've done all the
* copying, we can finally do so.
+ * b_frozen_data is from jbd2_alloc() which always provides an
+ * address from the direct kernels mapping.
*/
- if (do_escape) {
- mapped_data = kmap_local_folio(new_folio, new_offset);
- *((unsigned int *)mapped_data) = 0;
- kunmap_local(mapped_data);
- }
+ if (do_escape)
+ *((unsigned int *)jh_in->b_frozen_data) = 0;
folio_set_bh(new_bh, new_folio, new_offset);
new_bh->b_size = bh_in->b_size;
@@ -454,7 +440,7 @@ repeat:
set_buffer_shadow(bh_in);
spin_unlock(&jh_in->b_state_lock);
- return do_escape | (done_copy_out << 1);
+ return do_escape;
}
/*
@@ -789,17 +775,7 @@ EXPORT_SYMBOL(jbd2_fc_end_commit_fallback);
/* Return 1 when transaction with given tid has already committed. */
int jbd2_transaction_committed(journal_t *journal, tid_t tid)
{
- int ret = 1;
-
- read_lock(&journal->j_state_lock);
- if (journal->j_running_transaction &&
- journal->j_running_transaction->t_tid == tid)
- ret = 0;
- if (journal->j_committing_transaction &&
- journal->j_committing_transaction->t_tid == tid)
- ret = 0;
- read_unlock(&journal->j_state_lock);
- return ret;
+ return tid_geq(READ_ONCE(journal->j_commit_sequence), tid);
}
EXPORT_SYMBOL(jbd2_transaction_committed);
@@ -1451,6 +1427,48 @@ static int journal_revoke_records_per_block(journal_t *journal)
return space / record_size;
}
+static int jbd2_journal_get_max_txn_bufs(journal_t *journal)
+{
+ return (journal->j_total_len - journal->j_fc_wbufsize) / 3;
+}
+
+/*
+ * Base amount of descriptor blocks we reserve for each transaction.
+ */
+static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
+{
+ int tag_space = journal->j_blocksize - sizeof(journal_header_t);
+ int tags_per_block;
+
+ /* Subtract UUID */
+ tag_space -= 16;
+ if (jbd2_journal_has_csum_v2or3(journal))
+ tag_space -= sizeof(struct jbd2_journal_block_tail);
+ /* Commit code leaves a slack space of 16 bytes at the end of block */
+ tags_per_block = (tag_space - 16) / journal_tag_bytes(journal);
+ /*
+ * Revoke descriptors are accounted separately so we need to reserve
+ * space for commit block and normal transaction descriptor blocks.
+ */
+ return 1 + DIV_ROUND_UP(jbd2_journal_get_max_txn_bufs(journal),
+ tags_per_block);
+}
+
+/*
+ * Initialize number of blocks each transaction reserves for its bookkeeping
+ * and maximum number of blocks a transaction can use. This needs to be called
+ * after the journal size and the fastcommit area size are initialized.
+ */
+static void jbd2_journal_init_transaction_limits(journal_t *journal)
+{
+ journal->j_revoke_records_per_block =
+ journal_revoke_records_per_block(journal);
+ journal->j_transaction_overhead_buffers =
+ jbd2_descriptor_blocks_per_trans(journal);
+ journal->j_max_transaction_buffers =
+ jbd2_journal_get_max_txn_bufs(journal);
+}
+
/*
* Load the on-disk journal superblock and read the key fields into the
* journal_t.
@@ -1492,8 +1510,8 @@ static int journal_load_superblock(journal_t *journal)
if (jbd2_journal_has_csum_v2or3(journal))
journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
sizeof(sb->s_uuid));
- journal->j_revoke_records_per_block =
- journal_revoke_records_per_block(journal);
+ /* After journal features are set, we can compute transaction limits */
+ jbd2_journal_init_transaction_limits(journal);
if (jbd2_has_feature_fast_commit(journal)) {
journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
@@ -1599,7 +1617,6 @@ static journal_t *journal_init_common(struct block_device *bdev,
journal->j_shrinker->scan_objects = jbd2_journal_shrink_scan;
journal->j_shrinker->count_objects = jbd2_journal_shrink_count;
- journal->j_shrinker->batch = journal->j_max_transaction_buffers;
journal->j_shrinker->private_data = journal;
shrinker_register(journal->j_shrinker);
@@ -1743,8 +1760,6 @@ static int journal_reset(journal_t *journal)
journal->j_commit_sequence = journal->j_transaction_sequence - 1;
journal->j_commit_request = journal->j_commit_sequence;
- journal->j_max_transaction_buffers = jbd2_journal_get_max_txn_bufs(journal);
-
/*
* Now that journal recovery is done, turn fast commits off here. This
* way, if fast commit was enabled before the crash but if now FS has
@@ -2285,8 +2300,6 @@ jbd2_journal_initialize_fast_commit(journal_t *journal)
journal->j_fc_first = journal->j_last + 1;
journal->j_fc_off = 0;
journal->j_free = journal->j_last - journal->j_first;
- journal->j_max_transaction_buffers =
- jbd2_journal_get_max_txn_bufs(journal);
return 0;
}
@@ -2374,8 +2387,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
sb->s_feature_ro_compat |= cpu_to_be32(ro);
sb->s_feature_incompat |= cpu_to_be32(incompat);
unlock_buffer(journal->j_sb_buffer);
- journal->j_revoke_records_per_block =
- journal_revoke_records_per_block(journal);
+ jbd2_journal_init_transaction_limits(journal);
return 1;
#undef COMPAT_FEATURE_ON
@@ -2406,8 +2418,7 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
sb->s_feature_compat &= ~cpu_to_be32(compat);
sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
sb->s_feature_incompat &= ~cpu_to_be32(incompat);
- journal->j_revoke_records_per_block =
- journal_revoke_records_per_block(journal);
+ jbd2_journal_init_transaction_limits(journal);
}
EXPORT_SYMBOL(jbd2_journal_clear_features);
@@ -3181,6 +3192,7 @@ static void __exit journal_exit(void)
jbd2_journal_destroy_caches();
}
+MODULE_DESCRIPTION("Generic filesystem journal-writing module");
MODULE_LICENSE("GPL");
module_init(journal_init);
module_exit(journal_exit);
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 1f7664984d6e..667f67342c52 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -19,6 +19,7 @@
#include <linux/errno.h>
#include <linux/crc32.h>
#include <linux/blkdev.h>
+#include <linux/string_choices.h>
#endif
/*
@@ -374,7 +375,7 @@ int jbd2_journal_skip_recovery(journal_t *journal)
be32_to_cpu(journal->j_superblock->s_sequence);
jbd2_debug(1,
"JBD2: ignoring %d transaction%s from the journal.\n",
- dropped, (dropped == 1) ? "" : "s");
+ dropped, str_plural(dropped));
#endif
journal->j_transaction_sequence = ++info.end_transaction;
journal->j_head = info.head_block;
@@ -443,6 +444,27 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
return provided == cpu_to_be32(calculated);
}
+static bool jbd2_commit_block_csum_verify_partial(journal_t *j, void *buf)
+{
+ struct commit_header *h;
+ __be32 provided;
+ __u32 calculated;
+ void *tmpbuf;
+
+ tmpbuf = kzalloc(j->j_blocksize, GFP_KERNEL);
+ if (!tmpbuf)
+ return false;
+
+ memcpy(tmpbuf, buf, sizeof(struct commit_header));
+ h = tmpbuf;
+ provided = h->h_chksum[0];
+ h->h_chksum[0] = 0;
+ calculated = jbd2_chksum(j, j->j_csum_seed, tmpbuf, j->j_blocksize);
+ kfree(tmpbuf);
+
+ return provided == cpu_to_be32(calculated);
+}
+
static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
journal_block_tag3_t *tag3,
void *buf, __u32 sequence)
@@ -810,6 +832,13 @@ static int do_one_pass(journal_t *journal,
if (pass == PASS_SCAN &&
!jbd2_commit_block_csum_verify(journal,
bh->b_data)) {
+ if (jbd2_commit_block_csum_verify_partial(
+ journal,
+ bh->b_data)) {
+ pr_notice("JBD2: Find incomplete commit block in transaction %u block %lu\n",
+ next_commit_ID, next_log_block);
+ goto chksum_ok;
+ }
chksum_error:
if (commit_time < last_trans_commit_time)
goto ignore_crc_mismatch;
@@ -824,6 +853,7 @@ static int do_one_pass(journal_t *journal,
}
}
if (pass == PASS_SCAN) {
+ chksum_ok:
last_trans_commit_time = commit_time;
head_block = next_log_block;
}
@@ -843,6 +873,7 @@ static int do_one_pass(journal_t *journal,
next_log_block);
need_check_commit_time = true;
}
+
/* If we aren't in the REVOKE pass, then we can
* just skip over this block. */
if (pass != PASS_REVOKE) {
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index cb0b8d6fc0c6..66513c18ca29 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -63,28 +63,6 @@ void jbd2_journal_free_transaction(transaction_t *transaction)
}
/*
- * Base amount of descriptor blocks we reserve for each transaction.
- */
-static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
-{
- int tag_space = journal->j_blocksize - sizeof(journal_header_t);
- int tags_per_block;
-
- /* Subtract UUID */
- tag_space -= 16;
- if (jbd2_journal_has_csum_v2or3(journal))
- tag_space -= sizeof(struct jbd2_journal_block_tail);
- /* Commit code leaves a slack space of 16 bytes at the end of block */
- tags_per_block = (tag_space - 16) / journal_tag_bytes(journal);
- /*
- * Revoke descriptors are accounted separately so we need to reserve
- * space for commit block and normal transaction descriptor blocks.
- */
- return 1 + DIV_ROUND_UP(journal->j_max_transaction_buffers,
- tags_per_block);
-}
-
-/*
* jbd2_get_transaction: obtain a new transaction_t object.
*
* Simply initialise a new transaction. Initialize it in
@@ -109,7 +87,7 @@ static void jbd2_get_transaction(journal_t *journal,
transaction->t_expires = jiffies + journal->j_commit_interval;
atomic_set(&transaction->t_updates, 0);
atomic_set(&transaction->t_outstanding_credits,
- jbd2_descriptor_blocks_per_trans(journal) +
+ journal->j_transaction_overhead_buffers +
atomic_read(&journal->j_reserved_credits));
atomic_set(&transaction->t_outstanding_revokes, 0);
atomic_set(&transaction->t_handle_count, 0);
@@ -213,6 +191,13 @@ static void sub_reserved_credits(journal_t *journal, int blocks)
wake_up(&journal->j_wait_reserved);
}
+/* Maximum number of blocks for user transaction payload */
+static int jbd2_max_user_trans_buffers(journal_t *journal)
+{
+ return journal->j_max_transaction_buffers -
+ journal->j_transaction_overhead_buffers;
+}
+
/*
* Wait until we can add credits for handle to the running transaction. Called
* with j_state_lock held for reading. Returns 0 if handle joined the running
@@ -262,12 +247,12 @@ __must_hold(&journal->j_state_lock)
* big to fit this handle? Wait until reserved credits are freed.
*/
if (atomic_read(&journal->j_reserved_credits) + total >
- journal->j_max_transaction_buffers) {
+ jbd2_max_user_trans_buffers(journal)) {
read_unlock(&journal->j_state_lock);
jbd2_might_wait_for_commit(journal);
wait_event(journal->j_wait_reserved,
atomic_read(&journal->j_reserved_credits) + total <=
- journal->j_max_transaction_buffers);
+ jbd2_max_user_trans_buffers(journal));
__acquire(&journal->j_state_lock); /* fake out sparse */
return 1;
}
@@ -307,14 +292,14 @@ __must_hold(&journal->j_state_lock)
needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
/* We allow at most half of a transaction to be reserved */
- if (needed > journal->j_max_transaction_buffers / 2) {
+ if (needed > jbd2_max_user_trans_buffers(journal) / 2) {
sub_reserved_credits(journal, rsv_blocks);
atomic_sub(total, &t->t_outstanding_credits);
read_unlock(&journal->j_state_lock);
jbd2_might_wait_for_commit(journal);
wait_event(journal->j_wait_reserved,
atomic_read(&journal->j_reserved_credits) + rsv_blocks
- <= journal->j_max_transaction_buffers / 2);
+ <= jbd2_max_user_trans_buffers(journal) / 2);
__acquire(&journal->j_state_lock); /* fake out sparse */
return 1;
}
@@ -344,12 +329,12 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
* size and limit the number of total credits to not exceed maximum
* transaction size per operation.
*/
- if ((rsv_blocks > journal->j_max_transaction_buffers / 2) ||
- (rsv_blocks + blocks > journal->j_max_transaction_buffers)) {
+ if (rsv_blocks > jbd2_max_user_trans_buffers(journal) / 2 ||
+ rsv_blocks + blocks > jbd2_max_user_trans_buffers(journal)) {
printk(KERN_ERR "JBD2: %s wants too many credits "
"credits:%d rsv_credits:%d max:%d\n",
current->comm, blocks, rsv_blocks,
- journal->j_max_transaction_buffers);
+ jbd2_max_user_trans_buffers(journal));
WARN_ON(1);
return -ENOSPC;
}
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 62ea76da7fdf..e12cb145147e 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -95,13 +95,8 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg)
ret = jffs2_read_inode_range(c, f, pg_buf, pg->index << PAGE_SHIFT,
PAGE_SIZE);
- if (ret) {
- ClearPageUptodate(pg);
- SetPageError(pg);
- } else {
+ if (!ret)
SetPageUptodate(pg);
- ClearPageError(pg);
- }
flush_dcache_page(pg);
kunmap(pg);
@@ -304,10 +299,8 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
kunmap(pg);
- if (ret) {
- /* There was an error writing. */
- SetPageError(pg);
- }
+ if (ret)
+ mapping_set_error(mapping, ret);
/* Adjust writtenlen for the padding we did, so we don't confuse our caller */
writtenlen -= min(writtenlen, (start - aligned_start));
@@ -330,7 +323,6 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
it gets reread */
jffs2_dbg(1, "%s(): Not all bytes written. Marking page !uptodate\n",
__func__);
- SetPageError(pg);
ClearPageUptodate(pg);
}
diff --git a/fs/libfs.c b/fs/libfs.c
index b635ee5adbcc..8aa34870449f 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1854,6 +1854,80 @@ static const struct dentry_operations generic_ci_dentry_ops = {
.d_revalidate = fscrypt_d_revalidate,
#endif
};
+
+/**
+ * generic_ci_match() - Match a name (case-insensitively) with a dirent.
+ * This is a filesystem helper for comparison with directory entries.
+ * generic_ci_d_compare should be used in VFS' ->d_compare instead.
+ *
+ * @parent: Inode of the parent of the dirent under comparison
+ * @name: name under lookup.
+ * @folded_name: Optional pre-folded name under lookup
+ * @de_name: Dirent name.
+ * @de_name_len: dirent name length.
+ *
+ * Test whether a case-insensitive directory entry matches the filename
+ * being searched. If @folded_name is provided, it is used instead of
+ * recalculating the casefold of @name.
+ *
+ * Return: > 0 if the directory entry matches, 0 if it doesn't match, or
+ * < 0 on error.
+ */
+int generic_ci_match(const struct inode *parent,
+ const struct qstr *name,
+ const struct qstr *folded_name,
+ const u8 *de_name, u32 de_name_len)
+{
+ const struct super_block *sb = parent->i_sb;
+ const struct unicode_map *um = sb->s_encoding;
+ struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len);
+ struct qstr dirent = QSTR_INIT(de_name, de_name_len);
+ int res = 0;
+
+ if (IS_ENCRYPTED(parent)) {
+ const struct fscrypt_str encrypted_name =
+ FSTR_INIT((u8 *) de_name, de_name_len);
+
+ if (WARN_ON_ONCE(!fscrypt_has_encryption_key(parent)))
+ return -EINVAL;
+
+ decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL);
+ if (!decrypted_name.name)
+ return -ENOMEM;
+ res = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name,
+ &decrypted_name);
+ if (res < 0) {
+ kfree(decrypted_name.name);
+ return res;
+ }
+ dirent.name = decrypted_name.name;
+ dirent.len = decrypted_name.len;
+ }
+
+ /*
+ * Attempt a case-sensitive match first. It is cheaper and
+ * should cover most lookups, including all the sane
+ * applications that expect a case-sensitive filesystem.
+ */
+
+ if (dirent.len == name->len &&
+ !memcmp(name->name, dirent.name, dirent.len))
+ goto out;
+
+ if (folded_name->name)
+ res = utf8_strncasecmp_folded(um, folded_name, &dirent);
+ else
+ res = utf8_strncasecmp(um, name, &dirent);
+
+out:
+ kfree(decrypted_name.name);
+ if (res < 0 && sb_has_strict_encoding(sb)) {
+ pr_err_ratelimited("Directory contains filename that is invalid UTF-8");
+ return 0;
+ }
+ return !res;
+}
+EXPORT_SYMBOL(generic_ci_match);
#endif
#ifdef CONFIG_FS_ENCRYPTION
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index ac9f9d84510e..fe3e23dd29c3 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -7,8 +7,7 @@ ccflags-y += -I$(src) # needed for trace events
obj-$(CONFIG_LOCKD) += lockd.o
-lockd-objs-y += clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
- svcshare.o svcproc.o svcsubs.o mon.o trace.o xdr.o
-lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
-lockd-objs-$(CONFIG_PROC_FS) += procfs.o
-lockd-objs := $(lockd-objs-y)
+lockd-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
+ svcshare.o svcproc.o svcsubs.o mon.o trace.o xdr.o
+lockd-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
+lockd-$(CONFIG_PROC_FS) += procfs.o
diff --git a/fs/locks.c b/fs/locks.c
index 90c8746874de..bdd94c32256f 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1367,9 +1367,9 @@ retry:
locks_wake_up_blocks(&left->c);
}
out:
+ trace_posix_lock_inode(inode, request, error);
spin_unlock(&ctx->flc_lock);
percpu_up_read(&file_rwsem);
- trace_posix_lock_inode(inode, request, error);
/*
* Free any unused locks.
*/
@@ -2448,8 +2448,9 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
error = do_lock_file_wait(filp, cmd, file_lock);
/*
- * Attempt to detect a close/fcntl race and recover by releasing the
- * lock that was just acquired. There is no need to do that when we're
+ * Detect close/fcntl races and recover by zapping all POSIX locks
+ * associated with this file and our files_struct, just like on
+ * filp_flush(). There is no need to do that when we're
* unlocking though, or for OFD locks.
*/
if (!error && file_lock->c.flc_type != F_UNLCK &&
@@ -2464,9 +2465,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
f = files_lookup_fd_locked(files, fd);
spin_unlock(&files->file_lock);
if (f != filp) {
- file_lock->c.flc_type = F_UNLCK;
- error = do_lock_file_wait(filp, cmd, file_lock);
- WARN_ON_ONCE(error);
+ locks_remove_posix(filp, files);
error = -EBADF;
}
}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 7f9a2d8aa420..1c3df63162ef 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -730,5 +730,6 @@ static void __exit exit_minix_fs(void)
module_init(init_minix_fs)
module_exit(exit_minix_fs)
+MODULE_DESCRIPTION("Minix file system");
MODULE_LICENSE("GPL");
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index d6031acc34f0..a944a0f17b53 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -213,8 +213,7 @@ static int minix_rename(struct mnt_idmap *idmap,
if (!new_de)
goto out_dir;
err = minix_set_link(new_de, new_page, old_inode);
- kunmap(new_page);
- put_page(new_page);
+ unmap_and_put_page(new_page, new_de);
if (err)
goto out_dir;
inode_set_ctime_current(new_inode);
diff --git a/fs/mount.h b/fs/mount.h
index 4a42fc68f4cc..ad4b1ddebb54 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -16,6 +16,8 @@ struct mnt_namespace {
u64 event;
unsigned int nr_mounts; /* # of mounts in the namespace */
unsigned int pending_mounts;
+ struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */
+ refcount_t passive; /* number references not pinning @mounts */
} __randomize_layout;
struct mnt_pcp {
@@ -152,3 +154,4 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
}
extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
+bool has_locked_children(struct mount *mnt, struct dentry *dentry);
diff --git a/fs/mpage.c b/fs/mpage.c
index fa8b99a199fa..b5b5ddf9d513 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -48,13 +48,8 @@ static void mpage_read_end_io(struct bio *bio)
struct folio_iter fi;
int err = blk_status_to_errno(bio->bi_status);
- bio_for_each_folio_all(fi, bio) {
- if (err)
- folio_set_error(fi.folio);
- else
- folio_mark_uptodate(fi.folio);
- folio_unlock(fi.folio);
- }
+ bio_for_each_folio_all(fi, bio)
+ folio_end_read(fi.folio, err == 0);
bio_put(bio);
}
@@ -65,10 +60,8 @@ static void mpage_write_end_io(struct bio *bio)
int err = blk_status_to_errno(bio->bi_status);
bio_for_each_folio_all(fi, bio) {
- if (err) {
- folio_set_error(fi.folio);
+ if (err)
mapping_set_error(fi.folio->mapping, err);
- }
folio_end_writeback(fi.folio);
}
diff --git a/fs/namei.c b/fs/namei.c
index 37fb0a8aa09a..3a4c40e12f78 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -126,7 +126,7 @@
#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname))
struct filename *
-getname_flags(const char __user *filename, int flags, int *empty)
+getname_flags(const char __user *filename, int flags)
{
struct filename *result;
char *kname;
@@ -148,9 +148,20 @@ getname_flags(const char __user *filename, int flags, int *empty)
result->name = kname;
len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
- if (unlikely(len < 0)) {
- __putname(result);
- return ERR_PTR(len);
+ /*
+ * Handle both empty path and copy failure in one go.
+ */
+ if (unlikely(len <= 0)) {
+ if (unlikely(len < 0)) {
+ __putname(result);
+ return ERR_PTR(len);
+ }
+
+ /* The empty path is special. */
+ if (!(flags & LOOKUP_EMPTY)) {
+ __putname(result);
+ return ERR_PTR(-ENOENT);
+ }
}
/*
@@ -180,6 +191,12 @@ getname_flags(const char __user *filename, int flags, int *empty)
kfree(result);
return ERR_PTR(len);
}
+ /* The empty path is special. */
+ if (unlikely(!len) && !(flags & LOOKUP_EMPTY)) {
+ __putname(kname);
+ kfree(result);
+ return ERR_PTR(-ENOENT);
+ }
if (unlikely(len == PATH_MAX)) {
__putname(kname);
kfree(result);
@@ -188,16 +205,6 @@ getname_flags(const char __user *filename, int flags, int *empty)
}
atomic_set(&result->refcnt, 1);
- /* The empty path is special. */
- if (unlikely(!len)) {
- if (empty)
- *empty = 1;
- if (!(flags & LOOKUP_EMPTY)) {
- putname(result);
- return ERR_PTR(-ENOENT);
- }
- }
-
result->uptr = filename;
result->aname = NULL;
audit_getname(result);
@@ -209,13 +216,13 @@ getname_uflags(const char __user *filename, int uflags)
{
int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
- return getname_flags(filename, flags, NULL);
+ return getname_flags(filename, flags);
}
struct filename *
getname(const char __user * filename)
{
- return getname_flags(filename, 0, NULL);
+ return getname_flags(filename, 0);
}
struct filename *
@@ -1233,29 +1240,48 @@ int may_linkat(struct mnt_idmap *idmap, const struct path *link)
*
* Returns 0 if the open is allowed, -ve on error.
*/
-static int may_create_in_sticky(struct mnt_idmap *idmap,
- struct nameidata *nd, struct inode *const inode)
+static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd,
+ struct inode *const inode)
{
umode_t dir_mode = nd->dir_mode;
- vfsuid_t dir_vfsuid = nd->dir_vfsuid;
+ vfsuid_t dir_vfsuid = nd->dir_vfsuid, i_vfsuid;
- if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
- (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
- likely(!(dir_mode & S_ISVTX)) ||
- vfsuid_eq(i_uid_into_vfsuid(idmap, inode), dir_vfsuid) ||
- vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid()))
+ if (likely(!(dir_mode & S_ISVTX)))
return 0;
- if (likely(dir_mode & 0002) ||
- (dir_mode & 0020 &&
- ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
- (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
- const char *operation = S_ISFIFO(inode->i_mode) ?
- "sticky_create_fifo" :
- "sticky_create_regular";
- audit_log_path_denied(AUDIT_ANOM_CREAT, operation);
+ if (S_ISREG(inode->i_mode) && !sysctl_protected_regular)
+ return 0;
+
+ if (S_ISFIFO(inode->i_mode) && !sysctl_protected_fifos)
+ return 0;
+
+ i_vfsuid = i_uid_into_vfsuid(idmap, inode);
+
+ if (vfsuid_eq(i_vfsuid, dir_vfsuid))
+ return 0;
+
+ if (vfsuid_eq_kuid(i_vfsuid, current_fsuid()))
+ return 0;
+
+ if (likely(dir_mode & 0002)) {
+ audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create");
return -EACCES;
}
+
+ if (dir_mode & 0020) {
+ if (sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) {
+ audit_log_path_denied(AUDIT_ANOM_CREAT,
+ "sticky_create_fifo");
+ return -EACCES;
+ }
+
+ if (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode)) {
+ audit_log_path_denied(AUDIT_ANOM_CREAT,
+ "sticky_create_regular");
+ return -EACCES;
+ }
+ }
+
return 0;
}
@@ -1712,17 +1738,26 @@ static struct dentry *lookup_slow(const struct qstr *name,
}
static inline int may_lookup(struct mnt_idmap *idmap,
- struct nameidata *nd)
+ struct nameidata *restrict nd)
{
- if (nd->flags & LOOKUP_RCU) {
- int err = inode_permission(idmap, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
- if (!err) // success, keep going
- return 0;
- if (!try_to_unlazy(nd))
- return -ECHILD; // redo it all non-lazy
- if (err != -ECHILD) // hard error
- return err;
- }
+ int err, mask;
+
+ mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0;
+ err = inode_permission(idmap, nd->inode, mask | MAY_EXEC);
+ if (likely(!err))
+ return 0;
+
+ // If we failed, and we weren't in LOOKUP_RCU, it's final
+ if (!(nd->flags & LOOKUP_RCU))
+ return err;
+
+ // Drop out of RCU mode to make sure it wasn't transient
+ if (!try_to_unlazy(nd))
+ return -ECHILD; // redo it all non-lazy
+
+ if (err != -ECHILD) // hard error
+ return err;
+
return inode_permission(idmap, nd->inode, MAY_EXEC);
}
@@ -2163,21 +2198,39 @@ EXPORT_SYMBOL(hashlen_string);
/*
* Calculate the length and hash of the path component, and
- * return the "hash_len" as the result.
+ * return the length as the result.
*/
-static inline u64 hash_name(const void *salt, const char *name)
+static inline const char *hash_name(struct nameidata *nd,
+ const char *name,
+ unsigned long *lastword)
{
- unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
+ unsigned long a, b, x, y = (unsigned long)nd->path.dentry;
unsigned long adata, bdata, mask, len;
const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
- len = 0;
- goto inside;
+ /*
+ * The first iteration is special, because it can result in
+ * '.' and '..' and has no mixing other than the final fold.
+ */
+ a = load_unaligned_zeropad(name);
+ b = a ^ REPEAT_BYTE('/');
+ if (has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)) {
+ adata = prep_zero_mask(a, adata, &constants);
+ bdata = prep_zero_mask(b, bdata, &constants);
+ mask = create_zero_mask(adata | bdata);
+ a &= zero_bytemask(mask);
+ *lastword = a;
+ len = find_zero(mask);
+ nd->last.hash = fold_hash(a, y);
+ nd->last.len = len;
+ return name + len;
+ }
+ len = 0;
+ x = 0;
do {
HASH_MIX(x, y, a);
len += sizeof(unsigned long);
-inside:
a = load_unaligned_zeropad(name+len);
b = a ^ REPEAT_BYTE('/');
} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
@@ -2185,11 +2238,25 @@ inside:
adata = prep_zero_mask(a, adata, &constants);
bdata = prep_zero_mask(b, bdata, &constants);
mask = create_zero_mask(adata | bdata);
- x ^= a & zero_bytemask(mask);
+ a &= zero_bytemask(mask);
+ x ^= a;
+ len += find_zero(mask);
+ *lastword = 0; // Multi-word components cannot be DOT or DOTDOT
- return hashlen_create(fold_hash(x, y), len + find_zero(mask));
+ nd->last.hash = fold_hash(x, y);
+ nd->last.len = len;
+ return name + len;
}
+/*
+ * Note that the 'last' word is always zero-masked, but
+ * was loaded as a possibly big-endian word.
+ */
+#ifdef __BIG_ENDIAN
+ #define LAST_WORD_IS_DOT (0x2eul << (BITS_PER_LONG-8))
+ #define LAST_WORD_IS_DOTDOT (0x2e2eul << (BITS_PER_LONG-16))
+#endif
+
#else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
/* Return the hash of a string of known length */
@@ -2222,22 +2289,35 @@ EXPORT_SYMBOL(hashlen_string);
* We know there's a real path component here of at least
* one character.
*/
-static inline u64 hash_name(const void *salt, const char *name)
+static inline const char *hash_name(struct nameidata *nd, const char *name, unsigned long *lastword)
{
- unsigned long hash = init_name_hash(salt);
- unsigned long len = 0, c;
+ unsigned long hash = init_name_hash(nd->path.dentry);
+ unsigned long len = 0, c, last = 0;
c = (unsigned char)*name;
do {
+ last = (last << 8) + c;
len++;
hash = partial_name_hash(c, hash);
c = (unsigned char)name[len];
} while (c && c != '/');
- return hashlen_create(end_name_hash(hash), len);
+
+ // This is reliable for DOT or DOTDOT, since the component
+ // cannot contain NUL characters - top bits being zero means
+ // we cannot have had any other pathnames.
+ *lastword = last;
+ nd->last.hash = end_name_hash(hash);
+ nd->last.len = len;
+ return name + len;
}
#endif
+#ifndef LAST_WORD_IS_DOT
+ #define LAST_WORD_IS_DOT 0x2e
+ #define LAST_WORD_IS_DOTDOT 0x2e2e
+#endif
+
/*
* Name resolution.
* This is the basic name resolution function, turning a pathname into
@@ -2266,45 +2346,38 @@ static int link_path_walk(const char *name, struct nameidata *nd)
for(;;) {
struct mnt_idmap *idmap;
const char *link;
- u64 hash_len;
- int type;
+ unsigned long lastword;
idmap = mnt_idmap(nd->path.mnt);
err = may_lookup(idmap, nd);
if (err)
return err;
- hash_len = hash_name(nd->path.dentry, name);
+ nd->last.name = name;
+ name = hash_name(nd, name, &lastword);
- type = LAST_NORM;
- if (name[0] == '.') switch (hashlen_len(hash_len)) {
- case 2:
- if (name[1] == '.') {
- type = LAST_DOTDOT;
- nd->state |= ND_JUMPED;
- }
- break;
- case 1:
- type = LAST_DOT;
- }
- if (likely(type == LAST_NORM)) {
- struct dentry *parent = nd->path.dentry;
+ switch(lastword) {
+ case LAST_WORD_IS_DOTDOT:
+ nd->last_type = LAST_DOTDOT;
+ nd->state |= ND_JUMPED;
+ break;
+
+ case LAST_WORD_IS_DOT:
+ nd->last_type = LAST_DOT;
+ break;
+
+ default:
+ nd->last_type = LAST_NORM;
nd->state &= ~ND_JUMPED;
+
+ struct dentry *parent = nd->path.dentry;
if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
- struct qstr this = { { .hash_len = hash_len }, .name = name };
- err = parent->d_op->d_hash(parent, &this);
+ err = parent->d_op->d_hash(parent, &nd->last);
if (err < 0)
return err;
- hash_len = this.hash_len;
- name = this.name;
}
}
- nd->last.hash_len = hash_len;
- nd->last.name = name;
- nd->last_type = type;
-
- name += hashlen_len(hash_len);
if (!*name)
goto OK;
/*
@@ -2922,16 +2995,16 @@ int path_pts(struct path *path)
}
#endif
-int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
- struct path *path, int *empty)
+int user_path_at(int dfd, const char __user *name, unsigned flags,
+ struct path *path)
{
- struct filename *filename = getname_flags(name, flags, empty);
+ struct filename *filename = getname_flags(name, flags);
int ret = filename_lookup(dfd, filename, flags, path, NULL);
putname(filename);
return ret;
}
-EXPORT_SYMBOL(user_path_at_empty);
+EXPORT_SYMBOL(user_path_at);
int __check_sticky(struct mnt_idmap *idmap, struct inode *dir,
struct inode *inode)
@@ -3572,8 +3645,12 @@ static const char *open_last_lookups(struct nameidata *nd,
else
inode_lock_shared(dir->d_inode);
dentry = lookup_open(nd, file, op, got_write);
- if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
- fsnotify_create(dir->d_inode, dentry);
+ if (!IS_ERR(dentry)) {
+ if (file->f_mode & FMODE_CREATED)
+ fsnotify_create(dir->d_inode, dentry);
+ if (file->f_mode & FMODE_OPENED)
+ fsnotify_open(file);
+ }
if (open_flag & O_CREAT)
inode_unlock(dir->d_inode);
else
@@ -3700,6 +3777,8 @@ int vfs_tmpfile(struct mnt_idmap *idmap,
mode = vfs_prepare_mode(idmap, dir, mode, mode, mode);
error = dir->i_op->tmpfile(idmap, dir, file, mode);
dput(child);
+ if (file->f_mode & FMODE_OPENED)
+ fsnotify_open(file);
if (error)
return error;
/* Don't check for other permissions, the inode was just created */
diff --git a/fs/namespace.c b/fs/namespace.c
index 5a51315c6678..221db9de4729 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -70,7 +70,8 @@ static DEFINE_IDA(mnt_id_ida);
static DEFINE_IDA(mnt_group_ida);
/* Don't allow confusion with old 32bit mount ID */
-static atomic64_t mnt_id_ctr = ATOMIC64_INIT(1ULL << 32);
+#define MNT_UNIQUE_ID_OFFSET (1ULL << 32)
+static atomic64_t mnt_id_ctr = ATOMIC64_INIT(MNT_UNIQUE_ID_OFFSET);
static struct hlist_head *mount_hashtable __ro_after_init;
static struct hlist_head *mountpoint_hashtable __ro_after_init;
@@ -78,6 +79,8 @@ static struct kmem_cache *mnt_cache __ro_after_init;
static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
+static DEFINE_RWLOCK(mnt_ns_tree_lock);
+static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
struct mount_kattr {
unsigned int attr_set;
@@ -103,6 +106,109 @@ EXPORT_SYMBOL_GPL(fs_kobj);
*/
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
+static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
+{
+ u64 seq_b = ns->seq;
+
+ if (seq < seq_b)
+ return -1;
+ if (seq > seq_b)
+ return 1;
+ return 0;
+}
+
+static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
+{
+ if (!node)
+ return NULL;
+ return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
+}
+
+static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
+{
+ struct mnt_namespace *ns_a = node_to_mnt_ns(a);
+ struct mnt_namespace *ns_b = node_to_mnt_ns(b);
+ u64 seq_a = ns_a->seq;
+
+ return mnt_ns_cmp(seq_a, ns_b) < 0;
+}
+
+static void mnt_ns_tree_add(struct mnt_namespace *ns)
+{
+ guard(write_lock)(&mnt_ns_tree_lock);
+ rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
+}
+
+static void mnt_ns_release(struct mnt_namespace *ns)
+{
+ lockdep_assert_not_held(&mnt_ns_tree_lock);
+
+ /* keep alive for {list,stat}mount() */
+ if (refcount_dec_and_test(&ns->passive)) {
+ put_user_ns(ns->user_ns);
+ kfree(ns);
+ }
+}
+DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
+
+static void mnt_ns_tree_remove(struct mnt_namespace *ns)
+{
+ /* remove from global mount namespace list */
+ if (!is_anon_ns(ns)) {
+ guard(write_lock)(&mnt_ns_tree_lock);
+ rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
+ }
+
+ mnt_ns_release(ns);
+}
+
+/*
+ * Returns the mount namespace which either has the specified id, or has the
+ * next smallest id afer the specified one.
+ */
+static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
+{
+ struct rb_node *node = mnt_ns_tree.rb_node;
+ struct mnt_namespace *ret = NULL;
+
+ lockdep_assert_held(&mnt_ns_tree_lock);
+
+ while (node) {
+ struct mnt_namespace *n = node_to_mnt_ns(node);
+
+ if (mnt_ns_id <= n->seq) {
+ ret = node_to_mnt_ns(node);
+ if (mnt_ns_id == n->seq)
+ break;
+ node = node->rb_left;
+ } else {
+ node = node->rb_right;
+ }
+ }
+ return ret;
+}
+
+/*
+ * Lookup a mount namespace by id and take a passive reference count. Taking a
+ * passive reference means the mount namespace can be emptied if e.g., the last
+ * task holding an active reference exits. To access the mounts of the
+ * namespace the @namespace_sem must first be acquired. If the namespace has
+ * already shut down before acquiring @namespace_sem, {list,stat}mount() will
+ * see that the mount rbtree of the namespace is empty.
+ */
+static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
+{
+ struct mnt_namespace *ns;
+
+ guard(read_lock)(&mnt_ns_tree_lock);
+ ns = mnt_ns_find_id_at(mnt_ns_id);
+ if (!ns || ns->seq != mnt_ns_id)
+ return NULL;
+
+ refcount_inc(&ns->passive);
+ return ns;
+}
+
static inline void lock_mount_hash(void)
{
write_seqlock(&mount_lock);
@@ -1448,6 +1554,30 @@ static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id)
return ret;
}
+/*
+ * Returns the mount which either has the specified mnt_id, or has the next
+ * greater id before the specified one.
+ */
+static struct mount *mnt_find_id_at_reverse(struct mnt_namespace *ns, u64 mnt_id)
+{
+ struct rb_node *node = ns->mounts.rb_node;
+ struct mount *ret = NULL;
+
+ while (node) {
+ struct mount *m = node_to_mount(node);
+
+ if (mnt_id >= m->mnt_id_unique) {
+ ret = node_to_mount(node);
+ if (mnt_id == m->mnt_id_unique)
+ break;
+ node = node->rb_right;
+ } else {
+ node = node->rb_left;
+ }
+ }
+ return ret;
+}
+
#ifdef CONFIG_PROC_FS
/* iterator; we want it to have access to namespace_sem, thus here... */
@@ -1846,19 +1976,6 @@ bool may_mount(void)
return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
}
-/**
- * path_mounted - check whether path is mounted
- * @path: path to check
- *
- * Determine whether @path refers to the root of a mount.
- *
- * Return: true if @path is the root of a mount, false if not.
- */
-static inline bool path_mounted(const struct path *path)
-{
- return path->mnt->mnt_root == path->dentry;
-}
-
static void warn_mandlock(void)
{
pr_warn_once("=======================================================\n"
@@ -1966,69 +2083,72 @@ static bool mnt_ns_loop(struct dentry *dentry)
return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
}
-struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
+struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
int flag)
{
- struct mount *res, *p, *q, *r, *parent;
+ struct mount *res, *src_parent, *src_root_child, *src_mnt,
+ *dst_parent, *dst_mnt;
- if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
+ if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(src_root))
return ERR_PTR(-EINVAL);
if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
return ERR_PTR(-EINVAL);
- res = q = clone_mnt(mnt, dentry, flag);
- if (IS_ERR(q))
- return q;
+ res = dst_mnt = clone_mnt(src_root, dentry, flag);
+ if (IS_ERR(dst_mnt))
+ return dst_mnt;
- q->mnt_mountpoint = mnt->mnt_mountpoint;
+ src_parent = src_root;
+ dst_mnt->mnt_mountpoint = src_root->mnt_mountpoint;
- p = mnt;
- list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
- struct mount *s;
- if (!is_subdir(r->mnt_mountpoint, dentry))
+ list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) {
+ if (!is_subdir(src_root_child->mnt_mountpoint, dentry))
continue;
- for (s = r; s; s = next_mnt(s, r)) {
+ for (src_mnt = src_root_child; src_mnt;
+ src_mnt = next_mnt(src_mnt, src_root_child)) {
if (!(flag & CL_COPY_UNBINDABLE) &&
- IS_MNT_UNBINDABLE(s)) {
- if (s->mnt.mnt_flags & MNT_LOCKED) {
+ IS_MNT_UNBINDABLE(src_mnt)) {
+ if (src_mnt->mnt.mnt_flags & MNT_LOCKED) {
/* Both unbindable and locked. */
- q = ERR_PTR(-EPERM);
+ dst_mnt = ERR_PTR(-EPERM);
goto out;
} else {
- s = skip_mnt_tree(s);
+ src_mnt = skip_mnt_tree(src_mnt);
continue;
}
}
if (!(flag & CL_COPY_MNT_NS_FILE) &&
- is_mnt_ns_file(s->mnt.mnt_root)) {
- s = skip_mnt_tree(s);
+ is_mnt_ns_file(src_mnt->mnt.mnt_root)) {
+ src_mnt = skip_mnt_tree(src_mnt);
continue;
}
- while (p != s->mnt_parent) {
- p = p->mnt_parent;
- q = q->mnt_parent;
+ while (src_parent != src_mnt->mnt_parent) {
+ src_parent = src_parent->mnt_parent;
+ dst_mnt = dst_mnt->mnt_parent;
}
- p = s;
- parent = q;
- q = clone_mnt(p, p->mnt.mnt_root, flag);
- if (IS_ERR(q))
+
+ src_parent = src_mnt;
+ dst_parent = dst_mnt;
+ dst_mnt = clone_mnt(src_mnt, src_mnt->mnt.mnt_root, flag);
+ if (IS_ERR(dst_mnt))
goto out;
lock_mount_hash();
- list_add_tail(&q->mnt_list, &res->mnt_list);
- attach_mnt(q, parent, p->mnt_mp, false);
+ list_add_tail(&dst_mnt->mnt_list, &res->mnt_list);
+ attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp, false);
unlock_mount_hash();
}
}
return res;
+
out:
if (res) {
lock_mount_hash();
umount_tree(res, UMOUNT_SYNC);
unlock_mount_hash();
}
- return q;
+ return dst_mnt;
}
/* Caller should check returned pointer for errors */
@@ -2078,7 +2198,7 @@ void drop_collected_mounts(struct vfsmount *mnt)
namespace_unlock();
}
-static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
+bool has_locked_children(struct mount *mnt, struct dentry *dentry)
{
struct mount *child;
@@ -3709,8 +3829,7 @@ static void free_mnt_ns(struct mnt_namespace *ns)
if (!is_anon_ns(ns))
ns_free_inum(&ns->ns);
dec_mnt_namespaces(ns->ucounts);
- put_user_ns(ns->user_ns);
- kfree(ns);
+ mnt_ns_tree_remove(ns);
}
/*
@@ -3749,7 +3868,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
if (!anon)
new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
refcount_set(&new_ns->ns.count, 1);
+ refcount_set(&new_ns->passive, 1);
new_ns->mounts = RB_ROOT;
+ RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
init_waitqueue_head(&new_ns->poll);
new_ns->user_ns = get_user_ns(user_ns);
new_ns->ucounts = ucounts;
@@ -3826,6 +3947,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
while (p->mnt.mnt_root != q->mnt.mnt_root)
p = next_mnt(skip_mnt_tree(p), old);
}
+ mnt_ns_tree_add(new_ns);
namespace_unlock();
if (rootmnt)
@@ -4843,6 +4965,40 @@ static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
return 0;
}
+static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
+{
+ s->sm.mask |= STATMOUNT_MNT_NS_ID;
+ s->sm.mnt_ns_id = ns->seq;
+}
+
+static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
+{
+ struct vfsmount *mnt = s->mnt;
+ struct super_block *sb = mnt->mnt_sb;
+ int err;
+
+ if (sb->s_op->show_options) {
+ size_t start = seq->count;
+
+ err = sb->s_op->show_options(seq, mnt->mnt_root);
+ if (err)
+ return err;
+
+ if (unlikely(seq_has_overflowed(seq)))
+ return -EAGAIN;
+
+ if (seq->count == start)
+ return 0;
+
+ /* skip leading comma */
+ memmove(seq->buf + start, seq->buf + start + 1,
+ seq->count - start - 1);
+ seq->count--;
+ }
+
+ return 0;
+}
+
static int statmount_string(struct kstatmount *s, u64 flag)
{
int ret;
@@ -4863,6 +5019,10 @@ static int statmount_string(struct kstatmount *s, u64 flag)
sm->mnt_point = seq->count;
ret = statmount_mnt_point(s, seq);
break;
+ case STATMOUNT_MNT_OPTS:
+ sm->mnt_opts = seq->count;
+ ret = statmount_mnt_opts(s, seq);
+ break;
default:
WARN_ON_ONCE(true);
return -EINVAL;
@@ -4903,23 +5063,84 @@ static int copy_statmount_to_user(struct kstatmount *s)
return 0;
}
-static int do_statmount(struct kstatmount *s)
+static struct mount *listmnt_next(struct mount *curr, bool reverse)
{
- struct mount *m = real_mount(s->mnt);
+ struct rb_node *node;
+
+ if (reverse)
+ node = rb_prev(&curr->mnt_node);
+ else
+ node = rb_next(&curr->mnt_node);
+
+ return node_to_mount(node);
+}
+
+static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
+{
+ struct mount *first, *child;
+
+ rwsem_assert_held(&namespace_sem);
+
+ /* We're looking at our own ns, just use get_fs_root. */
+ if (ns == current->nsproxy->mnt_ns) {
+ get_fs_root(current->fs, root);
+ return 0;
+ }
+
+ /*
+ * We have to find the first mount in our ns and use that, however it
+ * may not exist, so handle that properly.
+ */
+ if (RB_EMPTY_ROOT(&ns->mounts))
+ return -ENOENT;
+
+ first = child = ns->root;
+ for (;;) {
+ child = listmnt_next(child, false);
+ if (!child)
+ return -ENOENT;
+ if (child->mnt_parent == first)
+ break;
+ }
+
+ root->mnt = mntget(&child->mnt);
+ root->dentry = dget(root->mnt->mnt_root);
+ return 0;
+}
+
+static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
+ struct mnt_namespace *ns)
+{
+ struct path root __free(path_put) = {};
+ struct mount *m;
int err;
+ /* Has the namespace already been emptied? */
+ if (mnt_ns_id && RB_EMPTY_ROOT(&ns->mounts))
+ return -ENOENT;
+
+ s->mnt = lookup_mnt_in_ns(mnt_id, ns);
+ if (!s->mnt)
+ return -ENOENT;
+
+ err = grab_requested_root(ns, &root);
+ if (err)
+ return err;
+
/*
* Don't trigger audit denials. We just want to determine what
* mounts to show users.
*/
- if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
- !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
+ m = real_mount(s->mnt);
+ if (!is_path_reachable(m, m->mnt.mnt_root, &root) &&
+ !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
err = security_sb_statfs(s->mnt->mnt_root);
if (err)
return err;
+ s->root = root;
if (s->mask & STATMOUNT_SB_BASIC)
statmount_sb_basic(s);
@@ -4938,6 +5159,12 @@ static int do_statmount(struct kstatmount *s)
if (!err && s->mask & STATMOUNT_MNT_POINT)
err = statmount_string(s, STATMOUNT_MNT_POINT);
+ if (!err && s->mask & STATMOUNT_MNT_OPTS)
+ err = statmount_string(s, STATMOUNT_MNT_OPTS);
+
+ if (!err && s->mask & STATMOUNT_MNT_NS_ID)
+ statmount_mnt_ns_id(s, ns);
+
if (err)
return err;
@@ -4955,6 +5182,9 @@ static inline bool retry_statmount(const long ret, size_t *seq_size)
return true;
}
+#define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \
+ STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS)
+
static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
struct statmount __user *buf, size_t bufsize,
size_t seq_size)
@@ -4966,10 +5196,18 @@ static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
ks->mask = kreq->param;
ks->buf = buf;
ks->bufsize = bufsize;
- ks->seq.size = seq_size;
- ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT);
- if (!ks->seq.buf)
- return -ENOMEM;
+
+ if (ks->mask & STATMOUNT_STRING_REQ) {
+ if (bufsize == sizeof(ks->sm))
+ return -EOVERFLOW;
+
+ ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT);
+ if (!ks->seq.buf)
+ return -ENOMEM;
+
+ ks->seq.size = seq_size;
+ }
+
return 0;
}
@@ -4979,7 +5217,7 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
int ret;
size_t usize;
- BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER0);
+ BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1);
ret = get_user(usize, &req->size);
if (ret)
@@ -4994,16 +5232,32 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
return ret;
if (kreq->spare != 0)
return -EINVAL;
+ /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
+ if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET)
+ return -EINVAL;
return 0;
}
+/*
+ * If the user requested a specific mount namespace id, look that up and return
+ * that, or if not simply grab a passive reference on our mount namespace and
+ * return that.
+ */
+static struct mnt_namespace *grab_requested_mnt_ns(u64 mnt_ns_id)
+{
+ if (mnt_ns_id)
+ return lookup_mnt_ns(mnt_ns_id);
+ refcount_inc(&current->nsproxy->mnt_ns->passive);
+ return current->nsproxy->mnt_ns;
+}
+
SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
struct statmount __user *, buf, size_t, bufsize,
unsigned int, flags)
{
- struct vfsmount *mnt;
+ struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
+ struct kstatmount *ks __free(kfree) = NULL;
struct mnt_id_req kreq;
- struct kstatmount ks;
/* We currently support retrieval of 3 strings. */
size_t seq_size = 3 * PATH_MAX;
int ret;
@@ -5015,64 +5269,88 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
if (ret)
return ret;
+ ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
+ if (!ns)
+ return -ENOENT;
+
+ if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
+ !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
+ return -ENOENT;
+
+ ks = kmalloc(sizeof(*ks), GFP_KERNEL_ACCOUNT);
+ if (!ks)
+ return -ENOMEM;
+
retry:
- ret = prepare_kstatmount(&ks, &kreq, buf, bufsize, seq_size);
+ ret = prepare_kstatmount(ks, &kreq, buf, bufsize, seq_size);
if (ret)
return ret;
- down_read(&namespace_sem);
- mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns);
- if (!mnt) {
- up_read(&namespace_sem);
- kvfree(ks.seq.buf);
- return -ENOENT;
- }
-
- ks.mnt = mnt;
- get_fs_root(current->fs, &ks.root);
- ret = do_statmount(&ks);
- path_put(&ks.root);
- up_read(&namespace_sem);
+ scoped_guard(rwsem_read, &namespace_sem)
+ ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, ns);
if (!ret)
- ret = copy_statmount_to_user(&ks);
- kvfree(ks.seq.buf);
+ ret = copy_statmount_to_user(ks);
+ kvfree(ks->seq.buf);
if (retry_statmount(ret, &seq_size))
goto retry;
return ret;
}
-static struct mount *listmnt_next(struct mount *curr)
+static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
+ u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids,
+ bool reverse)
{
- return node_to_mount(rb_next(&curr->mnt_node));
-}
-
-static ssize_t do_listmount(struct mount *first, struct path *orig,
- u64 mnt_parent_id, u64 __user *mnt_ids,
- size_t nr_mnt_ids, const struct path *root)
-{
- struct mount *r;
+ struct path root __free(path_put) = {};
+ struct path orig;
+ struct mount *r, *first;
ssize_t ret;
+ rwsem_assert_held(&namespace_sem);
+
+ ret = grab_requested_root(ns, &root);
+ if (ret)
+ return ret;
+
+ if (mnt_parent_id == LSMT_ROOT) {
+ orig = root;
+ } else {
+ orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns);
+ if (!orig.mnt)
+ return -ENOENT;
+ orig.dentry = orig.mnt->mnt_root;
+ }
+
/*
* Don't trigger audit denials. We just want to determine what
* mounts to show users.
*/
- if (!is_path_reachable(real_mount(orig->mnt), orig->dentry, root) &&
- !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
+ if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) &&
+ !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
- ret = security_sb_statfs(orig->dentry);
+ ret = security_sb_statfs(orig.dentry);
if (ret)
return ret;
- for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r)) {
+ if (!last_mnt_id) {
+ if (reverse)
+ first = node_to_mount(rb_last(&ns->mounts));
+ else
+ first = node_to_mount(rb_first(&ns->mounts));
+ } else {
+ if (reverse)
+ first = mnt_find_id_at_reverse(ns, last_mnt_id - 1);
+ else
+ first = mnt_find_id_at(ns, last_mnt_id + 1);
+ }
+
+ for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r, reverse)) {
if (r->mnt_id_unique == mnt_parent_id)
continue;
- if (!is_path_reachable(r, r->mnt.mnt_root, orig))
+ if (!is_path_reachable(r, r->mnt.mnt_root, &orig))
continue;
- if (put_user(r->mnt_id_unique, mnt_ids))
- return -EFAULT;
+ *mnt_ids = r->mnt_id_unique;
mnt_ids++;
nr_mnt_ids--;
ret++;
@@ -5080,22 +5358,26 @@ static ssize_t do_listmount(struct mount *first, struct path *orig,
return ret;
}
-SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *,
- mnt_ids, size_t, nr_mnt_ids, unsigned int, flags)
+SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
+ u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags)
{
- struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+ u64 *kmnt_ids __free(kvfree) = NULL;
+ const size_t maxcount = 1000000;
+ struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
struct mnt_id_req kreq;
- struct mount *first;
- struct path root, orig;
- u64 mnt_parent_id, last_mnt_id;
- const size_t maxcount = (size_t)-1 >> 3;
+ u64 last_mnt_id;
ssize_t ret;
- if (flags)
+ if (flags & ~LISTMOUNT_REVERSE)
return -EINVAL;
+ /*
+ * If the mount namespace really has more than 1 million mounts the
+ * caller must iterate over the mount namespace (and reconsider their
+ * system design...).
+ */
if (unlikely(nr_mnt_ids > maxcount))
- return -EFAULT;
+ return -EOVERFLOW;
if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids)))
return -EFAULT;
@@ -5103,33 +5385,37 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *,
ret = copy_mnt_id_req(req, &kreq);
if (ret)
return ret;
- mnt_parent_id = kreq.mnt_id;
+
last_mnt_id = kreq.param;
+ /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
+ if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET)
+ return -EINVAL;
- down_read(&namespace_sem);
- get_fs_root(current->fs, &root);
- if (mnt_parent_id == LSMT_ROOT) {
- orig = root;
- } else {
- ret = -ENOENT;
- orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns);
- if (!orig.mnt)
- goto err;
- orig.dentry = orig.mnt->mnt_root;
- }
- if (!last_mnt_id)
- first = node_to_mount(rb_first(&ns->mounts));
- else
- first = mnt_find_id_at(ns, last_mnt_id + 1);
+ kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kmnt_ids),
+ GFP_KERNEL_ACCOUNT);
+ if (!kmnt_ids)
+ return -ENOMEM;
+
+ ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
+ if (!ns)
+ return -ENOENT;
+
+ if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
+ !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
+ return -ENOENT;
+
+ scoped_guard(rwsem_read, &namespace_sem)
+ ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids,
+ nr_mnt_ids, (flags & LISTMOUNT_REVERSE));
+ if (ret <= 0)
+ return ret;
+
+ if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids)))
+ return -EFAULT;
- ret = do_listmount(first, &orig, mnt_parent_id, mnt_ids, nr_mnt_ids, &root);
-err:
- path_put(&root);
- up_read(&namespace_sem);
return ret;
}
-
static void __init init_mount_tree(void)
{
struct vfsmount *mnt;
@@ -5157,6 +5443,8 @@ static void __init init_mount_tree(void)
set_fs_pwd(current->fs, &root);
set_fs_root(current->fs, &root);
+
+ mnt_ns_tree_add(ns);
}
void __init mnt_init(void)
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index a6bb03bea920..a6d5d07cd436 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -117,7 +117,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
if (folio->index == rreq->no_unlock_folio &&
test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
- _debug("no unlock");
+ kdebug("no unlock");
else
folio_unlock(folio);
}
@@ -204,7 +204,7 @@ void netfs_readahead(struct readahead_control *ractl)
struct netfs_inode *ctx = netfs_inode(ractl->mapping->host);
int ret;
- _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
+ kenter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
if (readahead_count(ractl) == 0)
return;
@@ -268,10 +268,10 @@ int netfs_read_folio(struct file *file, struct folio *folio)
struct folio *sink = NULL;
int ret;
- _enter("%lx", folio->index);
+ kenter("%lx", folio->index);
rreq = netfs_alloc_request(mapping, file,
- folio_file_pos(folio), folio_size(folio),
+ folio_pos(folio), folio_size(folio),
NETFS_READPAGE);
if (IS_ERR(rreq)) {
ret = PTR_ERR(rreq);
@@ -470,7 +470,7 @@ retry:
}
rreq = netfs_alloc_request(mapping, file,
- folio_file_pos(folio), folio_size(folio),
+ folio_pos(folio), folio_size(folio),
NETFS_READ_FOR_WRITE);
if (IS_ERR(rreq)) {
ret = PTR_ERR(rreq);
@@ -508,7 +508,7 @@ retry:
have_folio:
*_folio = folio;
- _leave(" = 0");
+ kleave(" = 0");
return 0;
error_put:
@@ -518,7 +518,7 @@ error:
folio_unlock(folio);
folio_put(folio);
}
- _leave(" = %d", ret);
+ kleave(" = %d", ret);
return ret;
}
EXPORT_SYMBOL(netfs_write_begin);
@@ -536,7 +536,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio,
size_t flen = folio_size(folio);
int ret;
- _enter("%zx @%llx", flen, start);
+ kenter("%zx @%llx", flen, start);
ret = -ENOMEM;
@@ -567,7 +567,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio,
error_put:
netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
error:
- _leave(" = %d", ret);
+ kleave(" = %d", ret);
return ret;
}
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index 07bc1fd43530..68a3f1383cee 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -54,9 +54,9 @@ static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
{
struct netfs_folio *finfo = netfs_folio_info(folio);
struct netfs_group *group = netfs_folio_group(folio);
- loff_t pos = folio_file_pos(folio);
+ loff_t pos = folio_pos(folio);
- _enter("");
+ kenter("");
if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE)
return NETFS_FLUSH_CONTENT;
@@ -272,12 +272,12 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
*/
howto = netfs_how_to_modify(ctx, file, folio, netfs_group,
flen, offset, part, maybe_trouble);
- _debug("howto %u", howto);
+ kdebug("howto %u", howto);
switch (howto) {
case NETFS_JUST_PREFETCH:
ret = netfs_prefetch_for_write(file, folio, offset, part);
if (ret < 0) {
- _debug("prefetch = %zd", ret);
+ kdebug("prefetch = %zd", ret);
goto error_folio_unlock;
}
break;
@@ -418,7 +418,7 @@ out:
}
iocb->ki_pos += written;
- _leave(" = %zd [%zd]", written, ret);
+ kleave(" = %zd [%zd]", written, ret);
return written ? written : ret;
error_folio_unlock:
@@ -491,7 +491,7 @@ ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct netfs_inode *ictx = netfs_inode(inode);
ssize_t ret;
- _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
+ kenter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
if (!iov_iter_count(from))
return 0;
@@ -523,17 +523,23 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
struct netfs_group *group;
struct folio *folio = page_folio(vmf->page);
struct file *file = vmf->vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
struct inode *inode = file_inode(file);
struct netfs_inode *ictx = netfs_inode(inode);
vm_fault_t ret = VM_FAULT_RETRY;
int err;
- _enter("%lx", folio->index);
+ kenter("%lx", folio->index);
sb_start_pagefault(inode->i_sb);
if (folio_lock_killable(folio) < 0)
goto out;
+ if (folio->mapping != mapping) {
+ folio_unlock(folio);
+ ret = VM_FAULT_NOPAGE;
+ goto out;
+ }
if (folio_wait_writeback_killable(folio)) {
ret = VM_FAULT_LOCKED;
@@ -549,9 +555,9 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
group = netfs_folio_group(folio);
if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) {
folio_unlock(folio);
- err = filemap_fdatawait_range(inode->i_mapping,
- folio_pos(folio),
- folio_pos(folio) + folio_size(folio));
+ err = filemap_fdatawrite_range(mapping,
+ folio_pos(folio),
+ folio_pos(folio) + folio_size(folio));
switch (err) {
case 0:
ret = VM_FAULT_RETRY;
diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
index 10a1e4da6bda..b6debac6205f 100644
--- a/fs/netfs/direct_read.c
+++ b/fs/netfs/direct_read.c
@@ -33,7 +33,7 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i
size_t orig_count = iov_iter_count(iter);
bool async = !is_sync_kiocb(iocb);
- _enter("");
+ kenter("");
if (!orig_count)
return 0; /* Don't update atime */
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index e14cd53ac9fd..792ef17bae21 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -37,7 +37,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
size_t len = iov_iter_count(iter);
bool async = !is_sync_kiocb(iocb);
- _enter("");
+ kenter("");
/* We're going to need a bounce buffer if what we transmit is going to
* be different in some way to the source buffer, e.g. because it gets
@@ -45,7 +45,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
*/
// TODO
- _debug("uw %llx-%llx", start, end);
+ kdebug("uw %llx-%llx", start, end);
wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp, start,
iocb->ki_flags & IOCB_DIRECT ?
@@ -92,10 +92,11 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
if (async)
wreq->iocb = iocb;
+ wreq->len = iov_iter_count(&wreq->io_iter);
wreq->cleanup = netfs_cleanup_dio_write;
- ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), iov_iter_count(&wreq->io_iter));
+ ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len);
if (ret < 0) {
- _debug("begin = %zd", ret);
+ kdebug("begin = %zd", ret);
goto out;
}
@@ -142,7 +143,7 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
loff_t pos = iocb->ki_pos;
unsigned long long end = pos + iov_iter_count(from) - 1;
- _enter("%llx,%zx,%llx", pos, iov_iter_count(from), i_size_read(inode));
+ kenter("%llx,%zx,%llx", pos, iov_iter_count(from), i_size_read(inode));
if (!iov_iter_count(from))
return 0;
diff --git a/fs/netfs/fscache_cache.c b/fs/netfs/fscache_cache.c
index 9397ed39b0b4..288a73c3072d 100644
--- a/fs/netfs/fscache_cache.c
+++ b/fs/netfs/fscache_cache.c
@@ -237,7 +237,7 @@ int fscache_add_cache(struct fscache_cache *cache,
{
int n_accesses;
- _enter("{%s,%s}", ops->name, cache->name);
+ kenter("{%s,%s}", ops->name, cache->name);
BUG_ON(fscache_cache_state(cache) != FSCACHE_CACHE_IS_PREPARING);
@@ -257,7 +257,7 @@ int fscache_add_cache(struct fscache_cache *cache,
up_write(&fscache_addremove_sem);
pr_notice("Cache \"%s\" added (type %s)\n", cache->name, ops->name);
- _leave(" = 0 [%s]", cache->name);
+ kleave(" = 0 [%s]", cache->name);
return 0;
}
EXPORT_SYMBOL(fscache_add_cache);
diff --git a/fs/netfs/fscache_cookie.c b/fs/netfs/fscache_cookie.c
index bce2492186d0..4d1e8bf4c615 100644
--- a/fs/netfs/fscache_cookie.c
+++ b/fs/netfs/fscache_cookie.c
@@ -456,7 +456,7 @@ struct fscache_cookie *__fscache_acquire_cookie(
{
struct fscache_cookie *cookie;
- _enter("V=%x", volume->debug_id);
+ kenter("V=%x", volume->debug_id);
if (!index_key || !index_key_len || index_key_len > 255 || aux_data_len > 255)
return NULL;
@@ -484,7 +484,7 @@ struct fscache_cookie *__fscache_acquire_cookie(
trace_fscache_acquire(cookie);
fscache_stat(&fscache_n_acquires_ok);
- _leave(" = c=%08x", cookie->debug_id);
+ kleave(" = c=%08x", cookie->debug_id);
return cookie;
}
EXPORT_SYMBOL(__fscache_acquire_cookie);
@@ -505,7 +505,7 @@ static void fscache_perform_lookup(struct fscache_cookie *cookie)
enum fscache_access_trace trace = fscache_access_lookup_cookie_end_failed;
bool need_withdraw = false;
- _enter("");
+ kenter("");
if (!cookie->volume->cache_priv) {
fscache_create_volume(cookie->volume, true);
@@ -519,7 +519,7 @@ static void fscache_perform_lookup(struct fscache_cookie *cookie)
if (cookie->state != FSCACHE_COOKIE_STATE_FAILED)
fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_QUIESCENT);
need_withdraw = true;
- _leave(" [fail]");
+ kleave(" [fail]");
goto out;
}
@@ -572,7 +572,7 @@ void __fscache_use_cookie(struct fscache_cookie *cookie, bool will_modify)
bool queue = false;
int n_active;
- _enter("c=%08x", cookie->debug_id);
+ kenter("c=%08x", cookie->debug_id);
if (WARN(test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags),
"Trying to use relinquished cookie\n"))
@@ -636,7 +636,7 @@ again:
spin_unlock(&cookie->lock);
if (queue)
fscache_queue_cookie(cookie, fscache_cookie_get_use_work);
- _leave("");
+ kleave("");
}
EXPORT_SYMBOL(__fscache_use_cookie);
@@ -702,7 +702,7 @@ static void fscache_cookie_state_machine(struct fscache_cookie *cookie)
enum fscache_cookie_state state;
bool wake = false;
- _enter("c=%x", cookie->debug_id);
+ kenter("c=%x", cookie->debug_id);
again:
spin_lock(&cookie->lock);
@@ -820,7 +820,7 @@ out:
spin_unlock(&cookie->lock);
if (wake)
wake_up_cookie_state(cookie);
- _leave("");
+ kleave("");
}
static void fscache_cookie_worker(struct work_struct *work)
@@ -867,7 +867,7 @@ static void fscache_cookie_lru_do_one(struct fscache_cookie *cookie)
set_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags);
spin_unlock(&cookie->lock);
fscache_stat(&fscache_n_cookies_lru_expired);
- _debug("lru c=%x", cookie->debug_id);
+ kdebug("lru c=%x", cookie->debug_id);
__fscache_withdraw_cookie(cookie);
}
@@ -971,7 +971,7 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire)
if (retire)
fscache_stat(&fscache_n_relinquishes_retire);
- _enter("c=%08x{%d},%d",
+ kenter("c=%08x{%d},%d",
cookie->debug_id, atomic_read(&cookie->n_active), retire);
if (WARN(test_and_set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags),
@@ -1050,7 +1050,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie,
{
bool is_caching;
- _enter("c=%x", cookie->debug_id);
+ kenter("c=%x", cookie->debug_id);
fscache_stat(&fscache_n_invalidates);
@@ -1072,7 +1072,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie,
case FSCACHE_COOKIE_STATE_INVALIDATING: /* is_still_valid will catch it */
default:
spin_unlock(&cookie->lock);
- _leave(" [no %u]", cookie->state);
+ kleave(" [no %u]", cookie->state);
return;
case FSCACHE_COOKIE_STATE_LOOKING_UP:
@@ -1081,7 +1081,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie,
fallthrough;
case FSCACHE_COOKIE_STATE_CREATING:
spin_unlock(&cookie->lock);
- _leave(" [look %x]", cookie->inval_counter);
+ kleave(" [look %x]", cookie->inval_counter);
return;
case FSCACHE_COOKIE_STATE_ACTIVE:
@@ -1094,7 +1094,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie,
if (is_caching)
fscache_queue_cookie(cookie, fscache_cookie_get_inval_work);
- _leave(" [inv]");
+ kleave(" [inv]");
return;
}
}
diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c
index 38637e5c9b57..bf4eaeec44fb 100644
--- a/fs/netfs/fscache_io.c
+++ b/fs/netfs/fscache_io.c
@@ -28,12 +28,12 @@ bool fscache_wait_for_operation(struct netfs_cache_resources *cres,
again:
if (!fscache_cache_is_live(cookie->volume->cache)) {
- _leave(" [broken]");
+ kleave(" [broken]");
return false;
}
state = fscache_cookie_state(cookie);
- _enter("c=%08x{%u},%x", cookie->debug_id, state, want_state);
+ kenter("c=%08x{%u},%x", cookie->debug_id, state, want_state);
switch (state) {
case FSCACHE_COOKIE_STATE_CREATING:
@@ -52,7 +52,7 @@ again:
case FSCACHE_COOKIE_STATE_DROPPED:
case FSCACHE_COOKIE_STATE_RELINQUISHING:
default:
- _leave(" [not live]");
+ kleave(" [not live]");
return false;
}
@@ -92,7 +92,7 @@ again:
spin_lock(&cookie->lock);
state = fscache_cookie_state(cookie);
- _enter("c=%08x{%u},%x", cookie->debug_id, state, want_state);
+ kenter("c=%08x{%u},%x", cookie->debug_id, state, want_state);
switch (state) {
case FSCACHE_COOKIE_STATE_LOOKING_UP:
@@ -140,7 +140,7 @@ failed:
cres->cache_priv = NULL;
cres->ops = NULL;
fscache_end_cookie_access(cookie, fscache_access_io_not_live);
- _leave(" = -ENOBUFS");
+ kleave(" = -ENOBUFS");
return -ENOBUFS;
}
@@ -224,7 +224,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,
if (len == 0)
goto abandon;
- _enter("%llx,%zx", start, len);
+ kenter("%llx,%zx", start, len);
wreq = kzalloc(sizeof(struct fscache_write_request), GFP_NOFS);
if (!wreq)
diff --git a/fs/netfs/fscache_main.c b/fs/netfs/fscache_main.c
index 42e98bb523e3..bf9b33d26e31 100644
--- a/fs/netfs/fscache_main.c
+++ b/fs/netfs/fscache_main.c
@@ -99,7 +99,7 @@ error_wq:
*/
void __exit fscache_exit(void)
{
- _enter("");
+ kenter("");
kmem_cache_destroy(fscache_cookie_jar);
fscache_proc_cleanup();
diff --git a/fs/netfs/fscache_volume.c b/fs/netfs/fscache_volume.c
index cdf991bdd9de..2e2a405ca9b0 100644
--- a/fs/netfs/fscache_volume.c
+++ b/fs/netfs/fscache_volume.c
@@ -27,6 +27,19 @@ struct fscache_volume *fscache_get_volume(struct fscache_volume *volume,
return volume;
}
+struct fscache_volume *fscache_try_get_volume(struct fscache_volume *volume,
+ enum fscache_volume_trace where)
+{
+ int ref;
+
+ if (!__refcount_inc_not_zero(&volume->ref, &ref))
+ return NULL;
+
+ trace_fscache_volume(volume->debug_id, ref + 1, where);
+ return volume;
+}
+EXPORT_SYMBOL(fscache_try_get_volume);
+
static void fscache_see_volume(struct fscache_volume *volume,
enum fscache_volume_trace where)
{
@@ -251,7 +264,7 @@ static struct fscache_volume *fscache_alloc_volume(const char *volume_key,
fscache_see_volume(volume, fscache_volume_new_acquire);
fscache_stat(&fscache_n_volumes);
up_write(&fscache_addremove_sem);
- _leave(" = v=%x", volume->debug_id);
+ kleave(" = v=%x", volume->debug_id);
return volume;
err_vol:
@@ -420,6 +433,7 @@ void fscache_put_volume(struct fscache_volume *volume,
fscache_free_volume(volume);
}
}
+EXPORT_SYMBOL(fscache_put_volume);
/*
* Relinquish a volume representation cookie.
@@ -452,7 +466,7 @@ void fscache_withdraw_volume(struct fscache_volume *volume)
{
int n_accesses;
- _debug("withdraw V=%x", volume->debug_id);
+ kdebug("withdraw V=%x", volume->debug_id);
/* Allow wakeups on dec-to-0 */
n_accesses = atomic_dec_return(&volume->n_accesses);
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 95e281a8af78..21e46bc9aa49 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -34,7 +34,6 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync);
/*
* main.c
*/
-extern unsigned int netfs_debug;
extern struct list_head netfs_io_requests;
extern spinlock_t netfs_proc_lock;
extern mempool_t netfs_request_pool;
@@ -63,15 +62,6 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {}
/*
* misc.c
*/
-#define NETFS_FLAG_PUT_MARK BIT(0)
-#define NETFS_FLAG_PAGECACHE_MARK BIT(1)
-int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index,
- struct folio *folio, unsigned int flags,
- gfp_t gfp_mask);
-int netfs_add_folios_to_buffer(struct xarray *buffer,
- struct address_space *mapping,
- pgoff_t index, pgoff_t to, gfp_t gfp_mask);
-void netfs_clear_buffer(struct xarray *buffer);
/*
* objects.c
@@ -353,8 +343,6 @@ extern const struct seq_operations fscache_volumes_seq_ops;
struct fscache_volume *fscache_get_volume(struct fscache_volume *volume,
enum fscache_volume_trace where);
-void fscache_put_volume(struct fscache_volume *volume,
- enum fscache_volume_trace where);
bool fscache_begin_volume_access(struct fscache_volume *volume,
struct fscache_cookie *cookie,
enum fscache_access_trace why);
@@ -365,42 +353,12 @@ void fscache_create_volume(struct fscache_volume *volume, bool wait);
* debug tracing
*/
#define dbgprintk(FMT, ...) \
- printk("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
+ pr_debug("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
-#ifdef __KDEBUG
-#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
-#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
-#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
-
-#elif defined(CONFIG_NETFS_DEBUG)
-#define _enter(FMT, ...) \
-do { \
- if (netfs_debug) \
- kenter(FMT, ##__VA_ARGS__); \
-} while (0)
-
-#define _leave(FMT, ...) \
-do { \
- if (netfs_debug) \
- kleave(FMT, ##__VA_ARGS__); \
-} while (0)
-
-#define _debug(FMT, ...) \
-do { \
- if (netfs_debug) \
- kdebug(FMT, ##__VA_ARGS__); \
-} while (0)
-
-#else
-#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
-#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
-#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
-#endif
-
/*
* assertions
*/
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index c93851b98368..c7576481c321 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -130,7 +130,7 @@ static void netfs_reset_subreq_iter(struct netfs_io_request *rreq,
if (count == remaining)
return;
- _debug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n",
+ kdebug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n",
rreq->debug_id, subreq->debug_index,
iov_iter_count(&subreq->io_iter), subreq->transferred,
subreq->len, rreq->i_size,
@@ -326,7 +326,7 @@ void netfs_subreq_terminated(struct netfs_io_subrequest *subreq,
struct netfs_io_request *rreq = subreq->rreq;
int u;
- _enter("R=%x[%x]{%llx,%lx},%zd",
+ kenter("R=%x[%x]{%llx,%lx},%zd",
rreq->debug_id, subreq->debug_index,
subreq->start, subreq->flags, transferred_or_error);
@@ -435,7 +435,7 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq,
struct netfs_inode *ictx = netfs_inode(rreq->inode);
size_t lsize;
- _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
+ kenter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
if (rreq->origin != NETFS_DIO_READ) {
source = netfs_cache_prepare_read(subreq, rreq->i_size);
@@ -518,7 +518,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
subreq->start = rreq->start + rreq->submitted;
subreq->len = io_iter->count;
- _debug("slice %llx,%zx,%llx", subreq->start, subreq->len, rreq->submitted);
+ kdebug("slice %llx,%zx,%llx", subreq->start, subreq->len, rreq->submitted);
list_add_tail(&subreq->rreq_link, &rreq->subrequests);
/* Call out to the cache to find out what it can do with the remaining
@@ -570,7 +570,7 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
struct iov_iter io_iter;
int ret;
- _enter("R=%x %llx-%llx",
+ kenter("R=%x %llx-%llx",
rreq->debug_id, rreq->start, rreq->start + rreq->len - 1);
if (rreq->len == 0) {
@@ -593,7 +593,7 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
atomic_set(&rreq->nr_outstanding, 1);
io_iter = rreq->io_iter;
do {
- _debug("submit %llx + %llx >= %llx",
+ kdebug("submit %llx + %llx >= %llx",
rreq->start, rreq->submitted, rreq->i_size);
if (rreq->origin == NETFS_DIO_READ &&
rreq->start + rreq->submitted >= rreq->i_size)
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 5f0f438e5d21..db824c372842 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -20,10 +20,6 @@ MODULE_LICENSE("GPL");
EXPORT_TRACEPOINT_SYMBOL(netfs_sreq);
-unsigned netfs_debug;
-module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask");
-
static struct kmem_cache *netfs_request_slab;
static struct kmem_cache *netfs_subrequest_slab;
mempool_t netfs_request_pool;
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index bc1fc54fb724..172808e83ca8 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -8,87 +8,6 @@
#include <linux/swap.h>
#include "internal.h"
-/*
- * Attach a folio to the buffer and maybe set marks on it to say that we need
- * to put the folio later and twiddle the pagecache flags.
- */
-int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index,
- struct folio *folio, unsigned int flags,
- gfp_t gfp_mask)
-{
- XA_STATE_ORDER(xas, xa, index, folio_order(folio));
-
-retry:
- xas_lock(&xas);
- for (;;) {
- xas_store(&xas, folio);
- if (!xas_error(&xas))
- break;
- xas_unlock(&xas);
- if (!xas_nomem(&xas, gfp_mask))
- return xas_error(&xas);
- goto retry;
- }
-
- if (flags & NETFS_FLAG_PUT_MARK)
- xas_set_mark(&xas, NETFS_BUF_PUT_MARK);
- if (flags & NETFS_FLAG_PAGECACHE_MARK)
- xas_set_mark(&xas, NETFS_BUF_PAGECACHE_MARK);
- xas_unlock(&xas);
- return xas_error(&xas);
-}
-
-/*
- * Create the specified range of folios in the buffer attached to the read
- * request. The folios are marked with NETFS_BUF_PUT_MARK so that we know that
- * these need freeing later.
- */
-int netfs_add_folios_to_buffer(struct xarray *buffer,
- struct address_space *mapping,
- pgoff_t index, pgoff_t to, gfp_t gfp_mask)
-{
- struct folio *folio;
- int ret;
-
- if (to + 1 == index) /* Page range is inclusive */
- return 0;
-
- do {
- /* TODO: Figure out what order folio can be allocated here */
- folio = filemap_alloc_folio(readahead_gfp_mask(mapping), 0);
- if (!folio)
- return -ENOMEM;
- folio->index = index;
- ret = netfs_xa_store_and_mark(buffer, index, folio,
- NETFS_FLAG_PUT_MARK, gfp_mask);
- if (ret < 0) {
- folio_put(folio);
- return ret;
- }
-
- index += folio_nr_pages(folio);
- } while (index <= to && index != 0);
-
- return 0;
-}
-
-/*
- * Clear an xarray buffer, putting a ref on the folios that have
- * NETFS_BUF_PUT_MARK set.
- */
-void netfs_clear_buffer(struct xarray *buffer)
-{
- struct folio *folio;
- XA_STATE(xas, buffer, 0);
-
- rcu_read_lock();
- xas_for_each_marked(&xas, folio, ULONG_MAX, NETFS_BUF_PUT_MARK) {
- folio_put(folio);
- }
- rcu_read_unlock();
- xa_destroy(buffer);
-}
-
/**
* netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback
* @mapping: The mapping the folio belongs to.
@@ -107,7 +26,7 @@ bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio)
struct fscache_cookie *cookie = netfs_i_cookie(ictx);
bool need_use = false;
- _enter("");
+ kenter("");
if (!filemap_dirty_folio(mapping, folio))
return false;
@@ -180,7 +99,7 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
struct netfs_folio *finfo;
size_t flen = folio_size(folio);
- _enter("{%lx},%zx,%zx", folio->index, offset, length);
+ kenter("{%lx},%zx,%zx", folio->index, offset, length);
if (!folio_test_private(folio))
return;
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index 426cf87aaf2e..488147439fe0 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -161,7 +161,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
{
struct list_head *next;
- _enter("R=%x[%x:]", wreq->debug_id, stream->stream_nr);
+ kenter("R=%x[%x:]", wreq->debug_id, stream->stream_nr);
if (list_empty(&stream->subrequests))
return;
@@ -374,7 +374,7 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq)
unsigned int notes;
int s;
- _enter("%llx-%llx", wreq->start, wreq->start + wreq->len);
+ kenter("%llx-%llx", wreq->start, wreq->start + wreq->len);
trace_netfs_collect(wreq);
trace_netfs_rreq(wreq, netfs_rreq_trace_collect);
@@ -409,7 +409,7 @@ reassess_streams:
front = stream->front;
while (front) {
trace_netfs_collect_sreq(wreq, front);
- //_debug("sreq [%x] %llx %zx/%zx",
+ //kdebug("sreq [%x] %llx %zx/%zx",
// front->debug_index, front->start, front->transferred, front->len);
/* Stall if there may be a discontinuity. */
@@ -598,7 +598,7 @@ reassess_streams:
out:
netfs_put_group_many(wreq->group, wreq->nr_group_rel);
wreq->nr_group_rel = 0;
- _leave(" = %x", notes);
+ kleave(" = %x", notes);
return;
need_retry:
@@ -606,7 +606,7 @@ need_retry:
* that any partially completed op will have had any wholly transferred
* folios removed from it.
*/
- _debug("retry");
+ kdebug("retry");
netfs_retry_writes(wreq);
goto out;
}
@@ -621,7 +621,7 @@ void netfs_write_collection_worker(struct work_struct *work)
size_t transferred;
int s;
- _enter("R=%x", wreq->debug_id);
+ kenter("R=%x", wreq->debug_id);
netfs_see_request(wreq, netfs_rreq_trace_see_work);
if (!test_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags)) {
@@ -684,7 +684,7 @@ void netfs_write_collection_worker(struct work_struct *work)
if (wreq->origin == NETFS_DIO_WRITE)
inode_dio_end(wreq->inode);
- _debug("finished");
+ kdebug("finished");
trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS);
@@ -744,7 +744,7 @@ void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
struct netfs_io_request *wreq = subreq->rreq;
struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr];
- _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
+ kenter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
switch (subreq->source) {
case NETFS_UPLOAD_TO_SERVER:
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index 3aa86e268f40..d7c971df8866 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -99,7 +99,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
if (IS_ERR(wreq))
return wreq;
- _enter("R=%x", wreq->debug_id);
+ kenter("R=%x", wreq->debug_id);
ictx = netfs_inode(wreq->inode);
if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags))
@@ -159,7 +159,7 @@ static void netfs_prepare_write(struct netfs_io_request *wreq,
subreq->max_nr_segs = INT_MAX;
subreq->stream_nr = stream->stream_nr;
- _enter("R=%x[%x]", wreq->debug_id, subreq->debug_index);
+ kenter("R=%x[%x]", wreq->debug_id, subreq->debug_index);
trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
refcount_read(&subreq->ref),
@@ -215,7 +215,7 @@ static void netfs_do_issue_write(struct netfs_io_stream *stream,
{
struct netfs_io_request *wreq = subreq->rreq;
- _enter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len);
+ kenter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len);
if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
return netfs_write_subrequest_terminated(subreq, subreq->error, false);
@@ -272,11 +272,11 @@ int netfs_advance_write(struct netfs_io_request *wreq,
size_t part;
if (!stream->avail) {
- _leave("no write");
+ kleave("no write");
return len;
}
- _enter("R=%x[%x]", wreq->debug_id, subreq ? subreq->debug_index : 0);
+ kenter("R=%x[%x]", wreq->debug_id, subreq ? subreq->debug_index : 0);
if (subreq && start != subreq->start + subreq->len) {
netfs_issue_write(wreq, stream);
@@ -288,7 +288,7 @@ int netfs_advance_write(struct netfs_io_request *wreq,
subreq = stream->construct;
part = min(subreq->max_len - subreq->len, len);
- _debug("part %zx/%zx %zx/%zx", subreq->len, subreq->max_len, part, len);
+ kdebug("part %zx/%zx %zx/%zx", subreq->len, subreq->max_len, part, len);
subreq->len += part;
subreq->nr_segs++;
@@ -319,7 +319,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
bool to_eof = false, streamw = false;
bool debug = false;
- _enter("");
+ kenter("");
/* netfs_perform_write() may shift i_size around the page or from out
* of the page to beyond it, but cannot move i_size into or through the
@@ -329,7 +329,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
if (fpos >= i_size) {
/* mmap beyond eof. */
- _debug("beyond eof");
+ kdebug("beyond eof");
folio_start_writeback(folio);
folio_unlock(folio);
wreq->nr_group_rel += netfs_folio_written_back(folio);
@@ -363,7 +363,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
}
flen -= foff;
- _debug("folio %zx %zx %zx", foff, flen, fsize);
+ kdebug("folio %zx %zx %zx", foff, flen, fsize);
/* Deal with discontinuities in the stream of dirty pages. These can
* arise from a number of sources:
@@ -483,11 +483,11 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
if (!debug)
kdebug("R=%x: No submit", wreq->debug_id);
- if (flen < fsize)
+ if (foff + flen < fsize)
for (int s = 0; s < NR_IO_STREAMS; s++)
netfs_issue_write(wreq, &wreq->io_streams[s]);
- _leave(" = 0");
+ kleave(" = 0");
return 0;
}
@@ -522,7 +522,7 @@ int netfs_writepages(struct address_space *mapping,
netfs_stat(&netfs_n_wh_writepages);
do {
- _debug("wbiter %lx %llx", folio->index, wreq->start + wreq->submitted);
+ kdebug("wbiter %lx %llx", folio->index, wreq->start + wreq->submitted);
/* It appears we don't have to handle cyclic writeback wrapping. */
WARN_ON_ONCE(wreq && folio_pos(folio) < wreq->start + wreq->submitted);
@@ -546,14 +546,14 @@ int netfs_writepages(struct address_space *mapping,
mutex_unlock(&ictx->wb_lock);
netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
- _leave(" = %d", error);
+ kleave(" = %d", error);
return error;
couldnt_start:
netfs_kill_dirty_pages(mapping, wbc, folio);
out:
mutex_unlock(&ictx->wb_lock);
- _leave(" = %d", error);
+ kleave(" = %d", error);
return error;
}
EXPORT_SYMBOL(netfs_writepages);
@@ -590,7 +590,7 @@ int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_c
struct folio *folio, size_t copied, bool to_page_end,
struct folio **writethrough_cache)
{
- _enter("R=%x ic=%zu ws=%u cp=%zu tp=%u",
+ kenter("R=%x ic=%zu ws=%u cp=%zu tp=%u",
wreq->debug_id, wreq->iter.count, wreq->wsize, copied, to_page_end);
if (!*writethrough_cache) {
@@ -624,7 +624,7 @@ int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_contr
struct netfs_inode *ictx = netfs_inode(wreq->inode);
int ret;
- _enter("R=%x", wreq->debug_id);
+ kenter("R=%x", wreq->debug_id);
if (writethrough_cache)
netfs_write_folio(wreq, wbc, writethrough_cache);
@@ -657,7 +657,7 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t
loff_t start = wreq->start;
int error = 0;
- _enter("%zx", len);
+ kenter("%zx", len);
if (wreq->origin == NETFS_DIO_WRITE)
inode_dio_begin(wreq->inode);
@@ -665,7 +665,7 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t
while (len) {
// TODO: Prepare content encryption
- _debug("unbuffered %zx", len);
+ kdebug("unbuffered %zx", len);
part = netfs_advance_write(wreq, upload, start, len, false);
start += part;
len -= part;
@@ -684,6 +684,6 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t
if (list_empty(&upload->subrequests))
netfs_wake_write_collector(wreq, false);
- _leave(" = %d", error);
+ kleave(" = %d", error);
return error;
}
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 6be13e0ec170..0becdec12970 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -564,25 +564,32 @@ bl_find_get_deviceid(struct nfs_server *server,
gfp_t gfp_mask)
{
struct nfs4_deviceid_node *node;
- unsigned long start, end;
+ int err = -ENODEV;
retry:
node = nfs4_find_get_deviceid(server, id, cred, gfp_mask);
if (!node)
return ERR_PTR(-ENODEV);
- if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0)
- return node;
+ if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) {
+ unsigned long end = jiffies;
+ unsigned long start = end - PNFS_DEVICE_RETRY_TIMEOUT;
- end = jiffies;
- start = end - PNFS_DEVICE_RETRY_TIMEOUT;
- if (!time_in_range(node->timestamp_unavailable, start, end)) {
- nfs4_delete_deviceid(node->ld, node->nfs_client, id);
- goto retry;
+ if (!time_in_range(node->timestamp_unavailable, start, end)) {
+ nfs4_delete_deviceid(node->ld, node->nfs_client, id);
+ goto retry;
+ }
+ goto out_put;
}
+ if (!bl_register_dev(container_of(node, struct pnfs_block_dev, node)))
+ goto out_put;
+
+ return node;
+
+out_put:
nfs4_put_deviceid_node(node);
- return ERR_PTR(-ENODEV);
+ return ERR_PTR(err);
}
static int
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index f1eeb4914199..6da40ca19570 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -104,20 +104,26 @@ struct pnfs_block_dev {
u64 start;
u64 len;
+ enum pnfs_block_volume_type type;
u32 nr_children;
struct pnfs_block_dev *children;
u64 chunk_size;
struct file *bdev_file;
u64 disk_offset;
+ unsigned long flags;
u64 pr_key;
- bool pr_registered;
bool (*map)(struct pnfs_block_dev *dev, u64 offset,
struct pnfs_block_dev_map *map);
};
+/* pnfs_block_dev flag bits */
+enum {
+ PNFS_BDEV_REGISTERED = 0,
+};
+
/* sector_t fields are all in 512-byte sectors */
struct pnfs_block_extent {
union {
@@ -172,6 +178,7 @@ struct bl_msg_hdr {
#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */
/* dev.c */
+bool bl_register_dev(struct pnfs_block_dev *d);
struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
struct pnfs_device *pdev, gfp_t gfp_mask);
void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index 93ef7f864980..6252f4447945 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -10,12 +10,83 @@
#include <linux/pr.h>
#include "blocklayout.h"
+#include "../nfs4trace.h"
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+static void bl_unregister_scsi(struct pnfs_block_dev *dev)
+{
+ struct block_device *bdev = file_bdev(dev->bdev_file);
+ const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
+ int status;
+
+ if (!test_and_clear_bit(PNFS_BDEV_REGISTERED, &dev->flags))
+ return;
+
+ status = ops->pr_register(bdev, dev->pr_key, 0, false);
+ if (status)
+ trace_bl_pr_key_unreg_err(bdev, dev->pr_key, status);
+ else
+ trace_bl_pr_key_unreg(bdev, dev->pr_key);
+}
+
+static bool bl_register_scsi(struct pnfs_block_dev *dev)
+{
+ struct block_device *bdev = file_bdev(dev->bdev_file);
+ const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
+ int status;
+
+ if (test_and_set_bit(PNFS_BDEV_REGISTERED, &dev->flags))
+ return true;
+
+ status = ops->pr_register(bdev, 0, dev->pr_key, true);
+ if (status) {
+ trace_bl_pr_key_reg_err(bdev, dev->pr_key, status);
+ return false;
+ }
+ trace_bl_pr_key_reg(bdev, dev->pr_key);
+ return true;
+}
+
+static void bl_unregister_dev(struct pnfs_block_dev *dev)
+{
+ u32 i;
+
+ if (dev->nr_children) {
+ for (i = 0; i < dev->nr_children; i++)
+ bl_unregister_dev(&dev->children[i]);
+ return;
+ }
+
+ if (dev->type == PNFS_BLOCK_VOLUME_SCSI)
+ bl_unregister_scsi(dev);
+}
+
+bool bl_register_dev(struct pnfs_block_dev *dev)
+{
+ u32 i;
+
+ if (dev->nr_children) {
+ for (i = 0; i < dev->nr_children; i++) {
+ if (!bl_register_dev(&dev->children[i])) {
+ while (i > 0)
+ bl_unregister_dev(&dev->children[--i]);
+ return false;
+ }
+ }
+ return true;
+ }
+
+ if (dev->type == PNFS_BLOCK_VOLUME_SCSI)
+ return bl_register_scsi(dev);
+ return true;
+}
+
static void
bl_free_device(struct pnfs_block_dev *dev)
{
+ bl_unregister_dev(dev);
+
if (dev->nr_children) {
int i;
@@ -23,17 +94,6 @@ bl_free_device(struct pnfs_block_dev *dev)
bl_free_device(&dev->children[i]);
kfree(dev->children);
} else {
- if (dev->pr_registered) {
- const struct pr_ops *ops =
- file_bdev(dev->bdev_file)->bd_disk->fops->pr_ops;
- int error;
-
- error = ops->pr_register(file_bdev(dev->bdev_file),
- dev->pr_key, 0, false);
- if (error)
- pr_err("failed to unregister PR key.\n");
- }
-
if (dev->bdev_file)
fput(dev->bdev_file);
}
@@ -314,7 +374,7 @@ bl_open_path(struct pnfs_block_volume *v, const char *prefix)
bdev_file = bdev_file_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE,
NULL, NULL);
if (IS_ERR(bdev_file)) {
- pr_warn("pNFS: failed to open device %s (%ld)\n",
+ dprintk("failed to open device %s (%ld)\n",
devname, PTR_ERR(bdev_file));
}
@@ -327,8 +387,9 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
- struct file *bdev_file;
+ struct block_device *bdev;
const struct pr_ops *ops;
+ struct file *bdev_file;
int error;
if (!bl_validate_designator(v))
@@ -344,35 +405,30 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
if (IS_ERR(bdev_file))
bdev_file = bl_open_path(v, "wwn-0x");
if (IS_ERR(bdev_file))
+ bdev_file = bl_open_path(v, "nvme-eui.");
+ if (IS_ERR(bdev_file)) {
+ pr_warn("pNFS: no device found for volume %*phN\n",
+ v->scsi.designator_len, v->scsi.designator);
return PTR_ERR(bdev_file);
+ }
d->bdev_file = bdev_file;
+ bdev = file_bdev(bdev_file);
- d->len = bdev_nr_bytes(file_bdev(d->bdev_file));
+ d->len = bdev_nr_bytes(bdev);
d->map = bl_map_simple;
d->pr_key = v->scsi.pr_key;
if (d->len == 0)
return -ENODEV;
- pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
- file_bdev(d->bdev_file)->bd_disk->disk_name, d->pr_key);
-
- ops = file_bdev(d->bdev_file)->bd_disk->fops->pr_ops;
+ ops = bdev->bd_disk->fops->pr_ops;
if (!ops) {
pr_err("pNFS: block device %s does not support reservations.",
- file_bdev(d->bdev_file)->bd_disk->disk_name);
+ bdev->bd_disk->disk_name);
error = -EINVAL;
goto out_blkdev_put;
}
- error = ops->pr_register(file_bdev(d->bdev_file), 0, d->pr_key, true);
- if (error) {
- pr_err("pNFS: failed to register key for block device %s.",
- file_bdev(d->bdev_file)->bd_disk->disk_name);
- goto out_blkdev_put;
- }
-
- d->pr_registered = true;
return 0;
out_blkdev_put:
@@ -458,7 +514,9 @@ static int
bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
- switch (volumes[idx].type) {
+ d->type = volumes[idx].type;
+
+ switch (d->type) {
case PNFS_BLOCK_VOLUME_SIMPLE:
return bl_parse_simple(server, d, volumes, idx, gfp_mask);
case PNFS_BLOCK_VOLUME_SLICE:
@@ -470,7 +528,7 @@ bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
case PNFS_BLOCK_VOLUME_SCSI:
return bl_parse_scsi(server, d, volumes, idx, gfp_mask);
default:
- dprintk("unsupported volume type: %d\n", volumes[idx].type);
+ dprintk("unsupported volume type: %d\n", d->type);
return -EIO;
}
}
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 650758ee0d5f..154a6ed1299f 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -46,14 +46,15 @@ struct cb_compound_hdr_res {
struct cb_getattrargs {
struct nfs_fh fh;
- uint32_t bitmap[2];
+ uint32_t bitmap[3];
};
struct cb_getattrres {
__be32 status;
- uint32_t bitmap[2];
+ uint32_t bitmap[3];
uint64_t size;
uint64_t change_attr;
+ struct timespec64 atime;
struct timespec64 ctime;
struct timespec64 mtime;
};
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 76cea34477ae..7832fb0369a1 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -37,7 +37,7 @@ __be32 nfs4_callback_getattr(void *argp, void *resp,
if (!cps->clp) /* Always set for v4.0. Set in cb_sequence for v4.1 */
goto out;
- res->bitmap[0] = res->bitmap[1] = 0;
+ memset(res->bitmap, 0, sizeof(res->bitmap));
res->status = htonl(NFS4ERR_BADHANDLE);
dprintk_rcu("NFS: GETATTR callback request from %s\n",
@@ -59,12 +59,16 @@ __be32 nfs4_callback_getattr(void *argp, void *resp,
res->change_attr = delegation->change_attr;
if (nfs_have_writebacks(inode))
res->change_attr++;
+ res->atime = inode_get_atime(inode);
res->ctime = inode_get_ctime(inode);
res->mtime = inode_get_mtime(inode);
- res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) &
- args->bitmap[0];
- res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) &
- args->bitmap[1];
+ res->bitmap[0] = (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE) &
+ args->bitmap[0];
+ res->bitmap[1] = (FATTR4_WORD1_TIME_ACCESS |
+ FATTR4_WORD1_TIME_METADATA |
+ FATTR4_WORD1_TIME_MODIFY) & args->bitmap[1];
+ res->bitmap[2] = (FATTR4_WORD2_TIME_DELEG_ACCESS |
+ FATTR4_WORD2_TIME_DELEG_MODIFY) & args->bitmap[2];
res->status = 0;
out_iput:
rcu_read_unlock();
@@ -319,9 +323,10 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
int stat;
if (args->cbl_recall_type == RETURN_FSID)
- stat = pnfs_destroy_layouts_byfsid(clp, &args->cbl_fsid, true);
+ stat = pnfs_layout_destroy_byfsid(clp, &args->cbl_fsid,
+ PNFS_LAYOUT_BULK_RETURN);
else
- stat = pnfs_destroy_layouts_byclid(clp, true);
+ stat = pnfs_layout_destroy_byclid(clp, PNFS_LAYOUT_BULK_RETURN);
if (stat != 0)
return NFS4ERR_DELAY;
return NFS4ERR_NOMATCHING_LAYOUT;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 9369488f2ed4..29c49a7e5fe1 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -25,8 +25,9 @@
#define CB_OP_GETATTR_BITMAP_MAXSZ (4 * 4) // bitmap length, 3 bitmaps
#define CB_OP_GETATTR_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
CB_OP_GETATTR_BITMAP_MAXSZ + \
- /* change, size, ctime, mtime */\
- (2 + 2 + 3 + 3) * 4)
+ /* change, size, atime, ctime,
+ * mtime, deleg_atime, deleg_mtime */\
+ (2 + 2 + 3 + 3 + 3 + 3 + 3) * 4)
#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#if defined(CONFIG_NFS_V4_1)
@@ -635,6 +636,13 @@ static __be32 encode_attr_time(struct xdr_stream *xdr, const struct timespec64 *
return 0;
}
+static __be32 encode_attr_atime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec64 *time)
+{
+ if (!(bitmap[1] & FATTR4_WORD1_TIME_ACCESS))
+ return 0;
+ return encode_attr_time(xdr,time);
+}
+
static __be32 encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec64 *time)
{
if (!(bitmap[1] & FATTR4_WORD1_TIME_METADATA))
@@ -649,6 +657,24 @@ static __be32 encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap,
return encode_attr_time(xdr,time);
}
+static __be32 encode_attr_delegatime(struct xdr_stream *xdr,
+ const uint32_t *bitmap,
+ const struct timespec64 *time)
+{
+ if (!(bitmap[2] & FATTR4_WORD2_TIME_DELEG_ACCESS))
+ return 0;
+ return encode_attr_time(xdr,time);
+}
+
+static __be32 encode_attr_delegmtime(struct xdr_stream *xdr,
+ const uint32_t *bitmap,
+ const struct timespec64 *time)
+{
+ if (!(bitmap[2] & FATTR4_WORD2_TIME_DELEG_MODIFY))
+ return 0;
+ return encode_attr_time(xdr,time);
+}
+
static __be32 encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compound_hdr_res *hdr)
{
__be32 status;
@@ -699,10 +725,19 @@ static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr,
status = encode_attr_size(xdr, res->bitmap, res->size);
if (unlikely(status != 0))
goto out;
+ status = encode_attr_atime(xdr, res->bitmap, &res->atime);
+ if (unlikely(status != 0))
+ goto out;
status = encode_attr_ctime(xdr, res->bitmap, &res->ctime);
if (unlikely(status != 0))
goto out;
status = encode_attr_mtime(xdr, res->bitmap, &res->mtime);
+ if (unlikely(status != 0))
+ goto out;
+ status = encode_attr_delegatime(xdr, res->bitmap, &res->atime);
+ if (unlikely(status != 0))
+ goto out;
+ status = encode_attr_delegmtime(xdr, res->bitmap, &res->mtime);
*savep = htonl((unsigned int)((char *)xdr->p - (char *)(savep+1)));
out:
return status;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index de77848ae654..8286edd6062d 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -994,6 +994,9 @@ struct nfs_server *nfs_alloc_server(void)
server->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED;
+ init_waitqueue_head(&server->write_congestion_wait);
+ atomic_long_set(&server->writeback, 0);
+
ida_init(&server->openowner_id);
ida_init(&server->lockowner_id);
pnfs_init_server(server);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 6bace5fece04..d5edb3b3eeef 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -82,11 +82,10 @@ static void nfs_mark_return_delegation(struct nfs_server *server,
set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
}
-static bool
-nfs4_is_valid_delegation(const struct nfs_delegation *delegation,
- fmode_t flags)
+static bool nfs4_is_valid_delegation(const struct nfs_delegation *delegation,
+ fmode_t type)
{
- if (delegation != NULL && (delegation->type & flags) == flags &&
+ if (delegation != NULL && (delegation->type & type) == type &&
!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) &&
!test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
return true;
@@ -103,19 +102,22 @@ struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode)
return NULL;
}
-static int
-nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
+static int nfs4_do_check_delegation(struct inode *inode, fmode_t type,
+ int flags, bool mark)
{
struct nfs_delegation *delegation;
int ret = 0;
- flags &= FMODE_READ|FMODE_WRITE;
+ type &= FMODE_READ|FMODE_WRITE;
rcu_read_lock();
delegation = rcu_dereference(NFS_I(inode)->delegation);
- if (nfs4_is_valid_delegation(delegation, flags)) {
+ if (nfs4_is_valid_delegation(delegation, type)) {
if (mark)
nfs_mark_delegation_referenced(delegation);
ret = 1;
+ if ((flags & NFS_DELEGATION_FLAG_TIME) &&
+ !test_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags))
+ ret = 0;
}
rcu_read_unlock();
return ret;
@@ -124,22 +126,23 @@ nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
* nfs4_have_delegation - check if inode has a delegation, mark it
* NFS_DELEGATION_REFERENCED if there is one.
* @inode: inode to check
- * @flags: delegation types to check for
+ * @type: delegation types to check for
+ * @flags: various modifiers
*
* Returns one if inode has the indicated delegation, otherwise zero.
*/
-int nfs4_have_delegation(struct inode *inode, fmode_t flags)
+int nfs4_have_delegation(struct inode *inode, fmode_t type, int flags)
{
- return nfs4_do_check_delegation(inode, flags, true);
+ return nfs4_do_check_delegation(inode, type, flags, true);
}
/*
* nfs4_check_delegation - check if inode has a delegation, do not mark
* NFS_DELEGATION_REFERENCED if it has one.
*/
-int nfs4_check_delegation(struct inode *inode, fmode_t flags)
+int nfs4_check_delegation(struct inode *inode, fmode_t type)
{
- return nfs4_do_check_delegation(inode, flags, false);
+ return nfs4_do_check_delegation(inode, type, 0, false);
}
static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_stateid *stateid)
@@ -221,11 +224,12 @@ again:
* @type: delegation type
* @stateid: delegation stateid
* @pagemod_limit: write delegation "space_limit"
+ * @deleg_type: raw delegation type
*
*/
void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred,
fmode_t type, const nfs4_stateid *stateid,
- unsigned long pagemod_limit)
+ unsigned long pagemod_limit, u32 deleg_type)
{
struct nfs_delegation *delegation;
const struct cred *oldcred = NULL;
@@ -239,6 +243,14 @@ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred,
delegation->pagemod_limit = pagemod_limit;
oldcred = delegation->cred;
delegation->cred = get_cred(cred);
+ switch (deleg_type) {
+ case NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG:
+ case NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG:
+ set_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags);
+ break;
+ default:
+ clear_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags);
+ }
clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
if (test_and_clear_bit(NFS_DELEGATION_REVOKED,
&delegation->flags))
@@ -250,11 +262,13 @@ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred,
} else {
rcu_read_unlock();
nfs_inode_set_delegation(inode, cred, type, stateid,
- pagemod_limit);
+ pagemod_limit, deleg_type);
}
}
-static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
+static int nfs_do_return_delegation(struct inode *inode,
+ struct nfs_delegation *delegation,
+ int issync)
{
const struct cred *cred;
int res = 0;
@@ -263,9 +277,8 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *
spin_lock(&delegation->lock);
cred = get_cred(delegation->cred);
spin_unlock(&delegation->lock);
- res = nfs4_proc_delegreturn(inode, cred,
- &delegation->stateid,
- issync);
+ res = nfs4_proc_delegreturn(inode, cred, &delegation->stateid,
+ delegation, issync);
put_cred(cred);
}
return res;
@@ -418,13 +431,13 @@ nfs_update_inplace_delegation(struct nfs_delegation *delegation,
* @type: delegation type
* @stateid: delegation stateid
* @pagemod_limit: write delegation "space_limit"
+ * @deleg_type: raw delegation type
*
* Returns zero on success, or a negative errno value.
*/
int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
- fmode_t type,
- const nfs4_stateid *stateid,
- unsigned long pagemod_limit)
+ fmode_t type, const nfs4_stateid *stateid,
+ unsigned long pagemod_limit, u32 deleg_type)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs_client *clp = server->nfs_client;
@@ -444,6 +457,11 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
delegation->cred = get_cred(cred);
delegation->inode = inode;
delegation->flags = 1<<NFS_DELEGATION_REFERENCED;
+ switch (deleg_type) {
+ case NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG:
+ case NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG:
+ delegation->flags |= BIT(NFS_DELEGATION_DELEGTIME);
+ }
delegation->test_gen = 0;
spin_lock_init(&delegation->lock);
@@ -508,6 +526,11 @@ add_new:
atomic_long_inc(&nfs_active_delegations);
trace_nfs4_set_delegation(inode, type);
+
+ /* If we hold writebacks and have delegated mtime then update */
+ if (deleg_type == NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG &&
+ nfs_have_writebacks(inode))
+ nfs_update_delegated_mtime(inode);
out:
spin_unlock(&clp->cl_lock);
if (delegation != NULL)
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index a6f495d012cf..71524d34ed20 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -38,12 +38,15 @@ enum {
NFS_DELEGATION_TEST_EXPIRED,
NFS_DELEGATION_INODE_FREEING,
NFS_DELEGATION_RETURN_DELAYED,
+ NFS_DELEGATION_DELEGTIME,
};
int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
- fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit);
+ fmode_t type, const nfs4_stateid *stateid,
+ unsigned long pagemod_limit, u32 deleg_type);
void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred,
- fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit);
+ fmode_t type, const nfs4_stateid *stateid,
+ unsigned long pagemod_limit, u32 deleg_type);
int nfs4_inode_return_delegation(struct inode *inode);
void nfs4_inode_return_delegation_on_close(struct inode *inode);
int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
@@ -67,7 +70,9 @@ void nfs_test_expired_all_delegations(struct nfs_client *clp);
void nfs_reap_expired_delegations(struct nfs_client *clp);
/* NFSv4 delegation-related procedures */
-int nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, const nfs4_stateid *stateid, int issync);
+int nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
+ const nfs4_stateid *stateid,
+ struct nfs_delegation *delegation, int issync);
int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid);
int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid);
bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, const struct cred **cred);
@@ -75,8 +80,8 @@ bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode);
void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
-int nfs4_have_delegation(struct inode *inode, fmode_t flags);
-int nfs4_check_delegation(struct inode *inode, fmode_t flags);
+int nfs4_have_delegation(struct inode *inode, fmode_t type, int flags);
+int nfs4_check_delegation(struct inode *inode, fmode_t type);
bool nfs4_delegation_flush_on_close(const struct inode *inode);
void nfs_inode_find_delegation_state_and_recover(struct inode *inode,
const nfs4_stateid *stateid);
@@ -84,9 +89,37 @@ int nfs4_inode_make_writeable(struct inode *inode);
#endif
+#define NFS_DELEGATION_FLAG_TIME BIT(1)
+
+void nfs_update_delegated_atime(struct inode *inode);
+void nfs_update_delegated_mtime(struct inode *inode);
+void nfs_update_delegated_mtime_locked(struct inode *inode);
+
+static inline int nfs_have_read_or_write_delegation(struct inode *inode)
+{
+ return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ, 0);
+}
+
+static inline int nfs_have_write_delegation(struct inode *inode)
+{
+ return NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE, 0);
+}
+
static inline int nfs_have_delegated_attributes(struct inode *inode)
{
- return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ);
+ return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ, 0);
+}
+
+static inline int nfs_have_delegated_atime(struct inode *inode)
+{
+ return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ,
+ NFS_DELEGATION_FLAG_TIME);
+}
+
+static inline int nfs_have_delegated_mtime(struct inode *inode)
+{
+ return NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE,
+ NFS_DELEGATION_FLAG_TIME);
}
#endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 07a7be27182e..4cb97ef41350 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1437,7 +1437,7 @@ static void nfs_set_verifier_locked(struct dentry *dentry, unsigned long verf)
if (!dir || !nfs_verify_change_attribute(dir, verf))
return;
- if (inode && NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ if (inode && NFS_PROTO(inode)->have_delegation(inode, FMODE_READ, 0))
nfs_set_verifier_delegated(&verf);
dentry->d_time = verf;
}
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index bb2f583eb28b..90079ca134dd 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -141,8 +141,6 @@ int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter)
{
ssize_t ret;
- VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
-
if (iov_iter_rw(iter) == READ)
ret = nfs_file_direct_read(iocb, iter, true);
else
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 6bd127e6683d..61a8cdb9f1e1 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -339,6 +339,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, struct page **pagep,
void **fsdata)
{
+ fgf_t fgp = FGP_WRITEBEGIN;
struct folio *folio;
int once_thru = 0;
int ret;
@@ -346,8 +347,9 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n",
file, mapping->host->i_ino, len, (long long) pos);
+ fgp |= fgf_set_order(len);
start:
- folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, FGP_WRITEBEGIN,
+ folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp,
mapping_gfp_mask(mapping));
if (IS_ERR(folio))
return PTR_ERR(folio);
@@ -425,7 +427,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
static void nfs_invalidate_folio(struct folio *folio, size_t offset,
size_t length)
{
- struct inode *inode = folio_file_mapping(folio)->host;
+ struct inode *inode = folio->mapping->host;
dfprintk(PAGECACHE, "NFS: invalidate_folio(%lu, %zu, %zu)\n",
folio->index, offset, length);
@@ -434,7 +436,7 @@ static void nfs_invalidate_folio(struct folio *folio, size_t offset,
/* Cancel any unstarted writes on this page */
nfs_wb_folio_cancel(inode, folio);
folio_wait_private_2(folio); /* [DEPRECATED] */
- trace_nfs_invalidate_folio(inode, folio);
+ trace_nfs_invalidate_folio(inode, folio_pos(folio) + offset, length);
}
/*
@@ -452,7 +454,7 @@ static bool nfs_release_folio(struct folio *folio, gfp_t gfp)
if ((current_gfp_context(gfp) & GFP_KERNEL) != GFP_KERNEL ||
current_is_kswapd())
return false;
- if (nfs_wb_folio(folio_file_mapping(folio)->host, folio) < 0)
+ if (nfs_wb_folio(folio->mapping->host, folio) < 0)
return false;
}
return nfs_fscache_release_folio(folio, gfp);
@@ -502,7 +504,8 @@ static int nfs_launder_folio(struct folio *folio)
folio_wait_private_2(folio); /* [DEPRECATED] */
ret = nfs_wb_folio(inode, folio);
- trace_nfs_launder_folio_done(inode, folio, ret);
+ trace_nfs_launder_folio_done(inode, folio_pos(folio),
+ folio_size(folio), ret);
return ret;
}
@@ -588,7 +591,7 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf)
dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%lu), offset %lld)\n",
filp, filp->f_mapping->host->i_ino,
- (long long)folio_file_pos(folio));
+ (long long)folio_pos(folio));
sb_start_pagefault(inode->i_sb);
@@ -604,7 +607,7 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf)
TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
folio_lock(folio);
- mapping = folio_file_mapping(folio);
+ mapping = folio->mapping;
if (mapping != inode->i_mapping)
goto out_unlock;
@@ -730,7 +733,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
}
fl->c.flc_type = saved_type;
- if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ if (nfs_have_read_or_write_delegation(inode))
goto out_noconflict;
if (is_local)
@@ -813,7 +816,7 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
* This makes locking act as a cache coherency point.
*/
nfs_sync_mapping(filp->f_mapping);
- if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) {
+ if (!nfs_have_read_or_write_delegation(inode)) {
nfs_zap_caches(inode);
if (mapping_mapped(filp->f_mapping))
nfs_revalidate_mapping(inode, filp->f_mapping);
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 29d84dc66ca3..b6e9aeaf4ce2 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -1110,7 +1110,6 @@ static const struct pnfs_commit_ops filelayout_commit_ops = {
.clear_request_commit = pnfs_generic_clear_request_commit,
.scan_commit_lists = pnfs_generic_scan_commit_lists,
.recover_commit_reqs = pnfs_generic_recover_commit_reqs,
- .search_commit_reqs = pnfs_generic_search_commit_reqs,
.commit_pagelist = filelayout_commit_pagelist,
};
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 24188af56d5b..39ba9f4208aa 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -2548,7 +2548,7 @@ ff_layout_set_layoutdriver(struct nfs_server *server,
const struct nfs_fh *dummy)
{
#if IS_ENABLED(CONFIG_NFS_V4_2)
- server->caps |= NFS_CAP_LAYOUTSTATS;
+ server->caps |= NFS_CAP_LAYOUTSTATS | NFS_CAP_REBOOT_LAYOUTRETURN;
#endif
return 0;
}
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index ddc1ee031955..7202ce84d0eb 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -341,7 +341,7 @@ void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr)
int nfs_netfs_folio_unlock(struct folio *folio)
{
- struct inode *inode = folio_file_mapping(folio)->host;
+ struct inode *inode = folio->mapping->host;
/*
* If fscache is enabled, netfs will unlock pages.
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index acef52ecb1bb..b4914a11c3c2 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -190,9 +190,8 @@ static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi)
void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
{
struct nfs_inode *nfsi = NFS_I(inode);
- bool have_delegation = NFS_PROTO(inode)->have_delegation(inode, FMODE_READ);
- if (have_delegation) {
+ if (nfs_have_delegated_attributes(inode)) {
if (!(flags & NFS_INO_REVAL_FORCED))
flags &= ~(NFS_INO_INVALID_MODE |
NFS_INO_INVALID_OTHER |
@@ -276,6 +275,8 @@ EXPORT_SYMBOL_GPL(nfs_zap_acl_cache);
void nfs_invalidate_atime(struct inode *inode)
{
+ if (nfs_have_delegated_atime(inode))
+ return;
spin_lock(&inode->i_lock);
nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
spin_unlock(&inode->i_lock);
@@ -491,6 +492,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
inode->i_data.a_ops = &nfs_file_aops;
nfs_inode_init_regular(nfsi);
+ mapping_set_large_folios(inode->i_mapping);
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
inode->i_fop = &nfs_dir_operations;
@@ -604,6 +606,55 @@ out_no_inode:
}
EXPORT_SYMBOL_GPL(nfs_fhget);
+static void
+nfs_fattr_fixup_delegated(struct inode *inode, struct nfs_fattr *fattr)
+{
+ unsigned long cache_validity = NFS_I(inode)->cache_validity;
+
+ if (nfs_have_delegated_mtime(inode)) {
+ if (!(cache_validity & NFS_INO_INVALID_CTIME))
+ fattr->valid &= ~(NFS_ATTR_FATTR_PRECTIME |
+ NFS_ATTR_FATTR_CTIME);
+
+ if (!(cache_validity & NFS_INO_INVALID_MTIME))
+ fattr->valid &= ~(NFS_ATTR_FATTR_PREMTIME |
+ NFS_ATTR_FATTR_MTIME);
+
+ if (!(cache_validity & NFS_INO_INVALID_ATIME))
+ fattr->valid &= ~NFS_ATTR_FATTR_ATIME;
+ } else if (nfs_have_delegated_atime(inode)) {
+ if (!(cache_validity & NFS_INO_INVALID_ATIME))
+ fattr->valid &= ~NFS_ATTR_FATTR_ATIME;
+ }
+}
+
+void nfs_update_delegated_atime(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ if (nfs_have_delegated_atime(inode)) {
+ inode_update_timestamps(inode, S_ATIME);
+ NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ATIME;
+ }
+ spin_unlock(&inode->i_lock);
+}
+
+void nfs_update_delegated_mtime_locked(struct inode *inode)
+{
+ if (nfs_have_delegated_mtime(inode)) {
+ inode_update_timestamps(inode, S_CTIME | S_MTIME);
+ NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_CTIME |
+ NFS_INO_INVALID_MTIME);
+ }
+}
+
+void nfs_update_delegated_mtime(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ nfs_update_delegated_mtime_locked(inode);
+ spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_update_delegated_mtime);
+
#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN)
int
@@ -631,6 +682,17 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
attr->ia_valid &= ~ATTR_SIZE;
}
+ if (nfs_have_delegated_mtime(inode)) {
+ if (attr->ia_valid & ATTR_MTIME) {
+ nfs_update_delegated_mtime(inode);
+ attr->ia_valid &= ~ATTR_MTIME;
+ }
+ if (attr->ia_valid & ATTR_ATIME) {
+ nfs_update_delegated_atime(inode);
+ attr->ia_valid &= ~ATTR_ATIME;
+ }
+ }
+
/* Optimization: if the end result is no change, don't RPC */
if (((attr->ia_valid & NFS_VALID_ATTRS) & ~(ATTR_FILE|ATTR_OPEN)) == 0)
return 0;
@@ -686,6 +748,7 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset)
spin_unlock(&inode->i_lock);
truncate_pagecache(inode, offset);
+ nfs_update_delegated_mtime_locked(inode);
spin_lock(&inode->i_lock);
out:
return err;
@@ -709,8 +772,9 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
spin_lock(&inode->i_lock);
NFS_I(inode)->attr_gencount = fattr->gencount;
if ((attr->ia_valid & ATTR_SIZE) != 0) {
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME |
- NFS_INO_INVALID_BLOCKS);
+ if (!nfs_have_delegated_mtime(inode))
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS);
nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC);
nfs_vmtruncate(inode, attr->ia_size);
}
@@ -856,8 +920,12 @@ int nfs_getattr(struct mnt_idmap *idmap, const struct path *path,
/* Flush out writes to the server in order to update c/mtime/version. */
if ((request_mask & (STATX_CTIME | STATX_MTIME | STATX_CHANGE_COOKIE)) &&
- S_ISREG(inode->i_mode))
- filemap_write_and_wait(inode->i_mapping);
+ S_ISREG(inode->i_mode)) {
+ if (nfs_have_delegated_mtime(inode))
+ filemap_fdatawrite(inode->i_mapping);
+ else
+ filemap_write_and_wait(inode->i_mapping);
+ }
/*
* We may force a getattr if the user cares about atime.
@@ -1012,7 +1080,7 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
if (!is_sync)
return;
inode = d_inode(ctx->dentry);
- if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ if (nfs_have_read_or_write_delegation(inode))
return;
nfsi = NFS_I(inode);
if (inode->i_mapping->nrpages == 0)
@@ -1482,7 +1550,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
unsigned long invalid = 0;
struct timespec64 ts;
- if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ if (nfs_have_delegated_attributes(inode))
return 0;
if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) {
@@ -2118,6 +2186,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
*/
nfsi->read_cache_jiffies = fattr->time_start;
+ /* Fix up any delegated attributes in the struct nfs_fattr */
+ nfs_fattr_fixup_delegated(inode, fattr);
+
save_cache_validity = nfsi->cache_validity;
nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ATIME
@@ -2538,6 +2609,7 @@ static void __exit exit_nfs_fs(void)
/* Not quite true; I just maintain it */
MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
+MODULE_DESCRIPTION("NFS client support");
MODULE_LICENSE("GPL");
module_param(enable_ino64, bool, 0644);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9f0f4534744b..5902a9beca1f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -785,7 +785,7 @@ static inline void nfs_folio_mark_unstable(struct folio *folio,
struct nfs_commit_info *cinfo)
{
if (folio && !cinfo->dreq) {
- struct inode *inode = folio_file_mapping(folio)->host;
+ struct inode *inode = folio->mapping->host;
long nr = folio_nr_pages(folio);
/* This page is really still in write-back - just that the
@@ -800,31 +800,12 @@ static inline void nfs_folio_mark_unstable(struct folio *folio,
/*
* Determine the number of bytes of data the page contains
*/
-static inline
-unsigned int nfs_page_length(struct page *page)
-{
- loff_t i_size = i_size_read(page_file_mapping(page)->host);
-
- if (i_size > 0) {
- pgoff_t index = page_index(page);
- pgoff_t end_index = (i_size - 1) >> PAGE_SHIFT;
- if (index < end_index)
- return PAGE_SIZE;
- if (index == end_index)
- return ((i_size - 1) & ~PAGE_MASK) + 1;
- }
- return 0;
-}
-
-/*
- * Determine the number of bytes of data the page contains
- */
static inline size_t nfs_folio_length(struct folio *folio)
{
- loff_t i_size = i_size_read(folio_file_mapping(folio)->host);
+ loff_t i_size = i_size_read(folio->mapping->host);
if (i_size > 0) {
- pgoff_t index = folio_index(folio) >> folio_order(folio);
+ pgoff_t index = folio->index >> folio_order(folio);
pgoff_t end_index = (i_size - 1) >> folio_shift(folio);
if (index < end_index)
return folio_size(folio);
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index b17a9eb9b148..49862c95b224 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -46,6 +46,10 @@ static inline void nfs_add_stats(const struct inode *inode,
nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
}
+/*
+ * This specialized allocator has to be a macro for its allocations to be
+ * accounted separately (to have a separate alloc_tag).
+ */
#define nfs_alloc_iostats() alloc_percpu(struct nfs_iostats)
static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats)
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 68e76b626371..57c9dd700b58 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -128,11 +128,6 @@ struct mountres {
rpc_authflavor_t *auth_flavors;
};
-struct mnt_fhstatus {
- u32 status;
- struct nfs_fh *fh;
-};
-
/**
* nfs_mount - Obtain an NFS file handle for the given host and path
* @info: pointer to mount request arguments
diff --git a/fs/nfs/nfs2super.c b/fs/nfs/nfs2super.c
index 467f21ee6a35..b1badc70bd71 100644
--- a/fs/nfs/nfs2super.c
+++ b/fs/nfs/nfs2super.c
@@ -26,6 +26,7 @@ static void __exit exit_nfs_v2(void)
unregister_nfs_version(&nfs_v2);
}
+MODULE_DESCRIPTION("NFSv2 client support");
MODULE_LICENSE("GPL");
module_init(init_nfs_v2);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 74bda639a7cf..1566163c6d85 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -979,11 +979,18 @@ nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
return status;
}
-static int nfs3_have_delegation(struct inode *inode, fmode_t flags)
+static int nfs3_have_delegation(struct inode *inode, fmode_t type, int flags)
{
return 0;
}
+static int nfs3_return_delegation(struct inode *inode)
+{
+ if (S_ISREG(inode->i_mode))
+ nfs_wb_all(inode);
+ return 0;
+}
+
static const struct inode_operations nfs3_dir_inode_operations = {
.create = nfs_create,
.atomic_open = nfs_atomic_open_v23,
@@ -1062,6 +1069,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
.clear_acl_cache = forget_all_cached_acls,
.close_context = nfs_close_context,
.have_delegation = nfs3_have_delegation,
+ .return_delegation = nfs3_return_delegation,
.alloc_client = nfs_alloc_client,
.init_client = nfs_init_client,
.free_client = nfs_free_client,
diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
index 8a9be9e47f76..20a80478449e 100644
--- a/fs/nfs/nfs3super.c
+++ b/fs/nfs/nfs3super.c
@@ -27,6 +27,7 @@ static void __exit exit_nfs_v3(void)
unregister_nfs_version(&nfs_v3);
}
+MODULE_DESCRIPTION("NFSv3 client support");
MODULE_LICENSE("GPL");
module_init(init_nfs_v3);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 7024230f0d1d..c2045a2a9d0f 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -67,7 +67,8 @@ struct nfs4_minor_version_ops {
void (*free_lock_state)(struct nfs_server *,
struct nfs4_lock_state *);
int (*test_and_free_expired)(struct nfs_server *,
- nfs4_stateid *, const struct cred *);
+ const nfs4_stateid *,
+ const struct cred *);
struct nfs_seqid *
(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
void (*session_trunk)(struct rpc_clnt *clnt,
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 84573df5cf5a..83378f69b35e 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -231,9 +231,8 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
__set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags);
__set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
__set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
-
- if (test_bit(NFS_CS_DS, &cl_init->init_flags))
- __set_bit(NFS_CS_DS, &clp->cl_flags);
+ if (test_bit(NFS_CS_PNFS, &cl_init->init_flags))
+ __set_bit(NFS_CS_PNFS, &clp->cl_flags);
/*
* Set up the connection to the server before we add add to the
* global list.
@@ -1013,7 +1012,6 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
__set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
- __set_bit(NFS_CS_DS, &cl_init.init_flags);
__set_bit(NFS_CS_PNFS, &cl_init.init_flags);
cl_init.max_connect = NFS_MAX_TRANSPORTS;
/*
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a691fa10b3e9..8883016c551c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -103,10 +103,10 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
const struct cred *cred,
struct nfs4_slot *slot,
bool is_privileged);
-static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
- const struct cred *);
+static int nfs41_test_stateid(struct nfs_server *, const nfs4_stateid *,
+ const struct cred *);
static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *,
- const struct cred *, bool);
+ const struct cred *, bool);
#endif
#ifdef CONFIG_NFS_V4_SECURITY_LABEL
@@ -293,7 +293,7 @@ static void nfs4_bitmap_copy_adjust(__u32 *dst, const __u32 *src,
unsigned long cache_validity;
memcpy(dst, src, NFS4_BITMASK_SZ*sizeof(*dst));
- if (!inode || !nfs4_have_delegation(inode, FMODE_READ))
+ if (!inode || !nfs_have_read_or_write_delegation(inode))
return;
cache_validity = READ_ONCE(NFS_I(inode)->cache_validity) | flags;
@@ -310,6 +310,18 @@ static void nfs4_bitmap_copy_adjust(__u32 *dst, const __u32 *src,
dst[1] &= ~FATTR4_WORD1_MODE;
if (!(cache_validity & NFS_INO_INVALID_OTHER))
dst[1] &= ~(FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP);
+
+ if (nfs_have_delegated_mtime(inode)) {
+ if (!(cache_validity & NFS_INO_INVALID_ATIME))
+ dst[1] &= ~FATTR4_WORD1_TIME_ACCESS;
+ if (!(cache_validity & NFS_INO_INVALID_MTIME))
+ dst[1] &= ~FATTR4_WORD1_TIME_MODIFY;
+ if (!(cache_validity & NFS_INO_INVALID_CTIME))
+ dst[1] &= ~FATTR4_WORD1_TIME_METADATA;
+ } else if (nfs_have_delegated_atime(inode)) {
+ if (!(cache_validity & NFS_INO_INVALID_ATIME))
+ dst[1] &= ~FATTR4_WORD1_TIME_ACCESS;
+ }
}
static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry,
@@ -1245,7 +1257,8 @@ nfs4_update_changeattr_locked(struct inode *inode,
struct nfs_inode *nfsi = NFS_I(inode);
u64 change_attr = inode_peek_iversion_raw(inode);
- cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME;
+ if (!nfs_have_delegated_mtime(inode))
+ cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME;
if (S_ISDIR(inode->i_mode))
cache_validity |= NFS_INO_INVALID_DATA;
@@ -1264,7 +1277,7 @@ nfs4_update_changeattr_locked(struct inode *inode,
if (S_ISDIR(inode->i_mode))
nfs_force_lookup_revalidate(inode);
- if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ if (!nfs_have_delegated_attributes(inode))
cache_validity |=
NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL |
NFS_INO_INVALID_SIZE | NFS_INO_INVALID_OTHER |
@@ -1320,8 +1333,7 @@ static fmode_t _nfs4_ctx_to_openmode(const struct nfs_open_context *ctx)
}
static u32
-nfs4_map_atomic_open_share(struct nfs_server *server,
- fmode_t fmode, int openflags)
+nfs4_fmode_to_share_access(fmode_t fmode)
{
u32 res = 0;
@@ -1335,11 +1347,27 @@ nfs4_map_atomic_open_share(struct nfs_server *server,
case FMODE_READ|FMODE_WRITE:
res = NFS4_SHARE_ACCESS_BOTH;
}
+ return res;
+}
+
+static u32
+nfs4_map_atomic_open_share(struct nfs_server *server,
+ fmode_t fmode, int openflags)
+{
+ u32 res = nfs4_fmode_to_share_access(fmode);
+
if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1))
goto out;
/* Want no delegation if we're using O_DIRECT */
- if (openflags & O_DIRECT)
+ if (openflags & O_DIRECT) {
res |= NFS4_SHARE_WANT_NO_DELEG;
+ goto out;
+ }
+ /* res |= NFS4_SHARE_WANT_NO_PREFERENCE; */
+ if (server->caps & NFS_CAP_DELEGTIME)
+ res |= NFS4_SHARE_WANT_DELEG_TIMESTAMPS;
+ if (server->caps & NFS_CAP_OPEN_XOR)
+ res |= NFS4_SHARE_WANT_OPEN_XOR_DELEGATION;
out:
return res;
}
@@ -1954,44 +1982,41 @@ out_return_state:
}
static void
-nfs4_opendata_check_deleg(struct nfs4_opendata *data, struct nfs4_state *state)
-{
- struct nfs_client *clp = NFS_SERVER(state->inode)->nfs_client;
- struct nfs_delegation *delegation;
- int delegation_flags = 0;
-
- rcu_read_lock();
- delegation = rcu_dereference(NFS_I(state->inode)->delegation);
- if (delegation)
- delegation_flags = delegation->flags;
- rcu_read_unlock();
- switch (data->o_arg.claim) {
- default:
+nfs4_process_delegation(struct inode *inode, const struct cred *cred,
+ enum open_claim_type4 claim,
+ const struct nfs4_open_delegation *delegation)
+{
+ switch (delegation->open_delegation_type) {
+ case NFS4_OPEN_DELEGATE_READ:
+ case NFS4_OPEN_DELEGATE_WRITE:
+ case NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG:
+ case NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG:
break;
+ default:
+ return;
+ }
+ switch (claim) {
case NFS4_OPEN_CLAIM_DELEGATE_CUR:
case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
pr_err_ratelimited("NFS: Broken NFSv4 server %s is "
"returning a delegation for "
"OPEN(CLAIM_DELEGATE_CUR)\n",
- clp->cl_hostname);
- return;
+ NFS_SERVER(inode)->nfs_client->cl_hostname);
+ break;
+ case NFS4_OPEN_CLAIM_PREVIOUS:
+ nfs_inode_reclaim_delegation(inode, cred, delegation->type,
+ &delegation->stateid,
+ delegation->pagemod_limit,
+ delegation->open_delegation_type);
+ break;
+ default:
+ nfs_inode_set_delegation(inode, cred, delegation->type,
+ &delegation->stateid,
+ delegation->pagemod_limit,
+ delegation->open_delegation_type);
}
- if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
- nfs_inode_set_delegation(state->inode,
- data->owner->so_cred,
- data->o_res.delegation_type,
- &data->o_res.delegation,
- data->o_res.pagemod_limit);
- else
- nfs_inode_reclaim_delegation(state->inode,
- data->owner->so_cred,
- data->o_res.delegation_type,
- &data->o_res.delegation,
- data->o_res.pagemod_limit);
-
- if (data->o_res.do_recall)
- nfs_async_inode_return_delegation(state->inode,
- &data->o_res.delegation);
+ if (delegation->do_recall)
+ nfs_async_inode_return_delegation(inode, &delegation->stateid);
}
/*
@@ -2015,11 +2040,16 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
if (ret)
return ERR_PTR(ret);
- if (data->o_res.delegation_type != 0)
- nfs4_opendata_check_deleg(data, state);
+ nfs4_process_delegation(state->inode,
+ data->owner->so_cred,
+ data->o_arg.claim,
+ &data->o_res.delegation);
- if (!update_open_stateid(state, &data->o_res.stateid,
- NULL, data->o_arg.fmode))
+ if (!(data->o_res.rflags & NFS4_OPEN_RESULT_NO_OPEN_STATEID)) {
+ if (!update_open_stateid(state, &data->o_res.stateid,
+ NULL, data->o_arg.fmode))
+ return ERR_PTR(-EAGAIN);
+ } else if (!update_open_stateid(state, NULL, NULL, data->o_arg.fmode))
return ERR_PTR(-EAGAIN);
refcount_inc(&state->count);
@@ -2083,10 +2113,18 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
if (IS_ERR(state))
goto out;
- if (data->o_res.delegation_type != 0)
- nfs4_opendata_check_deleg(data, state);
- if (!update_open_stateid(state, &data->o_res.stateid,
- NULL, data->o_arg.fmode)) {
+ nfs4_process_delegation(state->inode,
+ data->owner->so_cred,
+ data->o_arg.claim,
+ &data->o_res.delegation);
+
+ if (!(data->o_res.rflags & NFS4_OPEN_RESULT_NO_OPEN_STATEID)) {
+ if (!update_open_stateid(state, &data->o_res.stateid,
+ NULL, data->o_arg.fmode)) {
+ nfs4_put_open_state(state);
+ state = ERR_PTR(-EAGAIN);
+ }
+ } else if (!update_open_stateid(state, NULL, NULL, data->o_arg.fmode)) {
nfs4_put_open_state(state);
state = ERR_PTR(-EAGAIN);
}
@@ -2222,7 +2260,7 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
{
struct nfs_delegation *delegation;
struct nfs4_opendata *opendata;
- fmode_t delegation_type = 0;
+ u32 delegation_type = NFS4_OPEN_DELEGATE_NONE;
int status;
opendata = nfs4_open_recoverdata_alloc(ctx, state,
@@ -2231,8 +2269,20 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
return PTR_ERR(opendata);
rcu_read_lock();
delegation = rcu_dereference(NFS_I(state->inode)->delegation);
- if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0)
- delegation_type = delegation->type;
+ if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0) {
+ switch(delegation->type) {
+ case FMODE_READ:
+ delegation_type = NFS4_OPEN_DELEGATE_READ;
+ if (test_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags))
+ delegation_type = NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG;
+ break;
+ case FMODE_WRITE:
+ case FMODE_READ|FMODE_WRITE:
+ delegation_type = NFS4_OPEN_DELEGATE_WRITE;
+ if (test_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags))
+ delegation_type = NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG;
+ }
+ }
rcu_read_unlock();
opendata->o_arg.u.delegation_type = delegation_type;
status = nfs4_open_recover(opendata, state);
@@ -2825,16 +2875,16 @@ static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
}
static int nfs40_test_and_free_expired_stateid(struct nfs_server *server,
- nfs4_stateid *stateid,
- const struct cred *cred)
+ const nfs4_stateid *stateid,
+ const struct cred *cred)
{
return -NFS4ERR_BAD_STATEID;
}
#if defined(CONFIG_NFS_V4_1)
static int nfs41_test_and_free_expired_stateid(struct nfs_server *server,
- nfs4_stateid *stateid,
- const struct cred *cred)
+ const nfs4_stateid *stateid,
+ const struct cred *cred)
{
int status;
@@ -3111,7 +3161,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
case NFS4_OPEN_CLAIM_DELEGATE_PREV:
if (!opendata->rpc_done)
break;
- if (opendata->o_res.delegation_type != 0)
+ if (opendata->o_res.delegation.type != 0)
dir_verifier = nfs_save_change_attribute(dir);
nfs_set_verifier(dentry, dir_verifier);
}
@@ -3394,7 +3444,8 @@ static int nfs4_do_setattr(struct inode *inode, const struct cred *cred,
.inode = inode,
.stateid = &arg.stateid,
};
- unsigned long adjust_flags = NFS_INO_INVALID_CHANGE;
+ unsigned long adjust_flags = NFS_INO_INVALID_CHANGE |
+ NFS_INO_INVALID_CTIME;
int err;
if (sattr->ia_valid & (ATTR_MODE | ATTR_KILL_SUID | ATTR_KILL_SGID))
@@ -3700,7 +3751,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
if (calldata->arg.fmode == 0 || calldata->arg.fmode == FMODE_READ) {
/* Close-to-open cache consistency revalidation */
- if (!nfs4_have_delegation(inode, FMODE_READ)) {
+ if (!nfs4_have_delegation(inode, FMODE_READ, 0)) {
nfs4_bitmask_set(calldata->arg.bitmask_store,
server->cache_consistency_bitmask,
inode, 0);
@@ -3710,8 +3761,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
}
calldata->arg.share_access =
- nfs4_map_atomic_open_share(NFS_SERVER(inode),
- calldata->arg.fmode, 0);
+ nfs4_fmode_to_share_access(calldata->arg.fmode);
if (calldata->res.fattr == NULL)
calldata->arg.bitmask = NULL;
@@ -3852,11 +3902,14 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
#define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL)
#define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL)
-#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_XATTR_SUPPORT - 1UL)
+#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_OPEN_ARGUMENTS - 1UL)
static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
{
- u32 bitmask[3] = {}, minorversion = server->nfs_client->cl_minorversion;
+ u32 minorversion = server->nfs_client->cl_minorversion;
+ u32 bitmask[3] = {
+ [0] = FATTR4_WORD0_SUPPORTED_ATTRS,
+ };
struct nfs4_server_caps_arg args = {
.fhandle = fhandle,
.bitmask = bitmask,
@@ -3882,6 +3935,14 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
if (status == 0) {
+ bitmask[0] = (FATTR4_WORD0_SUPPORTED_ATTRS |
+ FATTR4_WORD0_FH_EXPIRE_TYPE |
+ FATTR4_WORD0_LINK_SUPPORT |
+ FATTR4_WORD0_SYMLINK_SUPPORT |
+ FATTR4_WORD0_ACLSUPPORT |
+ FATTR4_WORD0_CASE_INSENSITIVE |
+ FATTR4_WORD0_CASE_PRESERVING) &
+ res.attr_bitmask[0];
/* Sanity check the server answers */
switch (minorversion) {
case 0:
@@ -3890,9 +3951,14 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
break;
case 1:
res.attr_bitmask[2] &= FATTR4_WORD2_NFS41_MASK;
+ bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT &
+ res.attr_bitmask[2];
break;
case 2:
res.attr_bitmask[2] &= FATTR4_WORD2_NFS42_MASK;
+ bitmask[2] = (FATTR4_WORD2_SUPPATTR_EXCLCREAT |
+ FATTR4_WORD2_OPEN_ARGUMENTS) &
+ res.attr_bitmask[2];
}
memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
server->caps &= ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS |
@@ -3915,6 +3981,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
#endif
if (res.attr_bitmask[0] & FATTR4_WORD0_FS_LOCATIONS)
server->caps |= NFS_CAP_FS_LOCATIONS;
+ if (res.attr_bitmask[2] & FATTR4_WORD2_TIME_DELEG_MODIFY)
+ server->caps |= NFS_CAP_DELEGTIME;
if (!(res.attr_bitmask[0] & FATTR4_WORD0_FILEID))
server->fattr_valid &= ~NFS_ATTR_FATTR_FILEID;
if (!(res.attr_bitmask[1] & FATTR4_WORD1_MODE))
@@ -3939,6 +4007,10 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
sizeof(server->attr_bitmask));
server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+ if (res.open_caps.oa_share_access_want[0] &
+ NFS4_SHARE_WANT_OPEN_XOR_DELEGATION)
+ server->caps |= NFS_CAP_OPEN_XOR;
+
memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
@@ -4638,7 +4710,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
};
int status = 0;
- if (!nfs4_have_delegation(inode, FMODE_READ)) {
+ if (!nfs4_have_delegation(inode, FMODE_READ, 0)) {
res.fattr = nfs_alloc_fattr();
if (res.fattr == NULL)
return -ENOMEM;
@@ -4956,8 +5028,9 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct
goto out;
nfs4_inode_make_writeable(inode);
- nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, res.fattr->label), inode,
- NFS_INO_INVALID_CHANGE);
+ nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, res.fattr->label),
+ inode,
+ NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME);
status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
if (!status) {
nfs4_update_changeattr(dir, &res.cinfo, res.fattr->time_start,
@@ -5607,7 +5680,7 @@ bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr)
/* Otherwise, request attributes if and only if we don't hold
* a delegation
*/
- return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0;
+ return nfs4_have_delegation(hdr->inode, FMODE_READ, 0) == 0;
}
void nfs4_bitmask_set(__u32 bitmask[], const __u32 src[],
@@ -6575,6 +6648,7 @@ struct nfs4_delegreturndata {
u32 roc_barrier;
bool roc;
} lr;
+ struct nfs4_delegattr sattr;
struct nfs_fattr fattr;
int rpc_status;
struct inode *inode;
@@ -6599,6 +6673,30 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
&data->res.lr_ret) == -EAGAIN)
goto out_restart;
+ if (data->args.sattr_args && task->tk_status != 0) {
+ switch(data->res.sattr_ret) {
+ case 0:
+ data->args.sattr_args = NULL;
+ data->res.sattr_res = false;
+ break;
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_BAD_STATEID:
+ /* Let the main handler below do stateid recovery */
+ break;
+ case -NFS4ERR_OLD_STATEID:
+ if (nfs4_refresh_delegation_stateid(&data->stateid,
+ data->inode))
+ goto out_restart;
+ fallthrough;
+ default:
+ data->args.sattr_args = NULL;
+ data->res.sattr_res = false;
+ goto out_restart;
+ }
+ }
+
switch (task->tk_status) {
case 0:
renew_lease(data->res.server, data->timestamp);
@@ -6692,7 +6790,10 @@ static const struct rpc_call_ops nfs4_delegreturn_ops = {
.rpc_release = nfs4_delegreturn_release,
};
-static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, const nfs4_stateid *stateid, int issync)
+static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
+ const nfs4_stateid *stateid,
+ struct nfs_delegation *delegation,
+ int issync)
{
struct nfs4_delegreturndata *data;
struct nfs_server *server = NFS_SERVER(inode);
@@ -6744,12 +6845,27 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
}
}
+ if (delegation &&
+ test_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags)) {
+ if (delegation->type & FMODE_READ) {
+ data->sattr.atime = inode_get_atime(inode);
+ data->sattr.atime_set = true;
+ }
+ if (delegation->type & FMODE_WRITE) {
+ data->sattr.mtime = inode_get_mtime(inode);
+ data->sattr.mtime_set = true;
+ }
+ data->args.sattr_args = &data->sattr;
+ data->res.sattr_res = true;
+ }
+
if (!data->inode)
nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1,
1);
else
nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1,
0);
+
task_setup_data.callback_data = data;
msg.rpc_argp = &data->args;
msg.rpc_resp = &data->res;
@@ -6767,13 +6883,16 @@ out:
return status;
}
-int nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, const nfs4_stateid *stateid, int issync)
+int nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
+ const nfs4_stateid *stateid,
+ struct nfs_delegation *delegation, int issync)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs4_exception exception = { };
int err;
do {
- err = _nfs4_proc_delegreturn(inode, cred, stateid, issync);
+ err = _nfs4_proc_delegreturn(inode, cred, stateid,
+ delegation, issync);
trace_nfs4_delegreturn(inode, stateid, err);
switch (err) {
case -NFS4ERR_STALE_STATEID:
@@ -7629,10 +7748,10 @@ static int nfs4_add_lease(struct file *file, int arg, struct file_lease **lease,
int ret;
/* No delegation, no lease */
- if (!nfs4_have_delegation(inode, type))
+ if (!nfs4_have_delegation(inode, type, 0))
return -EAGAIN;
ret = generic_setlease(file, arg, lease, priv);
- if (ret || nfs4_have_delegation(inode, type))
+ if (ret || nfs4_have_delegation(inode, type, 0))
return ret;
/* We raced with a delegation return */
nfs4_delete_lease(file, priv);
@@ -8840,7 +8959,7 @@ nfs4_run_exchange_id(struct nfs_client *clp, const struct cred *cred,
#ifdef CONFIG_NFS_V4_1_MIGRATION
calldata->args.flags |= EXCHGID4_FLAG_SUPP_MOVED_MIGR;
#endif
- if (test_bit(NFS_CS_DS, &clp->cl_flags))
+ if (test_bit(NFS_CS_PNFS, &clp->cl_flags))
calldata->args.flags |= EXCHGID4_FLAG_USE_PNFS_DS;
msg.rpc_argp = &calldata->args;
msg.rpc_resp = &calldata->res;
@@ -9854,6 +9973,11 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
if (!nfs41_sequence_process(task, &lrp->res.seq_res))
return;
+ if (task->tk_rpc_status == -ETIMEDOUT) {
+ lrp->rpc_status = -EAGAIN;
+ lrp->res.lrs_present = 0;
+ return;
+ }
/*
* Was there an RPC level error? Assume the call succeeded,
* and that we need to release the layout
@@ -9876,6 +10000,15 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
fallthrough;
case 0:
break;
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_DEADSESSION:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ nfs4_schedule_session_recovery(server->nfs_client->cl_session,
+ task->tk_status);
+ lrp->res.lrs_present = 0;
+ lrp->rpc_status = -EAGAIN;
+ task->tk_status = 0;
+ break;
case -NFS4ERR_DELAY:
if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN)
break;
@@ -9893,8 +10026,13 @@ static void nfs4_layoutreturn_release(void *calldata)
struct nfs4_layoutreturn *lrp = calldata;
struct pnfs_layout_hdr *lo = lrp->args.layout;
- pnfs_layoutreturn_free_lsegs(lo, &lrp->args.stateid, &lrp->args.range,
+ if (lrp->rpc_status == 0 || !lrp->inode)
+ pnfs_layoutreturn_free_lsegs(
+ lo, &lrp->args.stateid, &lrp->args.range,
lrp->res.lrs_present ? &lrp->res.stateid : NULL);
+ else
+ pnfs_layoutreturn_retry_later(lo, &lrp->args.stateid,
+ &lrp->args.range);
nfs4_sequence_free_slot(&lrp->res.seq_res);
if (lrp->ld_private.ops && lrp->ld_private.ops->free)
lrp->ld_private.ops->free(&lrp->ld_private);
@@ -9910,7 +10048,7 @@ static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
.rpc_release = nfs4_layoutreturn_release,
};
-int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
+int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, unsigned int flags)
{
struct rpc_task *task;
struct rpc_message msg = {
@@ -9933,7 +10071,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
&task_setup_data.rpc_client, &msg);
lrp->inode = nfs_igrab_and_active(lrp->args.inode);
- if (!sync) {
+ if (flags & PNFS_FL_LAYOUTRETURN_ASYNC) {
if (!lrp->inode) {
nfs4_layoutreturn_release(lrp);
return -EAGAIN;
@@ -9941,6 +10079,8 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
task_setup_data.flags |= RPC_TASK_ASYNC;
}
if (!lrp->inode)
+ flags |= PNFS_FL_LAYOUTRETURN_PRIVILEGED;
+ if (flags & PNFS_FL_LAYOUTRETURN_PRIVILEGED)
nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1,
1);
else
@@ -9949,7 +10089,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
- if (sync)
+ if (!(flags & PNFS_FL_LAYOUTRETURN_ASYNC))
status = task->tk_status;
trace_nfs4_layoutreturn(lrp->args.inode, &lrp->args.stateid, status);
dprintk("<-- %s status=%d\n", __func__, status);
@@ -10267,12 +10407,12 @@ out:
}
static int _nfs41_test_stateid(struct nfs_server *server,
- nfs4_stateid *stateid,
- const struct cred *cred)
+ const nfs4_stateid *stateid,
+ const struct cred *cred)
{
int status;
struct nfs41_test_stateid_args args = {
- .stateid = stateid,
+ .stateid = *stateid,
};
struct nfs41_test_stateid_res res;
struct rpc_message msg = {
@@ -10328,8 +10468,8 @@ static void nfs4_handle_delay_or_session_error(struct nfs_server *server,
* failed or the state ID is not currently valid.
*/
static int nfs41_test_stateid(struct nfs_server *server,
- nfs4_stateid *stateid,
- const struct cred *cred)
+ const nfs4_stateid *stateid,
+ const struct cred *cred)
{
struct nfs4_exception exception = {
.interruptible = true,
@@ -10759,6 +10899,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
.close_context = nfs4_close_context,
.open_context = nfs4_atomic_open,
.have_delegation = nfs4_have_delegation,
+ .return_delegation = nfs4_inode_return_delegation,
.alloc_client = nfs4_alloc_client,
.init_client = nfs4_init_client,
.free_client = nfs4_free_client,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 5b452411e8fd..877f682b45f2 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1863,6 +1863,7 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
if (!nfs4_state_clear_reclaim_reboot(clp))
return;
+ pnfs_destroy_all_layouts(clp);
ops = clp->cl_mvops->reboot_recovery_ops;
cred = nfs4_get_clid_cred(clp);
err = nfs4_reclaim_complete(clp, ops, cred);
@@ -2068,7 +2069,6 @@ static int nfs4_establish_lease(struct nfs_client *clp)
put_cred(cred);
if (status != 0)
return status;
- pnfs_destroy_all_layouts(clp);
return 0;
}
@@ -2680,6 +2680,8 @@ static void nfs4_state_manager(struct nfs_client *clp)
section = "reclaim reboot";
status = nfs4_do_reclaim(clp,
clp->cl_mvops->reboot_recovery_ops);
+ if (status == 0)
+ status = pnfs_layout_handle_reboot(clp);
if (status == -EAGAIN)
continue;
if (status < 0)
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 8da5a9c000f4..b29a26923ce0 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -332,6 +332,7 @@ static void __exit exit_nfs_v4(void)
nfs_dns_resolver_destroy();
}
+MODULE_DESCRIPTION("NFSv4 client support");
MODULE_LICENSE("GPL");
module_init(init_nfs_v4);
diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c
index d22c6670f770..389941ccc9c9 100644
--- a/fs/nfs/nfs4trace.c
+++ b/fs/nfs/nfs4trace.c
@@ -2,6 +2,8 @@
/*
* Copyright (c) 2013 Trond Myklebust <Trond.Myklebust@netapp.com>
*/
+#include <uapi/linux/pr.h>
+#include <linux/blkdev.h>
#include <linux/nfs_fs.h>
#include "nfs4_fs.h"
#include "internal.h"
@@ -29,5 +31,10 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_read_error);
EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_write_error);
EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_commit_error);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bl_pr_key_reg);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bl_pr_key_reg_err);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bl_pr_key_unreg);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bl_pr_key_unreg_err);
+
EXPORT_TRACEPOINT_SYMBOL_GPL(fl_getdevinfo);
#endif
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 4de8780a7c48..22c973316f0b 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -2153,6 +2153,94 @@ TRACE_EVENT(ff_layout_commit_error,
)
);
+DECLARE_EVENT_CLASS(pnfs_bl_pr_key_class,
+ TP_PROTO(
+ const struct block_device *bdev,
+ u64 key
+ ),
+ TP_ARGS(bdev, key),
+ TP_STRUCT__entry(
+ __field(u64, key)
+ __field(dev_t, dev)
+ __string(device, bdev->bd_disk->disk_name)
+ ),
+ TP_fast_assign(
+ __entry->key = key;
+ __entry->dev = bdev->bd_dev;
+ __assign_str(device);
+ ),
+ TP_printk("dev=%d,%d (%s) key=0x%016llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __get_str(device), __entry->key
+ )
+);
+
+#define DEFINE_NFS4_BLOCK_PRKEY_EVENT(name) \
+ DEFINE_EVENT(pnfs_bl_pr_key_class, name, \
+ TP_PROTO( \
+ const struct block_device *bdev, \
+ u64 key \
+ ), \
+ TP_ARGS(bdev, key))
+DEFINE_NFS4_BLOCK_PRKEY_EVENT(bl_pr_key_reg);
+DEFINE_NFS4_BLOCK_PRKEY_EVENT(bl_pr_key_unreg);
+
+/*
+ * From uapi/linux/pr.h
+ */
+TRACE_DEFINE_ENUM(PR_STS_SUCCESS);
+TRACE_DEFINE_ENUM(PR_STS_IOERR);
+TRACE_DEFINE_ENUM(PR_STS_RESERVATION_CONFLICT);
+TRACE_DEFINE_ENUM(PR_STS_RETRY_PATH_FAILURE);
+TRACE_DEFINE_ENUM(PR_STS_PATH_FAST_FAILED);
+TRACE_DEFINE_ENUM(PR_STS_PATH_FAILED);
+
+#define show_pr_status(x) \
+ __print_symbolic(x, \
+ { PR_STS_SUCCESS, "SUCCESS" }, \
+ { PR_STS_IOERR, "IOERR" }, \
+ { PR_STS_RESERVATION_CONFLICT, "RESERVATION_CONFLICT" }, \
+ { PR_STS_RETRY_PATH_FAILURE, "RETRY_PATH_FAILURE" }, \
+ { PR_STS_PATH_FAST_FAILED, "PATH_FAST_FAILED" }, \
+ { PR_STS_PATH_FAILED, "PATH_FAILED" })
+
+DECLARE_EVENT_CLASS(pnfs_bl_pr_key_err_class,
+ TP_PROTO(
+ const struct block_device *bdev,
+ u64 key,
+ int status
+ ),
+ TP_ARGS(bdev, key, status),
+ TP_STRUCT__entry(
+ __field(u64, key)
+ __field(dev_t, dev)
+ __field(unsigned long, status)
+ __string(device, bdev->bd_disk->disk_name)
+ ),
+ TP_fast_assign(
+ __entry->key = key;
+ __entry->dev = bdev->bd_dev;
+ __entry->status = status;
+ __assign_str(device);
+ ),
+ TP_printk("dev=%d,%d (%s) key=0x%016llx status=%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __get_str(device), __entry->key,
+ show_pr_status(__entry->status)
+ )
+);
+
+#define DEFINE_NFS4_BLOCK_PRKEY_ERR_EVENT(name) \
+ DEFINE_EVENT(pnfs_bl_pr_key_err_class, name, \
+ TP_PROTO( \
+ const struct block_device *bdev, \
+ u64 key, \
+ int status \
+ ), \
+ TP_ARGS(bdev, key, status))
+DEFINE_NFS4_BLOCK_PRKEY_ERR_EVENT(bl_pr_key_reg_err);
+DEFINE_NFS4_BLOCK_PRKEY_ERR_EVENT(bl_pr_key_unreg_err);
+
#ifdef CONFIG_NFS_V4_2
TRACE_DEFINE_ENUM(NFS4_CONTENT_DATA);
TRACE_DEFINE_ENUM(NFS4_CONTENT_HOLE);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1416099dfcd1..7704a4509676 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -224,6 +224,11 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
encode_attrs_maxsz)
#define decode_setattr_maxsz (op_decode_hdr_maxsz + \
nfs4_fattr_bitmap_maxsz)
+#define encode_delegattr_maxsz (op_encode_hdr_maxsz + \
+ encode_stateid_maxsz + \
+ nfs4_fattr_bitmap_maxsz + \
+ 2*nfstime4_maxsz)
+#define decode_delegattr_maxsz (decode_setattr_maxsz)
#define encode_read_maxsz (op_encode_hdr_maxsz + \
encode_stateid_maxsz + 3)
#define decode_read_maxsz (op_decode_hdr_maxsz + 2 + pagepad_maxsz)
@@ -758,12 +763,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
encode_sequence_maxsz + \
encode_putfh_maxsz + \
encode_layoutreturn_maxsz + \
+ encode_delegattr_maxsz + \
encode_delegreturn_maxsz + \
encode_getattr_maxsz)
#define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \
decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_layoutreturn_maxsz + \
+ decode_delegattr_maxsz + \
decode_delegreturn_maxsz + \
decode_getattr_maxsz)
#define NFS4_enc_getacl_sz (compound_encode_hdr_maxsz + \
@@ -1060,9 +1067,10 @@ static void encode_nops(struct compound_hdr *hdr)
*hdr->nops_p = htonl(hdr->nops);
}
-static void encode_nfs4_stateid(struct xdr_stream *xdr, const nfs4_stateid *stateid)
+static void encode_nfs4_stateid(struct xdr_stream *xdr,
+ const nfs4_stateid *stateid)
{
- encode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE);
+ encode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE);
}
static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf)
@@ -1468,20 +1476,18 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
}
}
-static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delegation_type)
+static inline void encode_delegation_type(struct xdr_stream *xdr, u32 delegation_type)
{
__be32 *p;
p = reserve_space(xdr, 4);
switch (delegation_type) {
- case 0:
- *p = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE);
- break;
- case FMODE_READ:
- *p = cpu_to_be32(NFS4_OPEN_DELEGATE_READ);
- break;
- case FMODE_WRITE|FMODE_READ:
- *p = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE);
+ case NFS4_OPEN_DELEGATE_NONE:
+ case NFS4_OPEN_DELEGATE_READ:
+ case NFS4_OPEN_DELEGATE_WRITE:
+ case NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG:
+ case NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG:
+ *p = cpu_to_be32(delegation_type);
break;
default:
BUG();
@@ -1497,7 +1503,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
encode_string(xdr, name->len, name->name);
}
-static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
+static inline void encode_claim_previous(struct xdr_stream *xdr, u32 type)
{
__be32 *p;
@@ -1735,6 +1741,33 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
server->attr_bitmask);
}
+static void encode_delegattr(struct xdr_stream *xdr,
+ const nfs4_stateid *stateid,
+ const struct nfs4_delegattr *attr,
+ struct compound_hdr *hdr)
+{
+ uint32_t bitmap[3] = { 0 };
+ uint32_t len = 0;
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_SETATTR, encode_delegattr_maxsz, hdr);
+ encode_nfs4_stateid(xdr, stateid);
+ if (attr->atime_set) {
+ bitmap[2] |= FATTR4_WORD2_TIME_DELEG_ACCESS;
+ len += (nfstime4_maxsz << 2);
+ }
+ if (attr->mtime_set) {
+ bitmap[2] |= FATTR4_WORD2_TIME_DELEG_MODIFY;
+ len += (nfstime4_maxsz << 2);
+ }
+ xdr_encode_bitmap4(xdr, bitmap, ARRAY_SIZE(bitmap));
+ xdr_stream_encode_opaque_inline(xdr, (void **)&p, len);
+ if (bitmap[2] & FATTR4_WORD2_TIME_DELEG_ACCESS)
+ p = xdr_encode_nfstime4(p, &attr->atime);
+ if (bitmap[2] & FATTR4_WORD2_TIME_DELEG_MODIFY)
+ p = xdr_encode_nfstime4(p, &attr->mtime);
+}
+
static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
{
__be32 *p;
@@ -2105,7 +2138,7 @@ static void encode_test_stateid(struct xdr_stream *xdr,
{
encode_op_hdr(xdr, OP_TEST_STATEID, decode_test_stateid_maxsz, hdr);
encode_uint32(xdr, 1);
- encode_nfs4_stateid(xdr, args->stateid);
+ encode_nfs4_stateid(xdr, &args->stateid);
}
static void encode_free_stateid(struct xdr_stream *xdr,
@@ -2812,6 +2845,8 @@ static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req,
encode_putfh(xdr, args->fhandle, &hdr);
if (args->lr_args)
encode_layoutreturn(xdr, args->lr_args, &hdr);
+ if (args->sattr_args)
+ encode_delegattr(xdr, args->stateid, args->sattr_args, &hdr);
if (args->bitmask)
encode_getfattr(xdr, args->bitmask, &hdr);
encode_delegreturn(xdr, args->stateid, &hdr);
@@ -4303,6 +4338,28 @@ static int decode_attr_xattrsupport(struct xdr_stream *xdr, uint32_t *bitmap,
return 0;
}
+static int decode_attr_open_arguments(struct xdr_stream *xdr, uint32_t *bitmap,
+ struct nfs4_open_caps *res)
+{
+ memset(res, 0, sizeof(*res));
+ if (unlikely(bitmap[2] & (FATTR4_WORD2_OPEN_ARGUMENTS - 1U)))
+ return -EIO;
+ if (likely(bitmap[2] & FATTR4_WORD2_OPEN_ARGUMENTS)) {
+ if (decode_bitmap4(xdr, res->oa_share_access, ARRAY_SIZE(res->oa_share_access)) < 0)
+ return -EIO;
+ if (decode_bitmap4(xdr, res->oa_share_deny, ARRAY_SIZE(res->oa_share_deny)) < 0)
+ return -EIO;
+ if (decode_bitmap4(xdr, res->oa_share_access_want, ARRAY_SIZE(res->oa_share_access_want)) < 0)
+ return -EIO;
+ if (decode_bitmap4(xdr, res->oa_open_claim, ARRAY_SIZE(res->oa_open_claim)) < 0)
+ return -EIO;
+ if (decode_bitmap4(xdr, res->oa_createmode, ARRAY_SIZE(res->oa_createmode)) < 0)
+ return -EIO;
+ bitmap[2] &= ~FATTR4_WORD2_OPEN_ARGUMENTS;
+ }
+ return 0;
+}
+
static int verify_attr_len(struct xdr_stream *xdr, unsigned int savep, uint32_t attrlen)
{
unsigned int attrwords = XDR_QUADLEN(attrlen);
@@ -4477,6 +4534,8 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re
if ((status = decode_attr_exclcreat_supported(xdr, bitmap,
res->exclcreat_bitmask)) != 0)
goto xdr_error;
+ if ((status = decode_attr_open_arguments(xdr, bitmap, &res->open_caps)) != 0)
+ goto xdr_error;
status = verify_attr_len(xdr, savep, attrlen);
xdr_error:
dprintk("%s: xdr returned %d!\n", __func__, -status);
@@ -5148,13 +5207,12 @@ static int decode_space_limit(struct xdr_stream *xdr,
}
static int decode_rw_delegation(struct xdr_stream *xdr,
- uint32_t delegation_type,
- struct nfs_openres *res)
+ struct nfs4_open_delegation *res)
{
__be32 *p;
int status;
- status = decode_delegation_stateid(xdr, &res->delegation);
+ status = decode_delegation_stateid(xdr, &res->stateid);
if (unlikely(status))
return status;
p = xdr_inline_decode(xdr, 4);
@@ -5162,52 +5220,57 @@ static int decode_rw_delegation(struct xdr_stream *xdr,
return -EIO;
res->do_recall = be32_to_cpup(p);
- switch (delegation_type) {
+ switch (res->open_delegation_type) {
case NFS4_OPEN_DELEGATE_READ:
- res->delegation_type = FMODE_READ;
+ case NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG:
+ res->type = FMODE_READ;
break;
case NFS4_OPEN_DELEGATE_WRITE:
- res->delegation_type = FMODE_WRITE|FMODE_READ;
+ case NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG:
+ res->type = FMODE_WRITE|FMODE_READ;
if (decode_space_limit(xdr, &res->pagemod_limit) < 0)
return -EIO;
}
return decode_ace(xdr, NULL);
}
-static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
+static int decode_no_delegation(struct xdr_stream *xdr,
+ struct nfs4_open_delegation *res)
{
__be32 *p;
- uint32_t why_no_delegation;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
return -EIO;
- why_no_delegation = be32_to_cpup(p);
- switch (why_no_delegation) {
+ res->why_no_delegation = be32_to_cpup(p);
+ switch (res->why_no_delegation) {
case WND4_CONTENTION:
case WND4_RESOURCE:
- xdr_inline_decode(xdr, 4);
- /* Ignore for now */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -EIO;
+ res->will_notify = be32_to_cpup(p);
}
return 0;
}
-static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
+static int decode_delegation(struct xdr_stream *xdr,
+ struct nfs4_open_delegation *res)
{
__be32 *p;
- uint32_t delegation_type;
p = xdr_inline_decode(xdr, 4);
if (unlikely(!p))
return -EIO;
- delegation_type = be32_to_cpup(p);
- res->delegation_type = 0;
- switch (delegation_type) {
+ res->open_delegation_type = be32_to_cpup(p);
+ switch (res->open_delegation_type) {
case NFS4_OPEN_DELEGATE_NONE:
return 0;
case NFS4_OPEN_DELEGATE_READ:
case NFS4_OPEN_DELEGATE_WRITE:
- return decode_rw_delegation(xdr, delegation_type, res);
+ case NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG:
+ case NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG:
+ return decode_rw_delegation(xdr, res);
case NFS4_OPEN_DELEGATE_NONE_EXT:
return decode_no_delegation(xdr, res);
}
@@ -5248,7 +5311,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
for (; i < NFS4_BITMAP_SIZE; i++)
res->attrset[i] = 0;
- return decode_delegation(xdr, res);
+ return decode_delegation(xdr, &res->delegation);
xdr_error:
dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen);
return -EIO;
@@ -5480,6 +5543,11 @@ static int decode_setattr(struct xdr_stream *xdr)
return -EIO;
}
+static int decode_delegattr(struct xdr_stream *xdr)
+{
+ return decode_setattr(xdr);
+}
+
static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_res *res)
{
__be32 *p;
@@ -7052,6 +7120,12 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
if (status)
goto out;
}
+ if (res->sattr_res) {
+ status = decode_delegattr(xdr);
+ res->sattr_ret = status;
+ if (status)
+ goto out;
+ }
if (res->fattr) {
status = decode_getfattr(xdr, res->fattr, res->server);
if (status != 0)
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 1e710654af11..352fdaed4075 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -939,10 +939,11 @@ TRACE_EVENT(nfs_sillyrename_unlink,
DECLARE_EVENT_CLASS(nfs_folio_event,
TP_PROTO(
const struct inode *inode,
- struct folio *folio
+ loff_t offset,
+ size_t count
),
- TP_ARGS(inode, folio),
+ TP_ARGS(inode, offset, count),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -950,7 +951,7 @@ DECLARE_EVENT_CLASS(nfs_folio_event,
__field(u64, fileid)
__field(u64, version)
__field(loff_t, offset)
- __field(u32, count)
+ __field(size_t, count)
),
TP_fast_assign(
@@ -960,13 +961,13 @@ DECLARE_EVENT_CLASS(nfs_folio_event,
__entry->fileid = nfsi->fileid;
__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
__entry->version = inode_peek_iversion_raw(inode);
- __entry->offset = folio_file_pos(folio);
- __entry->count = nfs_folio_length(folio);
+ __entry->offset = offset,
+ __entry->count = count;
),
TP_printk(
"fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu "
- "offset=%lld count=%u",
+ "offset=%lld count=%zu",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle, __entry->version,
@@ -978,18 +979,20 @@ DECLARE_EVENT_CLASS(nfs_folio_event,
DEFINE_EVENT(nfs_folio_event, name, \
TP_PROTO( \
const struct inode *inode, \
- struct folio *folio \
+ loff_t offset, \
+ size_t count \
), \
- TP_ARGS(inode, folio))
+ TP_ARGS(inode, offset, count))
DECLARE_EVENT_CLASS(nfs_folio_event_done,
TP_PROTO(
const struct inode *inode,
- struct folio *folio,
+ loff_t offset,
+ size_t count,
int ret
),
- TP_ARGS(inode, folio, ret),
+ TP_ARGS(inode, offset, count, ret),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -998,7 +1001,7 @@ DECLARE_EVENT_CLASS(nfs_folio_event_done,
__field(u64, fileid)
__field(u64, version)
__field(loff_t, offset)
- __field(u32, count)
+ __field(size_t, count)
),
TP_fast_assign(
@@ -1008,14 +1011,14 @@ DECLARE_EVENT_CLASS(nfs_folio_event_done,
__entry->fileid = nfsi->fileid;
__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
__entry->version = inode_peek_iversion_raw(inode);
- __entry->offset = folio_file_pos(folio);
- __entry->count = nfs_folio_length(folio);
+ __entry->offset = offset,
+ __entry->count = count,
__entry->ret = ret;
),
TP_printk(
"fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu "
- "offset=%lld count=%u ret=%d",
+ "offset=%lld count=%zu ret=%d",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle, __entry->version,
@@ -1027,10 +1030,11 @@ DECLARE_EVENT_CLASS(nfs_folio_event_done,
DEFINE_EVENT(nfs_folio_event_done, name, \
TP_PROTO( \
const struct inode *inode, \
- struct folio *folio, \
+ loff_t offset, \
+ size_t count, \
int ret \
), \
- TP_ARGS(inode, folio, ret))
+ TP_ARGS(inode, offset, count, ret))
DEFINE_NFS_FOLIO_EVENT(nfs_aop_readpage);
DEFINE_NFS_FOLIO_EVENT_DONE(nfs_aop_readpage_done);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 040b6b79c75e..04124f226665 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -188,102 +188,6 @@ nfs_async_iocounter_wait(struct rpc_task *task, struct nfs_lock_context *l_ctx)
EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait);
/*
- * nfs_page_lock_head_request - page lock the head of the page group
- * @req: any member of the page group
- */
-struct nfs_page *
-nfs_page_group_lock_head(struct nfs_page *req)
-{
- struct nfs_page *head = req->wb_head;
-
- while (!nfs_lock_request(head)) {
- int ret = nfs_wait_on_request(head);
- if (ret < 0)
- return ERR_PTR(ret);
- }
- if (head != req)
- kref_get(&head->wb_kref);
- return head;
-}
-
-/*
- * nfs_unroll_locks - unlock all newly locked reqs and wait on @req
- * @head: head request of page group, must be holding head lock
- * @req: request that couldn't lock and needs to wait on the req bit lock
- *
- * This is a helper function for nfs_lock_and_join_requests
- * returns 0 on success, < 0 on error.
- */
-static void
-nfs_unroll_locks(struct nfs_page *head, struct nfs_page *req)
-{
- struct nfs_page *tmp;
-
- /* relinquish all the locks successfully grabbed this run */
- for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) {
- if (!kref_read(&tmp->wb_kref))
- continue;
- nfs_unlock_and_release_request(tmp);
- }
-}
-
-/*
- * nfs_page_group_lock_subreq - try to lock a subrequest
- * @head: head request of page group
- * @subreq: request to lock
- *
- * This is a helper function for nfs_lock_and_join_requests which
- * must be called with the head request and page group both locked.
- * On error, it returns with the page group unlocked.
- */
-static int
-nfs_page_group_lock_subreq(struct nfs_page *head, struct nfs_page *subreq)
-{
- int ret;
-
- if (!kref_get_unless_zero(&subreq->wb_kref))
- return 0;
- while (!nfs_lock_request(subreq)) {
- nfs_page_group_unlock(head);
- ret = nfs_wait_on_request(subreq);
- if (!ret)
- ret = nfs_page_group_lock(head);
- if (ret < 0) {
- nfs_unroll_locks(head, subreq);
- nfs_release_request(subreq);
- return ret;
- }
- }
- return 0;
-}
-
-/*
- * nfs_page_group_lock_subrequests - try to lock the subrequests
- * @head: head request of page group
- *
- * This is a helper function for nfs_lock_and_join_requests which
- * must be called with the head request locked.
- */
-int nfs_page_group_lock_subrequests(struct nfs_page *head)
-{
- struct nfs_page *subreq;
- int ret;
-
- ret = nfs_page_group_lock(head);
- if (ret < 0)
- return ret;
- /* lock each request in the page group */
- for (subreq = head->wb_this_page; subreq != head;
- subreq = subreq->wb_this_page) {
- ret = nfs_page_group_lock_subreq(head, subreq);
- if (ret < 0)
- return ret;
- }
- nfs_page_group_unlock(head);
- return 0;
-}
-
-/*
* nfs_page_set_headlock - set the request PG_HEADLOCK
* @req: request that is to be locked
*
@@ -569,7 +473,7 @@ struct nfs_page *nfs_page_create_from_folio(struct nfs_open_context *ctx,
if (IS_ERR(l_ctx))
return ERR_CAST(l_ctx);
- ret = nfs_page_create(l_ctx, offset, folio_index(folio), offset, count);
+ ret = nfs_page_create(l_ctx, offset, folio->index, offset, count);
if (!IS_ERR(ret)) {
nfs_page_assign_folio(ret, folio);
nfs_page_group_init(ret, NULL);
@@ -694,25 +598,6 @@ void nfs_release_request(struct nfs_page *req)
}
EXPORT_SYMBOL_GPL(nfs_release_request);
-/**
- * nfs_wait_on_request - Wait for a request to complete.
- * @req: request to wait upon.
- *
- * Interruptible by fatal signals only.
- * The user is responsible for holding a count on the request.
- */
-int
-nfs_wait_on_request(struct nfs_page *req)
-{
- if (!test_bit(PG_BUSY, &req->wb_flags))
- return 0;
- set_bit(PG_CONTENDED2, &req->wb_flags);
- smp_mb__after_atomic();
- return wait_on_bit_io(&req->wb_flags, PG_BUSY,
- TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL_GPL(nfs_wait_on_request);
-
/*
* nfs_generic_pg_test - determine if requests can be coalesced
* @desc: pointer to descriptor
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index b5834728f31b..aa698481bec8 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -61,6 +61,7 @@ static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,
u32 seq);
static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
struct list_head *tmp_list);
+static int pnfs_layout_return_on_reboot(struct pnfs_layout_hdr *lo);
/* Return the registered pnfs layout driver module matching given id */
static struct pnfs_layoutdriver_type *
@@ -476,6 +477,18 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
return !list_empty(&lo->plh_segs);
}
+static int pnfs_mark_layout_stateid_return(struct pnfs_layout_hdr *lo,
+ struct list_head *lseg_list,
+ enum pnfs_iomode iomode, u32 seq)
+{
+ struct pnfs_layout_range range = {
+ .iomode = iomode,
+ .length = NFS4_MAX_UINT64,
+ };
+
+ return pnfs_mark_matching_lsegs_return(lo, lseg_list, &range, seq);
+}
+
static int
pnfs_iomode_to_fail_bit(u32 iomode)
{
@@ -846,8 +859,6 @@ pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
break;
inode = pnfs_grab_inode_layout_hdr(lo);
if (inode != NULL) {
- if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags))
- list_del_rcu(&lo->plh_layouts);
if (pnfs_layout_add_bulk_destroy_list(inode,
layout_list))
continue;
@@ -868,7 +879,7 @@ pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
static int
pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
- bool is_bulk_recall)
+ enum pnfs_layout_destroy_mode mode)
{
struct pnfs_layout_hdr *lo;
struct inode *inode;
@@ -886,8 +897,11 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
spin_lock(&inode->i_lock);
list_del_init(&lo->plh_bulk_destroy);
- if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) {
- if (is_bulk_recall)
+ if (mode == PNFS_LAYOUT_FILE_BULK_RETURN) {
+ pnfs_mark_layout_stateid_return(lo, &lseg_list,
+ IOMODE_ANY, 0);
+ } else if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) {
+ if (mode == PNFS_LAYOUT_BULK_RETURN)
set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
ret = -EAGAIN;
}
@@ -901,10 +915,8 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
return ret;
}
-int
-pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
- struct nfs_fsid *fsid,
- bool is_recall)
+int pnfs_layout_destroy_byfsid(struct nfs_client *clp, struct nfs_fsid *fsid,
+ enum pnfs_layout_destroy_mode mode)
{
struct nfs_server *server;
LIST_HEAD(layout_list);
@@ -923,33 +935,40 @@ restart:
rcu_read_unlock();
spin_unlock(&clp->cl_lock);
- if (list_empty(&layout_list))
- return 0;
- return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
+ return pnfs_layout_free_bulk_destroy_list(&layout_list, mode);
}
-int
-pnfs_destroy_layouts_byclid(struct nfs_client *clp,
- bool is_recall)
+static void pnfs_layout_build_destroy_list_byclient(struct nfs_client *clp,
+ struct list_head *list)
{
struct nfs_server *server;
- LIST_HEAD(layout_list);
spin_lock(&clp->cl_lock);
rcu_read_lock();
restart:
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
- if (pnfs_layout_bulk_destroy_byserver_locked(clp,
- server,
- &layout_list) != 0)
+ if (pnfs_layout_bulk_destroy_byserver_locked(clp, server,
+ list) != 0)
goto restart;
}
rcu_read_unlock();
spin_unlock(&clp->cl_lock);
+}
- if (list_empty(&layout_list))
- return 0;
- return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
+static int pnfs_layout_do_destroy_byclid(struct nfs_client *clp,
+ struct list_head *list,
+ enum pnfs_layout_destroy_mode mode)
+{
+ pnfs_layout_build_destroy_list_byclient(clp, list);
+ return pnfs_layout_free_bulk_destroy_list(list, mode);
+}
+
+int pnfs_layout_destroy_byclid(struct nfs_client *clp,
+ enum pnfs_layout_destroy_mode mode)
+{
+ LIST_HEAD(layout_list);
+
+ return pnfs_layout_do_destroy_byclid(clp, &layout_list, mode);
}
/*
@@ -962,7 +981,68 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
nfs4_deviceid_mark_client_invalid(clp);
nfs4_deviceid_purge_client(clp);
- pnfs_destroy_layouts_byclid(clp, false);
+ pnfs_layout_destroy_byclid(clp, PNFS_LAYOUT_INVALIDATE);
+}
+
+static void pnfs_layout_build_recover_list_byclient(struct nfs_client *clp,
+ struct list_head *list)
+{
+ struct nfs_server *server;
+
+ spin_lock(&clp->cl_lock);
+ rcu_read_lock();
+restart:
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ if (!(server->caps & NFS_CAP_REBOOT_LAYOUTRETURN))
+ continue;
+ if (pnfs_layout_bulk_destroy_byserver_locked(clp, server,
+ list) != 0)
+ goto restart;
+ }
+ rcu_read_unlock();
+ spin_unlock(&clp->cl_lock);
+}
+
+static int pnfs_layout_bulk_list_reboot(struct list_head *list)
+{
+ struct pnfs_layout_hdr *lo;
+ struct nfs_server *server;
+ int ret;
+
+ list_for_each_entry(lo, list, plh_bulk_destroy) {
+ server = NFS_SERVER(lo->plh_inode);
+ ret = pnfs_layout_return_on_reboot(lo);
+ switch (ret) {
+ case 0:
+ continue;
+ case -NFS4ERR_BAD_STATEID:
+ server->caps &= ~NFS_CAP_REBOOT_LAYOUTRETURN;
+ break;
+ case -NFS4ERR_NO_GRACE:
+ break;
+ default:
+ goto err;
+ }
+ break;
+ }
+ return 0;
+err:
+ return ret;
+}
+
+int pnfs_layout_handle_reboot(struct nfs_client *clp)
+{
+ LIST_HEAD(list);
+ int ret = 0, ret2;
+
+ pnfs_layout_build_recover_list_byclient(clp, &list);
+ if (!list_empty(&list))
+ ret = pnfs_layout_bulk_list_reboot(&list);
+ ret2 = pnfs_layout_do_destroy_byclid(clp, &list,
+ PNFS_LAYOUT_INVALIDATE);
+ if (!ret)
+ ret = ret2;
+ return (ret == 0) ? 0 : -EAGAIN;
}
static void
@@ -1163,6 +1243,38 @@ static void pnfs_clear_layoutcommit(struct inode *inode,
}
}
+static void
+pnfs_layoutreturn_retry_later_locked(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *arg_stateid,
+ const struct pnfs_layout_range *range)
+{
+ const struct pnfs_layout_segment *lseg;
+ u32 seq = be32_to_cpu(arg_stateid->seqid);
+
+ if (pnfs_layout_is_valid(lo) &&
+ nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) {
+ list_for_each_entry(lseg, &lo->plh_return_segs, pls_list) {
+ if (pnfs_seqid_is_newer(lseg->pls_seq, seq) ||
+ !pnfs_should_free_range(&lseg->pls_range, range))
+ continue;
+ pnfs_set_plh_return_info(lo, range->iomode, seq);
+ break;
+ }
+ }
+}
+
+void pnfs_layoutreturn_retry_later(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *arg_stateid,
+ const struct pnfs_layout_range *range)
+{
+ struct inode *inode = lo->plh_inode;
+
+ spin_lock(&inode->i_lock);
+ pnfs_layoutreturn_retry_later_locked(lo, arg_stateid, range);
+ pnfs_clear_layoutreturn_waitbit(lo);
+ spin_unlock(&inode->i_lock);
+}
+
void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
const nfs4_stateid *arg_stateid,
const struct pnfs_layout_range *range,
@@ -1239,7 +1351,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo,
const nfs4_stateid *stateid,
const struct cred **pcred,
enum pnfs_iomode iomode,
- bool sync)
+ unsigned int flags)
{
struct inode *ino = lo->plh_inode;
struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
@@ -1266,33 +1378,21 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo,
if (ld->prepare_layoutreturn)
ld->prepare_layoutreturn(&lrp->args);
- status = nfs4_proc_layoutreturn(lrp, sync);
+ status = nfs4_proc_layoutreturn(lrp, flags);
out:
dprintk("<-- %s status: %d\n", __func__, status);
return status;
}
-static bool
-pnfs_layout_segments_returnable(struct pnfs_layout_hdr *lo,
- enum pnfs_iomode iomode,
- u32 seq)
-{
- struct pnfs_layout_range recall_range = {
- .length = NFS4_MAX_UINT64,
- .iomode = iomode,
- };
- return pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs,
- &recall_range, seq) != -EBUSY;
-}
-
/* Return true if layoutreturn is needed */
static bool
pnfs_layout_need_return(struct pnfs_layout_hdr *lo)
{
if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
return false;
- return pnfs_layout_segments_returnable(lo, lo->plh_return_iomode,
- lo->plh_return_seq);
+ return pnfs_mark_layout_stateid_return(lo, &lo->plh_return_segs,
+ lo->plh_return_iomode,
+ lo->plh_return_seq) != EBUSY;
}
static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
@@ -1312,7 +1412,8 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
spin_unlock(&inode->i_lock);
if (send) {
/* Send an async layoutreturn so we dont deadlock */
- pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
+ pnfs_send_layoutreturn(lo, &stateid, &cred, iomode,
+ PNFS_FL_LAYOUTRETURN_ASYNC);
}
} else
spin_unlock(&inode->i_lock);
@@ -1379,7 +1480,8 @@ _pnfs_return_layout(struct inode *ino)
send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, NULL);
spin_unlock(&ino->i_lock);
if (send)
- status = pnfs_send_layoutreturn(lo, &stateid, &cred, IOMODE_ANY, true);
+ status = pnfs_send_layoutreturn(lo, &stateid, &cred, IOMODE_ANY,
+ 0);
out_wait_layoutreturn:
wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, TASK_UNINTERRUPTIBLE);
out_put_layout_hdr:
@@ -1417,6 +1519,24 @@ pnfs_commit_and_return_layout(struct inode *inode)
return ret;
}
+static int pnfs_layout_return_on_reboot(struct pnfs_layout_hdr *lo)
+{
+ struct inode *inode = lo->plh_inode;
+ const struct cred *cred;
+
+ spin_lock(&inode->i_lock);
+ if (!pnfs_layout_is_valid(lo)) {
+ spin_unlock(&inode->i_lock);
+ return 0;
+ }
+ cred = get_cred(lo->plh_lc_cred);
+ pnfs_get_layout_hdr(lo);
+ spin_unlock(&inode->i_lock);
+
+ return pnfs_send_layoutreturn(lo, &zero_stateid, &cred, IOMODE_ANY,
+ PNFS_FL_LAYOUTRETURN_PRIVILEGED);
+}
+
bool pnfs_roc(struct inode *ino,
struct nfs4_layoutreturn_args *args,
struct nfs4_layoutreturn_res *res,
@@ -1520,7 +1640,7 @@ out_noroc:
return true;
}
if (layoutreturn)
- pnfs_send_layoutreturn(lo, &stateid, &lc_cred, iomode, true);
+ pnfs_send_layoutreturn(lo, &stateid, &lc_cred, iomode, 0);
pnfs_put_layout_hdr(lo);
return false;
}
@@ -1570,8 +1690,7 @@ int pnfs_roc_done(struct rpc_task *task, struct nfs4_layoutreturn_args **argpp,
}
void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
- struct nfs4_layoutreturn_res *res,
- int ret)
+ struct nfs4_layoutreturn_res *res, int ret)
{
struct pnfs_layout_hdr *lo = args->layout;
struct inode *inode = args->inode;
@@ -1579,11 +1698,13 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
struct nfs4_xdr_opaque_data *ld_private = args->ld_private;
switch (ret) {
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_DEADSESSION:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
case -NFS4ERR_NOMATCHING_LAYOUT:
spin_lock(&inode->i_lock);
- if (pnfs_layout_is_valid(lo) &&
- nfs4_stateid_match_other(&args->stateid, &lo->plh_stateid))
- pnfs_set_plh_return_info(lo, args->range.iomode, 0);
+ pnfs_layoutreturn_retry_later_locked(lo, &args->stateid,
+ &args->range);
pnfs_clear_layoutreturn_waitbit(lo);
spin_unlock(&inode->i_lock);
break;
@@ -2566,7 +2687,8 @@ pnfs_mark_layout_for_return(struct inode *inode,
return_now = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode);
spin_unlock(&inode->i_lock);
if (return_now)
- pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
+ pnfs_send_layoutreturn(lo, &stateid, &cred, iomode,
+ PNFS_FL_LAYOUTRETURN_ASYNC);
} else {
spin_unlock(&inode->i_lock);
nfs_commit_inode(inode, 0);
@@ -2682,7 +2804,8 @@ restart:
}
spin_unlock(&inode->i_lock);
rcu_read_unlock();
- pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
+ pnfs_send_layoutreturn(lo, &stateid, &cred, iomode,
+ PNFS_FL_LAYOUTRETURN_ASYNC);
pnfs_put_layout_hdr(lo);
cond_resched();
goto restart;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index fa5beeaaf5da..30d2613e912b 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -118,6 +118,12 @@ enum layoutdriver_policy_flags {
PNFS_LAYOUTGET_ON_OPEN = 1 << 3,
};
+enum pnfs_layout_destroy_mode {
+ PNFS_LAYOUT_INVALIDATE = 0,
+ PNFS_LAYOUT_BULK_RETURN,
+ PNFS_LAYOUT_FILE_BULK_RETURN,
+};
+
struct nfs4_deviceid_node;
/* Per-layout driver specific registration structure */
@@ -127,7 +133,6 @@ struct pnfs_layoutdriver_type {
const char *name;
struct module *owner;
unsigned flags;
- unsigned max_deviceinfo_size;
unsigned max_layoutget_response;
int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
@@ -193,8 +198,6 @@ struct pnfs_commit_ops {
int max);
void (*recover_commit_reqs) (struct list_head *list,
struct nfs_commit_info *cinfo);
- struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo,
- struct folio *folio);
};
struct pnfs_layout_hdr {
@@ -242,6 +245,9 @@ extern const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id);
extern void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld);
/* nfs4proc.c */
+#define PNFS_FL_LAYOUTRETURN_ASYNC (1U << 0)
+#define PNFS_FL_LAYOUTRETURN_PRIVILEGED (1U << 1)
+
extern size_t max_response_pages(struct nfs_server *server);
extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
struct pnfs_device *dev,
@@ -249,7 +255,8 @@ extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
extern struct pnfs_layout_segment *
nfs4_proc_layoutget(struct nfs4_layoutget *lgp,
struct nfs4_exception *exception);
-extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
+extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp,
+ unsigned int flags);
/* pnfs.c */
void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
@@ -273,11 +280,10 @@ void pnfs_free_lseg_list(struct list_head *tmp_list);
void pnfs_destroy_layout(struct nfs_inode *);
void pnfs_destroy_layout_final(struct nfs_inode *);
void pnfs_destroy_all_layouts(struct nfs_client *);
-int pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
- struct nfs_fsid *fsid,
- bool is_recall);
-int pnfs_destroy_layouts_byclid(struct nfs_client *clp,
- bool is_recall);
+int pnfs_layout_destroy_byfsid(struct nfs_client *clp, struct nfs_fsid *fsid,
+ enum pnfs_layout_destroy_mode mode);
+int pnfs_layout_destroy_byclid(struct nfs_client *clp,
+ enum pnfs_layout_destroy_mode mode);
bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
struct pnfs_layout_range *dst_range,
struct inode *inode);
@@ -323,6 +329,9 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
enum pnfs_iomode iomode,
bool strict_iomode,
gfp_t gfp_flags);
+void pnfs_layoutreturn_retry_later(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *arg_stateid,
+ const struct pnfs_layout_range *range);
void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
const nfs4_stateid *arg_stateid,
const struct pnfs_layout_range *range,
@@ -344,6 +353,7 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
struct pnfs_layout_segment *lseg);
void pnfs_layout_return_unused_byclid(struct nfs_client *clp,
enum pnfs_iomode iomode);
+int pnfs_layout_handle_reboot(struct nfs_client *clp);
/* nfs4_deviceid_flags */
enum {
@@ -396,8 +406,6 @@ void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data);
void pnfs_generic_rw_release(void *data);
void pnfs_generic_recover_commit_reqs(struct list_head *dst,
struct nfs_commit_info *cinfo);
-struct nfs_page *pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo,
- struct folio *folio);
int pnfs_generic_commit_pagelist(struct inode *inode,
struct list_head *mds_pages,
int how,
@@ -557,17 +565,6 @@ pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo)
fl_cinfo->ops->recover_commit_reqs(head, cinfo);
}
-static inline struct nfs_page *
-pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
- struct folio *folio)
-{
- struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
-
- if (!fl_cinfo->ops || !fl_cinfo->ops->search_commit_reqs)
- return NULL;
- return fl_cinfo->ops->search_commit_reqs(cinfo, folio);
-}
-
/* Should the pNFS client commit and return the layout upon a setattr */
static inline bool
pnfs_ld_layoutret_on_setattr(struct inode *inode)
@@ -725,6 +722,11 @@ static inline void pnfs_destroy_layout_final(struct nfs_inode *nfsi)
{
}
+static inline int pnfs_layout_handle_reboot(struct nfs_client *clp)
+{
+ return 0;
+}
+
static inline struct pnfs_layout_segment *
pnfs_get_lseg(struct pnfs_layout_segment *lseg)
{
@@ -864,13 +866,6 @@ pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo)
{
}
-static inline struct nfs_page *
-pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
- struct folio *folio)
-{
- return NULL;
-}
-
static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
{
return 0;
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 178001c90156..bf0f2d67e96c 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -110,9 +110,6 @@ nfs4_get_device_info(struct nfs_server *server,
* GETDEVICEINFO's maxcount
*/
max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
- if (server->pnfs_curr_ld->max_deviceinfo_size &&
- server->pnfs_curr_ld->max_deviceinfo_size < max_resp_sz)
- max_resp_sz = server->pnfs_curr_ld->max_deviceinfo_size;
max_pages = nfs_page_array_len(0, max_resp_sz);
dprintk("%s: server %p max_resp_sz %u max_pages %d\n",
__func__, server, max_resp_sz, max_pages);
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 88e061bd711b..a74ee69a2fa6 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -351,53 +351,6 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst,
}
EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs);
-static struct nfs_page *
-pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets,
- unsigned int nbuckets, struct folio *folio)
-{
- struct nfs_page *req;
- struct pnfs_commit_bucket *b;
- unsigned int i;
-
- /* Linearly search the commit lists for each bucket until a matching
- * request is found */
- for (i = 0, b = buckets; i < nbuckets; i++, b++) {
- list_for_each_entry(req, &b->written, wb_list) {
- if (nfs_page_to_folio(req) == folio)
- return req->wb_head;
- }
- list_for_each_entry(req, &b->committing, wb_list) {
- if (nfs_page_to_folio(req) == folio)
- return req->wb_head;
- }
- }
- return NULL;
-}
-
-/* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head request
- * for @folio
- * @cinfo - commit info for current inode
- * @folio - page to search for matching head request
- *
- * Return: the head request if one is found, otherwise %NULL.
- */
-struct nfs_page *pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo,
- struct folio *folio)
-{
- struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
- struct pnfs_commit_array *array;
- struct nfs_page *req;
-
- list_for_each_entry(array, &fl_cinfo->commits, cinfo_list) {
- req = pnfs_bucket_search_commit_reqs(array->buckets,
- array->nbuckets, folio);
- if (req)
- return req;
- }
- return NULL;
-}
-EXPORT_SYMBOL_GPL(pnfs_generic_search_commit_reqs);
-
static struct pnfs_layout_segment *
pnfs_bucket_get_committing(struct list_head *head,
struct pnfs_commit_bucket *bucket,
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index d105e5b2659d..6c09cd090c34 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -687,11 +687,18 @@ out_einval:
return -EINVAL;
}
-static int nfs_have_delegation(struct inode *inode, fmode_t flags)
+static int nfs_have_delegation(struct inode *inode, fmode_t type, int flags)
{
return 0;
}
+static int nfs_return_delegation(struct inode *inode)
+{
+ if (S_ISREG(inode->i_mode))
+ nfs_wb_all(inode);
+ return 0;
+}
+
static const struct inode_operations nfs_dir_inode_operations = {
.create = nfs_create,
.lookup = nfs_lookup,
@@ -757,6 +764,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
.lock_check_bounds = nfs_lock_check_bounds,
.close_context = nfs_close_context,
.have_delegation = nfs_have_delegation,
+ .return_delegation = nfs_return_delegation,
.alloc_client = nfs_alloc_client,
.init_client = nfs_init_client,
.free_client = nfs_free_client,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index a142287d86f6..a6103333b666 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -28,6 +28,7 @@
#include "fscache.h"
#include "pnfs.h"
#include "nfstrace.h"
+#include "delegation.h"
#define NFSDBG_FACILITY NFSDBG_PAGECACHE
@@ -122,8 +123,6 @@ static void nfs_readpage_release(struct nfs_page *req, int error)
{
struct folio *folio = nfs_page_to_folio(req);
- if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
- folio_set_error(folio);
if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE))
if (nfs_netfs_folio_unlock(folio))
folio_unlock(folio);
@@ -288,7 +287,7 @@ int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio,
struct nfs_open_context *ctx,
struct folio *folio)
{
- struct inode *inode = folio_file_mapping(folio)->host;
+ struct inode *inode = folio->mapping->host;
struct nfs_server *server = NFS_SERVER(inode);
size_t fsize = folio_size(folio);
unsigned int rsize = server->rsize;
@@ -324,21 +323,57 @@ out:
}
/*
- * Read a page over NFS.
- * We read the page synchronously in the following case:
- * - The error flag is set for this page. This happens only when a
- * previous async read operation failed.
+ * Actually read a folio over the wire.
*/
-int nfs_read_folio(struct file *file, struct folio *folio)
+static int nfs_do_read_folio(struct file *file, struct folio *folio)
{
struct inode *inode = file_inode(file);
struct nfs_pageio_descriptor pgio;
struct nfs_open_context *ctx;
int ret;
- trace_nfs_aop_readpage(inode, folio);
+ ctx = get_nfs_open_context(nfs_file_open_context(file));
+
+ xchg(&ctx->error, 0);
+ nfs_pageio_init_read(&pgio, inode, false,
+ &nfs_async_read_completion_ops);
+
+ ret = nfs_read_add_folio(&pgio, ctx, folio);
+ if (ret)
+ goto out_put;
+
+ nfs_pageio_complete_read(&pgio);
+ nfs_update_delegated_atime(inode);
+ if (pgio.pg_error < 0) {
+ ret = pgio.pg_error;
+ goto out_put;
+ }
+
+ ret = folio_wait_locked_killable(folio);
+ if (!folio_test_uptodate(folio) && !ret)
+ ret = xchg(&ctx->error, 0);
+
+out_put:
+ put_nfs_open_context(ctx);
+ return ret;
+}
+
+/*
+ * Synchronously read a folio.
+ *
+ * This is not heavily used as most users to try an asynchronous
+ * large read through ->readahead first.
+ */
+int nfs_read_folio(struct file *file, struct folio *folio)
+{
+ struct inode *inode = file_inode(file);
+ loff_t pos = folio_pos(folio);
+ size_t len = folio_size(folio);
+ int ret;
+
+ trace_nfs_aop_readpage(inode, pos, len);
nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
- task_io_account_read(folio_size(folio));
+ task_io_account_read(len);
/*
* Try to flush any pending writes to the file..
@@ -358,30 +393,10 @@ int nfs_read_folio(struct file *file, struct folio *folio)
goto out_unlock;
ret = nfs_netfs_read_folio(file, folio);
- if (!ret)
- goto out;
-
- ctx = get_nfs_open_context(nfs_file_open_context(file));
-
- xchg(&ctx->error, 0);
- nfs_pageio_init_read(&pgio, inode, false,
- &nfs_async_read_completion_ops);
-
- ret = nfs_read_add_folio(&pgio, ctx, folio);
if (ret)
- goto out_put;
-
- nfs_pageio_complete_read(&pgio);
- ret = pgio.pg_error < 0 ? pgio.pg_error : 0;
- if (!ret) {
- ret = folio_wait_locked_killable(folio);
- if (!folio_test_uptodate(folio) && !ret)
- ret = xchg(&ctx->error, 0);
- }
-out_put:
- put_nfs_open_context(ctx);
+ ret = nfs_do_read_folio(file, folio);
out:
- trace_nfs_aop_readpage_done(inode, folio, ret);
+ trace_nfs_aop_readpage_done(inode, pos, len, ret);
return ret;
out_unlock:
folio_unlock(folio);
@@ -428,6 +443,7 @@ void nfs_readahead(struct readahead_control *ractl)
}
nfs_pageio_complete_read(&pgio);
+ nfs_update_delegated_atime(inode);
put_nfs_open_context(ctx);
out:
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 13818129d268..1c62a5a9f51d 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -32,15 +32,7 @@ static int nfs_symlink_filler(struct file *file, struct folio *folio)
int error;
error = NFS_PROTO(inode)->readlink(inode, &folio->page, 0, PAGE_SIZE);
- if (error < 0)
- goto error;
- folio_mark_uptodate(folio);
- folio_unlock(folio);
- return 0;
-
-error:
- folio_set_error(folio);
- folio_unlock(folio);
+ folio_end_read(folio, error == 0);
return error;
}
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 0110299643a2..bf77399696a7 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -232,6 +232,8 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
dentry->d_fsdata = NULL;
spin_unlock(&dentry->d_lock);
+ NFS_PROTO(inode)->return_delegation(inode);
+
if (NFS_STALE(inode) || !nfs_call_unlink(dentry, inode, data))
nfs_free_unlinkdata(data);
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 2329cbb0e446..d074d0ceb4f0 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -63,9 +63,6 @@ static void nfs_clear_request_commit(struct nfs_commit_info *cinfo,
struct nfs_page *req);
static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
struct inode *inode);
-static struct nfs_page *
-nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
- struct folio *folio);
static struct kmem_cache *nfs_wdata_cachep;
static mempool_t *nfs_wdata_mempool;
@@ -147,53 +144,23 @@ static void nfs_io_completion_put(struct nfs_io_completion *ioc)
kref_put(&ioc->refcount, nfs_io_completion_release);
}
-static void
-nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode)
-{
- if (!test_and_set_bit(PG_INODE_REF, &req->wb_flags)) {
- kref_get(&req->wb_kref);
- atomic_long_inc(&NFS_I(inode)->nrequests);
- }
-}
-
-static int
-nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode)
-{
- int ret;
-
- if (!test_bit(PG_REMOVE, &req->wb_flags))
- return 0;
- ret = nfs_page_group_lock(req);
- if (ret)
- return ret;
- if (test_and_clear_bit(PG_REMOVE, &req->wb_flags))
- nfs_page_set_inode_ref(req, inode);
- nfs_page_group_unlock(req);
- return 0;
-}
-
-static struct nfs_page *nfs_folio_private_request(struct folio *folio)
-{
- return folio_get_private(folio);
-}
-
/**
- * nfs_folio_find_private_request - find head request associated with a folio
+ * nfs_folio_find_head_request - find head request associated with a folio
* @folio: pointer to folio
*
* must be called while holding the inode lock.
*
* returns matching head request with reference held, or NULL if not found.
*/
-static struct nfs_page *nfs_folio_find_private_request(struct folio *folio)
+static struct nfs_page *nfs_folio_find_head_request(struct folio *folio)
{
- struct address_space *mapping = folio_file_mapping(folio);
+ struct address_space *mapping = folio->mapping;
struct nfs_page *req;
if (!folio_test_private(folio))
return NULL;
spin_lock(&mapping->i_private_lock);
- req = nfs_folio_private_request(folio);
+ req = folio->private;
if (req) {
WARN_ON_ONCE(req->wb_head != req);
kref_get(&req->wb_kref);
@@ -202,86 +169,20 @@ static struct nfs_page *nfs_folio_find_private_request(struct folio *folio)
return req;
}
-static struct nfs_page *nfs_folio_find_swap_request(struct folio *folio)
-{
- struct inode *inode = folio_file_mapping(folio)->host;
- struct nfs_inode *nfsi = NFS_I(inode);
- struct nfs_page *req = NULL;
- if (!folio_test_swapcache(folio))
- return NULL;
- mutex_lock(&nfsi->commit_mutex);
- if (folio_test_swapcache(folio)) {
- req = nfs_page_search_commits_for_head_request_locked(nfsi,
- folio);
- if (req) {
- WARN_ON_ONCE(req->wb_head != req);
- kref_get(&req->wb_kref);
- }
- }
- mutex_unlock(&nfsi->commit_mutex);
- return req;
-}
-
-/**
- * nfs_folio_find_head_request - find head request associated with a folio
- * @folio: pointer to folio
- *
- * returns matching head request with reference held, or NULL if not found.
- */
-static struct nfs_page *nfs_folio_find_head_request(struct folio *folio)
-{
- struct nfs_page *req;
-
- req = nfs_folio_find_private_request(folio);
- if (!req)
- req = nfs_folio_find_swap_request(folio);
- return req;
-}
-
-static struct nfs_page *nfs_folio_find_and_lock_request(struct folio *folio)
-{
- struct inode *inode = folio_file_mapping(folio)->host;
- struct nfs_page *req, *head;
- int ret;
-
- for (;;) {
- req = nfs_folio_find_head_request(folio);
- if (!req)
- return req;
- head = nfs_page_group_lock_head(req);
- if (head != req)
- nfs_release_request(req);
- if (IS_ERR(head))
- return head;
- ret = nfs_cancel_remove_inode(head, inode);
- if (ret < 0) {
- nfs_unlock_and_release_request(head);
- return ERR_PTR(ret);
- }
- /* Ensure that nobody removed the request before we locked it */
- if (head == nfs_folio_private_request(folio))
- break;
- if (folio_test_swapcache(folio))
- break;
- nfs_unlock_and_release_request(head);
- }
- return head;
-}
-
/* Adjust the file length if we're writing beyond the end */
static void nfs_grow_file(struct folio *folio, unsigned int offset,
unsigned int count)
{
- struct inode *inode = folio_file_mapping(folio)->host;
+ struct inode *inode = folio->mapping->host;
loff_t end, i_size;
pgoff_t end_index;
spin_lock(&inode->i_lock);
i_size = i_size_read(inode);
end_index = ((i_size - 1) >> folio_shift(folio)) << folio_order(folio);
- if (i_size > 0 && folio_index(folio) < end_index)
+ if (i_size > 0 && folio->index < end_index)
goto out;
- end = folio_file_pos(folio) + (loff_t)offset + (loff_t)count;
+ end = folio_pos(folio) + (loff_t)offset + (loff_t)count;
if (i_size >= end)
goto out;
trace_nfs_size_grow(inode, end);
@@ -289,6 +190,8 @@ static void nfs_grow_file(struct folio *folio, unsigned int offset,
NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE;
nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
out:
+ /* Atomically update timestamps if they are delegated to us. */
+ nfs_update_delegated_mtime_locked(inode);
spin_unlock(&inode->i_lock);
nfs_fscache_invalidate(inode, 0);
}
@@ -309,9 +212,8 @@ static void nfs_set_pageerror(struct address_space *mapping)
static void nfs_mapping_set_error(struct folio *folio, int error)
{
- struct address_space *mapping = folio_file_mapping(folio);
+ struct address_space *mapping = folio->mapping;
- folio_set_error(folio);
filemap_set_wb_err(mapping, error);
if (mapping->host)
errseq_set(&mapping->host->i_sb->s_wb_err,
@@ -410,7 +312,7 @@ int nfs_congestion_kb;
static void nfs_folio_set_writeback(struct folio *folio)
{
- struct nfs_server *nfss = NFS_SERVER(folio_file_mapping(folio)->host);
+ struct nfs_server *nfss = NFS_SERVER(folio->mapping->host);
folio_start_writeback(folio);
if (atomic_long_inc_return(&nfss->writeback) > NFS_CONGESTION_ON_THRESH)
@@ -419,12 +321,14 @@ static void nfs_folio_set_writeback(struct folio *folio)
static void nfs_folio_end_writeback(struct folio *folio)
{
- struct nfs_server *nfss = NFS_SERVER(folio_file_mapping(folio)->host);
+ struct nfs_server *nfss = NFS_SERVER(folio->mapping->host);
folio_end_writeback(folio);
if (atomic_long_dec_return(&nfss->writeback) <
- NFS_CONGESTION_OFF_THRESH)
+ NFS_CONGESTION_OFF_THRESH) {
nfss->write_congested = 0;
+ wake_up_all(&nfss->write_congestion_wait);
+ }
}
static void nfs_page_end_writeback(struct nfs_page *req)
@@ -548,6 +452,74 @@ void nfs_join_page_group(struct nfs_page *head, struct nfs_commit_info *cinfo,
nfs_destroy_unlinked_subrequests(destroy_list, head, inode);
}
+/**
+ * nfs_wait_on_request - Wait for a request to complete.
+ * @req: request to wait upon.
+ *
+ * Interruptible by fatal signals only.
+ * The user is responsible for holding a count on the request.
+ */
+static int nfs_wait_on_request(struct nfs_page *req)
+{
+ if (!test_bit(PG_BUSY, &req->wb_flags))
+ return 0;
+ set_bit(PG_CONTENDED2, &req->wb_flags);
+ smp_mb__after_atomic();
+ return wait_on_bit_io(&req->wb_flags, PG_BUSY,
+ TASK_UNINTERRUPTIBLE);
+}
+
+/*
+ * nfs_unroll_locks - unlock all newly locked reqs and wait on @req
+ * @head: head request of page group, must be holding head lock
+ * @req: request that couldn't lock and needs to wait on the req bit lock
+ *
+ * This is a helper function for nfs_lock_and_join_requests
+ * returns 0 on success, < 0 on error.
+ */
+static void
+nfs_unroll_locks(struct nfs_page *head, struct nfs_page *req)
+{
+ struct nfs_page *tmp;
+
+ /* relinquish all the locks successfully grabbed this run */
+ for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) {
+ if (!kref_read(&tmp->wb_kref))
+ continue;
+ nfs_unlock_and_release_request(tmp);
+ }
+}
+
+/*
+ * nfs_page_group_lock_subreq - try to lock a subrequest
+ * @head: head request of page group
+ * @subreq: request to lock
+ *
+ * This is a helper function for nfs_lock_and_join_requests which
+ * must be called with the head request and page group both locked.
+ * On error, it returns with the page group unlocked.
+ */
+static int
+nfs_page_group_lock_subreq(struct nfs_page *head, struct nfs_page *subreq)
+{
+ int ret;
+
+ if (!kref_get_unless_zero(&subreq->wb_kref))
+ return 0;
+ while (!nfs_lock_request(subreq)) {
+ nfs_page_group_unlock(head);
+ ret = nfs_wait_on_request(subreq);
+ if (!ret)
+ ret = nfs_page_group_lock(head);
+ if (ret < 0) {
+ nfs_unroll_locks(head, subreq);
+ nfs_release_request(subreq);
+ return ret;
+ }
+ }
+ return 0;
+}
+
/*
* nfs_lock_and_join_requests - join all subreqs to the head req
* @folio: the folio used to lookup the "page group" of nfs_page structures
@@ -565,31 +537,75 @@ void nfs_join_page_group(struct nfs_page *head, struct nfs_commit_info *cinfo,
*/
static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio)
{
- struct inode *inode = folio_file_mapping(folio)->host;
- struct nfs_page *head;
+ struct inode *inode = folio->mapping->host;
+ struct nfs_page *head, *subreq;
struct nfs_commit_info cinfo;
+ bool removed;
int ret;
- nfs_init_cinfo_from_inode(&cinfo, inode);
/*
* A reference is taken only on the head request which acts as a
* reference to the whole page group - the group will not be destroyed
* until the head reference is released.
*/
- head = nfs_folio_find_and_lock_request(folio);
- if (IS_ERR_OR_NULL(head))
- return head;
+retry:
+ head = nfs_folio_find_head_request(folio);
+ if (!head)
+ return NULL;
+
+ while (!nfs_lock_request(head)) {
+ ret = nfs_wait_on_request(head);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ }
+
+ /* Ensure that nobody removed the request before we locked it */
+ if (head != folio->private) {
+ nfs_unlock_and_release_request(head);
+ goto retry;
+ }
+
+ ret = nfs_page_group_lock(head);
+ if (ret < 0)
+ goto out_unlock;
+
+ removed = test_bit(PG_REMOVE, &head->wb_flags);
/* lock each request in the page group */
- ret = nfs_page_group_lock_subrequests(head);
- if (ret < 0) {
+ for (subreq = head->wb_this_page;
+ subreq != head;
+ subreq = subreq->wb_this_page) {
+ if (test_bit(PG_REMOVE, &subreq->wb_flags))
+ removed = true;
+ ret = nfs_page_group_lock_subreq(head, subreq);
+ if (ret < 0)
+ goto out_unlock;
+ }
+
+ nfs_page_group_unlock(head);
+
+ /*
+ * If PG_REMOVE is set on any request, I/O on that request has
+ * completed, but some requests were still under I/O at the time
+ * we locked the head request.
+ *
+ * In that case the above wait for all requests means that all I/O
+ * has now finished, and we can restart from a clean slate. Let the
+ * old requests go away and start from scratch instead.
+ */
+ if (removed) {
+ nfs_unroll_locks(head, head);
nfs_unlock_and_release_request(head);
- return ERR_PTR(ret);
+ goto retry;
}
+ nfs_init_cinfo_from_inode(&cinfo, inode);
nfs_join_page_group(head, &cinfo, inode);
-
return head;
+
+out_unlock:
+ nfs_unlock_and_release_request(head);
+ return ERR_PTR(ret);
}
static void nfs_write_error(struct nfs_page *req, int error)
@@ -641,7 +657,7 @@ static int nfs_page_async_flush(struct folio *folio,
nfs_redirty_request(req);
pgio->pg_error = 0;
} else
- nfs_add_stats(folio_file_mapping(folio)->host,
+ nfs_add_stats(folio->mapping->host,
NFSIOS_WRITEPAGES, 1);
out:
return ret;
@@ -653,7 +669,7 @@ out_launder:
static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc,
struct nfs_pageio_descriptor *pgio)
{
- nfs_pageio_cond_complete(pgio, folio_index(folio));
+ nfs_pageio_cond_complete(pgio, folio->index);
return nfs_page_async_flush(folio, wbc, pgio);
}
@@ -664,7 +680,7 @@ static int nfs_writepage_locked(struct folio *folio,
struct writeback_control *wbc)
{
struct nfs_pageio_descriptor pgio;
- struct inode *inode = folio_file_mapping(folio)->host;
+ struct inode *inode = folio->mapping->host;
int err;
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
@@ -698,12 +714,17 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
struct nfs_pageio_descriptor pgio;
struct nfs_io_completion *ioc = NULL;
unsigned int mntflags = NFS_SERVER(inode)->flags;
+ struct nfs_server *nfss = NFS_SERVER(inode);
int priority = 0;
int err;
- if (wbc->sync_mode == WB_SYNC_NONE &&
- NFS_SERVER(inode)->write_congested)
- return 0;
+ /* Wait with writeback until write congestion eases */
+ if (wbc->sync_mode == WB_SYNC_NONE && nfss->write_congested) {
+ err = wait_event_killable(nfss->write_congestion_wait,
+ nfss->write_congested == 0);
+ if (err)
+ return err;
+ }
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
@@ -742,24 +763,17 @@ out_err:
static void nfs_inode_add_request(struct nfs_page *req)
{
struct folio *folio = nfs_page_to_folio(req);
- struct address_space *mapping = folio_file_mapping(folio);
+ struct address_space *mapping = folio->mapping;
struct nfs_inode *nfsi = NFS_I(mapping->host);
WARN_ON_ONCE(req->wb_this_page != req);
/* Lock the request! */
nfs_lock_request(req);
-
- /*
- * Swap-space should not get truncated. Hence no need to plug the race
- * with invalidate/truncate.
- */
spin_lock(&mapping->i_private_lock);
- if (likely(!folio_test_swapcache(folio))) {
- set_bit(PG_MAPPED, &req->wb_flags);
- folio_set_private(folio);
- folio->private = req;
- }
+ set_bit(PG_MAPPED, &req->wb_flags);
+ folio_set_private(folio);
+ folio->private = req;
spin_unlock(&mapping->i_private_lock);
atomic_long_inc(&nfsi->nrequests);
/* this a head request for a page group - mark it as having an
@@ -779,10 +793,10 @@ static void nfs_inode_remove_request(struct nfs_page *req)
if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
struct folio *folio = nfs_page_to_folio(req->wb_head);
- struct address_space *mapping = folio_file_mapping(folio);
+ struct address_space *mapping = folio->mapping;
spin_lock(&mapping->i_private_lock);
- if (likely(folio && !folio_test_swapcache(folio))) {
+ if (likely(folio)) {
folio->private = NULL;
folio_clear_private(folio);
clear_bit(PG_MAPPED, &req->wb_head->wb_flags);
@@ -803,38 +817,6 @@ static void nfs_mark_request_dirty(struct nfs_page *req)
filemap_dirty_folio(folio_mapping(folio), folio);
}
-/*
- * nfs_page_search_commits_for_head_request_locked
- *
- * Search through commit lists on @inode for the head request for @folio.
- * Must be called while holding the inode (which is cinfo) lock.
- *
- * Returns the head request if found, or NULL if not found.
- */
-static struct nfs_page *
-nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
- struct folio *folio)
-{
- struct nfs_page *freq, *t;
- struct nfs_commit_info cinfo;
- struct inode *inode = &nfsi->vfs_inode;
-
- nfs_init_cinfo_from_inode(&cinfo, inode);
-
- /* search through pnfs commit lists */
- freq = pnfs_search_commit_reqs(inode, &cinfo, folio);
- if (freq)
- return freq->wb_head;
-
- /* Linearly search the commit list for the correct request */
- list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
- if (nfs_page_to_folio(freq) == folio)
- return freq->wb_head;
- }
-
- return NULL;
-}
-
/**
* nfs_request_add_commit_list_locked - add request to a commit list
* @req: pointer to a struct nfs_page
@@ -941,7 +923,7 @@ static void nfs_folio_clear_commit(struct folio *folio)
long nr = folio_nr_pages(folio);
node_stat_mod_folio(folio, NR_WRITEBACK, -nr);
- wb_stat_mod(&inode_to_bdi(folio_file_mapping(folio)->host)->wb,
+ wb_stat_mod(&inode_to_bdi(folio->mapping->host)->wb,
WB_WRITEBACK, -nr);
}
}
@@ -1126,7 +1108,7 @@ out_flushme:
*/
nfs_mark_request_dirty(req);
nfs_unlock_and_release_request(req);
- error = nfs_wb_folio(folio_file_mapping(folio)->host, folio);
+ error = nfs_wb_folio(folio->mapping->host, folio);
return (error < 0) ? ERR_PTR(error) : NULL;
}
@@ -1202,7 +1184,7 @@ int nfs_flush_incompatible(struct file *file, struct folio *folio)
nfs_release_request(req);
if (!do_flush)
return 0;
- status = nfs_wb_folio(folio_file_mapping(folio)->host, folio);
+ status = nfs_wb_folio(folio->mapping->host, folio);
} while (status == 0);
return status;
}
@@ -1276,7 +1258,7 @@ out:
*/
static bool nfs_folio_write_uptodate(struct folio *folio, unsigned int pagelen)
{
- struct inode *inode = folio_file_mapping(folio)->host;
+ struct inode *inode = folio->mapping->host;
struct nfs_inode *nfsi = NFS_I(inode);
if (nfs_have_delegated_attributes(inode))
@@ -1320,7 +1302,7 @@ static int nfs_can_extend_write(struct file *file, struct folio *folio,
return 0;
if (!nfs_folio_write_uptodate(folio, pagelen))
return 0;
- if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
+ if (nfs_have_write_delegation(inode))
return 1;
if (!flctx || (list_empty_careful(&flctx->flc_flock) &&
list_empty_careful(&flctx->flc_posix)))
@@ -1354,7 +1336,7 @@ int nfs_update_folio(struct file *file, struct folio *folio,
unsigned int offset, unsigned int count)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
- struct address_space *mapping = folio_file_mapping(folio);
+ struct address_space *mapping = folio->mapping;
struct inode *inode = mapping->host;
unsigned int pagelen = nfs_folio_length(folio);
int status = 0;
@@ -1362,14 +1344,18 @@ int nfs_update_folio(struct file *file, struct folio *folio,
nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
dprintk("NFS: nfs_update_folio(%pD2 %d@%lld)\n", file, count,
- (long long)(folio_file_pos(folio) + offset));
+ (long long)(folio_pos(folio) + offset));
if (!count)
goto out;
if (nfs_can_extend_write(file, folio, pagelen)) {
- count = max(count + offset, pagelen);
- offset = 0;
+ unsigned int end = count + offset;
+
+ offset = round_down(offset, PAGE_SIZE);
+ if (end < pagelen)
+ end = min(round_up(end, PAGE_SIZE), pagelen);
+ count = end - offset;
}
status = nfs_writepage_setup(ctx, folio, offset, count);
@@ -1514,6 +1500,13 @@ void nfs_writeback_update_inode(struct nfs_pgio_header *hdr)
struct nfs_fattr *fattr = &hdr->fattr;
struct inode *inode = hdr->inode;
+ if (nfs_have_delegated_mtime(inode)) {
+ spin_lock(&inode->i_lock);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS);
+ spin_unlock(&inode->i_lock);
+ return;
+ }
+
spin_lock(&inode->i_lock);
nfs_writeback_check_extend(hdr, fattr);
nfs_post_op_update_inode_force_wcc_locked(inode, fattr);
@@ -1837,7 +1830,6 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
struct nfs_page *req;
int status = data->task.tk_status;
struct nfs_commit_info cinfo;
- struct nfs_server *nfss;
struct folio *folio;
while (!list_empty(&data->pages)) {
@@ -1880,9 +1872,6 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
/* Latency breaker */
cond_resched();
}
- nfss = NFS_SERVER(data->inode);
- if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
- nfss->write_congested = 0;
nfs_init_cinfo(&cinfo, data->inode, data->dreq);
nfs_commit_end(cinfo.mds);
@@ -2073,17 +2062,17 @@ int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio)
*/
int nfs_wb_folio(struct inode *inode, struct folio *folio)
{
- loff_t range_start = folio_file_pos(folio);
- loff_t range_end = range_start + (loff_t)folio_size(folio) - 1;
+ loff_t range_start = folio_pos(folio);
+ size_t len = folio_size(folio);
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 0,
.range_start = range_start,
- .range_end = range_end,
+ .range_end = range_start + len - 1,
};
int ret;
- trace_nfs_writeback_folio(inode, folio);
+ trace_nfs_writeback_folio(inode, range_start, len);
for (;;) {
folio_wait_writeback(folio);
@@ -2101,7 +2090,7 @@ int nfs_wb_folio(struct inode *inode, struct folio *folio)
goto out_error;
}
out_error:
- trace_nfs_writeback_folio_done(inode, folio, ret);
+ trace_nfs_writeback_folio_done(inode, range_start, len, ret);
return ret;
}
diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c
index 1479583fbb62..27cd0d13143b 100644
--- a/fs/nfs_common/grace.c
+++ b/fs/nfs_common/grace.c
@@ -139,6 +139,7 @@ exit_grace(void)
}
MODULE_AUTHOR("Jeff Layton <jlayton@primarydata.com>");
+MODULE_DESCRIPTION("NFS client and server infrastructure");
MODULE_LICENSE("GPL");
module_init(init_grace)
module_exit(exit_grace)
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 5a5bd85d08f8..ea382b75b26c 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -29,6 +29,7 @@
#include <linux/nfs3.h>
#include <linux/sort.h>
+MODULE_DESCRIPTION("NFS ACL support");
MODULE_LICENSE("GPL");
struct nfsacl_encode_desc {
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 272ab8d5c4d7..ec2ab6429e00 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -162,7 +162,7 @@ config NFSD_V4_SECURITY_LABEL
config NFSD_LEGACY_CLIENT_TRACKING
bool "Support legacy NFSv4 client tracking methods (DEPRECATED)"
depends on NFSD_V4
- default n
+ default y
help
The NFSv4 server needs to store a small amount of information on
stable storage in order to handle state recovery after reboot. Most
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index ad9083ca144b..f4704f5d4086 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -664,7 +664,7 @@ static int
nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg,
void *data)
{
- struct file_lock *fl = data;
+ struct file_lease *fl = data;
/* Only close files for F_SETLEASE leases */
if (fl->c.flc_flags & FL_LEASE)
diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c
index 529a75ecf22e..ca54aa583530 100644
--- a/fs/nfsd/netlink.c
+++ b/fs/nfsd/netlink.c
@@ -40,6 +40,11 @@ static const struct nla_policy nfsd_listener_set_nl_policy[NFSD_A_SERVER_SOCK_AD
[NFSD_A_SERVER_SOCK_ADDR] = NLA_POLICY_NESTED(nfsd_sock_nl_policy),
};
+/* NFSD_CMD_POOL_MODE_SET - do */
+static const struct nla_policy nfsd_pool_mode_set_nl_policy[NFSD_A_POOL_MODE_MODE + 1] = {
+ [NFSD_A_POOL_MODE_MODE] = { .type = NLA_NUL_STRING, },
+};
+
/* Ops table for nfsd */
static const struct genl_split_ops nfsd_nl_ops[] = {
{
@@ -83,6 +88,18 @@ static const struct genl_split_ops nfsd_nl_ops[] = {
.doit = nfsd_nl_listener_get_doit,
.flags = GENL_CMD_CAP_DO,
},
+ {
+ .cmd = NFSD_CMD_POOL_MODE_SET,
+ .doit = nfsd_nl_pool_mode_set_doit,
+ .policy = nfsd_pool_mode_set_nl_policy,
+ .maxattr = NFSD_A_POOL_MODE_MODE,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NFSD_CMD_POOL_MODE_GET,
+ .doit = nfsd_nl_pool_mode_get_doit,
+ .flags = GENL_CMD_CAP_DO,
+ },
};
struct genl_family nfsd_nl_family __ro_after_init = {
diff --git a/fs/nfsd/netlink.h b/fs/nfsd/netlink.h
index 2e132ef328f8..8eb903f24c41 100644
--- a/fs/nfsd/netlink.h
+++ b/fs/nfsd/netlink.h
@@ -23,6 +23,8 @@ int nfsd_nl_version_set_doit(struct sk_buff *skb, struct genl_info *info);
int nfsd_nl_version_get_doit(struct sk_buff *skb, struct genl_info *info);
int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info);
int nfsd_nl_listener_get_doit(struct sk_buff *skb, struct genl_info *info);
+int nfsd_nl_pool_mode_set_doit(struct sk_buff *skb, struct genl_info *info);
+int nfsd_nl_pool_mode_get_doit(struct sk_buff *skb, struct genl_info *info);
extern struct genl_family nfsd_nl_family;
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 12b2b9bc07bf..4e3be7201b1c 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -308,8 +308,6 @@ static void nfsaclsvc_release_access(struct svc_rqst *rqstp)
fh_put(&resp->fh);
}
-struct nfsd3_voidargs { int dummy; };
-
#define ST 1 /* status*/
#define AT 21 /* attributes */
#define pAT (1+AT) /* post attributes - conditional */
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 73adca47d373..5e34e98db969 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -221,8 +221,6 @@ static void nfs3svc_release_getacl(struct svc_rqst *rqstp)
posix_acl_release(resp->acl_default);
}
-struct nfsd3_voidargs { int dummy; };
-
#define ST 1 /* status*/
#define AT 21 /* attributes */
#define pAT (1+AT) /* post attributes - conditional */
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 46bd20fe5c0f..2e39cf2e502a 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -2269,7 +2269,7 @@ nfsd4_layoutget(struct svc_rqst *rqstp,
const struct nfsd4_layout_ops *ops;
struct nfs4_layout_stateid *ls;
__be32 nfserr;
- int accmode = NFSD_MAY_READ_IF_EXEC;
+ int accmode = NFSD_MAY_READ_IF_EXEC | NFSD_MAY_OWNER_OVERRIDE;
switch (lgp->lg_seg.iomode) {
case IOMODE_READ:
@@ -2359,7 +2359,8 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp,
struct nfs4_layout_stateid *ls;
__be32 nfserr;
- nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_WRITE);
+ nfserr = fh_verify(rqstp, current_fh, 0,
+ NFSD_MAY_WRITE | NFSD_MAY_OWNER_OVERRIDE);
if (nfserr)
goto out;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 2c060e0b1604..67d8673a9391 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -2086,8 +2086,8 @@ do_init:
status = nn->client_tracking_ops->init(net);
out:
if (status) {
- printk(KERN_WARNING "NFSD: Unable to initialize client "
- "recovery tracking! (%d)\n", status);
+ pr_warn("NFSD: Unable to initialize client recovery tracking! (%d)\n", status);
+ pr_warn("NFSD: Is nfsdcld running? If not, enable CONFIG_NFSD_LEGACY_CLIENT_TRACKING.\n");
nn->client_tracking_ops = NULL;
}
return status;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c7bfd2180e3f..42b41d55d4ed 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -118,11 +118,11 @@ static int zero_clientid(clientid_t *clid)
* operation described in @argp finishes.
*/
static void *
-svcxdr_tmpalloc(struct nfsd4_compoundargs *argp, u32 len)
+svcxdr_tmpalloc(struct nfsd4_compoundargs *argp, size_t len)
{
struct svcxdr_tmpbuf *tb;
- tb = kmalloc(sizeof(*tb) + len, GFP_KERNEL);
+ tb = kmalloc(struct_size(tb, buf, len), GFP_KERNEL);
if (!tb)
return NULL;
tb->next = argp->to_free;
@@ -138,9 +138,9 @@ svcxdr_tmpalloc(struct nfsd4_compoundargs *argp, u32 len)
* buffer might end on a page boundary.
*/
static char *
-svcxdr_dupstr(struct nfsd4_compoundargs *argp, void *buf, u32 len)
+svcxdr_dupstr(struct nfsd4_compoundargs *argp, void *buf, size_t len)
{
- char *p = svcxdr_tmpalloc(argp, len + 1);
+ char *p = svcxdr_tmpalloc(argp, size_add(len, 1));
if (!p)
return NULL;
@@ -150,7 +150,7 @@ svcxdr_dupstr(struct nfsd4_compoundargs *argp, void *buf, u32 len)
}
static void *
-svcxdr_savemem(struct nfsd4_compoundargs *argp, __be32 *p, u32 len)
+svcxdr_savemem(struct nfsd4_compoundargs *argp, __be32 *p, size_t len)
{
__be32 *tmp;
@@ -2146,7 +2146,7 @@ nfsd4_decode_clone(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
*/
static __be32
nfsd4_vbuf_from_vector(struct nfsd4_compoundargs *argp, struct xdr_buf *xdr,
- char **bufp, u32 buflen)
+ char **bufp, size_t buflen)
{
struct page **pages = xdr->pages;
struct kvec *head = xdr->head;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 533b65057e18..9e0ea6fc2aa3 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -406,7 +406,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
return -EINVAL;
trace_nfsd_ctl_threads(net, newthreads);
mutex_lock(&nfsd_mutex);
- rv = nfsd_svc(newthreads, net, file->f_cred, NULL);
+ rv = nfsd_svc(1, &newthreads, net, file->f_cred, NULL);
mutex_unlock(&nfsd_mutex);
if (rv < 0)
return rv;
@@ -481,6 +481,14 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
goto out_free;
trace_nfsd_ctl_pool_threads(net, i, nthreads[i]);
}
+
+ /*
+ * There must always be a thread in pool 0; the admin
+ * can't shut down NFS completely using pool_threads.
+ */
+ if (nthreads[0] == 0)
+ nthreads[0] = 1;
+
rv = nfsd_set_nrthreads(i, nthreads, net);
if (rv)
goto out_free;
@@ -1637,7 +1645,7 @@ out_unlock:
*/
int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info)
{
- int nthreads = 0, count = 0, nrpools, ret = -EOPNOTSUPP, rem;
+ int *nthreads, count = 0, nrpools, i, ret = -EOPNOTSUPP, rem;
struct net *net = genl_info_net(info);
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
const struct nlattr *attr;
@@ -1654,15 +1662,22 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info)
mutex_lock(&nfsd_mutex);
- nrpools = nfsd_nrpools(net);
- if (nrpools && count > nrpools)
- count = nrpools;
-
- /* XXX: make this handle non-global pool-modes */
- if (count > 1)
+ nrpools = max(count, nfsd_nrpools(net));
+ nthreads = kcalloc(nrpools, sizeof(int), GFP_KERNEL);
+ if (!nthreads) {
+ ret = -ENOMEM;
goto out_unlock;
+ }
+
+ i = 0;
+ nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
+ if (nla_type(attr) == NFSD_A_SERVER_THREADS) {
+ nthreads[i++] = nla_get_u32(attr);
+ if (i >= nrpools)
+ break;
+ }
+ }
- nthreads = nla_get_u32(info->attrs[NFSD_A_SERVER_THREADS]);
if (info->attrs[NFSD_A_SERVER_GRACETIME] ||
info->attrs[NFSD_A_SERVER_LEASETIME] ||
info->attrs[NFSD_A_SERVER_SCOPE]) {
@@ -1696,12 +1711,13 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info)
scope = nla_data(attr);
}
- ret = nfsd_svc(nthreads, net, get_current_cred(), scope);
-
+ ret = nfsd_svc(nrpools, nthreads, net, get_current_cred(), scope);
+ if (ret > 0)
+ ret = 0;
out_unlock:
mutex_unlock(&nfsd_mutex);
-
- return ret == nthreads ? 0 : ret;
+ kfree(nthreads);
+ return ret;
}
/**
@@ -2141,6 +2157,63 @@ err_free_msg:
}
/**
+ * nfsd_nl_pool_mode_set_doit - set the number of running threads
+ * @skb: reply buffer
+ * @info: netlink metadata and command arguments
+ *
+ * Return 0 on success or a negative errno.
+ */
+int nfsd_nl_pool_mode_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ const struct nlattr *attr;
+
+ if (GENL_REQ_ATTR_CHECK(info, NFSD_A_POOL_MODE_MODE))
+ return -EINVAL;
+
+ attr = info->attrs[NFSD_A_POOL_MODE_MODE];
+ return sunrpc_set_pool_mode(nla_data(attr));
+}
+
+/**
+ * nfsd_nl_pool_mode_get_doit - get info about pool_mode
+ * @skb: reply buffer
+ * @info: netlink metadata and command arguments
+ *
+ * Return 0 on success or a negative errno.
+ */
+int nfsd_nl_pool_mode_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ char buf[16];
+ void *hdr;
+ int err;
+
+ if (sunrpc_get_pool_mode(buf, ARRAY_SIZE(buf)) >= ARRAY_SIZE(buf))
+ return -ERANGE;
+
+ skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ err = -EMSGSIZE;
+ hdr = genlmsg_iput(skb, info);
+ if (!hdr)
+ goto err_free_msg;
+
+ err = nla_put_string(skb, NFSD_A_POOL_MODE_MODE, buf) |
+ nla_put_u32(skb, NFSD_A_POOL_MODE_NPOOLS, nfsd_nrpools(net));
+ if (err)
+ goto err_free_msg;
+
+ genlmsg_end(skb, hdr);
+ return genlmsg_reply(skb, info);
+
+err_free_msg:
+ nlmsg_free(skb);
+ return err;
+}
+
+/**
* nfsd_net_init - Prepare the nfsd_net portion of a new net namespace
* @net: a freshly-created network namespace
*
@@ -2169,6 +2242,8 @@ static __net_init int nfsd_net_init(struct net *net)
nn->nfsd_svcstats.program = &nfsd_program;
nn->nfsd_versions = NULL;
nn->nfsd4_minorversions = NULL;
+ nn->nfsd_info.mutex = &nfsd_mutex;
+ nn->nfsd_serv = NULL;
nfsd4_init_leases_net(nn);
get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key));
seqlock_init(&nn->writeverf_lock);
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 8f4f239d9f8a..cec8697b1cd6 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -103,7 +103,8 @@ bool nfssvc_encode_voidres(struct svc_rqst *rqstp,
/*
* Function prototypes.
*/
-int nfsd_svc(int nrservs, struct net *net, const struct cred *cred, const char *scope);
+int nfsd_svc(int n, int *nservers, struct net *net,
+ const struct cred *cred, const char *scope);
int nfsd_dispatch(struct svc_rqst *rqstp);
int nfsd_nrthreads(struct net *);
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 0b75305fb5f5..dd4e11a703aa 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -247,7 +247,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
dentry = dget(exp->ex_path.dentry);
else {
dentry = exportfs_decode_fh_raw(exp->ex_path.mnt, fid,
- data_left, fileid_type,
+ data_left, fileid_type, 0,
nfsd_acceptable, exp);
if (IS_ERR_OR_NULL(dentry)) {
trace_nfsd_set_fh_dentry_badhandle(rqstp, fhp,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index cd9a6a1a9fc8..0bc8eaa5e009 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -672,7 +672,6 @@ int nfsd_create_serv(struct net *net)
return error;
}
spin_lock(&nfsd_notifier_lock);
- nn->nfsd_info.mutex = &nfsd_mutex;
nn->nfsd_serv = serv;
spin_unlock(&nfsd_notifier_lock);
@@ -710,6 +709,19 @@ int nfsd_get_nrthreads(int n, int *nthreads, struct net *net)
return 0;
}
+/**
+ * nfsd_set_nrthreads - set the number of running threads in the net's service
+ * @n: number of array members in @nthreads
+ * @nthreads: array of thread counts for each pool
+ * @net: network namespace to operate within
+ *
+ * This function alters the number of running threads for the given network
+ * namespace in each pool. If passed an array longer then the number of pools
+ * the extra pool settings are ignored. If passed an array shorter than the
+ * number of pools, the missing values are interpreted as 0's.
+ *
+ * Returns 0 on success or a negative errno on error.
+ */
int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
{
int i = 0;
@@ -717,11 +729,18 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
int err = 0;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- WARN_ON(!mutex_is_locked(&nfsd_mutex));
+ lockdep_assert_held(&nfsd_mutex);
if (nn->nfsd_serv == NULL || n <= 0)
return 0;
+ /*
+ * Special case: When n == 1, pass in NULL for the pool, so that the
+ * change is distributed equally among them.
+ */
+ if (n == 1)
+ return svc_set_num_threads(nn->nfsd_serv, NULL, nthreads[0]);
+
if (n > nn->nfsd_serv->sv_nrpools)
n = nn->nfsd_serv->sv_nrpools;
@@ -744,31 +763,40 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
}
}
- /*
- * There must always be a thread in pool 0; the admin
- * can't shut down NFS completely using pool_threads.
- */
- if (nthreads[0] == 0)
- nthreads[0] = 1;
-
/* apply the new numbers */
for (i = 0; i < n; i++) {
err = svc_set_num_threads(nn->nfsd_serv,
&nn->nfsd_serv->sv_pools[i],
nthreads[i]);
if (err)
- break;
+ goto out;
}
+
+ /* Anything undefined in array is considered to be 0 */
+ for (i = n; i < nn->nfsd_serv->sv_nrpools; ++i) {
+ err = svc_set_num_threads(nn->nfsd_serv,
+ &nn->nfsd_serv->sv_pools[i],
+ 0);
+ if (err)
+ goto out;
+ }
+out:
return err;
}
-/*
- * Adjust the number of threads and return the new number of threads.
- * This is also the function that starts the server if necessary, if
- * this is the first time nrservs is nonzero.
+/**
+ * nfsd_svc: start up or shut down the nfsd server
+ * @n: number of array members in @nthreads
+ * @nthreads: array of thread counts for each pool
+ * @net: network namespace to operate within
+ * @cred: credentials to use for xprt creation
+ * @scope: server scope value (defaults to nodename)
+ *
+ * Adjust the number of threads in each pool and return the new
+ * total number of threads in the service.
*/
int
-nfsd_svc(int nrservs, struct net *net, const struct cred *cred, const char *scope)
+nfsd_svc(int n, int *nthreads, struct net *net, const struct cred *cred, const char *scope)
{
int error;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -778,13 +806,6 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred, const char *scop
dprintk("nfsd: creating service\n");
- nrservs = max(nrservs, 0);
- nrservs = min(nrservs, NFSD_MAXSERVS);
- error = 0;
-
- if (nrservs == 0 && nn->nfsd_serv == NULL)
- goto out;
-
strscpy(nn->nfsd_name, scope ? scope : utsname()->nodename,
sizeof(nn->nfsd_name));
@@ -796,7 +817,7 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred, const char *scop
error = nfsd_startup_net(net, cred);
if (error)
goto out_put;
- error = svc_set_num_threads(serv, NULL, nrservs);
+ error = nfsd_set_nrthreads(n, nthreads, net);
if (error)
goto out_put;
error = serv->sv_nrthreads;
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 89caef7513db..ba50388ee4bf 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -377,11 +377,12 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
* @target: offset number of an entry in the group (start point)
* @bsize: size in bits
* @lock: spin lock protecting @bitmap
+ * @wrap: whether to wrap around
*/
static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
unsigned long target,
unsigned int bsize,
- spinlock_t *lock)
+ spinlock_t *lock, bool wrap)
{
int pos, end = bsize;
@@ -397,6 +398,8 @@ static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
end = target;
}
+ if (!wrap)
+ return -ENOSPC;
/* wrap around */
for (pos = 0; pos < end; pos++) {
@@ -495,9 +498,10 @@ int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp)
* nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
* @inode: inode of metadata file using this allocator
* @req: nilfs_palloc_req structure exchanged for the allocation
+ * @wrap: whether to wrap around
*/
int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
- struct nilfs_palloc_req *req)
+ struct nilfs_palloc_req *req, bool wrap)
{
struct buffer_head *desc_bh, *bitmap_bh;
struct nilfs_palloc_group_desc *desc;
@@ -516,7 +520,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
entries_per_group = nilfs_palloc_entries_per_group(inode);
for (i = 0; i < ngroups; i += n) {
- if (group >= ngroups) {
+ if (group >= ngroups && wrap) {
/* wrap around */
group = 0;
maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr,
@@ -550,7 +554,14 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
bitmap_kaddr = kmap_local_page(bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
pos = nilfs_palloc_find_available_slot(
- bitmap, group_offset, entries_per_group, lock);
+ bitmap, group_offset, entries_per_group, lock,
+ wrap);
+ /*
+ * Since the search for a free slot in the second and
+ * subsequent bitmap blocks always starts from the
+ * beginning, the wrap flag only has an effect on the
+ * first search.
+ */
kunmap_local(bitmap_kaddr);
if (pos >= 0)
goto found;
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index b667e869ac07..d825a9faca6d 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -50,8 +50,8 @@ struct nilfs_palloc_req {
struct buffer_head *pr_entry_bh;
};
-int nilfs_palloc_prepare_alloc_entry(struct inode *,
- struct nilfs_palloc_req *);
+int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
+ struct nilfs_palloc_req *req, bool wrap);
void nilfs_palloc_commit_alloc_entry(struct inode *,
struct nilfs_palloc_req *);
void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *);
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 383f0afa2cea..cd14ea25968c 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -450,15 +450,9 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap)
__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
const struct buffer_head *bh)
{
- struct buffer_head *pbh;
- __u64 key;
+ loff_t pos = folio_pos(bh->b_folio) + bh_offset(bh);
- key = page_index(bh->b_page) << (PAGE_SHIFT -
- bmap->b_inode->i_blkbits);
- for (pbh = page_buffers(bh->b_page); pbh != bh; pbh = pbh->b_this_page)
- key++;
-
- return key;
+ return pos >> bmap->b_inode->i_blkbits;
}
__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key)
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 180fc8d36213..fc1caf63a42a 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -75,7 +75,7 @@ int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req)
{
int ret;
- ret = nilfs_palloc_prepare_alloc_entry(dat, req);
+ ret = nilfs_palloc_prepare_alloc_entry(dat, req, true);
if (ret < 0)
return ret;
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 52e50b1b7f22..4a29b0138d75 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -135,6 +135,9 @@ static bool nilfs_check_folio(struct folio *folio, char *kaddr)
goto Enamelen;
if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
goto Espan;
+ if (unlikely(p->inode &&
+ NILFS_PRIVATE_INODE(le64_to_cpu(p->inode))))
+ goto Einumber;
}
if (offs != limit)
goto Eend;
@@ -160,6 +163,9 @@ Enamelen:
goto bad_entry;
Espan:
error = "directory entry across blocks";
+ goto bad_entry;
+Einumber:
+ error = "disallowed inode number";
bad_entry:
nilfs_error(sb,
"bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%zd, name_len=%d",
@@ -377,11 +383,39 @@ found:
struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct folio **foliop)
{
- struct nilfs_dir_entry *de = nilfs_get_folio(dir, 0, foliop);
+ struct folio *folio;
+ struct nilfs_dir_entry *de, *next_de;
+ size_t limit;
+ char *msg;
+ de = nilfs_get_folio(dir, 0, &folio);
if (IS_ERR(de))
return NULL;
- return nilfs_next_entry(de);
+
+ limit = nilfs_last_byte(dir, 0); /* is a multiple of chunk size */
+ if (unlikely(!limit || le64_to_cpu(de->inode) != dir->i_ino ||
+ !nilfs_match(1, ".", de))) {
+ msg = "missing '.'";
+ goto fail;
+ }
+
+ next_de = nilfs_next_entry(de);
+ /*
+ * If "next_de" has not reached the end of the chunk, there is
+ * at least one more record. Check whether it matches "..".
+ */
+ if (unlikely((char *)next_de == (char *)de + nilfs_chunk_size(dir) ||
+ !nilfs_match(2, "..", next_de))) {
+ msg = "missing '..'";
+ goto fail;
+ }
+ *foliop = folio;
+ return next_de;
+
+fail:
+ nilfs_error(dir->i_sb, "directory #%lu %s", dir->i_ino, msg);
+ folio_release_kmap(folio, de);
+ return NULL;
}
ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 612e609158b5..1e86b9303b7c 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -56,13 +56,10 @@ int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
struct nilfs_palloc_req req;
int ret;
- req.pr_entry_nr = 0; /*
- * 0 says find free inode from beginning
- * of a group. dull code!!
- */
+ req.pr_entry_nr = NILFS_FIRST_INO(ifile->i_sb);
req.pr_entry_bh = NULL;
- ret = nilfs_palloc_prepare_alloc_entry(ifile, &req);
+ ret = nilfs_palloc_prepare_alloc_entry(ifile, &req, false);
if (!ret) {
ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1,
&req.pr_entry_bh);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 728e90be3570..4017f7856440 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -116,9 +116,15 @@ enum {
#define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino)
#define NILFS_MDT_INODE(sb, ino) \
- ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & BIT(ino)))
+ ((ino) < NILFS_USER_INO && (NILFS_MDT_INO_BITS & BIT(ino)))
#define NILFS_VALID_INODE(sb, ino) \
- ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & BIT(ino)))
+ ((ino) >= NILFS_FIRST_INO(sb) || \
+ ((ino) < NILFS_USER_INO && (NILFS_SYS_INO_BITS & BIT(ino))))
+
+#define NILFS_PRIVATE_INODE(ino) ({ \
+ ino_t __ino = (ino); \
+ ((__ino) < NILFS_USER_INO && (__ino) != NILFS_ROOT_INO && \
+ (__ino) != NILFS_SKETCH_INO); })
/**
* struct nilfs_transaction_info: context information for synchronization
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index f41d7b6d432c..e44dde57ab65 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -452,6 +452,12 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
}
nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino);
+ if (nilfs->ns_first_ino < NILFS_USER_INO) {
+ nilfs_err(nilfs->ns_sb,
+ "too small lower limit for non-reserved inode numbers: %u",
+ nilfs->ns_first_ino);
+ return -EINVAL;
+ }
nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 85da0629415d..1e829ed7b0ef 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -182,7 +182,7 @@ struct the_nilfs {
unsigned long ns_nrsvsegs;
unsigned long ns_first_data_block;
int ns_inode_size;
- int ns_first_ino;
+ unsigned int ns_first_ino;
u32 ns_crc_seed;
/* /sys/fs/<nilfs>/<device> */
diff --git a/fs/nls/mac-celtic.c b/fs/nls/mac-celtic.c
index 266c2d7d50bd..2963f3299d7e 100644
--- a/fs/nls/mac-celtic.c
+++ b/fs/nls/mac-celtic.c
@@ -598,4 +598,5 @@ static void __exit exit_nls_macceltic(void)
module_init(init_nls_macceltic)
module_exit(exit_nls_macceltic)
+MODULE_DESCRIPTION("NLS Codepage macceltic");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/mac-centeuro.c b/fs/nls/mac-centeuro.c
index 9789c6057551..43b20f4bdb67 100644
--- a/fs/nls/mac-centeuro.c
+++ b/fs/nls/mac-centeuro.c
@@ -528,4 +528,5 @@ static void __exit exit_nls_maccenteuro(void)
module_init(init_nls_maccenteuro)
module_exit(exit_nls_maccenteuro)
+MODULE_DESCRIPTION("NLS Codepage maccenteuro");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/mac-croatian.c b/fs/nls/mac-croatian.c
index bb19e7a07d43..62730d6a64e5 100644
--- a/fs/nls/mac-croatian.c
+++ b/fs/nls/mac-croatian.c
@@ -598,4 +598,5 @@ static void __exit exit_nls_maccroatian(void)
module_init(init_nls_maccroatian)
module_exit(exit_nls_maccroatian)
+MODULE_DESCRIPTION("NLS Codepage maccroatian");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/mac-cyrillic.c b/fs/nls/mac-cyrillic.c
index 2a7dea36acba..7a5c4d16aac8 100644
--- a/fs/nls/mac-cyrillic.c
+++ b/fs/nls/mac-cyrillic.c
@@ -493,4 +493,5 @@ static void __exit exit_nls_maccyrillic(void)
module_init(init_nls_maccyrillic)
module_exit(exit_nls_maccyrillic)
+MODULE_DESCRIPTION("NLS Codepage maccyrillic");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/mac-gaelic.c b/fs/nls/mac-gaelic.c
index 77b001653588..3d22f03a90b6 100644
--- a/fs/nls/mac-gaelic.c
+++ b/fs/nls/mac-gaelic.c
@@ -563,4 +563,5 @@ static void __exit exit_nls_macgaelic(void)
module_init(init_nls_macgaelic)
module_exit(exit_nls_macgaelic)
+MODULE_DESCRIPTION("NLS Codepage macgaelic");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/mac-greek.c b/fs/nls/mac-greek.c
index 1eccf499e2eb..de3aa9ddb5b1 100644
--- a/fs/nls/mac-greek.c
+++ b/fs/nls/mac-greek.c
@@ -493,4 +493,5 @@ static void __exit exit_nls_macgreek(void)
module_init(init_nls_macgreek)
module_exit(exit_nls_macgreek)
+MODULE_DESCRIPTION("NLS Codepage macgreek");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/mac-iceland.c b/fs/nls/mac-iceland.c
index cbd0875c6d69..0bba83f9d415 100644
--- a/fs/nls/mac-iceland.c
+++ b/fs/nls/mac-iceland.c
@@ -598,4 +598,5 @@ static void __exit exit_nls_maciceland(void)
module_init(init_nls_maciceland)
module_exit(exit_nls_maciceland)
+MODULE_DESCRIPTION("NLS Codepage maciceland");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/mac-inuit.c b/fs/nls/mac-inuit.c
index fba8357aaf03..493386832dfd 100644
--- a/fs/nls/mac-inuit.c
+++ b/fs/nls/mac-inuit.c
@@ -528,4 +528,5 @@ static void __exit exit_nls_macinuit(void)
module_init(init_nls_macinuit)
module_exit(exit_nls_macinuit)
+MODULE_DESCRIPTION("NLS Codepage macinuit");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/mac-roman.c b/fs/nls/mac-roman.c
index b6a98a5208cd..d3c082173c20 100644
--- a/fs/nls/mac-roman.c
+++ b/fs/nls/mac-roman.c
@@ -633,4 +633,5 @@ static void __exit exit_nls_macroman(void)
module_init(init_nls_macroman)
module_exit(exit_nls_macroman)
+MODULE_DESCRIPTION("NLS Codepage macroman");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/mac-romanian.c b/fs/nls/mac-romanian.c
index 25547f023638..a7735852f2d5 100644
--- a/fs/nls/mac-romanian.c
+++ b/fs/nls/mac-romanian.c
@@ -598,4 +598,5 @@ static void __exit exit_nls_macromanian(void)
module_init(init_nls_macromanian)
module_exit(exit_nls_macromanian)
+MODULE_DESCRIPTION("NLS Codepage macromanian");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/mac-turkish.c b/fs/nls/mac-turkish.c
index b5454bc7b7fa..d77e9b6b7d7c 100644
--- a/fs/nls/mac-turkish.c
+++ b/fs/nls/mac-turkish.c
@@ -598,4 +598,5 @@ static void __exit exit_nls_macturkish(void)
module_init(init_nls_macturkish)
module_exit(exit_nls_macturkish)
+MODULE_DESCRIPTION("NLS Codepage macturkish");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_ascii.c b/fs/nls/nls_ascii.c
index a2620650d5e4..068143d71284 100644
--- a/fs/nls/nls_ascii.c
+++ b/fs/nls/nls_ascii.c
@@ -163,4 +163,5 @@ static void __exit exit_nls_ascii(void)
module_init(init_nls_ascii)
module_exit(exit_nls_ascii)
+MODULE_DESCRIPTION("NLS ASCII (United States)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index a026dbd3593f..18d597e49a19 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -545,4 +545,5 @@ EXPORT_SYMBOL(unload_nls);
EXPORT_SYMBOL(load_nls);
EXPORT_SYMBOL(load_nls_default);
+MODULE_DESCRIPTION("Base file system native language support");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp1250.c b/fs/nls/nls_cp1250.c
index ace3e19d3407..e22a57a4b828 100644
--- a/fs/nls/nls_cp1250.c
+++ b/fs/nls/nls_cp1250.c
@@ -343,4 +343,5 @@ static void __exit exit_nls_cp1250(void)
module_init(init_nls_cp1250)
module_exit(exit_nls_cp1250)
+MODULE_DESCRIPTION("NLS Windows CP1250 (Slavic/Central European Languages)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp1251.c b/fs/nls/nls_cp1251.c
index 9273ddfd08a1..6f46d339f23c 100644
--- a/fs/nls/nls_cp1251.c
+++ b/fs/nls/nls_cp1251.c
@@ -298,4 +298,5 @@ static void __exit exit_nls_cp1251(void)
module_init(init_nls_cp1251)
module_exit(exit_nls_cp1251)
+MODULE_DESCRIPTION("NLS Windows CP1251 (Bulgarian, Belarusian)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp1255.c b/fs/nls/nls_cp1255.c
index 1caf5dfed85b..299e089d4301 100644
--- a/fs/nls/nls_cp1255.c
+++ b/fs/nls/nls_cp1255.c
@@ -380,5 +380,6 @@ static void __exit exit_nls_cp1255(void)
module_init(init_nls_cp1255)
module_exit(exit_nls_cp1255)
+MODULE_DESCRIPTION("NLS Hebrew charsets (ISO-8859-8, CP1255)");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_ALIAS_NLS(iso8859-8);
diff --git a/fs/nls/nls_cp437.c b/fs/nls/nls_cp437.c
index 7ddb830da3fd..ab880499ea32 100644
--- a/fs/nls/nls_cp437.c
+++ b/fs/nls/nls_cp437.c
@@ -384,4 +384,5 @@ static void __exit exit_nls_cp437(void)
module_init(init_nls_cp437)
module_exit(exit_nls_cp437)
+MODULE_DESCRIPTION("NLS Codepage 437 (United States, Canada)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp737.c b/fs/nls/nls_cp737.c
index c593f683a0cd..5c37618296e9 100644
--- a/fs/nls/nls_cp737.c
+++ b/fs/nls/nls_cp737.c
@@ -347,4 +347,5 @@ static void __exit exit_nls_cp737(void)
module_init(init_nls_cp737)
module_exit(exit_nls_cp737)
+MODULE_DESCRIPTION("NLS Codepage 737 (Greek)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp775.c b/fs/nls/nls_cp775.c
index 554c863745f2..51ccc908901f 100644
--- a/fs/nls/nls_cp775.c
+++ b/fs/nls/nls_cp775.c
@@ -316,4 +316,5 @@ static void __exit exit_nls_cp775(void)
module_init(init_nls_cp775)
module_exit(exit_nls_cp775)
+MODULE_DESCRIPTION("NLS Codepage 775 (Baltic Rim)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp850.c b/fs/nls/nls_cp850.c
index 56cccd14b40b..5f9b9507a8b6 100644
--- a/fs/nls/nls_cp850.c
+++ b/fs/nls/nls_cp850.c
@@ -312,4 +312,5 @@ static void __exit exit_nls_cp850(void)
module_init(init_nls_cp850)
module_exit(exit_nls_cp850)
+MODULE_DESCRIPTION("NLS Codepage 850 (Europe)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp852.c b/fs/nls/nls_cp852.c
index 7cdc05ac1d40..fc513a5e8358 100644
--- a/fs/nls/nls_cp852.c
+++ b/fs/nls/nls_cp852.c
@@ -334,4 +334,5 @@ static void __exit exit_nls_cp852(void)
module_init(init_nls_cp852)
module_exit(exit_nls_cp852)
+MODULE_DESCRIPTION("NLS Codepage 852 (Central/Eastern Europe)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp855.c b/fs/nls/nls_cp855.c
index 7426eea05663..a43be58adb36 100644
--- a/fs/nls/nls_cp855.c
+++ b/fs/nls/nls_cp855.c
@@ -296,4 +296,5 @@ static void __exit exit_nls_cp855(void)
module_init(init_nls_cp855)
module_exit(exit_nls_cp855)
+MODULE_DESCRIPTION("NLS Codepage 855 (Cyrillic)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp857.c b/fs/nls/nls_cp857.c
index 098309733ebd..772cd4195bad 100644
--- a/fs/nls/nls_cp857.c
+++ b/fs/nls/nls_cp857.c
@@ -298,4 +298,5 @@ static void __exit exit_nls_cp857(void)
module_init(init_nls_cp857)
module_exit(exit_nls_cp857)
+MODULE_DESCRIPTION("NLS Codepage 857 (Turkish)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp860.c b/fs/nls/nls_cp860.c
index 84224478e731..36cf4ca11966 100644
--- a/fs/nls/nls_cp860.c
+++ b/fs/nls/nls_cp860.c
@@ -361,4 +361,5 @@ static void __exit exit_nls_cp860(void)
module_init(init_nls_cp860)
module_exit(exit_nls_cp860)
+MODULE_DESCRIPTION("NLS Codepage 860 (Portuguese)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp861.c b/fs/nls/nls_cp861.c
index dc873e4be092..b7397d079f8f 100644
--- a/fs/nls/nls_cp861.c
+++ b/fs/nls/nls_cp861.c
@@ -384,4 +384,5 @@ static void __exit exit_nls_cp861(void)
module_init(init_nls_cp861)
module_exit(exit_nls_cp861)
+MODULE_DESCRIPTION("NLS Codepage 861 (Icelandic)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp862.c b/fs/nls/nls_cp862.c
index d5263e3c5566..fd3b95d1e95d 100644
--- a/fs/nls/nls_cp862.c
+++ b/fs/nls/nls_cp862.c
@@ -418,4 +418,5 @@ static void __exit exit_nls_cp862(void)
module_init(init_nls_cp862)
module_exit(exit_nls_cp862)
+MODULE_DESCRIPTION("NLS Codepage 862 (Hebrew)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp863.c b/fs/nls/nls_cp863.c
index 051c9832e36a..813ae7944249 100644
--- a/fs/nls/nls_cp863.c
+++ b/fs/nls/nls_cp863.c
@@ -378,4 +378,5 @@ static void __exit exit_nls_cp863(void)
module_init(init_nls_cp863)
module_exit(exit_nls_cp863)
+MODULE_DESCRIPTION("NLS Codepage 863 (Canadian French)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp864.c b/fs/nls/nls_cp864.c
index 97eb1273b2f7..d9eb6d5cd47a 100644
--- a/fs/nls/nls_cp864.c
+++ b/fs/nls/nls_cp864.c
@@ -404,4 +404,5 @@ static void __exit exit_nls_cp864(void)
module_init(init_nls_cp864)
module_exit(exit_nls_cp864)
+MODULE_DESCRIPTION("NLS Codepage 864 (Arabic)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp865.c b/fs/nls/nls_cp865.c
index 111214228525..2678ffd98bb6 100644
--- a/fs/nls/nls_cp865.c
+++ b/fs/nls/nls_cp865.c
@@ -384,4 +384,5 @@ static void __exit exit_nls_cp865(void)
module_init(init_nls_cp865)
module_exit(exit_nls_cp865)
+MODULE_DESCRIPTION("NLS Codepage 865 (Norwegian, Danish)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp866.c b/fs/nls/nls_cp866.c
index ffdcbc3fc38d..7e93d0a3802a 100644
--- a/fs/nls/nls_cp866.c
+++ b/fs/nls/nls_cp866.c
@@ -302,4 +302,5 @@ static void __exit exit_nls_cp866(void)
module_init(init_nls_cp866)
module_exit(exit_nls_cp866)
+MODULE_DESCRIPTION("NLS Codepage 866 (Cyrillic/Russian)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp869.c b/fs/nls/nls_cp869.c
index 3b5a34589354..4491737dd5cb 100644
--- a/fs/nls/nls_cp869.c
+++ b/fs/nls/nls_cp869.c
@@ -312,4 +312,5 @@ static void __exit exit_nls_cp869(void)
module_init(init_nls_cp869)
module_exit(exit_nls_cp869)
+MODULE_DESCRIPTION("NLS Codepage 869 (Greek)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_cp874.c b/fs/nls/nls_cp874.c
index 8dfaa10710fa..4fcfbf8ca72c 100644
--- a/fs/nls/nls_cp874.c
+++ b/fs/nls/nls_cp874.c
@@ -271,5 +271,6 @@ static void __exit exit_nls_cp874(void)
module_init(init_nls_cp874)
module_exit(exit_nls_cp874)
+MODULE_DESCRIPTION("NLS Thai charset (CP874, TIS-620)");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_ALIAS_NLS(tis-620);
diff --git a/fs/nls/nls_cp932.c b/fs/nls/nls_cp932.c
index 67b7398e8483..e5e6270fcca6 100644
--- a/fs/nls/nls_cp932.c
+++ b/fs/nls/nls_cp932.c
@@ -7929,5 +7929,6 @@ static void __exit exit_nls_cp932(void)
module_init(init_nls_cp932)
module_exit(exit_nls_cp932)
+MODULE_DESCRIPTION("NLS Japanese charset (Shift-JIS)");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_ALIAS_NLS(sjis);
diff --git a/fs/nls/nls_cp936.c b/fs/nls/nls_cp936.c
index c96546cfec9f..91d0a15fd7f9 100644
--- a/fs/nls/nls_cp936.c
+++ b/fs/nls/nls_cp936.c
@@ -11107,5 +11107,6 @@ static void __exit exit_nls_cp936(void)
module_init(init_nls_cp936)
module_exit(exit_nls_cp936)
+MODULE_DESCRIPTION("NLS Simplified Chinese charset (CP936, GB2312)");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_ALIAS_NLS(gb2312);
diff --git a/fs/nls/nls_cp949.c b/fs/nls/nls_cp949.c
index 199171e97aa4..3ae03c76d59c 100644
--- a/fs/nls/nls_cp949.c
+++ b/fs/nls/nls_cp949.c
@@ -13942,5 +13942,6 @@ static void __exit exit_nls_cp949(void)
module_init(init_nls_cp949)
module_exit(exit_nls_cp949)
+MODULE_DESCRIPTION("NLS Korean charset (CP949, EUC-KR)");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_ALIAS_NLS(euc-kr);
diff --git a/fs/nls/nls_cp950.c b/fs/nls/nls_cp950.c
index 8e1418708209..e968aa80198d 100644
--- a/fs/nls/nls_cp950.c
+++ b/fs/nls/nls_cp950.c
@@ -9478,5 +9478,6 @@ static void __exit exit_nls_cp950(void)
module_init(init_nls_cp950)
module_exit(exit_nls_cp950)
+MODULE_DESCRIPTION("NLS Traditional Chinese charset (Big5)");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_ALIAS_NLS(big5);
diff --git a/fs/nls/nls_euc-jp.c b/fs/nls/nls_euc-jp.c
index 162b3f160353..0191cc9d955e 100644
--- a/fs/nls/nls_euc-jp.c
+++ b/fs/nls/nls_euc-jp.c
@@ -577,4 +577,5 @@ static void __exit exit_nls_euc_jp(void)
module_init(init_nls_euc_jp)
module_exit(exit_nls_euc_jp)
+MODULE_DESCRIPTION("NLS Japanese charset (EUC-JP)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-1.c b/fs/nls/nls_iso8859-1.c
index 69ac020d43b1..a181be488f7d 100644
--- a/fs/nls/nls_iso8859-1.c
+++ b/fs/nls/nls_iso8859-1.c
@@ -254,4 +254,5 @@ static void __exit exit_nls_iso8859_1(void)
module_init(init_nls_iso8859_1)
module_exit(exit_nls_iso8859_1)
+MODULE_DESCRIPTION("NLS ISO 8859-1 (Latin 1; Western European Languages)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-13.c b/fs/nls/nls_iso8859-13.c
index afb3f8f275f0..8e2be5bfeaf1 100644
--- a/fs/nls/nls_iso8859-13.c
+++ b/fs/nls/nls_iso8859-13.c
@@ -282,4 +282,5 @@ static void __exit exit_nls_iso8859_13(void)
module_init(init_nls_iso8859_13)
module_exit(exit_nls_iso8859_13)
+MODULE_DESCRIPTION("NLS ISO 8859-13 (Latin 7; Baltic)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-14.c b/fs/nls/nls_iso8859-14.c
index 046370f0b6f0..c789eccb8a69 100644
--- a/fs/nls/nls_iso8859-14.c
+++ b/fs/nls/nls_iso8859-14.c
@@ -338,4 +338,5 @@ static void __exit exit_nls_iso8859_14(void)
module_init(init_nls_iso8859_14)
module_exit(exit_nls_iso8859_14)
+MODULE_DESCRIPTION("NLS ISO 8859-14 (Latin 8; Celtic)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-15.c b/fs/nls/nls_iso8859-15.c
index 7e34a841a056..ffec649176fb 100644
--- a/fs/nls/nls_iso8859-15.c
+++ b/fs/nls/nls_iso8859-15.c
@@ -304,4 +304,5 @@ static void __exit exit_nls_iso8859_15(void)
module_init(init_nls_iso8859_15)
module_exit(exit_nls_iso8859_15)
+MODULE_DESCRIPTION("NLS ISO 8859-15 (Latin 9; Western European Languages with Euro)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-2.c b/fs/nls/nls_iso8859-2.c
index 7dd571181741..d352334d0314 100644
--- a/fs/nls/nls_iso8859-2.c
+++ b/fs/nls/nls_iso8859-2.c
@@ -305,4 +305,5 @@ static void __exit exit_nls_iso8859_2(void)
module_init(init_nls_iso8859_2)
module_exit(exit_nls_iso8859_2)
+MODULE_DESCRIPTION("NLS ISO 8859-2 (Latin 2; Slavic/Central European Languages)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-3.c b/fs/nls/nls_iso8859-3.c
index 740b75ec4493..09990e6634d2 100644
--- a/fs/nls/nls_iso8859-3.c
+++ b/fs/nls/nls_iso8859-3.c
@@ -305,4 +305,5 @@ static void __exit exit_nls_iso8859_3(void)
module_init(init_nls_iso8859_3)
module_exit(exit_nls_iso8859_3)
+MODULE_DESCRIPTION("NLS ISO 8859-3 (Latin 3; Esperanto, Galician, Maltese, Turkish)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-4.c b/fs/nls/nls_iso8859-4.c
index 8826021e32f5..92795224912e 100644
--- a/fs/nls/nls_iso8859-4.c
+++ b/fs/nls/nls_iso8859-4.c
@@ -305,4 +305,5 @@ static void __exit exit_nls_iso8859_4(void)
module_init(init_nls_iso8859_4)
module_exit(exit_nls_iso8859_4)
+MODULE_DESCRIPTION("NLS ISO 8859-4 (Latin 4; old Baltic charset)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-5.c b/fs/nls/nls_iso8859-5.c
index 7c04057a1ad8..32309315307a 100644
--- a/fs/nls/nls_iso8859-5.c
+++ b/fs/nls/nls_iso8859-5.c
@@ -269,4 +269,5 @@ static void __exit exit_nls_iso8859_5(void)
module_init(init_nls_iso8859_5)
module_exit(exit_nls_iso8859_5)
+MODULE_DESCRIPTION("NLS ISO 8859-5 (Cyrillic)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-6.c b/fs/nls/nls_iso8859-6.c
index d4a881400d74..c18183469d2a 100644
--- a/fs/nls/nls_iso8859-6.c
+++ b/fs/nls/nls_iso8859-6.c
@@ -260,4 +260,5 @@ static void __exit exit_nls_iso8859_6(void)
module_init(init_nls_iso8859_6)
module_exit(exit_nls_iso8859_6)
+MODULE_DESCRIPTION("NLS ISO 8859-6 (Arabic)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-7.c b/fs/nls/nls_iso8859-7.c
index 37b75d825a75..3652d6832864 100644
--- a/fs/nls/nls_iso8859-7.c
+++ b/fs/nls/nls_iso8859-7.c
@@ -314,4 +314,5 @@ static void __exit exit_nls_iso8859_7(void)
module_init(init_nls_iso8859_7)
module_exit(exit_nls_iso8859_7)
+MODULE_DESCRIPTION("NLS ISO 8859-7 (Modern Greek)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_iso8859-9.c b/fs/nls/nls_iso8859-9.c
index 557b98250d37..11a67834b855 100644
--- a/fs/nls/nls_iso8859-9.c
+++ b/fs/nls/nls_iso8859-9.c
@@ -269,4 +269,5 @@ static void __exit exit_nls_iso8859_9(void)
module_init(init_nls_iso8859_9)
module_exit(exit_nls_iso8859_9)
+MODULE_DESCRIPTION("NLS ISO 8859-9 (Latin 5; Turkish)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_koi8-r.c b/fs/nls/nls_koi8-r.c
index 811f232fccfb..e3dca27a3803 100644
--- a/fs/nls/nls_koi8-r.c
+++ b/fs/nls/nls_koi8-r.c
@@ -320,4 +320,5 @@ static void __exit exit_nls_koi8_r(void)
module_init(init_nls_koi8_r)
module_exit(exit_nls_koi8_r)
+MODULE_DESCRIPTION("NLS KOI8-R (Russian)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_koi8-ru.c b/fs/nls/nls_koi8-ru.c
index a80a741a8676..07afcd9e58c0 100644
--- a/fs/nls/nls_koi8-ru.c
+++ b/fs/nls/nls_koi8-ru.c
@@ -79,4 +79,5 @@ static void __exit exit_nls_koi8_ru(void)
module_init(init_nls_koi8_ru)
module_exit(exit_nls_koi8_ru)
+MODULE_DESCRIPTION("NLS KOI8-RU (Belarusian)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_koi8-u.c b/fs/nls/nls_koi8-u.c
index 7e029e4c188a..f60645758c1a 100644
--- a/fs/nls/nls_koi8-u.c
+++ b/fs/nls/nls_koi8-u.c
@@ -327,4 +327,5 @@ static void __exit exit_nls_koi8_u(void)
module_init(init_nls_koi8_u)
module_exit(exit_nls_koi8_u)
+MODULE_DESCRIPTION("NLS KOI8-U (Ukrainian)");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_ucs2_utils.c b/fs/nls/nls_ucs2_utils.c
index a69781c54dd8..d4564b79d7bf 100644
--- a/fs/nls/nls_ucs2_utils.c
+++ b/fs/nls/nls_ucs2_utils.c
@@ -16,6 +16,7 @@
#include <asm/unaligned.h>
#include "nls_ucs2_utils.h"
+MODULE_DESCRIPTION("NLS UCS-2");
MODULE_LICENSE("GPL");
/*
diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c
index afcfbc4a14db..a0fa0610eaac 100644
--- a/fs/nls/nls_utf8.c
+++ b/fs/nls/nls_utf8.c
@@ -64,4 +64,5 @@ static void __exit exit_nls_utf8(void)
module_init(init_nls_utf8)
module_exit(exit_nls_utf8)
+MODULE_DESCRIPTION("NLS UTF-8");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ff69ae24c4e8..272c8a1dab3c 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -117,17 +117,13 @@ void fsnotify_sb_free(struct super_block *sb)
* parent cares. Thus when an event happens on a child it can quickly tell
* if there is a need to find a parent and send the event to the parent.
*/
-void __fsnotify_update_child_dentry_flags(struct inode *inode)
+void fsnotify_set_children_dentry_flags(struct inode *inode)
{
struct dentry *alias;
- int watched;
if (!S_ISDIR(inode->i_mode))
return;
- /* determine if the children should tell inode about their events */
- watched = fsnotify_inode_watches_children(inode);
-
spin_lock(&inode->i_lock);
/* run all of the dentries associated with this inode. Since this is a
* directory, there damn well better only be one item on this list */
@@ -143,10 +139,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
continue;
spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
- if (watched)
- child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
- else
- child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
+ child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
spin_unlock(&child->d_lock);
}
spin_unlock(&alias->d_lock);
@@ -154,6 +147,24 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
spin_unlock(&inode->i_lock);
}
+/*
+ * Lazily clear false positive PARENT_WATCHED flag for child whose parent had
+ * stopped watching children.
+ */
+static void fsnotify_clear_child_dentry_flag(struct inode *pinode,
+ struct dentry *dentry)
+{
+ spin_lock(&dentry->d_lock);
+ /*
+ * d_lock is a sufficient barrier to prevent observing a non-watched
+ * parent state from before the fsnotify_set_children_dentry_flags()
+ * or fsnotify_update_flags() call that had set PARENT_WATCHED.
+ */
+ if (!fsnotify_inode_watches_children(pinode))
+ dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
+ spin_unlock(&dentry->d_lock);
+}
+
/* Are inode/sb/mount interested in parent and name info with this event? */
static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask,
__u32 mask)
@@ -228,7 +239,7 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
p_inode = parent->d_inode;
p_mask = fsnotify_inode_watches_children(p_inode);
if (unlikely(parent_watched && !p_mask))
- __fsnotify_update_child_dentry_flags(p_inode);
+ fsnotify_clear_child_dentry_flag(p_inode, dentry);
/*
* Include parent/name in notification either if some notification
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 2d059f789ee3..663759ed6fbc 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -93,7 +93,7 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb)
* update the dentry->d_flags of all of inode's children to indicate if inode cares
* about events that happen to its children.
*/
-extern void __fsnotify_update_child_dentry_flags(struct inode *inode);
+extern void fsnotify_set_children_dentry_flags(struct inode *inode);
extern struct kmem_cache *fsnotify_mark_connector_cachep;
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index c3eefa70633c..5e170e713088 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -250,6 +250,24 @@ static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
return fsnotify_update_iref(conn, want_iref);
}
+static bool fsnotify_conn_watches_children(
+ struct fsnotify_mark_connector *conn)
+{
+ if (conn->type != FSNOTIFY_OBJ_TYPE_INODE)
+ return false;
+
+ return fsnotify_inode_watches_children(fsnotify_conn_inode(conn));
+}
+
+static void fsnotify_conn_set_children_dentry_flags(
+ struct fsnotify_mark_connector *conn)
+{
+ if (conn->type != FSNOTIFY_OBJ_TYPE_INODE)
+ return;
+
+ fsnotify_set_children_dentry_flags(fsnotify_conn_inode(conn));
+}
+
/*
* Calculate mask of events for a list of marks. The caller must make sure
* connector and connector->obj cannot disappear under us. Callers achieve
@@ -258,15 +276,23 @@ static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
*/
void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
{
+ bool update_children;
+
if (!conn)
return;
spin_lock(&conn->lock);
+ update_children = !fsnotify_conn_watches_children(conn);
__fsnotify_recalc_mask(conn);
+ update_children &= fsnotify_conn_watches_children(conn);
spin_unlock(&conn->lock);
- if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
- __fsnotify_update_child_dentry_flags(
- fsnotify_conn_inode(conn));
+ /*
+ * Set children's PARENT_WATCHED flags only if parent started watching.
+ * When parent stops watching, we clear false positive PARENT_WATCHED
+ * flags lazily in __fsnotify_parent().
+ */
+ if (update_children)
+ fsnotify_conn_set_children_dentry_flags(conn);
}
/* Free all connectors queued for freeing once SRCU period ends */
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 07e22a15ef02..97c37a9631e5 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -8,10 +8,12 @@
#include <linux/magic.h>
#include <linux/ktime.h>
#include <linux/seq_file.h>
+#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/nsfs.h>
#include <linux/uaccess.h>
+#include "mount.h"
#include "internal.h"
static struct vfsmount *nsfs_mnt;
@@ -82,40 +84,47 @@ int ns_get_path(struct path *path, struct task_struct *task,
return ns_get_path_cb(path, ns_get_path_task, &args);
}
-int open_related_ns(struct ns_common *ns,
- struct ns_common *(*get_ns)(struct ns_common *ns))
+/**
+ * open_namespace - open a namespace
+ * @ns: the namespace to open
+ *
+ * This will consume a reference to @ns indendent of success or failure.
+ *
+ * Return: A file descriptor on success or a negative error code on failure.
+ */
+int open_namespace(struct ns_common *ns)
{
- struct path path = {};
- struct ns_common *relative;
+ struct path path __free(path_put) = {};
struct file *f;
int err;
- int fd;
- fd = get_unused_fd_flags(O_CLOEXEC);
+ /* call first to consume reference */
+ err = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path);
+ if (err < 0)
+ return err;
+
+ CLASS(get_unused_fd, fd)(O_CLOEXEC);
if (fd < 0)
return fd;
- relative = get_ns(ns);
- if (IS_ERR(relative)) {
- put_unused_fd(fd);
- return PTR_ERR(relative);
- }
+ f = dentry_open(&path, O_RDONLY, current_cred());
+ if (IS_ERR(f))
+ return PTR_ERR(f);
- err = path_from_stashed(&relative->stashed, nsfs_mnt, relative, &path);
- if (err < 0) {
- put_unused_fd(fd);
- return err;
- }
+ fd_install(fd, f);
+ return take_fd(fd);
+}
- f = dentry_open(&path, O_RDONLY, current_cred());
- path_put(&path);
- if (IS_ERR(f)) {
- put_unused_fd(fd);
- fd = PTR_ERR(f);
- } else
- fd_install(fd, f);
+int open_related_ns(struct ns_common *ns,
+ struct ns_common *(*get_ns)(struct ns_common *ns))
+{
+ struct ns_common *relative;
+
+ relative = get_ns(ns);
+ if (IS_ERR(relative))
+ return PTR_ERR(relative);
- return fd;
+ return open_namespace(relative);
}
EXPORT_SYMBOL_GPL(open_related_ns);
@@ -123,9 +132,12 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
unsigned long arg)
{
struct user_namespace *user_ns;
+ struct pid_namespace *pid_ns;
+ struct task_struct *tsk;
struct ns_common *ns = get_proc_ns(file_inode(filp));
uid_t __user *argp;
uid_t uid;
+ int ret;
switch (ioctl) {
case NS_GET_USERNS:
@@ -143,9 +155,69 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
argp = (uid_t __user *) arg;
uid = from_kuid_munged(current_user_ns(), user_ns->owner);
return put_user(uid, argp);
+ case NS_GET_MNTNS_ID: {
+ struct mnt_namespace *mnt_ns;
+ __u64 __user *idp;
+ __u64 id;
+
+ if (ns->ops->type != CLONE_NEWNS)
+ return -EINVAL;
+
+ mnt_ns = container_of(ns, struct mnt_namespace, ns);
+ idp = (__u64 __user *)arg;
+ id = mnt_ns->seq;
+ return put_user(id, idp);
+ }
+ case NS_GET_PID_FROM_PIDNS:
+ fallthrough;
+ case NS_GET_TGID_FROM_PIDNS:
+ fallthrough;
+ case NS_GET_PID_IN_PIDNS:
+ fallthrough;
+ case NS_GET_TGID_IN_PIDNS: {
+ if (ns->ops->type != CLONE_NEWPID)
+ return -EINVAL;
+
+ ret = -ESRCH;
+ pid_ns = container_of(ns, struct pid_namespace, ns);
+
+ guard(rcu)();
+
+ if (ioctl == NS_GET_PID_IN_PIDNS ||
+ ioctl == NS_GET_TGID_IN_PIDNS)
+ tsk = find_task_by_vpid(arg);
+ else
+ tsk = find_task_by_pid_ns(arg, pid_ns);
+ if (!tsk)
+ break;
+
+ switch (ioctl) {
+ case NS_GET_PID_FROM_PIDNS:
+ ret = task_pid_vnr(tsk);
+ break;
+ case NS_GET_TGID_FROM_PIDNS:
+ ret = task_tgid_vnr(tsk);
+ break;
+ case NS_GET_PID_IN_PIDNS:
+ ret = task_pid_nr_ns(tsk, pid_ns);
+ break;
+ case NS_GET_TGID_IN_PIDNS:
+ ret = task_tgid_nr_ns(tsk, pid_ns);
+ break;
+ default:
+ ret = 0;
+ break;
+ }
+
+ if (!ret)
+ ret = -ESRCH;
+ break;
+ }
default:
- return -ENOTTY;
+ ret = -ENOTTY;
}
+
+ return ret;
}
int ns_get_name(char *buf, size_t size, struct task_struct *task,
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index 27fbde2701b6..c5b688c5f984 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -259,8 +259,8 @@ enum Opt {
// clang-format off
static const struct fs_parameter_spec ntfs_fs_parameters[] = {
- fsparam_u32("uid", Opt_uid),
- fsparam_u32("gid", Opt_gid),
+ fsparam_uid("uid", Opt_uid),
+ fsparam_gid("gid", Opt_gid),
fsparam_u32oct("umask", Opt_umask),
fsparam_u32oct("dmask", Opt_dmask),
fsparam_u32oct("fmask", Opt_fmask),
@@ -319,14 +319,10 @@ static int ntfs_fs_parse_param(struct fs_context *fc,
switch (opt) {
case Opt_uid:
- opts->fs_uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(opts->fs_uid))
- return invalf(fc, "ntfs3: Invalid value for uid.");
+ opts->fs_uid = result.uid;
break;
case Opt_gid:
- opts->fs_gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(opts->fs_gid))
- return invalf(fc, "ntfs3: Invalid value for gid.");
+ opts->fs_gid = result.gid;
break;
case Opt_umask:
if (result.uint_32 & ~07777)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f0467d3b3c88..6be175a1ab3c 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2366,6 +2366,11 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
}
list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
+ ret = ocfs2_assure_trans_credits(handle, credits);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
ret = ocfs2_mark_extent_written(inode, &et, handle,
ue->ue_cpos, 1,
ue->ue_phys,
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 86807086b2df..530fba34f6d3 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -446,6 +446,23 @@ bail:
}
/*
+ * Make sure handle has at least 'nblocks' credits available. If it does not
+ * have that many credits available, we will try to extend the handle to have
+ * enough credits. If that fails, we will restart transaction to have enough
+ * credits. Similar notes regarding data consistency and locking implications
+ * as for ocfs2_extend_trans() apply here.
+ */
+int ocfs2_assure_trans_credits(handle_t *handle, int nblocks)
+{
+ int old_nblks = jbd2_handle_buffer_credits(handle);
+
+ trace_ocfs2_assure_trans_credits(old_nblks);
+ if (old_nblks >= nblocks)
+ return 0;
+ return ocfs2_extend_trans(handle, nblocks - old_nblks);
+}
+
+/*
* If we have fewer than thresh credits, extend by OCFS2_MAX_TRANS_DATA.
* If that fails, restart the transaction & regain write access for the
* buffer head which is used for metadata modifications.
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 41c9fe7e62f9..e3c3a35dc5e0 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -243,6 +243,8 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb,
int ocfs2_commit_trans(struct ocfs2_super *osb,
handle_t *handle);
int ocfs2_extend_trans(handle_t *handle, int nblocks);
+int ocfs2_assure_trans_credits(handle_t *handle,
+ int nblocks);
int ocfs2_allocate_extend_trans(handle_t *handle,
int thresh);
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 60e208b01c8d..0511c69c9fde 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -2577,6 +2577,8 @@ DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_commit_cache_end);
DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans);
+DEFINE_OCFS2_INT_EVENT(ocfs2_assure_trans_credits);
+
DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart);
DEFINE_OCFS2_INT_INT_EVENT(ocfs2_allocate_extend_trans);
diff --git a/fs/open.c b/fs/open.c
index 89cafb572061..22adbef7ecc2 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -202,13 +202,13 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
return error;
}
-SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
+SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length)
{
return do_sys_ftruncate(fd, length, 1);
}
#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length)
+COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_off_t, length)
{
return do_sys_ftruncate(fd, length, 1);
}
@@ -247,6 +247,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
long ret;
+ loff_t sum;
if (offset < 0 || len <= 0)
return -EINVAL;
@@ -319,8 +320,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
return -ENODEV;
- /* Check for wrap through zero too */
- if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
+ /* Check for wraparound */
+ if (check_add_overflow(offset, len, &sum))
+ return -EFBIG;
+
+ if (sum > inode->i_sb->s_maxbytes)
return -EFBIG;
if (!file->f_op->fallocate)
@@ -982,12 +986,11 @@ static int do_dentry_open(struct file *f,
*/
if (f->f_mode & FMODE_WRITE) {
/*
- * Paired with smp_mb() in collapse_file() to ensure nr_thps
- * is up to date and the update to i_writecount by
- * get_write_access() is visible. Ensures subsequent insertion
- * of THPs into the page cache will fail.
+ * Depends on full fence from get_write_access() to synchronize
+ * against collapse_file() regarding i_writecount and nr_thps
+ * updates. Ensures subsequent insertion of THPs into the page
+ * cache will fail.
*/
- smp_mb();
if (filemap_nr_thps(inode->i_mapping)) {
struct address_space *mapping = inode->i_mapping;
@@ -1004,11 +1007,6 @@ static int do_dentry_open(struct file *f,
}
}
- /*
- * Once we return a file with FMODE_OPENED, __fput() will call
- * fsnotify_close(), so we need fsnotify_open() here for symmetry.
- */
- fsnotify_open(f);
return 0;
cleanup_all:
@@ -1085,8 +1083,19 @@ EXPORT_SYMBOL(file_path);
*/
int vfs_open(const struct path *path, struct file *file)
{
+ int ret;
+
file->f_path = *path;
- return do_dentry_open(file, NULL);
+ ret = do_dentry_open(file, NULL);
+ if (!ret) {
+ /*
+ * Once we return a file with FMODE_OPENED, __fput() will call
+ * fsnotify_close(), so we need fsnotify_open() here for
+ * symmetry.
+ */
+ fsnotify_open(file);
+ }
+ return ret;
}
struct file *dentry_open(const struct path *path, int flags,
@@ -1177,8 +1186,10 @@ struct file *kernel_file_open(const struct path *path, int flags,
error = do_dentry_open(f, NULL);
if (error) {
fput(f);
- f = ERR_PTR(error);
+ return ERR_PTR(error);
}
+
+ fsnotify_open(f);
return f;
}
EXPORT_SYMBOL_GPL(kernel_file_open);
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index a7b527ea50d3..26ecda0e4d19 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -471,4 +471,5 @@ static void __exit exit_openprom_fs(void)
module_init(init_openprom_fs)
module_exit(exit_openprom_fs)
+MODULE_DESCRIPTION("OpenPROM filesystem support");
MODULE_LICENSE("GPL");
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 085912268442..fdb9b65db1de 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -56,7 +56,6 @@ static int orangefs_writepage_locked(struct page *page,
ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen,
len, wr, NULL, NULL);
if (ret < 0) {
- SetPageError(page);
mapping_set_error(page->mapping, ret);
} else {
ret = 0;
@@ -119,7 +118,6 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow,
0, &wr, NULL, NULL);
if (ret < 0) {
for (i = 0; i < ow->npages; i++) {
- SetPageError(ow->pages[i]);
mapping_set_error(ow->pages[i]->mapping, ret);
if (PagePrivate(ow->pages[i])) {
wrp = (struct orangefs_write_range *)
@@ -303,15 +301,10 @@ static int orangefs_read_folio(struct file *file, struct folio *folio)
iov_iter_zero(~0U, &iter);
/* takes care of potential aliasing */
flush_dcache_folio(folio);
- if (ret < 0) {
- folio_set_error(folio);
- } else {
- folio_mark_uptodate(folio);
+ if (ret > 0)
ret = 0;
- }
- /* unlock the folio after the ->read_folio() routine completes */
- folio_unlock(folio);
- return ret;
+ folio_end_read(folio, ret == 0);
+ return ret;
}
static int orangefs_write_begin(struct file *file,
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index b501dc07f922..edcca4beb765 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -274,10 +274,8 @@ orangefs_bufmap_map(struct orangefs_bufmap *bufmap,
gossip_err("orangefs error: asked for %d pages, only got %d.\n",
bufmap->page_count, ret);
- for (i = 0; i < ret; i++) {
- SetPageError(bufmap->page_array[i]);
+ for (i = 0; i < ret; i++)
unpin_user_page(bufmap->page_array[i]);
- }
return -ENOMEM;
}
diff --git a/fs/pidfs.c b/fs/pidfs.c
index dbb9d854d1c5..c9cb14181def 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -11,10 +11,16 @@
#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
#include <linux/pseudo_fs.h>
+#include <linux/ptrace.h>
#include <linux/seq_file.h>
#include <uapi/linux/pidfd.h>
+#include <linux/ipc_namespace.h>
+#include <linux/time_namespace.h>
+#include <linux/utsname.h>
+#include <net/net_namespace.h>
#include "internal.h"
+#include "mount.h"
#ifdef CONFIG_PROC_FS
/**
@@ -108,11 +114,95 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
return poll_flags;
}
+static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct task_struct *task __free(put_task) = NULL;
+ struct nsproxy *nsp __free(put_nsproxy) = NULL;
+ struct pid *pid = pidfd_pid(file);
+ struct ns_common *ns_common;
+
+ if (arg)
+ return -EINVAL;
+
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task)
+ return -ESRCH;
+
+ scoped_guard(task_lock, task) {
+ nsp = task->nsproxy;
+ if (nsp)
+ get_nsproxy(nsp);
+ }
+ if (!nsp)
+ return -ESRCH; /* just pretend it didn't exist */
+
+ /*
+ * We're trying to open a file descriptor to the namespace so perform a
+ * filesystem cred ptrace check. Also, we mirror nsfs behavior.
+ */
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+ return -EACCES;
+
+ switch (cmd) {
+ /* Namespaces that hang of nsproxy. */
+ case PIDFD_GET_CGROUP_NAMESPACE:
+ get_cgroup_ns(nsp->cgroup_ns);
+ ns_common = to_ns_common(nsp->cgroup_ns);
+ break;
+ case PIDFD_GET_IPC_NAMESPACE:
+ get_ipc_ns(nsp->ipc_ns);
+ ns_common = to_ns_common(nsp->ipc_ns);
+ break;
+ case PIDFD_GET_MNT_NAMESPACE:
+ get_mnt_ns(nsp->mnt_ns);
+ ns_common = to_ns_common(nsp->mnt_ns);
+ break;
+ case PIDFD_GET_NET_NAMESPACE:
+ ns_common = to_ns_common(nsp->net_ns);
+ get_net_ns(ns_common);
+ break;
+ case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE:
+ get_pid_ns(nsp->pid_ns_for_children);
+ ns_common = to_ns_common(nsp->pid_ns_for_children);
+ break;
+ case PIDFD_GET_TIME_NAMESPACE:
+ get_time_ns(nsp->time_ns);
+ ns_common = to_ns_common(nsp->time_ns);
+ break;
+ case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE:
+ get_time_ns(nsp->time_ns_for_children);
+ ns_common = to_ns_common(nsp->time_ns_for_children);
+ break;
+ case PIDFD_GET_UTS_NAMESPACE:
+ get_uts_ns(nsp->uts_ns);
+ ns_common = to_ns_common(nsp->uts_ns);
+ break;
+ /* Namespaces that don't hang of nsproxy. */
+ case PIDFD_GET_USER_NAMESPACE:
+ rcu_read_lock();
+ ns_common = to_ns_common(get_user_ns(task_cred_xxx(task, user_ns)));
+ rcu_read_unlock();
+ break;
+ case PIDFD_GET_PID_NAMESPACE:
+ rcu_read_lock();
+ ns_common = to_ns_common(get_pid_ns(task_active_pid_ns(task)));
+ rcu_read_unlock();
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ /* open_namespace() unconditionally consumes the reference */
+ return open_namespace(ns_common);
+}
+
static const struct file_operations pidfs_file_operations = {
.poll = pidfd_poll,
#ifdef CONFIG_PROC_FS
.show_fdinfo = pidfd_show_fdinfo,
#endif
+ .unlocked_ioctl = pidfd_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
};
struct pid *pidfd_pid(const struct file *file)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index a71ac5379584..a8a8576d8592 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -13,6 +13,7 @@
#include <linux/binfmts.h>
#include <linux/sched/coredump.h>
#include <linux/sched/task.h>
+#include <linux/mm.h>
struct ctl_table_header;
struct mempolicy;
@@ -142,6 +143,38 @@ unsigned name_to_int(const struct qstr *qstr);
/* Worst case buffer size needed for holding an integer. */
#define PROC_NUMBUF 13
+/**
+ * folio_precise_page_mapcount() - Number of mappings of this folio page.
+ * @folio: The folio.
+ * @page: The page.
+ *
+ * The number of present user page table entries that reference this page
+ * as tracked via the RMAP: either referenced directly (PTE) or as part of
+ * a larger area that covers this page (e.g., PMD).
+ *
+ * Use this function only for the calculation of existing statistics
+ * (USS, PSS, mapcount_max) and for debugging purposes (/proc/kpagecount).
+ *
+ * Do not add new users.
+ *
+ * Returns: The number of mappings of this folio page. 0 for
+ * folios that are not mapped to user space or are not tracked via the RMAP
+ * (e.g., shared zeropage).
+ */
+static inline int folio_precise_page_mapcount(struct folio *folio,
+ struct page *page)
+{
+ int mapcount = atomic_read(&page->_mapcount) + 1;
+
+ /* Handle page_has_type() pages */
+ if (mapcount < PAGE_MAPCOUNT_RESERVE + 1)
+ mapcount = 0;
+ if (folio_test_large(folio))
+ mapcount += folio_entire_mapcount(folio);
+
+ return mapcount;
+}
+
/*
* array.c
*/
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 2fb64bdb64eb..b7a5c84b5819 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -37,21 +37,19 @@ static inline unsigned long get_max_dump_pfn(void)
#endif
}
-/* /proc/kpagecount - an array exposing page counts
+/* /proc/kpagecount - an array exposing page mapcounts
*
* Each entry is a u64 representing the corresponding
- * physical page count.
+ * physical page mapcount.
*/
static ssize_t kpagecount_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
const unsigned long max_dump_pfn = get_max_dump_pfn();
u64 __user *out = (u64 __user *)buf;
- struct page *ppage;
unsigned long src = *ppos;
unsigned long pfn;
ssize_t ret = 0;
- u64 pcount;
pfn = src / KPMSIZE;
if (src & KPMMASK || count & KPMMASK)
@@ -61,18 +59,19 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
while (count > 0) {
+ struct page *page;
+ u64 mapcount = 0;
+
/*
* TODO: ZONE_DEVICE support requires to identify
* memmaps that were actually initialized.
*/
- ppage = pfn_to_online_page(pfn);
-
- if (!ppage)
- pcount = 0;
- else
- pcount = page_mapcount(ppage);
+ page = pfn_to_online_page(pfn);
+ if (page)
+ mapcount = folio_precise_page_mapcount(page_folio(page),
+ page);
- if (put_user(pcount, out)) {
+ if (put_user(mapcount, out)) {
ret = -EFAULT;
break;
}
@@ -148,19 +147,16 @@ u64 stable_page_flags(const struct page *page)
u |= 1 << KPF_COMPOUND_TAIL;
if (folio_test_hugetlb(folio))
u |= 1 << KPF_HUGE;
- /*
- * We need to check PageLRU/PageAnon
- * to make sure a given page is a thp, not a non-huge compound page.
- */
- else if (folio_test_large(folio)) {
- if ((k & (1 << PG_lru)) || is_anon)
- u |= 1 << KPF_THP;
- else if (is_huge_zero_folio(folio)) {
- u |= 1 << KPF_ZERO_PAGE;
- u |= 1 << KPF_THP;
- }
- } else if (is_zero_pfn(page_to_pfn(page)))
+ else if (folio_test_large(folio) &&
+ folio_test_large_rmappable(folio)) {
+ /* Note: we indicate any THPs here, not just PMD-sized ones */
+ u |= 1 << KPF_THP;
+ } else if (is_huge_zero_folio(folio)) {
u |= 1 << KPF_ZERO_PAGE;
+ u |= 1 << KPF_THP;
+ } else if (is_zero_folio(folio)) {
+ u |= 1 << KPF_ZERO_PAGE;
+ }
/*
* Caveats on high order pages: PG_buddy and PG_slab will only be set
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index b1c2c0b82116..9553e77c9d31 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -21,7 +21,7 @@
#define list_for_each_table_entry(entry, header) \
entry = header->ctl_table; \
- for (size_t i = 0 ; i < header->ctl_table_size && entry->procname; ++i, entry++)
+ for (size_t i = 0 ; i < header->ctl_table_size; ++i, entry++)
static const struct dentry_operations proc_sys_dentry_operations;
static const struct file_operations proc_sys_file_operations;
@@ -476,12 +476,10 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
make_empty_dir_inode(inode);
}
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
if (root->set_ownership)
root->set_ownership(head, &inode->i_uid, &inode->i_gid);
- else {
- inode->i_uid = GLOBAL_ROOT_UID;
- inode->i_gid = GLOBAL_ROOT_GID;
- }
return inode;
}
@@ -951,14 +949,14 @@ static struct ctl_dir *new_dir(struct ctl_table_set *set,
char *new_name;
new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) +
- sizeof(struct ctl_table)*2 + namelen + 1,
+ sizeof(struct ctl_table) + namelen + 1,
GFP_KERNEL);
if (!new)
return NULL;
node = (struct ctl_node *)(new + 1);
table = (struct ctl_table *)(node + 1);
- new_name = (char *)(table + 2);
+ new_name = (char *)(table + 1);
memcpy(new_name, name, namelen);
table[0].procname = new_name;
table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO;
@@ -1093,6 +1091,7 @@ static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
static int sysctl_check_table_array(const char *path, struct ctl_table *table)
{
+ unsigned int extra;
int err = 0;
if ((table->proc_handler == proc_douintvec) ||
@@ -1104,6 +1103,19 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table)
if (table->proc_handler == proc_dou8vec_minmax) {
if (table->maxlen != sizeof(u8))
err |= sysctl_err(path, table, "array not allowed");
+
+ if (table->extra1) {
+ extra = *(unsigned int *) table->extra1;
+ if (extra > 255U)
+ err |= sysctl_err(path, table,
+ "range value too large for proc_dou8vec_minmax");
+ }
+ if (table->extra2) {
+ extra = *(unsigned int *) table->extra2;
+ if (extra > 255U)
+ err |= sysctl_err(path, table,
+ "range value too large for proc_dou8vec_minmax");
+ }
}
if (table->proc_handler == proc_dobool) {
@@ -1119,6 +1131,8 @@ static int sysctl_check_table(const char *path, struct ctl_table_header *header)
struct ctl_table *entry;
int err = 0;
list_for_each_table_entry(entry, header) {
+ if (!entry->procname)
+ err |= sysctl_err(path, entry, "procname is null");
if ((entry->proc_handler == proc_dostring) ||
(entry->proc_handler == proc_dobool) ||
(entry->proc_handler == proc_dointvec) ||
@@ -1154,18 +1168,16 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_
struct ctl_table_header *links;
struct ctl_node *node;
char *link_name;
- int nr_entries, name_bytes;
+ int name_bytes;
name_bytes = 0;
- nr_entries = 0;
list_for_each_table_entry(entry, head) {
- nr_entries++;
name_bytes += strlen(entry->procname) + 1;
}
links = kzalloc(sizeof(struct ctl_table_header) +
- sizeof(struct ctl_node)*nr_entries +
- sizeof(struct ctl_table)*(nr_entries + 1) +
+ sizeof(struct ctl_node)*head->ctl_table_size +
+ sizeof(struct ctl_table)*head->ctl_table_size +
name_bytes,
GFP_KERNEL);
@@ -1173,8 +1185,8 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_
return NULL;
node = (struct ctl_node *)(links + 1);
- link_table = (struct ctl_table *)(node + nr_entries);
- link_name = (char *)&link_table[nr_entries + 1];
+ link_table = (struct ctl_table *)(node + head->ctl_table_size);
+ link_name = (char *)(link_table + head->ctl_table_size);
link = link_table;
list_for_each_table_entry(entry, head) {
@@ -1188,7 +1200,7 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_
}
init_header(links, dir->header.root, dir->header.set, node, link_table,
head->ctl_table_size);
- links->nreg = nr_entries;
+ links->nreg = head->ctl_table_size;
return links;
}
@@ -1300,28 +1312,23 @@ static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path)
* __register_sysctl_table - register a leaf sysctl table
* @set: Sysctl tree to register on
* @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure without any child. This table
- * should not be free'd after registration. So it should not be
- * used on stack. It can either be a global or dynamically allocated
- * by the caller and free'd later after sysctl unregistration.
+ *
+ * @table: the top-level table structure. This table should not be free'd
+ * after registration. So it should not be used on stack. It can either
+ * be a global or dynamically allocated by the caller and free'd later
+ * after sysctl unregistration.
* @table_size : The number of elements in table
*
* Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
+ * array.
*
* The members of the &struct ctl_table structure are used as follows:
- *
* procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
* enter a sysctl file
- *
- * data - a pointer to data for use by proc_handler
- *
- * maxlen - the maximum size in bytes of the data
- *
- * mode - the file permissions for the /proc/sys file
- *
- * child - must be %NULL.
- *
+ * data - a pointer to data for use by proc_handler
+ * maxlen - the maximum size in bytes of the data
+ * mode - the file permissions for the /proc/sys file
+ * type - Defines the target type (described in struct definition)
* proc_handler - the text handler routine (described below)
*
* extra1, extra2 - extra pointers usable by the proc handler routines
@@ -1329,8 +1336,7 @@ static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path)
* [0] https://lkml.kernel.org/87zgpte9o4.fsf@email.froward.int.ebiederm.org
*
* Leaf nodes in the sysctl tree will be represented by a single file
- * under /proc; non-leaf nodes (where child is not NULL) are not allowed,
- * sysctl_check_table() verifies this.
+ * under /proc; non-leaf nodes are not allowed.
*
* There must be a proc_handler routine for any terminal nodes.
* Several default handlers are available to cover common cases -
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f8d35f993fe5..775a2e8d600c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -22,6 +22,7 @@
#include <linux/pkeys.h>
#include <linux/minmax.h>
#include <linux/overflow.h>
+#include <linux/buildid.h>
#include <asm/elf.h>
#include <asm/tlb.h>
@@ -239,6 +240,67 @@ static int do_maps_open(struct inode *inode, struct file *file,
sizeof(struct proc_maps_private));
}
+static void get_vma_name(struct vm_area_struct *vma,
+ const struct path **path,
+ const char **name,
+ const char **name_fmt)
+{
+ struct anon_vma_name *anon_name = vma->vm_mm ? anon_vma_name(vma) : NULL;
+
+ *name = NULL;
+ *path = NULL;
+ *name_fmt = NULL;
+
+ /*
+ * Print the dentry name for named mappings, and a
+ * special [heap] marker for the heap:
+ */
+ if (vma->vm_file) {
+ /*
+ * If user named this anon shared memory via
+ * prctl(PR_SET_VMA ..., use the provided name.
+ */
+ if (anon_name) {
+ *name_fmt = "[anon_shmem:%s]";
+ *name = anon_name->name;
+ } else {
+ *path = file_user_path(vma->vm_file);
+ }
+ return;
+ }
+
+ if (vma->vm_ops && vma->vm_ops->name) {
+ *name = vma->vm_ops->name(vma);
+ if (*name)
+ return;
+ }
+
+ *name = arch_vma_name(vma);
+ if (*name)
+ return;
+
+ if (!vma->vm_mm) {
+ *name = "[vdso]";
+ return;
+ }
+
+ if (vma_is_initial_heap(vma)) {
+ *name = "[heap]";
+ return;
+ }
+
+ if (vma_is_initial_stack(vma)) {
+ *name = "[stack]";
+ return;
+ }
+
+ if (anon_name) {
+ *name_fmt = "[anon:%s]";
+ *name = anon_name->name;
+ return;
+ }
+}
+
static void show_vma_header_prefix(struct seq_file *m,
unsigned long start, unsigned long end,
vm_flags_t flags, unsigned long long pgoff,
@@ -262,17 +324,15 @@ static void show_vma_header_prefix(struct seq_file *m,
static void
show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
{
- struct anon_vma_name *anon_name = NULL;
- struct mm_struct *mm = vma->vm_mm;
- struct file *file = vma->vm_file;
+ const struct path *path;
+ const char *name_fmt, *name;
vm_flags_t flags = vma->vm_flags;
unsigned long ino = 0;
unsigned long long pgoff = 0;
unsigned long start, end;
dev_t dev = 0;
- const char *name = NULL;
- if (file) {
+ if (vma->vm_file) {
const struct inode *inode = file_user_inode(vma->vm_file);
dev = inode->i_sb->s_dev;
@@ -283,57 +343,15 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
start = vma->vm_start;
end = vma->vm_end;
show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
- if (mm)
- anon_name = anon_vma_name(vma);
- /*
- * Print the dentry name for named mappings, and a
- * special [heap] marker for the heap:
- */
- if (file) {
+ get_vma_name(vma, &path, &name, &name_fmt);
+ if (path) {
seq_pad(m, ' ');
- /*
- * If user named this anon shared memory via
- * prctl(PR_SET_VMA ..., use the provided name.
- */
- if (anon_name)
- seq_printf(m, "[anon_shmem:%s]", anon_name->name);
- else
- seq_path(m, file_user_path(file), "\n");
- goto done;
- }
-
- if (vma->vm_ops && vma->vm_ops->name) {
- name = vma->vm_ops->name(vma);
- if (name)
- goto done;
- }
-
- name = arch_vma_name(vma);
- if (!name) {
- if (!mm) {
- name = "[vdso]";
- goto done;
- }
-
- if (vma_is_initial_heap(vma)) {
- name = "[heap]";
- goto done;
- }
-
- if (vma_is_initial_stack(vma)) {
- name = "[stack]";
- goto done;
- }
-
- if (anon_name) {
- seq_pad(m, ' ');
- seq_printf(m, "[anon:%s]", anon_name->name);
- }
- }
-
-done:
- if (name) {
+ seq_path(m, path, "\n");
+ } else if (name_fmt) {
+ seq_pad(m, ' ');
+ seq_printf(m, name_fmt, name);
+ } else if (name) {
seq_pad(m, ' ');
seq_puts(m, name);
}
@@ -358,11 +376,268 @@ static int pid_maps_open(struct inode *inode, struct file *file)
return do_maps_open(inode, file, &proc_pid_maps_op);
}
+#define PROCMAP_QUERY_VMA_FLAGS ( \
+ PROCMAP_QUERY_VMA_READABLE | \
+ PROCMAP_QUERY_VMA_WRITABLE | \
+ PROCMAP_QUERY_VMA_EXECUTABLE | \
+ PROCMAP_QUERY_VMA_SHARED \
+)
+
+#define PROCMAP_QUERY_VALID_FLAGS_MASK ( \
+ PROCMAP_QUERY_COVERING_OR_NEXT_VMA | \
+ PROCMAP_QUERY_FILE_BACKED_VMA | \
+ PROCMAP_QUERY_VMA_FLAGS \
+)
+
+static int query_vma_setup(struct mm_struct *mm)
+{
+ return mmap_read_lock_killable(mm);
+}
+
+static void query_vma_teardown(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ mmap_read_unlock(mm);
+}
+
+static struct vm_area_struct *query_vma_find_by_addr(struct mm_struct *mm, unsigned long addr)
+{
+ return find_vma(mm, addr);
+}
+
+static struct vm_area_struct *query_matching_vma(struct mm_struct *mm,
+ unsigned long addr, u32 flags)
+{
+ struct vm_area_struct *vma;
+
+next_vma:
+ vma = query_vma_find_by_addr(mm, addr);
+ if (!vma)
+ goto no_vma;
+
+ /* user requested only file-backed VMA, keep iterating */
+ if ((flags & PROCMAP_QUERY_FILE_BACKED_VMA) && !vma->vm_file)
+ goto skip_vma;
+
+ /* VMA permissions should satisfy query flags */
+ if (flags & PROCMAP_QUERY_VMA_FLAGS) {
+ u32 perm = 0;
+
+ if (flags & PROCMAP_QUERY_VMA_READABLE)
+ perm |= VM_READ;
+ if (flags & PROCMAP_QUERY_VMA_WRITABLE)
+ perm |= VM_WRITE;
+ if (flags & PROCMAP_QUERY_VMA_EXECUTABLE)
+ perm |= VM_EXEC;
+ if (flags & PROCMAP_QUERY_VMA_SHARED)
+ perm |= VM_MAYSHARE;
+
+ if ((vma->vm_flags & perm) != perm)
+ goto skip_vma;
+ }
+
+ /* found covering VMA or user is OK with the matching next VMA */
+ if ((flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) || vma->vm_start <= addr)
+ return vma;
+
+skip_vma:
+ /*
+ * If the user needs closest matching VMA, keep iterating.
+ */
+ addr = vma->vm_end;
+ if (flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA)
+ goto next_vma;
+
+no_vma:
+ return ERR_PTR(-ENOENT);
+}
+
+static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
+{
+ struct procmap_query karg;
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+ const char *name = NULL;
+ char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL;
+ __u64 usize;
+ int err;
+
+ if (copy_from_user(&usize, (void __user *)uarg, sizeof(usize)))
+ return -EFAULT;
+ /* argument struct can never be that large, reject abuse */
+ if (usize > PAGE_SIZE)
+ return -E2BIG;
+ /* argument struct should have at least query_flags and query_addr fields */
+ if (usize < offsetofend(struct procmap_query, query_addr))
+ return -EINVAL;
+ err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize);
+ if (err)
+ return err;
+
+ /* reject unknown flags */
+ if (karg.query_flags & ~PROCMAP_QUERY_VALID_FLAGS_MASK)
+ return -EINVAL;
+ /* either both buffer address and size are set, or both should be zero */
+ if (!!karg.vma_name_size != !!karg.vma_name_addr)
+ return -EINVAL;
+ if (!!karg.build_id_size != !!karg.build_id_addr)
+ return -EINVAL;
+
+ mm = priv->mm;
+ if (!mm || !mmget_not_zero(mm))
+ return -ESRCH;
+
+ err = query_vma_setup(mm);
+ if (err) {
+ mmput(mm);
+ return err;
+ }
+
+ vma = query_matching_vma(mm, karg.query_addr, karg.query_flags);
+ if (IS_ERR(vma)) {
+ err = PTR_ERR(vma);
+ vma = NULL;
+ goto out;
+ }
+
+ karg.vma_start = vma->vm_start;
+ karg.vma_end = vma->vm_end;
+
+ karg.vma_flags = 0;
+ if (vma->vm_flags & VM_READ)
+ karg.vma_flags |= PROCMAP_QUERY_VMA_READABLE;
+ if (vma->vm_flags & VM_WRITE)
+ karg.vma_flags |= PROCMAP_QUERY_VMA_WRITABLE;
+ if (vma->vm_flags & VM_EXEC)
+ karg.vma_flags |= PROCMAP_QUERY_VMA_EXECUTABLE;
+ if (vma->vm_flags & VM_MAYSHARE)
+ karg.vma_flags |= PROCMAP_QUERY_VMA_SHARED;
+
+ karg.vma_page_size = vma_kernel_pagesize(vma);
+
+ if (vma->vm_file) {
+ const struct inode *inode = file_user_inode(vma->vm_file);
+
+ karg.vma_offset = ((__u64)vma->vm_pgoff) << PAGE_SHIFT;
+ karg.dev_major = MAJOR(inode->i_sb->s_dev);
+ karg.dev_minor = MINOR(inode->i_sb->s_dev);
+ karg.inode = inode->i_ino;
+ } else {
+ karg.vma_offset = 0;
+ karg.dev_major = 0;
+ karg.dev_minor = 0;
+ karg.inode = 0;
+ }
+
+ if (karg.build_id_size) {
+ __u32 build_id_sz;
+
+ err = build_id_parse(vma, build_id_buf, &build_id_sz);
+ if (err) {
+ karg.build_id_size = 0;
+ } else {
+ if (karg.build_id_size < build_id_sz) {
+ err = -ENAMETOOLONG;
+ goto out;
+ }
+ karg.build_id_size = build_id_sz;
+ }
+ }
+
+ if (karg.build_id_size) {
+ __u32 build_id_sz;
+
+ err = build_id_parse(vma, build_id_buf, &build_id_sz);
+ if (err) {
+ karg.build_id_size = 0;
+ } else {
+ if (karg.build_id_size < build_id_sz) {
+ err = -ENAMETOOLONG;
+ goto out;
+ }
+ karg.build_id_size = build_id_sz;
+ }
+ }
+
+ if (karg.vma_name_size) {
+ size_t name_buf_sz = min_t(size_t, PATH_MAX, karg.vma_name_size);
+ const struct path *path;
+ const char *name_fmt;
+ size_t name_sz = 0;
+
+ get_vma_name(vma, &path, &name, &name_fmt);
+
+ if (path || name_fmt || name) {
+ name_buf = kmalloc(name_buf_sz, GFP_KERNEL);
+ if (!name_buf) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+ if (path) {
+ name = d_path(path, name_buf, name_buf_sz);
+ if (IS_ERR(name)) {
+ err = PTR_ERR(name);
+ goto out;
+ }
+ name_sz = name_buf + name_buf_sz - name;
+ } else if (name || name_fmt) {
+ name_sz = 1 + snprintf(name_buf, name_buf_sz, name_fmt ?: "%s", name);
+ name = name_buf;
+ }
+ if (name_sz > name_buf_sz) {
+ err = -ENAMETOOLONG;
+ goto out;
+ }
+ karg.vma_name_size = name_sz;
+ }
+
+ /* unlock vma or mmap_lock, and put mm_struct before copying data to user */
+ query_vma_teardown(mm, vma);
+ mmput(mm);
+
+ if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr),
+ name, karg.vma_name_size)) {
+ kfree(name_buf);
+ return -EFAULT;
+ }
+ kfree(name_buf);
+
+ if (karg.build_id_size && copy_to_user(u64_to_user_ptr(karg.build_id_addr),
+ build_id_buf, karg.build_id_size))
+ return -EFAULT;
+
+ if (copy_to_user(uarg, &karg, min_t(size_t, sizeof(karg), usize)))
+ return -EFAULT;
+
+ return 0;
+
+out:
+ query_vma_teardown(mm, vma);
+ mmput(mm);
+ kfree(name_buf);
+ return err;
+}
+
+static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct seq_file *seq = file->private_data;
+ struct proc_maps_private *priv = seq->private;
+
+ switch (cmd) {
+ case PROCMAP_QUERY:
+ return do_procmap_query(priv, (void __user *)arg);
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+
const struct file_operations proc_pid_maps_operations = {
.open = pid_maps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = proc_map_release,
+ .unlocked_ioctl = procfs_procmap_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
};
/*
@@ -442,7 +717,7 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
static void smaps_account(struct mem_size_stats *mss, struct page *page,
bool compound, bool young, bool dirty, bool locked,
- bool migration)
+ bool present)
{
struct folio *folio = page_folio(page);
int i, nr = compound ? compound_nr(page) : 1;
@@ -471,24 +746,29 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
* Then accumulate quantities that may depend on sharing, or that may
* differ page-by-page.
*
- * refcount == 1 guarantees the page is mapped exactly once.
- * If any subpage of the compound page mapped with PTE it would elevate
- * the refcount.
+ * refcount == 1 for present entries guarantees that the folio is mapped
+ * exactly once. For large folios this implies that exactly one
+ * PTE/PMD/... maps (a part of) this folio.
+ *
+ * Treat all non-present entries (where relying on the mapcount and
+ * refcount doesn't make sense) as "maybe shared, but not sure how
+ * often". We treat device private entries as being fake-present.
*
- * The page_mapcount() is called to get a snapshot of the mapcount.
- * Without holding the page lock this snapshot can be slightly wrong as
- * we cannot always read the mapcount atomically. It is not safe to
- * call page_mapcount() even with PTL held if the page is not mapped,
- * especially for migration entries. Treat regular migration entries
- * as mapcount == 1.
+ * Note that it would not be safe to read the mapcount especially for
+ * pages referenced by migration entries, even with the PTL held.
*/
- if ((folio_ref_count(folio) == 1) || migration) {
+ if (folio_ref_count(folio) == 1 || !present) {
smaps_page_accumulate(mss, folio, size, size << PSS_SHIFT,
- dirty, locked, true);
+ dirty, locked, present);
return;
}
+ /*
+ * We obtain a snapshot of the mapcount. Without holding the folio lock
+ * this snapshot can be slightly wrong as we cannot always read the
+ * mapcount atomically.
+ */
for (i = 0; i < nr; i++, page++) {
- int mapcount = page_mapcount(page);
+ int mapcount = folio_precise_page_mapcount(folio, page);
unsigned long pss = PAGE_SIZE << PSS_SHIFT;
if (mapcount >= 2)
pss /= mapcount;
@@ -531,13 +811,14 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
- bool migration = false, young = false, dirty = false;
+ bool present = false, young = false, dirty = false;
pte_t ptent = ptep_get(pte);
if (pte_present(ptent)) {
page = vm_normal_page(vma, addr, ptent);
young = pte_young(ptent);
dirty = pte_dirty(ptent);
+ present = true;
} else if (is_swap_pte(ptent)) {
swp_entry_t swpent = pte_to_swp_entry(ptent);
@@ -555,8 +836,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
}
} else if (is_pfn_swap_entry(swpent)) {
- if (is_migration_entry(swpent))
- migration = true;
+ if (is_device_private_entry(swpent))
+ present = true;
page = pfn_swap_entry_to_page(swpent);
}
} else {
@@ -567,7 +848,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
if (!page)
return;
- smaps_account(mss, page, false, young, dirty, locked, migration);
+ smaps_account(mss, page, false, young, dirty, locked, present);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -578,18 +859,17 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
+ bool present = false;
struct folio *folio;
- bool migration = false;
if (pmd_present(*pmd)) {
page = vm_normal_page_pmd(vma, addr, *pmd);
+ present = true;
} else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
- if (is_migration_entry(entry)) {
- migration = true;
+ if (is_pfn_swap_entry(entry))
page = pfn_swap_entry_to_page(entry);
- }
}
if (IS_ERR_OR_NULL(page))
return;
@@ -604,7 +884,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
mss->file_thp += HPAGE_PMD_SIZE;
smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
- locked, migration);
+ locked, present);
}
#else
static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -707,6 +987,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
#ifdef CONFIG_X86_USER_SHADOW_STACK
[ilog2(VM_SHADOW_STACK)] = "ss",
#endif
+#ifdef CONFIG_64BIT
+ [ilog2(VM_SEALED)] = "sl",
+#endif
};
size_t i;
@@ -730,19 +1013,23 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
{
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = walk->vma;
- pte_t ptent = huge_ptep_get(pte);
+ pte_t ptent = huge_ptep_get(walk->mm, addr, pte);
struct folio *folio = NULL;
+ bool present = false;
if (pte_present(ptent)) {
folio = page_folio(pte_page(ptent));
+ present = true;
} else if (is_swap_pte(ptent)) {
swp_entry_t swpent = pte_to_swp_entry(ptent);
if (is_pfn_swap_entry(swpent))
folio = pfn_swap_entry_folio(swpent);
}
+
if (folio) {
- if (folio_likely_mapped_shared(folio) ||
+ /* We treat non-present entries as "maybe shared". */
+ if (!present || folio_likely_mapped_shared(folio) ||
hugetlb_pmd_shared(pte))
mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
else
@@ -1088,7 +1375,7 @@ struct clear_refs_private {
static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
{
- struct page *page;
+ struct folio *folio;
if (!pte_write(pte))
return false;
@@ -1096,10 +1383,10 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr,
return false;
if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)))
return false;
- page = vm_normal_page(vma, addr, pte);
- if (!page)
+ folio = vm_normal_folio(vma, addr, pte);
+ if (!folio)
return false;
- return page_maybe_dma_pinned(page);
+ return folio_maybe_dma_pinned(folio);
}
static inline void clear_soft_dirty(struct vm_area_struct *vma,
@@ -1415,7 +1702,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
{
u64 frame = 0, flags = 0;
struct page *page = NULL;
- bool migration = false;
+ struct folio *folio;
if (pte_present(pte)) {
if (pm->show_pfn)
@@ -1447,17 +1734,20 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
(offset << MAX_SWAPFILES_SHIFT);
}
flags |= PM_SWAP;
- migration = is_migration_entry(entry);
if (is_pfn_swap_entry(entry))
page = pfn_swap_entry_to_page(entry);
if (pte_marker_entry_uffd_wp(entry))
flags |= PM_UFFD_WP;
}
- if (page && !PageAnon(page))
- flags |= PM_FILE;
- if (page && !migration && page_mapcount(page) == 1)
- flags |= PM_MMAP_EXCLUSIVE;
+ if (page) {
+ folio = page_folio(page);
+ if (!folio_test_anon(folio))
+ flags |= PM_FILE;
+ if ((flags & PM_PRESENT) &&
+ folio_precise_page_mapcount(folio, page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
+ }
if (vma->vm_flags & VM_SOFTDIRTY)
flags |= PM_SOFT_DIRTY;
@@ -1473,13 +1763,14 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
pte_t *pte, *orig_pte;
int err = 0;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- bool migration = false;
ptl = pmd_trans_huge_lock(pmdp, vma);
if (ptl) {
+ unsigned int idx = (addr & ~PMD_MASK) >> PAGE_SHIFT;
u64 flags = 0, frame = 0;
pmd_t pmd = *pmdp;
struct page *page = NULL;
+ struct folio *folio = NULL;
if (vma->vm_flags & VM_SOFTDIRTY)
flags |= PM_SOFT_DIRTY;
@@ -1493,8 +1784,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
if (pmd_uffd_wp(pmd))
flags |= PM_UFFD_WP;
if (pm->show_pfn)
- frame = pmd_pfn(pmd) +
- ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ frame = pmd_pfn(pmd) + idx;
}
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
else if (is_swap_pmd(pmd)) {
@@ -1503,11 +1793,9 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
if (pm->show_pfn) {
if (is_pfn_swap_entry(entry))
- offset = swp_offset_pfn(entry);
+ offset = swp_offset_pfn(entry) + idx;
else
- offset = swp_offset(entry);
- offset = offset +
- ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ offset = swp_offset(entry) + idx;
frame = swp_type(entry) |
(offset << MAX_SWAPFILES_SHIFT);
}
@@ -1517,17 +1805,25 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
if (pmd_swp_uffd_wp(pmd))
flags |= PM_UFFD_WP;
VM_BUG_ON(!is_pmd_migration_entry(pmd));
- migration = is_migration_entry(entry);
page = pfn_swap_entry_to_page(entry);
}
#endif
- if (page && !migration && page_mapcount(page) == 1)
- flags |= PM_MMAP_EXCLUSIVE;
+ if (page) {
+ folio = page_folio(page);
+ if (!folio_test_anon(folio))
+ flags |= PM_FILE;
+ }
+
+ for (; addr != end; addr += PAGE_SIZE, idx++) {
+ unsigned long cur_flags = flags;
+ pagemap_entry_t pme;
- for (; addr != end; addr += PAGE_SIZE) {
- pagemap_entry_t pme = make_pme(frame, flags);
+ if (folio && (flags & PM_PRESENT) &&
+ folio_precise_page_mapcount(folio, page + idx) == 1)
+ cur_flags |= PM_MMAP_EXCLUSIVE;
+ pme = make_pme(frame, cur_flags);
err = add_to_pagemap(&pme, pm);
if (err)
break;
@@ -1582,7 +1878,7 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
if (vma->vm_flags & VM_SOFTDIRTY)
flags |= PM_SOFT_DIRTY;
- pte = huge_ptep_get(ptep);
+ pte = huge_ptep_get(walk->mm, addr, ptep);
if (pte_present(pte)) {
struct folio *folio = page_folio(pte_page(pte));
@@ -2271,7 +2567,7 @@ static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
if (~p->arg.flags & PM_SCAN_WP_MATCHING) {
/* Go the short route when not write-protecting pages. */
- pte = huge_ptep_get(ptep);
+ pte = huge_ptep_get(walk->mm, start, ptep);
categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
if (!pagemap_scan_is_interesting_page(categories, p))
@@ -2283,7 +2579,7 @@ static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
i_mmap_lock_write(vma->vm_file->f_mapping);
ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep);
- pte = huge_ptep_get(ptep);
+ pte = huge_ptep_get(walk->mm, start, ptep);
categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
if (!pagemap_scan_is_interesting_page(categories, p))
@@ -2563,7 +2859,7 @@ static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
unsigned long nr_pages)
{
struct folio *folio = page_folio(page);
- int count = page_mapcount(page);
+ int count = folio_precise_page_mapcount(folio, page);
md->pages += nr_pages;
if (pte_dirty || folio_test_dirty(folio))
@@ -2679,7 +2975,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end, struct mm_walk *walk)
{
- pte_t huge_pte = huge_ptep_get(pte);
+ pte_t huge_pte = huge_ptep_get(walk->mm, addr, pte);
struct numa_maps *md;
struct page *page;
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 0a808951b7d3..e133b507ddf3 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -61,7 +61,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb)
return security_sb_show_options(m, sb);
}
-static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
+static void show_vfsmnt_opts(struct seq_file *m, struct vfsmount *mnt)
{
static const struct proc_fs_opts mnt_opts[] = {
{ MNT_NOSUID, ",nosuid" },
@@ -124,7 +124,7 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
err = show_sb_opts(m, sb);
if (err)
goto out;
- show_mnt_opts(m, mnt);
+ show_vfsmnt_opts(m, mnt);
if (sb->s_op->show_options)
err = sb->s_op->show_options(m, mnt_path.dentry);
seq_puts(m, " 0 0\n");
@@ -153,7 +153,7 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
goto out;
seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw");
- show_mnt_opts(m, mnt);
+ show_vfsmnt_opts(m, mnt);
/* Tagged fields ("foo:X" or "bar") */
if (IS_MNT_SHARED(r))
diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c
index de8cf5d75f34..65b2473e22ff 100644
--- a/fs/pstore/blk.c
+++ b/fs/pstore/blk.c
@@ -241,7 +241,7 @@ err:
/* get information of pstore/blk */
int pstore_blk_get_config(struct pstore_blk_config *info)
{
- strncpy(info->device, blkdev, 80);
+ strscpy(info->device, blkdev);
info->max_reason = max_reason;
info->kmsg_size = check_size(kmsg_size, 4096);
info->pmsg_size = check_size(pmsg_size, 4096);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 03425928d2fb..3497ede88aa0 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -761,4 +761,5 @@ static void __exit pstore_exit(void)
module_exit(pstore_exit)
MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
+MODULE_DESCRIPTION("Persistent Storage - platform driver interface");
MODULE_LICENSE("GPL");
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index b1a455f42e93..4311fcbc84f2 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -50,6 +50,10 @@ module_param_hw(mem_address, ullong, other, 0400);
MODULE_PARM_DESC(mem_address,
"start of reserved RAM used to store oops/panic logs");
+static char *mem_name;
+module_param_named(mem_name, mem_name, charp, 0400);
+MODULE_PARM_DESC(mem_name, "name of kernel param that holds addr");
+
static ulong mem_size;
module_param(mem_size, ulong, 0400);
MODULE_PARM_DESC(mem_size,
@@ -914,6 +918,16 @@ static void __init ramoops_register_dummy(void)
{
struct ramoops_platform_data pdata;
+ if (mem_name) {
+ phys_addr_t start;
+ phys_addr_t size;
+
+ if (reserve_mem_find_by_name(mem_name, &start, &size)) {
+ mem_address = start;
+ mem_size = size;
+ }
+ }
+
/*
* Prepare a dummy platform data structure to carry the module
* parameters. If mem_size isn't set, then there are no module
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index d79841e94428..e399e2dd3a12 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -430,5 +430,6 @@ static void __exit exit_qnx4_fs(void)
module_init(init_qnx4_fs)
module_exit(exit_qnx4_fs)
+MODULE_DESCRIPTION("QNX4 file system");
MODULE_LICENSE("GPL");
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index d62fbef838b6..4f1735b882b1 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -694,4 +694,5 @@ static void __exit exit_qnx6_fs(void)
module_init(init_qnx6_fs)
module_exit(exit_qnx6_fs)
+MODULE_DESCRIPTION("QNX6 file system");
MODULE_LICENSE("GPL");
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 627eb2f72ef3..a2b256dac36e 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2246,9 +2246,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
int cnt;
struct quota_info *dqopt = sb_dqopt(sb);
- /* s_umount should be held in exclusive mode */
- if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
- up_read(&sb->s_umount);
+ rwsem_assert_held_write(&sb->s_umount);
/* Cannot turn off usage accounting without turning off limits, or
* suspend quotas and simultaneously turn quotas off. */
@@ -2510,9 +2508,7 @@ int dquot_resume(struct super_block *sb, int type)
int ret = 0, cnt;
unsigned int flags;
- /* s_umount should be held in exclusive mode */
- if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
- up_read(&sb->s_umount);
+ rwsem_assert_held_write(&sb->s_umount);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (type != -1 && cnt != type)
diff --git a/fs/read_write.c b/fs/read_write.c
index ef6339391351..90e283b31ca1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -730,7 +730,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
- ret = kiocb_set_rw_flags(&kiocb, flags);
+ ret = kiocb_set_rw_flags(&kiocb, flags, type);
if (ret)
return ret;
kiocb.ki_pos = (ppos ? *ppos : 0);
@@ -1736,3 +1736,19 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
return 0;
}
+
+bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos)
+{
+ size_t len = iov_iter_count(iter);
+
+ if (!iter_is_ubuf(iter))
+ return false;
+
+ if (!is_power_of_2(len))
+ return false;
+
+ if (!IS_ALIGNED(pos, len))
+ return false;
+
+ return true;
+}
diff --git a/fs/readdir.c b/fs/readdir.c
index 278bc0254732..d6c82421902a 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -22,8 +22,6 @@
#include <linux/compat.h>
#include <linux/uaccess.h>
-#include <asm/unaligned.h>
-
/*
* Some filesystems were never converted to '->iterate_shared()'
* and their directory iterators want the inode lock held for
@@ -72,7 +70,7 @@ int wrap_directory_iterator(struct file *file,
EXPORT_SYMBOL(wrap_directory_iterator);
/*
- * Note the "unsafe_put_user() semantics: we goto a
+ * Note the "unsafe_put_user()" semantics: we goto a
* label for errors.
*/
#define unsafe_copy_dirent_name(_dst, _src, _len, label) do { \
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index c1daedc50f4c..9b43a81a6488 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2699,7 +2699,6 @@ fail:
}
bh = bh->b_this_page;
} while (bh != head);
- folio_set_error(folio);
BUG_ON(folio_test_writeback(folio));
folio_start_writeback(folio);
folio_unlock(folio);
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 2cbb92462074..68758b6fed94 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -101,19 +101,15 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos);
*/
static int romfs_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
loff_t offset, size;
unsigned long fillsize, pos;
void *buf;
int ret;
- buf = kmap(page);
- if (!buf)
- return -ENOMEM;
+ buf = kmap_local_folio(folio, 0);
- /* 32 bit warning -- but not for us :) */
- offset = page_offset(page);
+ offset = folio_pos(folio);
size = i_size_read(inode);
fillsize = 0;
ret = 0;
@@ -125,20 +121,14 @@ static int romfs_read_folio(struct file *file, struct folio *folio)
ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize);
if (ret < 0) {
- SetPageError(page);
fillsize = 0;
ret = -EIO;
}
}
- if (fillsize < PAGE_SIZE)
- memset(buf + fillsize, 0, PAGE_SIZE - fillsize);
- if (ret == 0)
- SetPageUptodate(page);
-
- flush_dcache_page(page);
- kunmap(page);
- unlock_page(page);
+ buf = folio_zero_tail(folio, fillsize, buf);
+ kunmap_local(buf);
+ folio_end_read(folio, ret == 0);
return ret;
}
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 6397fdefd876..c92937bed133 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -1359,7 +1359,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
target_tcon = tlink_tcon(smb_file_target->tlink);
if (src_tcon->ses != target_tcon->ses) {
- cifs_dbg(VFS, "source and target of copy not on same server\n");
+ cifs_dbg(FYI, "source and target of copy not on same server\n");
goto out;
}
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 557b68e99d0a..8e86fec7dcd2 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -290,7 +290,7 @@ struct smb_version_operations {
int (*check_receive)(struct mid_q_entry *, struct TCP_Server_Info *,
bool);
void (*add_credits)(struct TCP_Server_Info *server,
- const struct cifs_credits *credits,
+ struct cifs_credits *credits,
const int optype);
void (*set_credits)(struct TCP_Server_Info *, const int);
int * (*get_credits_field)(struct TCP_Server_Info *, const int);
@@ -550,8 +550,8 @@ struct smb_version_operations {
size_t *, struct cifs_credits *);
/* adjust previously taken mtu credits to request size */
int (*adjust_credits)(struct TCP_Server_Info *server,
- struct cifs_credits *credits,
- const unsigned int payload_size);
+ struct cifs_io_subrequest *subreq,
+ unsigned int /*enum smb3_rw_credits_trace*/ trace);
/* check if we need to issue closedir */
bool (*dir_needs_close)(struct cifsFileInfo *);
long (*fallocate)(struct file *, struct cifs_tcon *, int, loff_t,
@@ -848,6 +848,9 @@ static inline void cifs_server_unlock(struct TCP_Server_Info *server)
struct cifs_credits {
unsigned int value;
unsigned int instance;
+ unsigned int in_flight_check;
+ unsigned int rreq_debug_id;
+ unsigned int rreq_debug_index;
};
static inline unsigned int
@@ -873,7 +876,7 @@ has_credits(struct TCP_Server_Info *server, int *credits, int num_credits)
}
static inline void
-add_credits(struct TCP_Server_Info *server, const struct cifs_credits *credits,
+add_credits(struct TCP_Server_Info *server, struct cifs_credits *credits,
const int optype)
{
server->ops->add_credits(server, credits, optype);
@@ -897,11 +900,11 @@ set_credits(struct TCP_Server_Info *server, const int val)
}
static inline int
-adjust_credits(struct TCP_Server_Info *server, struct cifs_credits *credits,
- const unsigned int payload_size)
+adjust_credits(struct TCP_Server_Info *server, struct cifs_io_subrequest *subreq,
+ unsigned int /* enum smb3_rw_credits_trace */ trace)
{
return server->ops->adjust_credits ?
- server->ops->adjust_credits(server, credits, payload_size) : 0;
+ server->ops->adjust_credits(server, subreq, trace) : 0;
}
static inline __le64
@@ -1918,8 +1921,8 @@ require use of the stronger protocol */
#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */
#define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */
-#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP)
-#define CIFSSEC_MAX (CIFSSEC_MUST_NTLMV2)
+#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP | CIFSSEC_MAY_SEAL)
+#define CIFSSEC_MAX (CIFSSEC_MAY_SIGN | CIFSSEC_MUST_KRB5 | CIFSSEC_MAY_SEAL)
#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
/*
*****************************************************************
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index f1f2573bb18d..b2405dd4d4d4 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -80,6 +80,16 @@ retry:
return netfs_prepare_write_failed(subreq);
}
+ wdata->credits.rreq_debug_id = subreq->rreq->debug_id;
+ wdata->credits.rreq_debug_index = subreq->debug_index;
+ wdata->credits.in_flight_check = 1;
+ trace_smb3_rw_credits(wdata->rreq->debug_id,
+ wdata->subreq.debug_index,
+ wdata->credits.value,
+ server->credits, server->in_flight,
+ wdata->credits.value,
+ cifs_trace_rw_credits_write_prepare);
+
#ifdef CONFIG_CIFS_SMB_DIRECT
if (server->smbd_conn)
subreq->max_nr_segs = server->smbd_conn->max_frmr_depth;
@@ -101,7 +111,7 @@ static void cifs_issue_write(struct netfs_io_subrequest *subreq)
goto fail;
}
- rc = adjust_credits(wdata->server, &wdata->credits, wdata->subreq.len);
+ rc = adjust_credits(wdata->server, wdata, cifs_trace_rw_credits_issue_write_adjust);
if (rc)
goto fail;
@@ -123,6 +133,11 @@ fail:
goto out;
}
+static void cifs_netfs_invalidate_cache(struct netfs_io_request *wreq)
+{
+ cifs_invalidate_cache(wreq->inode, 0);
+}
+
/*
* Split the read up according to how many credits we can get for each piece.
* It's okay to sleep here if we need to wait for more credit to become
@@ -158,7 +173,18 @@ static bool cifs_clamp_length(struct netfs_io_subrequest *subreq)
return false;
}
+ rdata->credits.in_flight_check = 1;
+ rdata->credits.rreq_debug_id = rreq->debug_id;
+ rdata->credits.rreq_debug_index = subreq->debug_index;
+
+ trace_smb3_rw_credits(rdata->rreq->debug_id,
+ rdata->subreq.debug_index,
+ rdata->credits.value,
+ server->credits, server->in_flight, 0,
+ cifs_trace_rw_credits_read_submit);
+
subreq->len = min_t(size_t, subreq->len, rsize);
+
#ifdef CONFIG_CIFS_SMB_DIRECT
if (server->smbd_conn)
subreq->max_nr_segs = server->smbd_conn->max_frmr_depth;
@@ -246,35 +272,6 @@ static int cifs_init_request(struct netfs_io_request *rreq, struct file *file)
}
/*
- * Expand the size of a readahead to the size of the rsize, if at least as
- * large as a page, allowing for the possibility that rsize is not pow-2
- * aligned.
- */
-static void cifs_expand_readahead(struct netfs_io_request *rreq)
-{
- unsigned int rsize = rreq->rsize;
- loff_t misalignment, i_size = i_size_read(rreq->inode);
-
- if (rsize < PAGE_SIZE)
- return;
-
- if (rsize < INT_MAX)
- rsize = roundup_pow_of_two(rsize);
- else
- rsize = ((unsigned int)INT_MAX + 1) / 2;
-
- misalignment = rreq->start & (rsize - 1);
- if (misalignment) {
- rreq->start -= misalignment;
- rreq->len += misalignment;
- }
-
- rreq->len = round_up(rreq->len, rsize);
- if (rreq->start < i_size && rreq->len > i_size - rreq->start)
- rreq->len = i_size - rreq->start;
-}
-
-/*
* Completion of a request operation.
*/
static void cifs_rreq_done(struct netfs_io_request *rreq)
@@ -318,6 +315,15 @@ static void cifs_free_subrequest(struct netfs_io_subrequest *subreq)
#endif
}
+ if (rdata->credits.value != 0)
+ trace_smb3_rw_credits(rdata->rreq->debug_id,
+ rdata->subreq.debug_index,
+ rdata->credits.value,
+ rdata->server ? rdata->server->credits : 0,
+ rdata->server ? rdata->server->in_flight : 0,
+ -rdata->credits.value,
+ cifs_trace_rw_credits_free_subreq);
+
add_credits_and_wake_if(rdata->server, &rdata->credits, 0);
if (rdata->have_xid)
free_xid(rdata->xid);
@@ -329,7 +335,6 @@ const struct netfs_request_ops cifs_req_ops = {
.init_request = cifs_init_request,
.free_request = cifs_free_request,
.free_subrequest = cifs_free_subrequest,
- .expand_readahead = cifs_expand_readahead,
.clamp_length = cifs_clamp_length,
.issue_read = cifs_req_issue_read,
.done = cifs_rreq_done,
@@ -337,6 +342,7 @@ const struct netfs_request_ops cifs_req_ops = {
.begin_writeback = cifs_begin_writeback,
.prepare_write = cifs_prepare_write,
.issue_write = cifs_issue_write,
+ .invalidate_cache = cifs_netfs_invalidate_cache,
};
/*
@@ -2388,13 +2394,18 @@ void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t
bool was_async)
{
struct netfs_io_request *wreq = wdata->rreq;
- loff_t new_server_eof;
+ struct netfs_inode *ictx = netfs_inode(wreq->inode);
+ loff_t wrend;
if (result > 0) {
- new_server_eof = wdata->subreq.start + wdata->subreq.transferred + result;
+ wrend = wdata->subreq.start + wdata->subreq.transferred + result;
- if (new_server_eof > netfs_inode(wreq->inode)->remote_i_size)
- netfs_resize_file(netfs_inode(wreq->inode), new_server_eof, true);
+ if (wrend > ictx->zero_point &&
+ (wdata->rreq->origin == NETFS_UNBUFFERED_WRITE ||
+ wdata->rreq->origin == NETFS_DIO_WRITE))
+ ictx->zero_point = wrend;
+ if (wrend > ictx->remote_i_size)
+ netfs_resize_file(ictx, wrend, true);
}
netfs_write_subrequest_terminated(&wdata->subreq, result, was_async);
@@ -2907,6 +2918,7 @@ cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)
rc = netfs_start_io_direct(inode);
if (rc < 0)
goto out;
+ rc = -EACCES;
down_read(&cinode->lock_sem);
if (!cifs_find_lock_conflict(
cfile, iocb->ki_pos, iov_iter_count(to),
@@ -2919,6 +2931,7 @@ cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)
rc = netfs_start_io_read(inode);
if (rc < 0)
goto out;
+ rc = -EACCES;
down_read(&cinode->lock_sem);
if (!cifs_find_lock_conflict(
cfile, iocb->ki_pos, iov_iter_count(to),
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index 3bbac925d076..bc926ab2555b 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -128,12 +128,14 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_flag("compress", Opt_compress),
fsparam_flag("witness", Opt_witness),
+ /* Mount options which take uid or gid */
+ fsparam_uid("backupuid", Opt_backupuid),
+ fsparam_gid("backupgid", Opt_backupgid),
+ fsparam_uid("uid", Opt_uid),
+ fsparam_uid("cruid", Opt_cruid),
+ fsparam_gid("gid", Opt_gid),
+
/* Mount options which take numeric value */
- fsparam_u32("backupuid", Opt_backupuid),
- fsparam_u32("backupgid", Opt_backupgid),
- fsparam_u32("uid", Opt_uid),
- fsparam_u32("cruid", Opt_cruid),
- fsparam_u32("gid", Opt_gid),
fsparam_u32("file_mode", Opt_file_mode),
fsparam_u32("dirmode", Opt_dirmode),
fsparam_u32("dir_mode", Opt_dirmode),
@@ -951,8 +953,6 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
int i, opt;
bool is_smb3 = !strcmp(fc->fs_type->name, "smb3");
bool skip_parsing = false;
- kuid_t uid;
- kgid_t gid;
cifs_dbg(FYI, "CIFS: parsing cifs mount option '%s'\n", param->key);
@@ -1083,38 +1083,23 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
}
break;
case Opt_uid:
- uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(uid))
- goto cifs_parse_mount_err;
- ctx->linux_uid = uid;
+ ctx->linux_uid = result.uid;
ctx->uid_specified = true;
break;
case Opt_cruid:
- uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(uid))
- goto cifs_parse_mount_err;
- ctx->cred_uid = uid;
+ ctx->cred_uid = result.uid;
ctx->cruid_specified = true;
break;
case Opt_backupuid:
- uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(uid))
- goto cifs_parse_mount_err;
- ctx->backupuid = uid;
+ ctx->backupuid = result.uid;
ctx->backupuid_specified = true;
break;
case Opt_backupgid:
- gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(gid))
- goto cifs_parse_mount_err;
- ctx->backupgid = gid;
+ ctx->backupgid = result.gid;
ctx->backupgid_specified = true;
break;
case Opt_gid:
- gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(gid))
- goto cifs_parse_mount_err;
- ctx->linux_gid = gid;
+ ctx->linux_gid = result.gid;
ctx->gid_specified = true;
break;
case Opt_port:
diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index 212ec6f66ec6..e1f2feb56f45 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -108,7 +108,7 @@ cifs_find_mid(struct TCP_Server_Info *server, char *buffer)
static void
cifs_add_credits(struct TCP_Server_Info *server,
- const struct cifs_credits *credits, const int optype)
+ struct cifs_credits *credits, const int optype)
{
spin_lock(&server->req_lock);
server->credits += credits->value;
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index c8e536540895..7fe59235f090 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -66,7 +66,7 @@ change_conf(struct TCP_Server_Info *server)
static void
smb2_add_credits(struct TCP_Server_Info *server,
- const struct cifs_credits *credits, const int optype)
+ struct cifs_credits *credits, const int optype)
{
int *val, rc = -1;
int scredits, in_flight;
@@ -94,7 +94,21 @@ smb2_add_credits(struct TCP_Server_Info *server,
server->conn_id, server->hostname, *val,
add, server->in_flight);
}
- WARN_ON_ONCE(server->in_flight == 0);
+ if (credits->in_flight_check > 1) {
+ pr_warn_once("rreq R=%08x[%x] Credits not in flight\n",
+ credits->rreq_debug_id, credits->rreq_debug_index);
+ } else {
+ credits->in_flight_check = 2;
+ }
+ if (WARN_ON_ONCE(server->in_flight == 0)) {
+ pr_warn_once("rreq R=%08x[%x] Zero in_flight\n",
+ credits->rreq_debug_id, credits->rreq_debug_index);
+ trace_smb3_rw_credits(credits->rreq_debug_id,
+ credits->rreq_debug_index,
+ credits->value,
+ server->credits, server->in_flight, 0,
+ cifs_trace_rw_credits_zero_in_flight);
+ }
server->in_flight--;
if (server->in_flight == 0 &&
((optype & CIFS_OP_MASK) != CIFS_NEG_OP) &&
@@ -283,16 +297,23 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, size_t size,
static int
smb2_adjust_credits(struct TCP_Server_Info *server,
- struct cifs_credits *credits,
- const unsigned int payload_size)
+ struct cifs_io_subrequest *subreq,
+ unsigned int /*enum smb3_rw_credits_trace*/ trace)
{
- int new_val = DIV_ROUND_UP(payload_size, SMB2_MAX_BUFFER_SIZE);
+ struct cifs_credits *credits = &subreq->credits;
+ int new_val = DIV_ROUND_UP(subreq->subreq.len, SMB2_MAX_BUFFER_SIZE);
int scredits, in_flight;
if (!credits->value || credits->value == new_val)
return 0;
if (credits->value < new_val) {
+ trace_smb3_rw_credits(subreq->rreq->debug_id,
+ subreq->subreq.debug_index,
+ credits->value,
+ server->credits, server->in_flight,
+ new_val - credits->value,
+ cifs_trace_rw_credits_no_adjust_up);
trace_smb3_too_many_credits(server->CurrentMid,
server->conn_id, server->hostname, 0, credits->value - new_val, 0);
cifs_server_dbg(VFS, "request has less credits (%d) than required (%d)",
@@ -308,6 +329,12 @@ smb2_adjust_credits(struct TCP_Server_Info *server,
in_flight = server->in_flight;
spin_unlock(&server->req_lock);
+ trace_smb3_rw_credits(subreq->rreq->debug_id,
+ subreq->subreq.debug_index,
+ credits->value,
+ server->credits, server->in_flight,
+ new_val - credits->value,
+ cifs_trace_rw_credits_old_session);
trace_smb3_reconnect_detected(server->CurrentMid,
server->conn_id, server->hostname, scredits,
credits->value - new_val, in_flight);
@@ -316,6 +343,11 @@ smb2_adjust_credits(struct TCP_Server_Info *server,
return -EAGAIN;
}
+ trace_smb3_rw_credits(subreq->rreq->debug_id,
+ subreq->subreq.debug_index,
+ credits->value,
+ server->credits, server->in_flight,
+ new_val - credits->value, trace);
server->credits += credits->value - new_val;
scredits = server->credits;
in_flight = server->in_flight;
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 2ae2dbb6202b..9fc5b11c0b6c 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -4502,8 +4502,15 @@ smb2_readv_callback(struct mid_q_entry *mid)
struct TCP_Server_Info *server = rdata->server;
struct smb2_hdr *shdr =
(struct smb2_hdr *)rdata->iov[0].iov_base;
- struct cifs_credits credits = { .value = 0, .instance = 0 };
+ struct cifs_credits credits = {
+ .value = 0,
+ .instance = 0,
+ .rreq_debug_id = rdata->rreq->debug_id,
+ .rreq_debug_index = rdata->subreq.debug_index,
+ };
struct smb_rqst rqst = { .rq_iov = &rdata->iov[1], .rq_nvec = 1 };
+ unsigned int rreq_debug_id = rdata->rreq->debug_id;
+ unsigned int subreq_debug_index = rdata->subreq.debug_index;
if (rdata->got_bytes) {
rqst.rq_iter = rdata->subreq.io_iter;
@@ -4587,10 +4594,16 @@ smb2_readv_callback(struct mid_q_entry *mid)
if (rdata->subreq.start < rdata->subreq.rreq->i_size)
rdata->result = 0;
}
+ trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value,
+ server->credits, server->in_flight,
+ 0, cifs_trace_rw_credits_read_response_clear);
rdata->credits.value = 0;
INIT_WORK(&rdata->subreq.work, smb2_readv_worker);
queue_work(cifsiod_wq, &rdata->subreq.work);
release_mid(mid);
+ trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, 0,
+ server->credits, server->in_flight,
+ credits.value, cifs_trace_rw_credits_read_response_add);
add_credits(server, &credits, 0);
}
@@ -4647,7 +4660,7 @@ smb2_async_readv(struct cifs_io_subrequest *rdata)
min_t(int, server->max_credits -
server->credits, credit_request));
- rc = adjust_credits(server, &rdata->credits, rdata->subreq.len);
+ rc = adjust_credits(server, rdata, cifs_trace_rw_credits_call_readv_adjust);
if (rc)
goto async_readv_out;
@@ -4766,7 +4779,14 @@ smb2_writev_callback(struct mid_q_entry *mid)
struct cifs_tcon *tcon = tlink_tcon(wdata->req->cfile->tlink);
struct TCP_Server_Info *server = wdata->server;
struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf;
- struct cifs_credits credits = { .value = 0, .instance = 0 };
+ struct cifs_credits credits = {
+ .value = 0,
+ .instance = 0,
+ .rreq_debug_id = wdata->rreq->debug_id,
+ .rreq_debug_index = wdata->subreq.debug_index,
+ };
+ unsigned int rreq_debug_id = wdata->rreq->debug_id;
+ unsigned int subreq_debug_index = wdata->subreq.debug_index;
ssize_t result = 0;
size_t written;
@@ -4837,9 +4857,15 @@ smb2_writev_callback(struct mid_q_entry *mid)
tcon->tid, tcon->ses->Suid,
wdata->subreq.start, wdata->subreq.len);
+ trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, wdata->credits.value,
+ server->credits, server->in_flight,
+ 0, cifs_trace_rw_credits_write_response_clear);
wdata->credits.value = 0;
cifs_write_subrequest_terminated(wdata, result ?: written, true);
release_mid(mid);
+ trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, 0,
+ server->credits, server->in_flight,
+ credits.value, cifs_trace_rw_credits_write_response_add);
add_credits(server, &credits, 0);
}
@@ -4859,9 +4885,6 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
struct cifs_io_parms *io_parms = NULL;
int credit_request;
- if (!wdata->server || test_bit(NETFS_SREQ_RETRYING, &wdata->subreq.flags))
- server = wdata->server = cifs_pick_channel(tcon->ses);
-
/*
* in future we may get cifs_io_parms passed in from the caller,
* but for now we construct it here...
@@ -4972,7 +4995,7 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
min_t(int, server->max_credits -
server->credits, credit_request));
- rc = adjust_credits(server, &wdata->credits, io_parms->length);
+ rc = adjust_credits(server, wdata, cifs_trace_rw_credits_call_writev_adjust);
if (rc)
goto async_writev_out;
@@ -4997,6 +5020,12 @@ async_writev_out:
cifs_small_buf_release(req);
out:
if (rc) {
+ trace_smb3_rw_credits(wdata->rreq->debug_id,
+ wdata->subreq.debug_index,
+ wdata->credits.value,
+ server->credits, server->in_flight,
+ -(int)wdata->credits.value,
+ cifs_trace_rw_credits_write_response_clear);
add_credits_and_wake_if(wdata->server, &wdata->credits, 0);
cifs_write_subrequest_terminated(wdata, rc, true);
}
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index 36d47ce59631..36d5295c2a6f 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -20,6 +20,22 @@
/*
* Specify enums for tracing information.
*/
+#define smb3_rw_credits_traces \
+ EM(cifs_trace_rw_credits_call_readv_adjust, "rd-call-adj") \
+ EM(cifs_trace_rw_credits_call_writev_adjust, "wr-call-adj") \
+ EM(cifs_trace_rw_credits_free_subreq, "free-subreq") \
+ EM(cifs_trace_rw_credits_issue_read_adjust, "rd-issu-adj") \
+ EM(cifs_trace_rw_credits_issue_write_adjust, "wr-issu-adj") \
+ EM(cifs_trace_rw_credits_no_adjust_up, "no-adj-up ") \
+ EM(cifs_trace_rw_credits_old_session, "old-session") \
+ EM(cifs_trace_rw_credits_read_response_add, "rd-resp-add") \
+ EM(cifs_trace_rw_credits_read_response_clear, "rd-resp-clr") \
+ EM(cifs_trace_rw_credits_read_submit, "rd-submit ") \
+ EM(cifs_trace_rw_credits_write_prepare, "wr-prepare ") \
+ EM(cifs_trace_rw_credits_write_response_add, "wr-resp-add") \
+ EM(cifs_trace_rw_credits_write_response_clear, "wr-resp-clr") \
+ E_(cifs_trace_rw_credits_zero_in_flight, "ZERO-IN-FLT")
+
#define smb3_tcon_ref_traces \
EM(netfs_trace_tcon_ref_dec_dfs_refer, "DEC DfsRef") \
EM(netfs_trace_tcon_ref_free, "FRE ") \
@@ -59,7 +75,8 @@
#define EM(a, b) a,
#define E_(a, b) a
-enum smb3_tcon_ref_trace { smb3_tcon_ref_traces } __mode(byte);
+enum smb3_rw_credits_trace { smb3_rw_credits_traces } __mode(byte);
+enum smb3_tcon_ref_trace { smb3_tcon_ref_traces } __mode(byte);
#undef EM
#undef E_
@@ -71,6 +88,7 @@ enum smb3_tcon_ref_trace { smb3_tcon_ref_traces } __mode(byte);
#define EM(a, b) TRACE_DEFINE_ENUM(a);
#define E_(a, b) TRACE_DEFINE_ENUM(a);
+smb3_rw_credits_traces;
smb3_tcon_ref_traces;
#undef EM
@@ -1316,6 +1334,41 @@ TRACE_EVENT(smb3_tcon_ref,
__entry->ref)
);
+TRACE_EVENT(smb3_rw_credits,
+ TP_PROTO(unsigned int rreq_debug_id,
+ unsigned int subreq_debug_index,
+ unsigned int subreq_credits,
+ unsigned int server_credits,
+ int server_in_flight,
+ int credit_change,
+ enum smb3_rw_credits_trace trace),
+ TP_ARGS(rreq_debug_id, subreq_debug_index, subreq_credits,
+ server_credits, server_in_flight, credit_change, trace),
+ TP_STRUCT__entry(
+ __field(unsigned int, rreq_debug_id)
+ __field(unsigned int, subreq_debug_index)
+ __field(unsigned int, subreq_credits)
+ __field(unsigned int, server_credits)
+ __field(int, in_flight)
+ __field(int, credit_change)
+ __field(enum smb3_rw_credits_trace, trace)
+ ),
+ TP_fast_assign(
+ __entry->rreq_debug_id = rreq_debug_id;
+ __entry->subreq_debug_index = subreq_debug_index;
+ __entry->subreq_credits = subreq_credits;
+ __entry->server_credits = server_credits;
+ __entry->in_flight = server_in_flight;
+ __entry->credit_change = credit_change;
+ __entry->trace = trace;
+ ),
+ TP_printk("R=%08x[%x] %s cred=%u chg=%d pool=%u ifl=%d",
+ __entry->rreq_debug_id, __entry->subreq_debug_index,
+ __print_symbolic(__entry->trace, smb3_rw_credits_traces),
+ __entry->subreq_credits, __entry->credit_change,
+ __entry->server_credits, __entry->in_flight)
+ );
+
#undef EM
#undef E_
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index 012b9bd06995..adfe0d058701 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -988,10 +988,10 @@ static void
cifs_compound_callback(struct mid_q_entry *mid)
{
struct TCP_Server_Info *server = mid->server;
- struct cifs_credits credits;
-
- credits.value = server->ops->get_credits(mid);
- credits.instance = server->reconnect_instance;
+ struct cifs_credits credits = {
+ .value = server->ops->get_credits(mid),
+ .instance = server->reconnect_instance,
+ };
add_credits(server, &credits, mid->optype);
diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h
index 8d10be1fe18a..c3ee42188d25 100644
--- a/fs/smb/common/smb2pdu.h
+++ b/fs/smb/common/smb2pdu.h
@@ -917,6 +917,40 @@ struct smb2_query_directory_rsp {
__u8 Buffer[];
} __packed;
+/* DeviceType Flags */
+#define FILE_DEVICE_CD_ROM 0x00000002
+#define FILE_DEVICE_CD_ROM_FILE_SYSTEM 0x00000003
+#define FILE_DEVICE_DFS 0x00000006
+#define FILE_DEVICE_DISK 0x00000007
+#define FILE_DEVICE_DISK_FILE_SYSTEM 0x00000008
+#define FILE_DEVICE_FILE_SYSTEM 0x00000009
+#define FILE_DEVICE_NAMED_PIPE 0x00000011
+#define FILE_DEVICE_NETWORK 0x00000012
+#define FILE_DEVICE_NETWORK_FILE_SYSTEM 0x00000014
+#define FILE_DEVICE_NULL 0x00000015
+#define FILE_DEVICE_PARALLEL_PORT 0x00000016
+#define FILE_DEVICE_PRINTER 0x00000018
+#define FILE_DEVICE_SERIAL_PORT 0x0000001b
+#define FILE_DEVICE_STREAMS 0x0000001e
+#define FILE_DEVICE_TAPE 0x0000001f
+#define FILE_DEVICE_TAPE_FILE_SYSTEM 0x00000020
+#define FILE_DEVICE_VIRTUAL_DISK 0x00000024
+#define FILE_DEVICE_NETWORK_REDIRECTOR 0x00000028
+
+/* Device Characteristics */
+#define FILE_REMOVABLE_MEDIA 0x00000001
+#define FILE_READ_ONLY_DEVICE 0x00000002
+#define FILE_FLOPPY_DISKETTE 0x00000004
+#define FILE_WRITE_ONCE_MEDIA 0x00000008
+#define FILE_REMOTE_DEVICE 0x00000010
+#define FILE_DEVICE_IS_MOUNTED 0x00000020
+#define FILE_VIRTUAL_VOLUME 0x00000040
+#define FILE_DEVICE_SECURE_OPEN 0x00000100
+#define FILE_CHARACTERISTIC_TS_DEVICE 0x00001000
+#define FILE_CHARACTERISTIC_WEBDAV_DEVICE 0x00002000
+#define FILE_PORTABLE_DEVICE 0x00004000
+#define FILE_DEVICE_ALLOW_APPCONTAINER_TRAVERSAL 0x00020000
+
/*
* Maximum number of iovs we need for a set-info request.
* The largest one is rename/hardlink
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index e7e07891781b..840c71c66b30 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -2051,15 +2051,22 @@ out_err1:
* @access: file access flags
* @disposition: file disposition flags
* @may_flags: set with MAY_ flags
+ * @is_dir: is creating open flags for directory
*
* Return: file open flags
*/
static int smb2_create_open_flags(bool file_present, __le32 access,
__le32 disposition,
- int *may_flags)
+ int *may_flags,
+ bool is_dir)
{
int oflags = O_NONBLOCK | O_LARGEFILE;
+ if (is_dir) {
+ access &= ~FILE_WRITE_DESIRE_ACCESS_LE;
+ ksmbd_debug(SMB, "Discard write access to a directory\n");
+ }
+
if (access & FILE_READ_DESIRED_ACCESS_LE &&
access & FILE_WRITE_DESIRE_ACCESS_LE) {
oflags |= O_RDWR;
@@ -3167,7 +3174,9 @@ int smb2_open(struct ksmbd_work *work)
open_flags = smb2_create_open_flags(file_present, daccess,
req->CreateDisposition,
- &may_flags);
+ &may_flags,
+ req->CreateOptions & FILE_DIRECTORY_FILE_LE ||
+ (file_present && S_ISDIR(d_inode(path.dentry)->i_mode)));
if (!test_tree_conn_flag(tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
if (open_flags & (O_CREAT | O_TRUNC)) {
@@ -5314,8 +5323,13 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
info = (struct filesystem_device_info *)rsp->Buffer;
- info->DeviceType = cpu_to_le32(stfs.f_type);
- info->DeviceCharacteristics = cpu_to_le32(0x00000020);
+ info->DeviceType = cpu_to_le32(FILE_DEVICE_DISK);
+ info->DeviceCharacteristics =
+ cpu_to_le32(FILE_DEVICE_IS_MOUNTED);
+ if (!test_tree_conn_flag(work->tcon,
+ KSMBD_TREE_CONN_FLAG_WRITABLE))
+ info->DeviceCharacteristics |=
+ cpu_to_le32(FILE_READ_ONLY_DEVICE);
rsp->OutputBufferLength = cpu_to_le32(8);
break;
}
diff --git a/fs/stat.c b/fs/stat.c
index 70bd3e888cfa..89ce1be56310 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -90,6 +90,37 @@ void generic_fill_statx_attr(struct inode *inode, struct kstat *stat)
EXPORT_SYMBOL(generic_fill_statx_attr);
/**
+ * generic_fill_statx_atomic_writes - Fill in atomic writes statx attributes
+ * @stat: Where to fill in the attribute flags
+ * @unit_min: Minimum supported atomic write length in bytes
+ * @unit_max: Maximum supported atomic write length in bytes
+ *
+ * Fill in the STATX{_ATTR}_WRITE_ATOMIC flags in the kstat structure from
+ * atomic write unit_min and unit_max values.
+ */
+void generic_fill_statx_atomic_writes(struct kstat *stat,
+ unsigned int unit_min,
+ unsigned int unit_max)
+{
+ /* Confirm that the request type is known */
+ stat->result_mask |= STATX_WRITE_ATOMIC;
+
+ /* Confirm that the file attribute type is known */
+ stat->attributes_mask |= STATX_ATTR_WRITE_ATOMIC;
+
+ if (unit_min) {
+ stat->atomic_write_unit_min = unit_min;
+ stat->atomic_write_unit_max = unit_max;
+ /* Initially only allow 1x segment */
+ stat->atomic_write_segments_max = 1;
+
+ /* Confirm atomic writes are actually supported */
+ stat->attributes |= STATX_ATTR_WRITE_ATOMIC;
+ }
+}
+EXPORT_SYMBOL_GPL(generic_fill_statx_atomic_writes);
+
+/**
* vfs_getattr_nosec - getattr without security checks
* @path: file to get attributes from
* @stat: structure to return attributes in
@@ -214,6 +245,43 @@ int getname_statx_lookup_flags(int flags)
return lookup_flags;
}
+static int vfs_statx_path(struct path *path, int flags, struct kstat *stat,
+ u32 request_mask)
+{
+ int error = vfs_getattr(path, stat, request_mask, flags);
+
+ if (request_mask & STATX_MNT_ID_UNIQUE) {
+ stat->mnt_id = real_mount(path->mnt)->mnt_id_unique;
+ stat->result_mask |= STATX_MNT_ID_UNIQUE;
+ } else {
+ stat->mnt_id = real_mount(path->mnt)->mnt_id;
+ stat->result_mask |= STATX_MNT_ID;
+ }
+
+ if (path_mounted(path))
+ stat->attributes |= STATX_ATTR_MOUNT_ROOT;
+ stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT;
+
+ /*
+ * If this is a block device inode, override the filesystem
+ * attributes with the block device specific parameters that need to be
+ * obtained from the bdev backing inode.
+ */
+ if (S_ISBLK(stat->mode))
+ bdev_statx(path, stat, request_mask);
+
+ return error;
+}
+
+static int vfs_statx_fd(int fd, int flags, struct kstat *stat,
+ u32 request_mask)
+{
+ CLASS(fd_raw, f)(fd);
+ if (!f.file)
+ return -EBADF;
+ return vfs_statx_path(&f.file->f_path, flags, stat, request_mask);
+}
+
/**
* vfs_statx - Get basic and extra attributes by filename
* @dfd: A file descriptor representing the base dir for a relative filename
@@ -243,36 +311,13 @@ static int vfs_statx(int dfd, struct filename *filename, int flags,
retry:
error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
if (error)
- goto out;
-
- error = vfs_getattr(&path, stat, request_mask, flags);
-
- if (request_mask & STATX_MNT_ID_UNIQUE) {
- stat->mnt_id = real_mount(path.mnt)->mnt_id_unique;
- stat->result_mask |= STATX_MNT_ID_UNIQUE;
- } else {
- stat->mnt_id = real_mount(path.mnt)->mnt_id;
- stat->result_mask |= STATX_MNT_ID;
- }
-
- if (path.mnt->mnt_root == path.dentry)
- stat->attributes |= STATX_ATTR_MOUNT_ROOT;
- stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT;
-
- /* Handle STATX_DIOALIGN for block devices. */
- if (request_mask & STATX_DIOALIGN) {
- struct inode *inode = d_backing_inode(path.dentry);
-
- if (S_ISBLK(inode->i_mode))
- bdev_statx_dioalign(inode, stat);
- }
-
+ return error;
+ error = vfs_statx_path(&path, flags, stat, request_mask);
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
-out:
return error;
}
@@ -289,18 +334,10 @@ int vfs_fstatat(int dfd, const char __user *filename,
* If AT_EMPTY_PATH is set, we expect the common case to be that
* empty path, and avoid doing all the extra pathname work.
*/
- if (dfd >= 0 && flags == AT_EMPTY_PATH) {
- char c;
-
- ret = get_user(c, filename);
- if (unlikely(ret))
- return ret;
+ if (flags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename))
+ return vfs_fstat(dfd, stat);
- if (likely(!c))
- return vfs_fstat(dfd, stat);
- }
-
- name = getname_flags(filename, getname_statx_lookup_flags(statx_flags), NULL);
+ name = getname_flags(filename, getname_statx_lookup_flags(statx_flags));
ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS);
putname(name);
@@ -488,34 +525,39 @@ static int do_readlinkat(int dfd, const char __user *pathname,
char __user *buf, int bufsiz)
{
struct path path;
+ struct filename *name;
int error;
- int empty = 0;
unsigned int lookup_flags = LOOKUP_EMPTY;
if (bufsiz <= 0)
return -EINVAL;
retry:
- error = user_path_at_empty(dfd, pathname, lookup_flags, &path, &empty);
- if (!error) {
- struct inode *inode = d_backing_inode(path.dentry);
-
- error = empty ? -ENOENT : -EINVAL;
- /*
- * AFS mountpoints allow readlink(2) but are not symlinks
- */
- if (d_is_symlink(path.dentry) || inode->i_op->readlink) {
- error = security_inode_readlink(path.dentry);
- if (!error) {
- touch_atime(&path);
- error = vfs_readlink(path.dentry, buf, bufsiz);
- }
- }
- path_put(&path);
- if (retry_estale(error, lookup_flags)) {
- lookup_flags |= LOOKUP_REVAL;
- goto retry;
+ name = getname_flags(pathname, lookup_flags);
+ error = filename_lookup(dfd, name, lookup_flags, &path, NULL);
+ if (unlikely(error)) {
+ putname(name);
+ return error;
+ }
+
+ /*
+ * AFS mountpoints allow readlink(2) but are not symlinks
+ */
+ if (d_is_symlink(path.dentry) ||
+ d_backing_inode(path.dentry)->i_op->readlink) {
+ error = security_inode_readlink(path.dentry);
+ if (!error) {
+ touch_atime(&path);
+ error = vfs_readlink(path.dentry, buf, bufsiz);
}
+ } else {
+ error = (name->name[0] == '\0') ? -ENOENT : -EINVAL;
+ }
+ path_put(&path);
+ putname(name);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
}
return error;
}
@@ -659,6 +701,9 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
tmp.stx_dio_mem_align = stat->dio_mem_align;
tmp.stx_dio_offset_align = stat->dio_offset_align;
tmp.stx_subvol = stat->subvol;
+ tmp.stx_atomic_write_unit_min = stat->atomic_write_unit_min;
+ tmp.stx_atomic_write_unit_max = stat->atomic_write_unit_max;
+ tmp.stx_atomic_write_segments_max = stat->atomic_write_segments_max;
return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}
@@ -674,7 +719,8 @@ int do_statx(int dfd, struct filename *filename, unsigned int flags,
if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE)
return -EINVAL;
- /* STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests
+ /*
+ * STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests
* from userland.
*/
mask &= ~STATX_CHANGE_COOKIE;
@@ -686,16 +732,41 @@ int do_statx(int dfd, struct filename *filename, unsigned int flags,
return cp_statx(&stat, buffer);
}
+int do_statx_fd(int fd, unsigned int flags, unsigned int mask,
+ struct statx __user *buffer)
+{
+ struct kstat stat;
+ int error;
+
+ if (mask & STATX__RESERVED)
+ return -EINVAL;
+ if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE)
+ return -EINVAL;
+
+ /*
+ * STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests
+ * from userland.
+ */
+ mask &= ~STATX_CHANGE_COOKIE;
+
+ error = vfs_statx_fd(fd, flags, &stat, mask);
+ if (error)
+ return error;
+
+ return cp_statx(&stat, buffer);
+}
+
/**
* sys_statx - System call to get enhanced stats
* @dfd: Base directory to pathwalk from *or* fd to stat.
- * @filename: File to stat or "" with AT_EMPTY_PATH
+ * @filename: File to stat or either NULL or "" with AT_EMPTY_PATH
* @flags: AT_* flags to control pathwalk.
* @mask: Parts of statx struct actually required.
* @buffer: Result buffer.
*
* Note that fstat() can be emulated by setting dfd to the fd of interest,
- * supplying "" as the filename and setting AT_EMPTY_PATH in the flags.
+ * supplying "" (or preferably NULL) as the filename and setting AT_EMPTY_PATH
+ * in the flags.
*/
SYSCALL_DEFINE5(statx,
int, dfd, const char __user *, filename, unsigned, flags,
@@ -703,9 +774,24 @@ SYSCALL_DEFINE5(statx,
struct statx __user *, buffer)
{
int ret;
+ unsigned lflags;
struct filename *name;
- name = getname_flags(filename, getname_statx_lookup_flags(flags), NULL);
+ /*
+ * Short-circuit handling of NULL and "" paths.
+ *
+ * For a NULL path we require and accept only the AT_EMPTY_PATH flag
+ * (possibly |'d with AT_STATX flags).
+ *
+ * However, glibc on 32-bit architectures implements fstatat as statx
+ * with the "" pathname and AT_NO_AUTOMOUNT | AT_EMPTY_PATH flags.
+ * Supporting this results in the uglification below.
+ */
+ lflags = flags & ~(AT_NO_AUTOMOUNT | AT_STATX_SYNC_TYPE);
+ if (lflags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename))
+ return do_statx_fd(dfd, flags & ~AT_NO_AUTOMOUNT, mask, buffer);
+
+ name = getname_flags(filename, getname_statx_lookup_flags(flags));
ret = do_statx(dfd, name, flags, mask, buffer);
putname(name);
diff --git a/fs/super.c b/fs/super.c
index b72f1d288e95..095ba793e10c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1502,8 +1502,17 @@ static int fs_bdev_thaw(struct block_device *bdev)
lockdep_assert_held(&bdev->bd_fsfreeze_mutex);
+ /*
+ * The block device may have been frozen before it was claimed by a
+ * filesystem. Concurrently another process might try to mount that
+ * frozen block device and has temporarily claimed the block device for
+ * that purpose causing a concurrent fs_bdev_thaw() to end up here. The
+ * mounter is already about to abort mounting because they still saw an
+ * elevanted bdev->bd_fsfreeze_count so get_bdev_super() will return
+ * NULL in that case.
+ */
sb = get_bdev_super(bdev);
- if (WARN_ON_ONCE(!sb))
+ if (!sb)
return -EINVAL;
if (sb->s_op->thaw_super)
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 3365a30dc1e0..5c0d07ddbda2 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -591,4 +591,5 @@ static void __exit exit_sysv_fs(void)
module_init(init_sysv_fs)
module_exit(exit_sysv_fs)
+MODULE_DESCRIPTION("SystemV Filesystem");
MODULE_LICENSE("GPL");
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 7c29f4afc23d..1028ab6d9a74 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -296,9 +296,9 @@ enum {
};
static const struct fs_parameter_spec tracefs_param_specs[] = {
- fsparam_u32 ("gid", Opt_gid),
+ fsparam_gid ("gid", Opt_gid),
fsparam_u32oct ("mode", Opt_mode),
- fsparam_u32 ("uid", Opt_uid),
+ fsparam_uid ("uid", Opt_uid),
{}
};
@@ -306,8 +306,6 @@ static int tracefs_parse_param(struct fs_context *fc, struct fs_parameter *param
{
struct tracefs_fs_info *opts = fc->s_fs_info;
struct fs_parse_result result;
- kuid_t uid;
- kgid_t gid;
int opt;
opt = fs_parse(fc, tracefs_param_specs, param, &result);
@@ -316,16 +314,10 @@ static int tracefs_parse_param(struct fs_context *fc, struct fs_parameter *param
switch (opt) {
case Opt_uid:
- uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(uid))
- return invalf(fc, "Unknown uid");
- opts->uid = uid;
+ opts->uid = result.uid;
break;
case Opt_gid:
- gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(gid))
- return invalf(fc, "Unknown gid");
- opts->gid = gid;
+ opts->gid = result.gid;
break;
case Opt_mode:
opts->mode = result.uint_32 & S_IALLUGO;
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index ab3ffc355949..d8fc11765d61 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -18,6 +18,7 @@
#include "udfdecl.h"
#include <linux/bitops.h>
+#include <linux/overflow.h>
#include "udf_i.h"
#include "udf_sb.h"
@@ -64,14 +65,18 @@ static int read_block_bitmap(struct super_block *sb,
}
for (i = 0; i < count; i++)
- if (udf_test_bit(i + off, bh->b_data))
+ if (udf_test_bit(i + off, bh->b_data)) {
+ bitmap->s_block_bitmap[bitmap_nr] =
+ ERR_PTR(-EFSCORRUPTED);
+ brelse(bh);
return -EFSCORRUPTED;
+ }
return 0;
}
-static int __load_block_bitmap(struct super_block *sb,
- struct udf_bitmap *bitmap,
- unsigned int block_group)
+static int load_block_bitmap(struct super_block *sb,
+ struct udf_bitmap *bitmap,
+ unsigned int block_group)
{
int retval = 0;
int nr_groups = bitmap->s_nr_groups;
@@ -81,8 +86,15 @@ static int __load_block_bitmap(struct super_block *sb,
block_group, nr_groups);
}
- if (bitmap->s_block_bitmap[block_group])
+ if (bitmap->s_block_bitmap[block_group]) {
+ /*
+ * The bitmap failed verification in the past. No point in
+ * trying again.
+ */
+ if (IS_ERR(bitmap->s_block_bitmap[block_group]))
+ return PTR_ERR(bitmap->s_block_bitmap[block_group]);
return block_group;
+ }
retval = read_block_bitmap(sb, bitmap, block_group, block_group);
if (retval < 0)
@@ -91,23 +103,6 @@ static int __load_block_bitmap(struct super_block *sb,
return block_group;
}
-static inline int load_block_bitmap(struct super_block *sb,
- struct udf_bitmap *bitmap,
- unsigned int block_group)
-{
- int slot;
-
- slot = __load_block_bitmap(sb, bitmap, block_group);
-
- if (slot < 0)
- return slot;
-
- if (!bitmap->s_block_bitmap[slot])
- return -EIO;
-
- return slot;
-}
-
static void udf_add_free_space(struct super_block *sb, u16 partition, u32 cnt)
{
struct udf_sb_info *sbi = UDF_SB(sb);
@@ -129,7 +124,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
{
struct udf_sb_info *sbi = UDF_SB(sb);
struct buffer_head *bh = NULL;
- struct udf_part_map *partmap;
unsigned long block;
unsigned long block_group;
unsigned long bit;
@@ -138,19 +132,9 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
unsigned long overflow;
mutex_lock(&sbi->s_alloc_mutex);
- partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
- if (bloc->logicalBlockNum + count < count ||
- (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
- udf_debug("%u < %d || %u + %u > %u\n",
- bloc->logicalBlockNum, 0,
- bloc->logicalBlockNum, count,
- partmap->s_partition_len);
- goto error_return;
- }
-
+ /* We make sure this cannot overflow when mounting the filesystem */
block = bloc->logicalBlockNum + offset +
(sizeof(struct spaceBitmapDesc) << 3);
-
do {
overflow = 0;
block_group = block >> (sb->s_blocksize_bits + 3);
@@ -380,7 +364,6 @@ static void udf_table_free_blocks(struct super_block *sb,
uint32_t count)
{
struct udf_sb_info *sbi = UDF_SB(sb);
- struct udf_part_map *partmap;
uint32_t start, end;
uint32_t elen;
struct kernel_lb_addr eloc;
@@ -389,16 +372,6 @@ static void udf_table_free_blocks(struct super_block *sb,
struct udf_inode_info *iinfo;
mutex_lock(&sbi->s_alloc_mutex);
- partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
- if (bloc->logicalBlockNum + count < count ||
- (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
- udf_debug("%u < %d || %u + %u > %u\n",
- bloc->logicalBlockNum, 0,
- bloc->logicalBlockNum, count,
- partmap->s_partition_len);
- goto error_return;
- }
-
iinfo = UDF_I(table);
udf_add_free_space(sb, sbi->s_partition, count);
@@ -673,6 +646,17 @@ void udf_free_blocks(struct super_block *sb, struct inode *inode,
{
uint16_t partition = bloc->partitionReferenceNum;
struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
+ uint32_t blk;
+
+ if (check_add_overflow(bloc->logicalBlockNum, offset, &blk) ||
+ check_add_overflow(blk, count, &blk) ||
+ bloc->logicalBlockNum + count > map->s_partition_len) {
+ udf_debug("Invalid request to free blocks: (%d, %u), off %u, "
+ "len %u, partition len %u\n",
+ partition, bloc->logicalBlockNum, offset, count,
+ map->s_partition_len);
+ return;
+ }
if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) {
udf_bitmap_free_blocks(sb, map->s_uspace.s_bitmap,
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 97c59585208c..3a4179de316b 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -232,7 +232,9 @@ static int udf_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size != i_size_read(inode)) {
+ filemap_invalidate_lock(inode->i_mapping);
error = udf_setsize(inode, attr->ia_size);
+ filemap_invalidate_unlock(inode->i_mapping);
if (error)
return error;
}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 2fb21c5ffccf..4726a4d014b6 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1247,10 +1247,7 @@ int udf_setsize(struct inode *inode, loff_t newsize)
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
S_ISLNK(inode->i_mode)))
return -EINVAL;
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
- return -EPERM;
- filemap_invalidate_lock(inode->i_mapping);
iinfo = UDF_I(inode);
if (newsize > inode->i_size) {
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
@@ -1263,11 +1260,11 @@ int udf_setsize(struct inode *inode, loff_t newsize)
}
err = udf_expand_file_adinicb(inode);
if (err)
- goto out_unlock;
+ return err;
}
err = udf_extend_file(inode, newsize);
if (err)
- goto out_unlock;
+ return err;
set_size:
truncate_setsize(inode, newsize);
} else {
@@ -1285,14 +1282,14 @@ set_size:
err = block_truncate_page(inode->i_mapping, newsize,
udf_get_block);
if (err)
- goto out_unlock;
+ return err;
truncate_setsize(inode, newsize);
down_write(&iinfo->i_data_sem);
udf_clear_extent_cache(inode);
err = udf_truncate_extents(inode);
up_write(&iinfo->i_data_sem);
if (err)
- goto out_unlock;
+ return err;
}
update_time:
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
@@ -1300,8 +1297,6 @@ update_time:
udf_sync_inode(inode);
else
mark_inode_dirty(inode);
-out_unlock:
- filemap_invalidate_unlock(inode->i_mapping);
return err;
}
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 1308109fd42d..78a603129dd5 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -876,8 +876,6 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (has_diriter) {
diriter.fi.icb.extLocation =
cpu_to_lelb(UDF_I(new_dir)->i_location);
- udf_update_tag((char *)&diriter.fi,
- udf_dir_entry_len(&diriter.fi));
udf_fiiter_write_fi(&diriter, NULL);
udf_fiiter_release(&diriter);
}
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 9381a66c6ce5..3460ecc826d1 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -336,7 +336,8 @@ static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
int nr_groups = bitmap->s_nr_groups;
for (i = 0; i < nr_groups; i++)
- brelse(bitmap->s_block_bitmap[i]);
+ if (!IS_ERR_OR_NULL(bitmap->s_block_bitmap[i]))
+ brelse(bitmap->s_block_bitmap[i]);
kvfree(bitmap);
}
@@ -1110,12 +1111,19 @@ static int udf_fill_partdesc_info(struct super_block *sb,
struct udf_part_map *map;
struct udf_sb_info *sbi = UDF_SB(sb);
struct partitionHeaderDesc *phd;
+ u32 sum;
int err;
map = &sbi->s_partmaps[p_index];
map->s_partition_len = le32_to_cpu(p->partitionLength); /* blocks */
map->s_partition_root = le32_to_cpu(p->partitionStartingLocation);
+ if (check_add_overflow(map->s_partition_root, map->s_partition_len,
+ &sum)) {
+ udf_err(sb, "Partition %d has invalid location %u + %u\n",
+ p_index, map->s_partition_root, map->s_partition_len);
+ return -EFSCORRUPTED;
+ }
if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_READ_ONLY))
map->s_partition_flags |= UDF_PART_FLAG_READ_ONLY;
@@ -1171,6 +1179,14 @@ static int udf_fill_partdesc_info(struct super_block *sb,
bitmap->s_extPosition = le32_to_cpu(
phd->unallocSpaceBitmap.extPosition);
map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP;
+ /* Check whether math over bitmap won't overflow. */
+ if (check_add_overflow(map->s_partition_len,
+ sizeof(struct spaceBitmapDesc) << 3,
+ &sum)) {
+ udf_err(sb, "Partition %d is too long (%u)\n", p_index,
+ map->s_partition_len);
+ return -EFSCORRUPTED;
+ }
udf_debug("unallocSpaceBitmap (part %d) @ %u\n",
p_index, bitmap->s_extPosition);
}
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 27c85d92d1dc..61f25d3cf3f7 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -188,7 +188,6 @@ Eend:
"offset=%lu",
dir->i_ino, (page->index<<PAGE_SHIFT)+offs);
fail:
- SetPageError(page);
return false;
}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index eee7320ab0b0..27a3e9285fbf 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -257,7 +257,7 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
goto out;
ret = false;
- pte = huge_ptep_get(ptep);
+ pte = huge_ptep_get(vma->vm_mm, vmf->address, ptep);
/*
* Lockless access: we're in a wait_event so it's ok if it
@@ -2057,7 +2057,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
goto out;
features = uffdio_api.features;
ret = -EINVAL;
- if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
+ if (uffdio_api.api != UFFD_API)
goto err_out;
ret = -EPERM;
if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
@@ -2081,6 +2081,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
#endif
+
+ ret = -EINVAL;
+ if (features & ~uffdio_api.features)
+ goto err_out;
+
uffdio_api.ioctls = UFFD_API_IOCTLS;
ret = -EFAULT;
if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c
index 118dedef8ebe..fdb4da24d662 100644
--- a/fs/vboxsf/file.c
+++ b/fs/vboxsf/file.c
@@ -228,26 +228,19 @@ const struct inode_operations vboxsf_reg_iops = {
static int vboxsf_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
struct vboxsf_handle *sf_handle = file->private_data;
- loff_t off = page_offset(page);
+ loff_t off = folio_pos(folio);
u32 nread = PAGE_SIZE;
u8 *buf;
int err;
- buf = kmap(page);
+ buf = kmap_local_folio(folio, 0);
err = vboxsf_read(sf_handle->root, sf_handle->handle, off, &nread, buf);
- if (err == 0) {
- memset(&buf[nread], 0, PAGE_SIZE - nread);
- flush_dcache_page(page);
- SetPageUptodate(page);
- } else {
- SetPageError(page);
- }
+ buf = folio_zero_tail(folio, nread, buf + nread);
- kunmap(page);
- unlock_page(page);
+ kunmap_local(buf);
+ folio_end_read(folio, err == 0);
return err;
}
@@ -295,7 +288,6 @@ static int vboxsf_writepage(struct page *page, struct writeback_control *wbc)
kref_put(&sf_handle->refcount, vboxsf_handle_release);
if (err == 0) {
- ClearPageError(page);
/* mtime changed */
sf_i->force_restat = 1;
} else {
diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c
index ffb1d565da39..e95b8a48d8a0 100644
--- a/fs/vboxsf/super.c
+++ b/fs/vboxsf/super.c
@@ -41,8 +41,8 @@ enum { opt_nls, opt_uid, opt_gid, opt_ttl, opt_dmode, opt_fmode,
static const struct fs_parameter_spec vboxsf_fs_parameters[] = {
fsparam_string ("nls", opt_nls),
- fsparam_u32 ("uid", opt_uid),
- fsparam_u32 ("gid", opt_gid),
+ fsparam_uid ("uid", opt_uid),
+ fsparam_gid ("gid", opt_gid),
fsparam_u32 ("ttl", opt_ttl),
fsparam_u32oct ("dmode", opt_dmode),
fsparam_u32oct ("fmode", opt_fmode),
@@ -55,8 +55,6 @@ static int vboxsf_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct vboxsf_fs_context *ctx = fc->fs_private;
struct fs_parse_result result;
- kuid_t uid;
- kgid_t gid;
int opt;
opt = fs_parse(fc, vboxsf_fs_parameters, param, &result);
@@ -73,16 +71,10 @@ static int vboxsf_parse_param(struct fs_context *fc, struct fs_parameter *param)
param->string = NULL;
break;
case opt_uid:
- uid = make_kuid(current_user_ns(), result.uint_32);
- if (!uid_valid(uid))
- return -EINVAL;
- ctx->o.uid = uid;
+ ctx->o.uid = result.uid;
break;
case opt_gid:
- gid = make_kgid(current_user_ns(), result.uint_32);
- if (!gid_valid(gid))
- return -EINVAL;
- ctx->o.gid = gid;
+ ctx->o.gid = result.gid;
break;
case opt_ttl:
ctx->o.ttl = msecs_to_jiffies(result.uint_32);
diff --git a/fs/verity/measure.c b/fs/verity/measure.c
index 3969d54158d1..175d2f1bc089 100644
--- a/fs/verity/measure.c
+++ b/fs/verity/measure.c
@@ -111,14 +111,15 @@ __bpf_kfunc_start_defs();
/**
* bpf_get_fsverity_digest: read fsverity digest of file
* @file: file to get digest from
- * @digest_ptr: (out) dynptr for struct fsverity_digest
+ * @digest_p: (out) dynptr for struct fsverity_digest
*
* Read fsverity_digest of *file* into *digest_ptr*.
*
* Return: 0 on success, a negative value on error.
*/
-__bpf_kfunc int bpf_get_fsverity_digest(struct file *file, struct bpf_dynptr_kern *digest_ptr)
+__bpf_kfunc int bpf_get_fsverity_digest(struct file *file, struct bpf_dynptr *digest_p)
{
+ struct bpf_dynptr_kern *digest_ptr = (struct bpf_dynptr_kern *)digest_p;
const struct inode *inode = file_inode(file);
u32 dynptr_sz = __bpf_dynptr_size(digest_ptr);
struct fsverity_digest *arg;
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index d41edd30388b..fffd6fffdce0 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -217,6 +217,18 @@ config XFS_DEBUG
Say N unless you are an XFS developer, or you play one on TV.
+config XFS_DEBUG_EXPENSIVE
+ bool "XFS expensive debugging checks"
+ depends on XFS_FS && XFS_DEBUG
+ help
+ Say Y here to get an XFS build with expensive debugging checks
+ enabled. These checks may affect performance significantly.
+
+ Note that the resulting code will be HUGER and SLOWER, and probably
+ not useful unless you are debugging a particular problem.
+
+ Say N unless you are an XFS developer, or you play one on TV.
+
config XFS_ASSERT_FATAL
bool "XFS fatal asserts"
default y
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index c50447548d65..dd692619bed5 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -40,6 +40,7 @@ xfs-y += $(addprefix libxfs/, \
xfs_iext_tree.o \
xfs_inode_fork.o \
xfs_inode_buf.o \
+ xfs_inode_util.o \
xfs_log_rlimit.o \
xfs_ag_resv.o \
xfs_parent.o \
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 240e079cb3fb..7e80732cb547 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -1008,7 +1008,7 @@ xfs_ag_shrink_space(
goto resv_err;
err2 = xfs_free_extent_later(*tpp, args.fsbno, delta, NULL,
- XFS_AG_RESV_NONE, true);
+ XFS_AG_RESV_NONE, XFS_FREE_EXTENT_SKIP_DISCARD);
if (err2)
goto resv_err;
diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h
index ff20ed93de77..f247eeff7358 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.h
+++ b/fs/xfs/libxfs/xfs_ag_resv.h
@@ -33,23 +33,4 @@ xfs_perag_resv(
}
}
-/*
- * RMAPBT reservation accounting wrappers. Since rmapbt blocks are sourced from
- * the AGFL, they are allocated one at a time and the reservation updates don't
- * require a transaction.
- */
-static inline void
-xfs_ag_resv_rmapbt_alloc(
- struct xfs_mount *mp,
- xfs_agnumber_t agno)
-{
- struct xfs_alloc_arg args = { NULL };
- struct xfs_perag *pag;
-
- args.len = 1;
- pag = xfs_perag_get(mp, agno);
- xfs_ag_resv_alloc_extent(pag, XFS_AG_RESV_RMAPBT, &args);
- xfs_perag_put(pag);
-}
-
#endif /* __XFS_AG_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 6c55a6e88eba..59326f84f6a5 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -27,6 +27,7 @@
#include "xfs_ag_resv.h"
#include "xfs_bmap.h"
#include "xfs_health.h"
+#include "xfs_extfree_item.h"
struct kmem_cache *xfs_extfree_item_cache;
@@ -466,6 +467,97 @@ xfs_alloc_fix_len(
}
/*
+ * Determine if the cursor points to the block that contains the right-most
+ * block of records in the by-count btree. This block contains the largest
+ * contiguous free extent in the AG, so if we modify a record in this block we
+ * need to call xfs_alloc_fixup_longest() once the modifications are done to
+ * ensure the agf->agf_longest field is kept up to date with the longest free
+ * extent tracked by the by-count btree.
+ */
+static bool
+xfs_alloc_cursor_at_lastrec(
+ struct xfs_btree_cur *cnt_cur)
+{
+ struct xfs_btree_block *block;
+ union xfs_btree_ptr ptr;
+ struct xfs_buf *bp;
+
+ block = xfs_btree_get_block(cnt_cur, 0, &bp);
+
+ xfs_btree_get_sibling(cnt_cur, block, &ptr, XFS_BB_RIGHTSIB);
+ return xfs_btree_ptr_is_null(cnt_cur, &ptr);
+}
+
+/*
+ * Find the rightmost record of the cntbt, and return the longest free space
+ * recorded in it. Simply set both the block number and the length to their
+ * maximum values before searching.
+ */
+static int
+xfs_cntbt_longest(
+ struct xfs_btree_cur *cnt_cur,
+ xfs_extlen_t *longest)
+{
+ struct xfs_alloc_rec_incore irec;
+ union xfs_btree_rec *rec;
+ int stat = 0;
+ int error;
+
+ memset(&cnt_cur->bc_rec, 0xFF, sizeof(cnt_cur->bc_rec));
+ error = xfs_btree_lookup(cnt_cur, XFS_LOOKUP_LE, &stat);
+ if (error)
+ return error;
+ if (!stat) {
+ /* totally empty tree */
+ *longest = 0;
+ return 0;
+ }
+
+ error = xfs_btree_get_rec(cnt_cur, &rec, &stat);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(cnt_cur->bc_mp, !stat)) {
+ xfs_btree_mark_sick(cnt_cur);
+ return -EFSCORRUPTED;
+ }
+
+ xfs_alloc_btrec_to_irec(rec, &irec);
+ *longest = irec.ar_blockcount;
+ return 0;
+}
+
+/*
+ * Update the longest contiguous free extent in the AG from the by-count cursor
+ * that is passed to us. This should be done at the end of any allocation or
+ * freeing operation that touches the longest extent in the btree.
+ *
+ * Needing to update the longest extent can be determined by calling
+ * xfs_alloc_cursor_at_lastrec() after the cursor is positioned for record
+ * modification but before the modification begins.
+ */
+static int
+xfs_alloc_fixup_longest(
+ struct xfs_btree_cur *cnt_cur)
+{
+ struct xfs_perag *pag = cnt_cur->bc_ag.pag;
+ struct xfs_buf *bp = cnt_cur->bc_ag.agbp;
+ struct xfs_agf *agf = bp->b_addr;
+ xfs_extlen_t longest = 0;
+ int error;
+
+ /* Lookup last rec in order to update AGF. */
+ error = xfs_cntbt_longest(cnt_cur, &longest);
+ if (error)
+ return error;
+
+ pag->pagf_longest = longest;
+ agf->agf_longest = cpu_to_be32(pag->pagf_longest);
+ xfs_alloc_log_agf(cnt_cur->bc_tp, bp, XFS_AGF_LONGEST);
+
+ return 0;
+}
+
+/*
* Update the two btrees, logically removing from freespace the extent
* starting at rbno, rlen blocks. The extent is contained within the
* actual (current) free extent fbno for flen blocks.
@@ -489,6 +581,7 @@ xfs_alloc_fixup_trees(
xfs_extlen_t nflen1=0; /* first new free length */
xfs_extlen_t nflen2=0; /* second new free length */
struct xfs_mount *mp;
+ bool fixup_longest = false;
mp = cnt_cur->bc_mp;
@@ -577,6 +670,10 @@ xfs_alloc_fixup_trees(
nfbno2 = rbno + rlen;
nflen2 = (fbno + flen) - nfbno2;
}
+
+ if (xfs_alloc_cursor_at_lastrec(cnt_cur))
+ fixup_longest = true;
+
/*
* Delete the entry from the by-size btree.
*/
@@ -654,6 +751,10 @@ xfs_alloc_fixup_trees(
return -EFSCORRUPTED;
}
}
+
+ if (fixup_longest)
+ return xfs_alloc_fixup_longest(cnt_cur);
+
return 0;
}
@@ -1932,7 +2033,7 @@ out_nominleft:
/*
* Free the extent starting at agno/bno for length.
*/
-STATIC int
+int
xfs_free_ag_extent(
struct xfs_trans *tp,
struct xfs_buf *agbp,
@@ -1956,6 +2057,7 @@ xfs_free_ag_extent(
int i;
int error;
struct xfs_perag *pag = agbp->b_pag;
+ bool fixup_longest = false;
bno_cur = cnt_cur = NULL;
mp = tp->t_mountp;
@@ -2219,8 +2321,13 @@ xfs_free_ag_extent(
}
xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
bno_cur = NULL;
+
/*
* In all cases we need to insert the new freespace in the by-size tree.
+ *
+ * If this new freespace is being inserted in the block that contains
+ * the largest free space in the btree, make sure we also fix up the
+ * agf->agf-longest tracker field.
*/
if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
goto error0;
@@ -2229,6 +2336,8 @@ xfs_free_ag_extent(
error = -EFSCORRUPTED;
goto error0;
}
+ if (xfs_alloc_cursor_at_lastrec(cnt_cur))
+ fixup_longest = true;
if ((error = xfs_btree_insert(cnt_cur, &i)))
goto error0;
if (XFS_IS_CORRUPT(mp, i != 1)) {
@@ -2236,6 +2345,12 @@ xfs_free_ag_extent(
error = -EFSCORRUPTED;
goto error0;
}
+ if (fixup_longest) {
+ error = xfs_alloc_fixup_longest(cnt_cur);
+ if (error)
+ goto error0;
+ }
+
xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
cnt_cur = NULL;
@@ -2422,32 +2537,6 @@ xfs_alloc_space_available(
return true;
}
-int
-xfs_free_agfl_block(
- struct xfs_trans *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t agbno,
- struct xfs_buf *agbp,
- struct xfs_owner_info *oinfo)
-{
- int error;
- struct xfs_buf *bp;
-
- error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo,
- XFS_AG_RESV_AGFL);
- if (error)
- return error;
-
- error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp,
- XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno),
- tp->t_mountp->m_bsize, 0, &bp);
- if (error)
- return error;
- xfs_trans_binval(tp, bp);
-
- return 0;
-}
-
/*
* Check the agfl fields of the agf for inconsistency or corruption.
*
@@ -2536,48 +2625,6 @@ xfs_agfl_reset(
}
/*
- * Defer an AGFL block free. This is effectively equivalent to
- * xfs_free_extent_later() with some special handling particular to AGFL blocks.
- *
- * Deferring AGFL frees helps prevent log reservation overruns due to too many
- * allocation operations in a transaction. AGFL frees are prone to this problem
- * because for one they are always freed one at a time. Further, an immediate
- * AGFL block free can cause a btree join and require another block free before
- * the real allocation can proceed. Deferring the free disconnects freeing up
- * the AGFL slot from freeing the block.
- */
-static int
-xfs_defer_agfl_block(
- struct xfs_trans *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t agbno,
- struct xfs_owner_info *oinfo)
-{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_extent_free_item *xefi;
- xfs_fsblock_t fsbno = XFS_AGB_TO_FSB(mp, agno, agbno);
-
- ASSERT(xfs_extfree_item_cache != NULL);
- ASSERT(oinfo != NULL);
-
- if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, fsbno)))
- return -EFSCORRUPTED;
-
- xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
- GFP_KERNEL | __GFP_NOFAIL);
- xefi->xefi_startblock = fsbno;
- xefi->xefi_blockcount = 1;
- xefi->xefi_owner = oinfo->oi_owner;
- xefi->xefi_agresv = XFS_AG_RESV_AGFL;
-
- trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
-
- xfs_extent_free_get_group(mp, xefi);
- xfs_defer_add(tp, &xefi->xefi_list, &xfs_agfl_free_defer_type);
- return 0;
-}
-
-/*
* Add the extent to the list of extents to be free at transaction end.
* The list is maintained sorted (by block number).
*/
@@ -2588,28 +2635,15 @@ xfs_defer_extent_free(
xfs_filblks_t len,
const struct xfs_owner_info *oinfo,
enum xfs_ag_resv_type type,
- bool skip_discard,
+ unsigned int free_flags,
struct xfs_defer_pending **dfpp)
{
struct xfs_extent_free_item *xefi;
struct xfs_mount *mp = tp->t_mountp;
-#ifdef DEBUG
- xfs_agnumber_t agno;
- xfs_agblock_t agbno;
- ASSERT(bno != NULLFSBLOCK);
- ASSERT(len > 0);
ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
ASSERT(!isnullstartblock(bno));
- agno = XFS_FSB_TO_AGNO(mp, bno);
- agbno = XFS_FSB_TO_AGBNO(mp, bno);
- ASSERT(agno < mp->m_sb.sb_agcount);
- ASSERT(agbno < mp->m_sb.sb_agblocks);
- ASSERT(len < mp->m_sb.sb_agblocks);
- ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
-#endif
- ASSERT(xfs_extfree_item_cache != NULL);
- ASSERT(type != XFS_AG_RESV_AGFL);
+ ASSERT(!(free_flags & ~XFS_FREE_EXTENT_ALL_FLAGS));
if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
return -EFSCORRUPTED;
@@ -2619,7 +2653,7 @@ xfs_defer_extent_free(
xefi->xefi_startblock = bno;
xefi->xefi_blockcount = (xfs_extlen_t)len;
xefi->xefi_agresv = type;
- if (skip_discard)
+ if (free_flags & XFS_FREE_EXTENT_SKIP_DISCARD)
xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD;
if (oinfo) {
ASSERT(oinfo->oi_offset == 0);
@@ -2632,12 +2666,8 @@ xfs_defer_extent_free(
} else {
xefi->xefi_owner = XFS_RMAP_OWN_NULL;
}
- trace_xfs_bmap_free_defer(mp,
- XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
- XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
- xfs_extent_free_get_group(mp, xefi);
- *dfpp = xfs_defer_add(tp, &xefi->xefi_list, &xfs_extent_free_defer_type);
+ xfs_extent_free_defer_add(tp, xefi, dfpp);
return 0;
}
@@ -2648,11 +2678,11 @@ xfs_free_extent_later(
xfs_filblks_t len,
const struct xfs_owner_info *oinfo,
enum xfs_ag_resv_type type,
- bool skip_discard)
+ unsigned int free_flags)
{
struct xfs_defer_pending *dontcare = NULL;
- return xfs_defer_extent_free(tp, bno, len, oinfo, type, skip_discard,
+ return xfs_defer_extent_free(tp, bno, len, oinfo, type, free_flags,
&dontcare);
}
@@ -2677,13 +2707,13 @@ xfs_free_extent_later(
int
xfs_alloc_schedule_autoreap(
const struct xfs_alloc_arg *args,
- bool skip_discard,
+ unsigned int free_flags,
struct xfs_alloc_autoreap *aarp)
{
int error;
error = xfs_defer_extent_free(args->tp, args->fsbno, args->len,
- &args->oinfo, args->resv, skip_discard, &aarp->dfp);
+ &args->oinfo, args->resv, free_flags, &aarp->dfp);
if (error)
return error;
@@ -2895,8 +2925,21 @@ xfs_alloc_fix_freelist(
if (error)
goto out_agbp_relse;
- /* defer agfl frees */
- error = xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
+ /*
+ * Defer the AGFL block free.
+ *
+ * This helps to prevent log reservation overruns due to too
+ * many allocation operations in a transaction. AGFL frees are
+ * prone to this problem because for one they are always freed
+ * one at a time. Further, an immediate AGFL block free can
+ * cause a btree join and require another block free before the
+ * real allocation can proceed.
+ * Deferring the free disconnects freeing up the AGFL slot from
+ * freeing the block.
+ */
+ error = xfs_free_extent_later(tp,
+ XFS_AGB_TO_FSB(mp, args->agno, bno), 1,
+ &targs.oinfo, XFS_AG_RESV_AGFL, 0);
if (error)
goto out_agbp_relse;
}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 0b956f8b9d5a..fae170825be0 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -80,6 +80,10 @@ int xfs_alloc_get_freelist(struct xfs_perag *pag, struct xfs_trans *tp,
int xfs_alloc_put_freelist(struct xfs_perag *pag, struct xfs_trans *tp,
struct xfs_buf *agfbp, struct xfs_buf *agflbp,
xfs_agblock_t bno, int btreeblk);
+int xfs_free_ag_extent(struct xfs_trans *tp, struct xfs_buf *agbp,
+ xfs_agnumber_t agno, xfs_agblock_t bno,
+ xfs_extlen_t len, const struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type type);
/*
* Compute and fill in value of m_alloc_maxlevels.
@@ -194,8 +198,6 @@ int xfs_alloc_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags,
struct xfs_buf **agfbpp);
int xfs_alloc_read_agfl(struct xfs_perag *pag, struct xfs_trans *tp,
struct xfs_buf **bpp);
-int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t,
- struct xfs_buf *, struct xfs_owner_info *);
int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, uint32_t alloc_flags);
int xfs_free_extent_fix_freelist(struct xfs_trans *tp, struct xfs_perag *pag,
struct xfs_buf **agbp);
@@ -233,7 +235,12 @@ xfs_buf_to_agfl_bno(
int xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
xfs_filblks_t len, const struct xfs_owner_info *oinfo,
- enum xfs_ag_resv_type type, bool skip_discard);
+ enum xfs_ag_resv_type type, unsigned int free_flags);
+
+/* Don't issue a discard for the blocks freed. */
+#define XFS_FREE_EXTENT_SKIP_DISCARD (1U << 0)
+
+#define XFS_FREE_EXTENT_ALL_FLAGS (XFS_FREE_EXTENT_SKIP_DISCARD)
/*
* List of extents to be free "later".
@@ -249,9 +256,6 @@ struct xfs_extent_free_item {
enum xfs_ag_resv_type xefi_agresv;
};
-void xfs_extent_free_get_group(struct xfs_mount *mp,
- struct xfs_extent_free_item *xefi);
-
#define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */
#define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */
#define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */
@@ -262,7 +266,7 @@ struct xfs_alloc_autoreap {
};
int xfs_alloc_schedule_autoreap(const struct xfs_alloc_arg *args,
- bool skip_discard, struct xfs_alloc_autoreap *aarp);
+ unsigned int free_flags, struct xfs_alloc_autoreap *aarp);
void xfs_alloc_cancel_autoreap(struct xfs_trans *tp,
struct xfs_alloc_autoreap *aarp);
void xfs_alloc_commit_autoreap(struct xfs_trans *tp,
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 6ef5ddd89600..585e98e87ef9 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -115,67 +115,6 @@ xfs_allocbt_free_block(
return 0;
}
-/*
- * Update the longest extent in the AGF
- */
-STATIC void
-xfs_allocbt_update_lastrec(
- struct xfs_btree_cur *cur,
- const struct xfs_btree_block *block,
- const union xfs_btree_rec *rec,
- int ptr,
- int reason)
-{
- struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
- struct xfs_perag *pag;
- __be32 len;
- int numrecs;
-
- ASSERT(!xfs_btree_is_bno(cur->bc_ops));
-
- switch (reason) {
- case LASTREC_UPDATE:
- /*
- * If this is the last leaf block and it's the last record,
- * then update the size of the longest extent in the AG.
- */
- if (ptr != xfs_btree_get_numrecs(block))
- return;
- len = rec->alloc.ar_blockcount;
- break;
- case LASTREC_INSREC:
- if (be32_to_cpu(rec->alloc.ar_blockcount) <=
- be32_to_cpu(agf->agf_longest))
- return;
- len = rec->alloc.ar_blockcount;
- break;
- case LASTREC_DELREC:
- numrecs = xfs_btree_get_numrecs(block);
- if (ptr <= numrecs)
- return;
- ASSERT(ptr == numrecs + 1);
-
- if (numrecs) {
- xfs_alloc_rec_t *rrp;
-
- rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs);
- len = rrp->ar_blockcount;
- } else {
- len = 0;
- }
-
- break;
- default:
- ASSERT(0);
- return;
- }
-
- agf->agf_longest = len;
- pag = cur->bc_ag.agbp->b_pag;
- pag->pagf_longest = be32_to_cpu(len);
- xfs_alloc_log_agf(cur->bc_tp, cur->bc_ag.agbp, XFS_AGF_LONGEST);
-}
-
STATIC int
xfs_allocbt_get_minrecs(
struct xfs_btree_cur *cur,
@@ -493,7 +432,6 @@ const struct xfs_btree_ops xfs_bnobt_ops = {
.set_root = xfs_allocbt_set_root,
.alloc_block = xfs_allocbt_alloc_block,
.free_block = xfs_allocbt_free_block,
- .update_lastrec = xfs_allocbt_update_lastrec,
.get_minrecs = xfs_allocbt_get_minrecs,
.get_maxrecs = xfs_allocbt_get_maxrecs,
.init_key_from_rec = xfs_allocbt_init_key_from_rec,
@@ -511,7 +449,6 @@ const struct xfs_btree_ops xfs_bnobt_ops = {
const struct xfs_btree_ops xfs_cntbt_ops = {
.name = "cnt",
.type = XFS_BTREE_TYPE_AG,
- .geom_flags = XFS_BTGEO_LASTREC_UPDATE,
.rec_len = sizeof(xfs_alloc_rec_t),
.key_len = sizeof(xfs_alloc_key_t),
@@ -525,7 +462,6 @@ const struct xfs_btree_ops xfs_cntbt_ops = {
.set_root = xfs_allocbt_set_root,
.alloc_block = xfs_allocbt_alloc_block,
.free_block = xfs_allocbt_free_block,
- .update_lastrec = xfs_allocbt_update_lastrec,
.get_minrecs = xfs_allocbt_get_minrecs,
.get_maxrecs = xfs_allocbt_get_maxrecs,
.init_key_from_rec = xfs_allocbt_init_key_from_rec,
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index c101cf266bc4..7df74c35d9f9 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -39,6 +39,7 @@
#include "xfs_health.h"
#include "xfs_bmap_item.h"
#include "xfs_symlink_remote.h"
+#include "xfs_inode_util.h"
struct kmem_cache *xfs_bmap_intent_cache;
@@ -604,7 +605,7 @@ xfs_bmap_btree_to_extents(
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo,
- XFS_AG_RESV_NONE, false);
+ XFS_AG_RESV_NONE, 0);
if (error)
return error;
@@ -4058,20 +4059,32 @@ xfs_bmapi_reserve_delalloc(
xfs_extlen_t indlen;
uint64_t fdblocks;
int error;
- xfs_fileoff_t aoff = off;
+ xfs_fileoff_t aoff;
+ bool use_cowextszhint =
+ whichfork == XFS_COW_FORK && !prealloc;
+retry:
/*
* Cap the alloc length. Keep track of prealloc so we know whether to
* tag the inode before we return.
*/
+ aoff = off;
alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
if (!eof)
alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
if (prealloc && alen >= len)
prealloc = alen - len;
- /* Figure out the extent size, adjust alen */
- if (whichfork == XFS_COW_FORK) {
+ /*
+ * If we're targetting the COW fork but aren't creating a speculative
+ * posteof preallocation, try to expand the reservation to align with
+ * the COW extent size hint if there's sufficient free space.
+ *
+ * Unlike the data fork, the CoW cancellation functions will free all
+ * the reservations at inactivation, so we don't require that every
+ * delalloc reservation have a dirty pagecache.
+ */
+ if (use_cowextszhint) {
struct xfs_bmbt_irec prev;
xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip);
@@ -4090,7 +4103,7 @@ xfs_bmapi_reserve_delalloc(
*/
error = xfs_quota_reserve_blkres(ip, alen);
if (error)
- return error;
+ goto out;
/*
* Split changing sb for alen and indlen since they could be coming
@@ -4140,6 +4153,17 @@ out_unreserve_frextents:
out_unreserve_quota:
if (XFS_IS_QUOTA_ON(mp))
xfs_quota_unreserve_blkres(ip, alen);
+out:
+ if (error == -ENOSPC || error == -EDQUOT) {
+ trace_xfs_delalloc_enospc(ip, off, len);
+
+ if (prealloc || use_cowextszhint) {
+ /* retry without any preallocation */
+ use_cowextszhint = false;
+ prealloc = 0;
+ goto retry;
+ }
+ }
return error;
}
@@ -5357,11 +5381,15 @@ xfs_bmap_del_extent_real(
error = xfs_rtfree_blocks(tp, del->br_startblock,
del->br_blockcount);
} else {
+ unsigned int efi_flags = 0;
+
+ if ((bflags & XFS_BMAPI_NODISCARD) ||
+ del->br_state == XFS_EXT_UNWRITTEN)
+ efi_flags |= XFS_FREE_EXTENT_SKIP_DISCARD;
+
error = xfs_free_extent_later(tp, del->br_startblock,
del->br_blockcount, NULL,
- XFS_AG_RESV_NONE,
- ((bflags & XFS_BMAPI_NODISCARD) ||
- del->br_state == XFS_EXT_UNWRITTEN));
+ XFS_AG_RESV_NONE, efi_flags);
}
if (error)
return error;
@@ -6431,3 +6459,45 @@ xfs_bmap_query_all(
return xfs_btree_query_all(cur, xfs_bmap_query_range_helper, &query);
}
+
+/* Helper function to extract extent size hint from inode */
+xfs_extlen_t
+xfs_get_extsz_hint(
+ struct xfs_inode *ip)
+{
+ /*
+ * No point in aligning allocations if we need to COW to actually
+ * write to them.
+ */
+ if (xfs_is_always_cow_inode(ip))
+ return 0;
+ if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
+ return ip->i_extsize;
+ if (XFS_IS_REALTIME_INODE(ip) &&
+ ip->i_mount->m_sb.sb_rextsize > 1)
+ return ip->i_mount->m_sb.sb_rextsize;
+ return 0;
+}
+
+/*
+ * Helper function to extract CoW extent size hint from inode.
+ * Between the extent size hint and the CoW extent size hint, we
+ * return the greater of the two. If the value is zero (automatic),
+ * use the default size.
+ */
+xfs_extlen_t
+xfs_get_cowextsz_hint(
+ struct xfs_inode *ip)
+{
+ xfs_extlen_t a, b;
+
+ a = 0;
+ if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
+ a = ip->i_cowextsize;
+ b = xfs_get_extsz_hint(ip);
+
+ a = max(a, b);
+ if (a == 0)
+ return XFS_DEFAULT_COWEXTSZ_HINT;
+ return a;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 667b0c2b33d1..7592d46e97c6 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -296,4 +296,7 @@ typedef int (*xfs_bmap_query_range_fn)(
int xfs_bmap_query_all(struct xfs_btree_cur *cur, xfs_bmap_query_range_fn fn,
void *priv);
+xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
+xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip);
+
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index f5d84dcb58da..d1b06ccde19e 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -282,7 +282,7 @@ xfs_bmbt_free_block(
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo,
- XFS_AG_RESV_NONE, false);
+ XFS_AG_RESV_NONE, 0);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index d29547572a68..a5c4af148853 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -1331,30 +1331,6 @@ xfs_btree_init_block_cur(
xfs_btree_owner(cur));
}
-/*
- * Return true if ptr is the last record in the btree and
- * we need to track updates to this record. The decision
- * will be further refined in the update_lastrec method.
- */
-STATIC int
-xfs_btree_is_lastrec(
- struct xfs_btree_cur *cur,
- struct xfs_btree_block *block,
- int level)
-{
- union xfs_btree_ptr ptr;
-
- if (level > 0)
- return 0;
- if (!(cur->bc_ops->geom_flags & XFS_BTGEO_LASTREC_UPDATE))
- return 0;
-
- xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
- if (!xfs_btree_ptr_is_null(cur, &ptr))
- return 0;
- return 1;
-}
-
STATIC void
xfs_btree_buf_to_ptr(
struct xfs_btree_cur *cur,
@@ -2420,15 +2396,6 @@ xfs_btree_update(
xfs_btree_copy_recs(cur, rp, rec, 1);
xfs_btree_log_recs(cur, bp, ptr, ptr);
- /*
- * If we are tracking the last record in the tree and
- * we are at the far right edge of the tree, update it.
- */
- if (xfs_btree_is_lastrec(cur, block, 0)) {
- cur->bc_ops->update_lastrec(cur, block, rec,
- ptr, LASTREC_UPDATE);
- }
-
/* Pass new key value up to our parent. */
if (xfs_btree_needs_key_update(cur, ptr)) {
error = xfs_btree_update_keys(cur, 0);
@@ -3618,15 +3585,6 @@ xfs_btree_insrec(
}
/*
- * If we are tracking the last record in the tree and
- * we are at the far right edge of the tree, update it.
- */
- if (xfs_btree_is_lastrec(cur, block, level)) {
- cur->bc_ops->update_lastrec(cur, block, rec,
- ptr, LASTREC_INSREC);
- }
-
- /*
* Return the new block number, if any.
* If there is one, give back a record value and a cursor too.
*/
@@ -3984,15 +3942,6 @@ xfs_btree_delrec(
xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
/*
- * If we are tracking the last record in the tree and
- * we are at the far right edge of the tree, update it.
- */
- if (xfs_btree_is_lastrec(cur, block, level)) {
- cur->bc_ops->update_lastrec(cur, block, NULL,
- ptr, LASTREC_DELREC);
- }
-
- /*
* We're at the root level. First, shrink the root block in-memory.
* Try to get rid of the next level down. If we can't then there's
* nothing left to do.
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index f93374278aa1..10b7ddc3b2b3 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -154,12 +154,6 @@ struct xfs_btree_ops {
int *stat);
int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
- /* update last record information */
- void (*update_lastrec)(struct xfs_btree_cur *cur,
- const struct xfs_btree_block *block,
- const union xfs_btree_rec *rec,
- int ptr, int reason);
-
/* records in block/level */
int (*get_minrecs)(struct xfs_btree_cur *cur, int level);
int (*get_maxrecs)(struct xfs_btree_cur *cur, int level);
@@ -222,15 +216,7 @@ struct xfs_btree_ops {
};
/* btree geometry flags */
-#define XFS_BTGEO_LASTREC_UPDATE (1U << 0) /* track last rec externally */
-#define XFS_BTGEO_OVERLAPPING (1U << 1) /* overlapping intervals */
-
-/*
- * Reasons for the update_lastrec method to be called.
- */
-#define LASTREC_UPDATE 0
-#define LASTREC_INSREC 1
-#define LASTREC_DELREC 2
+#define XFS_BTGEO_OVERLAPPING (1U << 0) /* overlapping intervals */
union xfs_btree_irec {
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 4a078e07e1a0..40021849b42f 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -12,12 +12,14 @@
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
#include "xfs_buf_item.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_log.h"
+#include "xfs_log_priv.h"
#include "xfs_rmap.h"
#include "xfs_refcount.h"
#include "xfs_bmap.h"
@@ -556,7 +558,7 @@ xfs_defer_relog(
* the log threshold once per call.
*/
if (threshold_lsn == NULLCOMMITLSN) {
- threshold_lsn = xlog_grant_push_threshold(log, 0);
+ threshold_lsn = xfs_ail_get_push_target(log->l_ailp);
if (threshold_lsn == NULLCOMMITLSN)
break;
}
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 457f9a38f850..202468223bf9 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -19,6 +19,11 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_health.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_parent.h"
+#include "xfs_ag.h"
+#include "xfs_ialloc.h"
const struct xfs_name xfs_name_dotdot = {
.name = (const unsigned char *)"..",
@@ -584,9 +589,9 @@ xfs_dir_replace(
*/
int
xfs_dir_canenter(
- xfs_trans_t *tp,
- xfs_inode_t *dp,
- struct xfs_name *name) /* name of entry to add */
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ const struct xfs_name *name) /* name of entry to add */
{
return xfs_dir_createname(tp, dp, name, 0, 0);
}
@@ -756,3 +761,653 @@ xfs_dir2_compname(
return xfs_ascii_ci_compname(args, name, len);
return xfs_da_compname(args, name, len);
}
+
+#ifdef CONFIG_XFS_LIVE_HOOKS
+/*
+ * Use a static key here to reduce the overhead of directory live update hooks.
+ * If the compiler supports jump labels, the static branch will be replaced by
+ * a nop sled when there are no hook users. Online fsck is currently the only
+ * caller, so this is a reasonable tradeoff.
+ *
+ * Note: Patching the kernel code requires taking the cpu hotplug lock. Other
+ * parts of the kernel allocate memory with that lock held, which means that
+ * XFS callers cannot hold any locks that might be used by memory reclaim or
+ * writeback when calling the static_branch_{inc,dec} functions.
+ */
+DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch);
+
+void
+xfs_dir_hook_disable(void)
+{
+ xfs_hooks_switch_off(&xfs_dir_hooks_switch);
+}
+
+void
+xfs_dir_hook_enable(void)
+{
+ xfs_hooks_switch_on(&xfs_dir_hooks_switch);
+}
+
+/* Call hooks for a directory update relating to a child dirent update. */
+inline void
+xfs_dir_update_hook(
+ struct xfs_inode *dp,
+ struct xfs_inode *ip,
+ int delta,
+ const struct xfs_name *name)
+{
+ if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) {
+ struct xfs_dir_update_params p = {
+ .dp = dp,
+ .ip = ip,
+ .delta = delta,
+ .name = name,
+ };
+ struct xfs_mount *mp = ip->i_mount;
+
+ xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p);
+ }
+}
+
+/* Call the specified function during a directory update. */
+int
+xfs_dir_hook_add(
+ struct xfs_mount *mp,
+ struct xfs_dir_hook *hook)
+{
+ return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook);
+}
+
+/* Stop calling the specified function during a directory update. */
+void
+xfs_dir_hook_del(
+ struct xfs_mount *mp,
+ struct xfs_dir_hook *hook)
+{
+ xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook);
+}
+
+/* Configure directory update hook functions. */
+void
+xfs_dir_hook_setup(
+ struct xfs_dir_hook *hook,
+ notifier_fn_t mod_fn)
+{
+ xfs_hook_setup(&hook->dirent_hook, mod_fn);
+}
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
+/*
+ * Given a directory @dp, a newly allocated inode @ip, and a @name, link @ip
+ * into @dp under the given @name. If @ip is a directory, it will be
+ * initialized. Both inodes must have the ILOCK held and the transaction must
+ * have sufficient blocks reserved.
+ */
+int
+xfs_dir_create_child(
+ struct xfs_trans *tp,
+ unsigned int resblks,
+ struct xfs_dir_update *du)
+{
+ struct xfs_inode *dp = du->dp;
+ const struct xfs_name *name = du->name;
+ struct xfs_inode *ip = du->ip;
+ int error;
+
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
+ xfs_assert_ilocked(dp, XFS_ILOCK_EXCL);
+
+ error = xfs_dir_createname(tp, dp, name, ip->i_ino, resblks);
+ if (error) {
+ ASSERT(error != -ENOSPC);
+ return error;
+ }
+
+ xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+ if (S_ISDIR(VFS_I(ip)->i_mode)) {
+ error = xfs_dir_init(tp, ip, dp);
+ if (error)
+ return error;
+
+ xfs_bumplink(tp, dp);
+ }
+
+ /*
+ * If we have parent pointers, we need to add the attribute containing
+ * the parent information now.
+ */
+ if (du->ppargs) {
+ error = xfs_parent_addname(tp, du->ppargs, dp, name, ip);
+ if (error)
+ return error;
+ }
+
+ xfs_dir_update_hook(dp, ip, 1, name);
+ return 0;
+}
+
+/*
+ * Given a directory @dp, an existing non-directory inode @ip, and a @name,
+ * link @ip into @dp under the given @name. Both inodes must have the ILOCK
+ * held.
+ */
+int
+xfs_dir_add_child(
+ struct xfs_trans *tp,
+ unsigned int resblks,
+ struct xfs_dir_update *du)
+{
+ struct xfs_inode *dp = du->dp;
+ const struct xfs_name *name = du->name;
+ struct xfs_inode *ip = du->ip;
+ struct xfs_mount *mp = tp->t_mountp;
+ int error;
+
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
+ xfs_assert_ilocked(dp, XFS_ILOCK_EXCL);
+ ASSERT(!S_ISDIR(VFS_I(ip)->i_mode));
+
+ if (!resblks) {
+ error = xfs_dir_canenter(tp, dp, name);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Handle initial link state of O_TMPFILE inode
+ */
+ if (VFS_I(ip)->i_nlink == 0) {
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+ error = xfs_iunlink_remove(tp, pag, ip);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+ }
+
+ error = xfs_dir_createname(tp, dp, name, ip->i_ino, resblks);
+ if (error)
+ return error;
+
+ xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+ xfs_bumplink(tp, ip);
+
+ /*
+ * If we have parent pointers, we now need to add the parent record to
+ * the attribute fork of the inode. If this is the initial parent
+ * attribute, we need to create it correctly, otherwise we can just add
+ * the parent to the inode.
+ */
+ if (du->ppargs) {
+ error = xfs_parent_addname(tp, du->ppargs, dp, name, ip);
+ if (error)
+ return error;
+ }
+
+ xfs_dir_update_hook(dp, ip, 1, name);
+ return 0;
+}
+
+/*
+ * Given a directory @dp, a child @ip, and a @name, remove the (@name, @ip)
+ * entry from the directory. Both inodes must have the ILOCK held.
+ */
+int
+xfs_dir_remove_child(
+ struct xfs_trans *tp,
+ unsigned int resblks,
+ struct xfs_dir_update *du)
+{
+ struct xfs_inode *dp = du->dp;
+ const struct xfs_name *name = du->name;
+ struct xfs_inode *ip = du->ip;
+ int error;
+
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
+ xfs_assert_ilocked(dp, XFS_ILOCK_EXCL);
+
+ /*
+ * If we're removing a directory perform some additional validation.
+ */
+ if (S_ISDIR(VFS_I(ip)->i_mode)) {
+ ASSERT(VFS_I(ip)->i_nlink >= 2);
+ if (VFS_I(ip)->i_nlink != 2)
+ return -ENOTEMPTY;
+ if (!xfs_dir_isempty(ip))
+ return -ENOTEMPTY;
+
+ /* Drop the link from ip's "..". */
+ error = xfs_droplink(tp, dp);
+ if (error)
+ return error;
+
+ /* Drop the "." link from ip to self. */
+ error = xfs_droplink(tp, ip);
+ if (error)
+ return error;
+
+ /*
+ * Point the unlinked child directory's ".." entry to the root
+ * directory to eliminate back-references to inodes that may
+ * get freed before the child directory is closed. If the fs
+ * gets shrunk, this can lead to dirent inode validation errors.
+ */
+ if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) {
+ error = xfs_dir_replace(tp, ip, &xfs_name_dotdot,
+ tp->t_mountp->m_sb.sb_rootino, 0);
+ if (error)
+ return error;
+ }
+ } else {
+ /*
+ * When removing a non-directory we need to log the parent
+ * inode here. For a directory this is done implicitly
+ * by the xfs_droplink call for the ".." entry.
+ */
+ xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+ }
+ xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+ /* Drop the link from dp to ip. */
+ error = xfs_droplink(tp, ip);
+ if (error)
+ return error;
+
+ error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks);
+ if (error) {
+ ASSERT(error != -ENOENT);
+ return error;
+ }
+
+ /* Remove parent pointer. */
+ if (du->ppargs) {
+ error = xfs_parent_removename(tp, du->ppargs, dp, name, ip);
+ if (error)
+ return error;
+ }
+
+ xfs_dir_update_hook(dp, ip, -1, name);
+ return 0;
+}
+
+/*
+ * Exchange the entry (@name1, @ip1) in directory @dp1 with the entry (@name2,
+ * @ip2) in directory @dp2, and update '..' @ip1 and @ip2's entries as needed.
+ * @ip1 and @ip2 need not be of the same type.
+ *
+ * All inodes must have the ILOCK held, and both entries must already exist.
+ */
+int
+xfs_dir_exchange_children(
+ struct xfs_trans *tp,
+ struct xfs_dir_update *du1,
+ struct xfs_dir_update *du2,
+ unsigned int spaceres)
+{
+ struct xfs_inode *dp1 = du1->dp;
+ const struct xfs_name *name1 = du1->name;
+ struct xfs_inode *ip1 = du1->ip;
+ struct xfs_inode *dp2 = du2->dp;
+ const struct xfs_name *name2 = du2->name;
+ struct xfs_inode *ip2 = du2->ip;
+ int ip1_flags = 0;
+ int ip2_flags = 0;
+ int dp2_flags = 0;
+ int error;
+
+ /* Swap inode number for dirent in first parent */
+ error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres);
+ if (error)
+ return error;
+
+ /* Swap inode number for dirent in second parent */
+ error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres);
+ if (error)
+ return error;
+
+ /*
+ * If we're renaming one or more directories across different parents,
+ * update the respective ".." entries (and link counts) to match the new
+ * parents.
+ */
+ if (dp1 != dp2) {
+ dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+
+ if (S_ISDIR(VFS_I(ip2)->i_mode)) {
+ error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
+ dp1->i_ino, spaceres);
+ if (error)
+ return error;
+
+ /* transfer ip2 ".." reference to dp1 */
+ if (!S_ISDIR(VFS_I(ip1)->i_mode)) {
+ error = xfs_droplink(tp, dp2);
+ if (error)
+ return error;
+ xfs_bumplink(tp, dp1);
+ }
+
+ /*
+ * Although ip1 isn't changed here, userspace needs
+ * to be warned about the change, so that applications
+ * relying on it (like backup ones), will properly
+ * notify the change
+ */
+ ip1_flags |= XFS_ICHGTIME_CHG;
+ ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+ }
+
+ if (S_ISDIR(VFS_I(ip1)->i_mode)) {
+ error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
+ dp2->i_ino, spaceres);
+ if (error)
+ return error;
+
+ /* transfer ip1 ".." reference to dp2 */
+ if (!S_ISDIR(VFS_I(ip2)->i_mode)) {
+ error = xfs_droplink(tp, dp1);
+ if (error)
+ return error;
+ xfs_bumplink(tp, dp2);
+ }
+
+ /*
+ * Although ip2 isn't changed here, userspace needs
+ * to be warned about the change, so that applications
+ * relying on it (like backup ones), will properly
+ * notify the change
+ */
+ ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+ ip2_flags |= XFS_ICHGTIME_CHG;
+ }
+ }
+
+ if (ip1_flags) {
+ xfs_trans_ichgtime(tp, ip1, ip1_flags);
+ xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
+ }
+ if (ip2_flags) {
+ xfs_trans_ichgtime(tp, ip2, ip2_flags);
+ xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
+ }
+ if (dp2_flags) {
+ xfs_trans_ichgtime(tp, dp2, dp2_flags);
+ xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE);
+ }
+ xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
+
+ /* Schedule parent pointer replacements */
+ if (du1->ppargs) {
+ error = xfs_parent_replacename(tp, du1->ppargs, dp1, name1,
+ dp2, name2, ip1);
+ if (error)
+ return error;
+ }
+
+ if (du2->ppargs) {
+ error = xfs_parent_replacename(tp, du2->ppargs, dp2, name2,
+ dp1, name1, ip2);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Inform our hook clients that we've finished an exchange operation as
+ * follows: removed the source and target files from their directories;
+ * added the target to the source directory; and added the source to
+ * the target directory. All inodes are locked, so it's ok to model a
+ * rename this way so long as we say we deleted entries before we add
+ * new ones.
+ */
+ xfs_dir_update_hook(dp1, ip1, -1, name1);
+ xfs_dir_update_hook(dp2, ip2, -1, name2);
+ xfs_dir_update_hook(dp1, ip2, 1, name1);
+ xfs_dir_update_hook(dp2, ip1, 1, name2);
+ return 0;
+}
+
+/*
+ * Given an entry (@src_name, @src_ip) in directory @src_dp, make the entry
+ * @target_name in directory @target_dp point to @src_ip and remove the
+ * original entry, cleaning up everything left behind.
+ *
+ * Cleanup involves dropping a link count on @target_ip, and either removing
+ * the (@src_name, @src_ip) entry from @src_dp or simply replacing the entry
+ * with (@src_name, @wip) if a whiteout inode @wip is supplied.
+ *
+ * All inodes must have the ILOCK held. We assume that if @src_ip is a
+ * directory then its '..' doesn't already point to @target_dp, and that @wip
+ * is a freshly allocated whiteout.
+ */
+int
+xfs_dir_rename_children(
+ struct xfs_trans *tp,
+ struct xfs_dir_update *du_src,
+ struct xfs_dir_update *du_tgt,
+ unsigned int spaceres,
+ struct xfs_dir_update *du_wip)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_inode *src_dp = du_src->dp;
+ const struct xfs_name *src_name = du_src->name;
+ struct xfs_inode *src_ip = du_src->ip;
+ struct xfs_inode *target_dp = du_tgt->dp;
+ const struct xfs_name *target_name = du_tgt->name;
+ struct xfs_inode *target_ip = du_tgt->ip;
+ bool new_parent = (src_dp != target_dp);
+ bool src_is_directory;
+ int error;
+
+ src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
+
+ /*
+ * Check for expected errors before we dirty the transaction
+ * so we can return an error without a transaction abort.
+ */
+ if (target_ip == NULL) {
+ /*
+ * If there's no space reservation, check the entry will
+ * fit before actually inserting it.
+ */
+ if (!spaceres) {
+ error = xfs_dir_canenter(tp, target_dp, target_name);
+ if (error)
+ return error;
+ }
+ } else {
+ /*
+ * If target exists and it's a directory, check that whether
+ * it can be destroyed.
+ */
+ if (S_ISDIR(VFS_I(target_ip)->i_mode) &&
+ (!xfs_dir_isempty(target_ip) ||
+ (VFS_I(target_ip)->i_nlink > 2)))
+ return -EEXIST;
+ }
+
+ /*
+ * Directory entry creation below may acquire the AGF. Remove
+ * the whiteout from the unlinked list first to preserve correct
+ * AGI/AGF locking order. This dirties the transaction so failures
+ * after this point will abort and log recovery will clean up the
+ * mess.
+ *
+ * For whiteouts, we need to bump the link count on the whiteout
+ * inode. After this point, we have a real link, clear the tmpfile
+ * state flag from the inode so it doesn't accidentally get misused
+ * in future.
+ */
+ if (du_wip->ip) {
+ struct xfs_perag *pag;
+
+ ASSERT(VFS_I(du_wip->ip)->i_nlink == 0);
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, du_wip->ip->i_ino));
+ error = xfs_iunlink_remove(tp, pag, du_wip->ip);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+
+ xfs_bumplink(tp, du_wip->ip);
+ }
+
+ /*
+ * Set up the target.
+ */
+ if (target_ip == NULL) {
+ /*
+ * If target does not exist and the rename crosses
+ * directories, adjust the target directory link count
+ * to account for the ".." reference from the new entry.
+ */
+ error = xfs_dir_createname(tp, target_dp, target_name,
+ src_ip->i_ino, spaceres);
+ if (error)
+ return error;
+
+ xfs_trans_ichgtime(tp, target_dp,
+ XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+ if (new_parent && src_is_directory) {
+ xfs_bumplink(tp, target_dp);
+ }
+ } else { /* target_ip != NULL */
+ /*
+ * Link the source inode under the target name.
+ * If the source inode is a directory and we are moving
+ * it across directories, its ".." entry will be
+ * inconsistent until we replace that down below.
+ *
+ * In case there is already an entry with the same
+ * name at the destination directory, remove it first.
+ */
+ error = xfs_dir_replace(tp, target_dp, target_name,
+ src_ip->i_ino, spaceres);
+ if (error)
+ return error;
+
+ xfs_trans_ichgtime(tp, target_dp,
+ XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+ /*
+ * Decrement the link count on the target since the target
+ * dir no longer points to it.
+ */
+ error = xfs_droplink(tp, target_ip);
+ if (error)
+ return error;
+
+ if (src_is_directory) {
+ /*
+ * Drop the link from the old "." entry.
+ */
+ error = xfs_droplink(tp, target_ip);
+ if (error)
+ return error;
+ }
+ } /* target_ip != NULL */
+
+ /*
+ * Remove the source.
+ */
+ if (new_parent && src_is_directory) {
+ /*
+ * Rewrite the ".." entry to point to the new
+ * directory.
+ */
+ error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
+ target_dp->i_ino, spaceres);
+ ASSERT(error != -EEXIST);
+ if (error)
+ return error;
+ }
+
+ /*
+ * We always want to hit the ctime on the source inode.
+ *
+ * This isn't strictly required by the standards since the source
+ * inode isn't really being changed, but old unix file systems did
+ * it and some incremental backup programs won't work without it.
+ */
+ xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
+
+ /*
+ * Adjust the link count on src_dp. This is necessary when
+ * renaming a directory, either within one parent when
+ * the target existed, or across two parent directories.
+ */
+ if (src_is_directory && (new_parent || target_ip != NULL)) {
+
+ /*
+ * Decrement link count on src_directory since the
+ * entry that's moved no longer points to it.
+ */
+ error = xfs_droplink(tp, src_dp);
+ if (error)
+ return error;
+ }
+
+ /*
+ * For whiteouts, we only need to update the source dirent with the
+ * inode number of the whiteout inode rather than removing it
+ * altogether.
+ */
+ if (du_wip->ip)
+ error = xfs_dir_replace(tp, src_dp, src_name, du_wip->ip->i_ino,
+ spaceres);
+ else
+ error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
+ spaceres);
+ if (error)
+ return error;
+
+ xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
+ if (new_parent)
+ xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
+
+ /* Schedule parent pointer updates. */
+ if (du_wip->ppargs) {
+ error = xfs_parent_addname(tp, du_wip->ppargs, src_dp,
+ src_name, du_wip->ip);
+ if (error)
+ return error;
+ }
+
+ if (du_src->ppargs) {
+ error = xfs_parent_replacename(tp, du_src->ppargs, src_dp,
+ src_name, target_dp, target_name, src_ip);
+ if (error)
+ return error;
+ }
+
+ if (du_tgt->ppargs) {
+ error = xfs_parent_removename(tp, du_tgt->ppargs, target_dp,
+ target_name, target_ip);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Inform our hook clients that we've finished a rename operation as
+ * follows: removed the source and target files from their directories;
+ * that we've added the source to the target directory; and finally
+ * that we've added the whiteout, if there was one. All inodes are
+ * locked, so it's ok to model a rename this way so long as we say we
+ * deleted entries before we add new ones.
+ */
+ if (target_ip)
+ xfs_dir_update_hook(target_dp, target_ip, -1, target_name);
+ xfs_dir_update_hook(src_dp, src_ip, -1, src_name);
+ xfs_dir_update_hook(target_dp, src_ip, 1, target_name);
+ if (du_wip->ip)
+ xfs_dir_update_hook(src_dp, du_wip->ip, 1, src_name);
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index 6dbe6e9ecb49..576068ed81fa 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -74,7 +74,7 @@ extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
const struct xfs_name *name, xfs_ino_t inum,
xfs_extlen_t tot);
extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
- struct xfs_name *name);
+ const struct xfs_name *name);
int xfs_dir_lookup_args(struct xfs_da_args *args);
int xfs_dir_createname_args(struct xfs_da_args *args);
@@ -309,4 +309,51 @@ static inline unsigned char xfs_ascii_ci_xfrm(unsigned char c)
return c;
}
+struct xfs_dir_update_params {
+ const struct xfs_inode *dp;
+ const struct xfs_inode *ip;
+ const struct xfs_name *name;
+ int delta;
+};
+
+#ifdef CONFIG_XFS_LIVE_HOOKS
+void xfs_dir_update_hook(struct xfs_inode *dp, struct xfs_inode *ip,
+ int delta, const struct xfs_name *name);
+
+struct xfs_dir_hook {
+ struct xfs_hook dirent_hook;
+};
+
+void xfs_dir_hook_disable(void);
+void xfs_dir_hook_enable(void);
+
+int xfs_dir_hook_add(struct xfs_mount *mp, struct xfs_dir_hook *hook);
+void xfs_dir_hook_del(struct xfs_mount *mp, struct xfs_dir_hook *hook);
+void xfs_dir_hook_setup(struct xfs_dir_hook *hook, notifier_fn_t mod_fn);
+#else
+# define xfs_dir_update_hook(dp, ip, delta, name) ((void)0)
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
+struct xfs_parent_args;
+
+struct xfs_dir_update {
+ struct xfs_inode *dp;
+ const struct xfs_name *name;
+ struct xfs_inode *ip;
+ struct xfs_parent_args *ppargs;
+};
+
+int xfs_dir_create_child(struct xfs_trans *tp, unsigned int resblks,
+ struct xfs_dir_update *du);
+int xfs_dir_add_child(struct xfs_trans *tp, unsigned int resblks,
+ struct xfs_dir_update *du);
+int xfs_dir_remove_child(struct xfs_trans *tp, unsigned int resblks,
+ struct xfs_dir_update *du);
+
+int xfs_dir_exchange_children(struct xfs_trans *tp, struct xfs_dir_update *du1,
+ struct xfs_dir_update *du2, unsigned int spaceres);
+int xfs_dir_rename_children(struct xfs_trans *tp, struct xfs_dir_update *du_src,
+ struct xfs_dir_update *du_tgt, unsigned int spaceres,
+ struct xfs_dir_update *du_wip);
+
#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index ea0b9628df18..a16b05c43e2e 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -178,6 +178,14 @@ __xfs_dir3_data_check(
while (offset < end) {
struct xfs_dir2_data_unused *dup = bp->b_addr + offset;
struct xfs_dir2_data_entry *dep = bp->b_addr + offset;
+ unsigned int reclen;
+
+ /*
+ * Are the remaining bytes large enough to hold an
+ * unused entry?
+ */
+ if (offset > end - xfs_dir2_data_unusedsize(1))
+ return __this_address;
/*
* If it's unused, look for the space in the bestfree table.
@@ -187,9 +195,13 @@ __xfs_dir3_data_check(
if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
xfs_failaddr_t fa;
+ reclen = xfs_dir2_data_unusedsize(
+ be16_to_cpu(dup->length));
if (lastfree != 0)
return __this_address;
- if (offset + be16_to_cpu(dup->length) > end)
+ if (be16_to_cpu(dup->length) != reclen)
+ return __this_address;
+ if (offset + reclen > end)
return __this_address;
if (be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) !=
offset)
@@ -207,10 +219,18 @@ __xfs_dir3_data_check(
be16_to_cpu(bf[2].length))
return __this_address;
}
- offset += be16_to_cpu(dup->length);
+ offset += reclen;
lastfree = 1;
continue;
}
+
+ /*
+ * This is not an unused entry. Are the remaining bytes
+ * large enough for a dirent with a single-byte name?
+ */
+ if (offset > end - xfs_dir2_data_entsize(mp, 1))
+ return __this_address;
+
/*
* It's a real entry. Validate the fields.
* If this is a block directory then make sure it's
@@ -219,9 +239,10 @@ __xfs_dir3_data_check(
*/
if (dep->namelen == 0)
return __this_address;
- if (!xfs_verify_dir_ino(mp, be64_to_cpu(dep->inumber)))
+ reclen = xfs_dir2_data_entsize(mp, dep->namelen);
+ if (offset + reclen > end)
return __this_address;
- if (offset + xfs_dir2_data_entsize(mp, dep->namelen) > end)
+ if (!xfs_verify_dir_ino(mp, be64_to_cpu(dep->inumber)))
return __this_address;
if (be16_to_cpu(*xfs_dir2_data_entry_tag_p(mp, dep)) != offset)
return __this_address;
@@ -245,7 +266,7 @@ __xfs_dir3_data_check(
if (i >= be32_to_cpu(btp->count))
return __this_address;
}
- offset += xfs_dir2_data_entsize(mp, dep->namelen);
+ offset += reclen;
}
/*
* Need to have seen all the entries and all the bestfree slots.
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 3befb32509fa..10041350274a 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -190,6 +190,13 @@ extern int xfs_readdir(struct xfs_trans *tp, struct xfs_inode *dp,
struct dir_context *ctx, size_t bufsize);
static inline unsigned int
+xfs_dir2_data_unusedsize(
+ unsigned int len)
+{
+ return round_up(len, XFS_DIR2_DATA_ALIGN);
+}
+
+static inline unsigned int
xfs_dir2_data_entsize(
struct xfs_mount *mp,
unsigned int namelen)
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 61f51becff4f..e1bfee0c3b1a 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -90,8 +90,7 @@ struct xfs_ifork;
#define XFSLABEL_MAX 12
/*
- * Superblock - in core version. Must match the ondisk version below.
- * Must be padded to 64 bit alignment.
+ * Superblock - in core version. Must be padded to 64 bit alignment.
*/
typedef struct xfs_sb {
uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */
@@ -178,10 +177,8 @@ typedef struct xfs_sb {
/* must be padded to 64 bit alignment */
} xfs_sb_t;
-#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc)
-
/*
- * Superblock - on disk version. Must match the in core version above.
+ * Superblock - on disk version.
* Must be padded to 64 bit alignment.
*/
struct xfs_dsb {
@@ -265,6 +262,8 @@ struct xfs_dsb {
/* must be padded to 64 bit alignment */
};
+#define XFS_SB_CRC_OFF offsetof(struct xfs_dsb, sb_crc)
+
/*
* Misc. Flags - warning - these will be cleared by xfs_repair unless
* a feature bit is set when the flag is used.
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 97996cb79aaa..454b63ef7201 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -996,7 +996,7 @@ struct xfs_getparents_by_handle {
#define XFS_IOC_FSGEOMETRY _IOR ('X', 126, struct xfs_fsop_geom)
#define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req)
#define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req)
-#define XFS_IOC_EXCHANGE_RANGE _IOWR('X', 129, struct xfs_exchange_range)
+#define XFS_IOC_EXCHANGE_RANGE _IOW ('X', 129, struct xfs_exchange_range)
/* XFS_IOC_GETFSUUID ---------- deprecated 140 */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 14c81f227c5b..0af5b7a33d05 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1946,6 +1946,21 @@ retry:
}
return -ENOSPC;
}
+
+ /*
+ * Protect against obviously corrupt allocation btree records. Later
+ * xfs_iget checks will catch re-allocation of other active in-memory
+ * and on-disk inodes. If we don't catch reallocating the parent inode
+ * here we will deadlock in xfs_iget() so we have to do these checks
+ * first.
+ */
+ if (ino == parent || !xfs_verify_dir_ino(mp, ino)) {
+ xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
+ xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino),
+ XFS_SICK_AG_INOBT);
+ return -EFSCORRUPTED;
+ }
+
*new_ino = ino;
return 0;
}
@@ -1975,7 +1990,7 @@ xfs_difree_inode_chunk(
return xfs_free_extent_later(tp,
XFS_AGB_TO_FSB(mp, agno, sagbno),
M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES,
- XFS_AG_RESV_NONE, false);
+ XFS_AG_RESV_NONE, 0);
}
/* holemask is only 16-bits (fits in an unsigned long) */
@@ -2021,8 +2036,7 @@ xfs_difree_inode_chunk(
ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
error = xfs_free_extent_later(tp,
XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
- &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE,
- false);
+ &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, 0);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 42e9fd47f6c7..496e2f72a85b 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -170,7 +170,7 @@ __xfs_inobt_free_block(
xfs_inobt_mod_blockcount(cur, -1);
fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
return xfs_free_extent_later(cur->bc_tp, fsbno, 1,
- &XFS_RMAP_OINFO_INOBT, resv, false);
+ &XFS_RMAP_OINFO_INOBT, resv, 0);
}
STATIC int
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index e7a7bfbe75b4..513b50da6215 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -379,10 +379,13 @@ xfs_dinode_verify_fork(
/*
* A directory small enough to fit in the inode must be stored
* in local format. The directory sf <-> extents conversion
- * code updates the directory size accordingly.
+ * code updates the directory size accordingly. Directories
+ * being truncated have zero size and are not subject to this
+ * check.
*/
if (S_ISDIR(mode)) {
- if (be64_to_cpu(dip->di_size) <= fork_size &&
+ if (dip->di_size &&
+ be64_to_cpu(dip->di_size) <= fork_size &&
fork_format != XFS_DINODE_FMT_LOCAL)
return __this_address;
}
@@ -528,9 +531,19 @@ xfs_dinode_verify(
if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN)
return __this_address;
- /* No zero-length symlinks/dirs. */
- if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0)
- return __this_address;
+ /*
+ * No zero-length symlinks/dirs unless they're unlinked and hence being
+ * inactivated.
+ */
+ if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) {
+ if (dip->di_version > 1) {
+ if (dip->di_nlink)
+ return __this_address;
+ } else {
+ if (dip->di_onlink)
+ return __this_address;
+ }
+ }
fa = xfs_dinode_verify_nrext64(mp, dip);
if (fa)
diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
new file mode 100644
index 000000000000..032333289113
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_inode_util.c
@@ -0,0 +1,749 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include <linux/iversion.h>
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_inode_util.h"
+#include "xfs_trans.h"
+#include "xfs_ialloc.h"
+#include "xfs_health.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_ag.h"
+#include "xfs_iunlink_item.h"
+#include "xfs_inode_item.h"
+
+uint16_t
+xfs_flags2diflags(
+ struct xfs_inode *ip,
+ unsigned int xflags)
+{
+ /* can't set PREALLOC this way, just preserve it */
+ uint16_t di_flags =
+ (ip->i_diflags & XFS_DIFLAG_PREALLOC);
+
+ if (xflags & FS_XFLAG_IMMUTABLE)
+ di_flags |= XFS_DIFLAG_IMMUTABLE;
+ if (xflags & FS_XFLAG_APPEND)
+ di_flags |= XFS_DIFLAG_APPEND;
+ if (xflags & FS_XFLAG_SYNC)
+ di_flags |= XFS_DIFLAG_SYNC;
+ if (xflags & FS_XFLAG_NOATIME)
+ di_flags |= XFS_DIFLAG_NOATIME;
+ if (xflags & FS_XFLAG_NODUMP)
+ di_flags |= XFS_DIFLAG_NODUMP;
+ if (xflags & FS_XFLAG_NODEFRAG)
+ di_flags |= XFS_DIFLAG_NODEFRAG;
+ if (xflags & FS_XFLAG_FILESTREAM)
+ di_flags |= XFS_DIFLAG_FILESTREAM;
+ if (S_ISDIR(VFS_I(ip)->i_mode)) {
+ if (xflags & FS_XFLAG_RTINHERIT)
+ di_flags |= XFS_DIFLAG_RTINHERIT;
+ if (xflags & FS_XFLAG_NOSYMLINKS)
+ di_flags |= XFS_DIFLAG_NOSYMLINKS;
+ if (xflags & FS_XFLAG_EXTSZINHERIT)
+ di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+ if (xflags & FS_XFLAG_PROJINHERIT)
+ di_flags |= XFS_DIFLAG_PROJINHERIT;
+ } else if (S_ISREG(VFS_I(ip)->i_mode)) {
+ if (xflags & FS_XFLAG_REALTIME)
+ di_flags |= XFS_DIFLAG_REALTIME;
+ if (xflags & FS_XFLAG_EXTSIZE)
+ di_flags |= XFS_DIFLAG_EXTSIZE;
+ }
+
+ return di_flags;
+}
+
+uint64_t
+xfs_flags2diflags2(
+ struct xfs_inode *ip,
+ unsigned int xflags)
+{
+ uint64_t di_flags2 =
+ (ip->i_diflags2 & (XFS_DIFLAG2_REFLINK |
+ XFS_DIFLAG2_BIGTIME |
+ XFS_DIFLAG2_NREXT64));
+
+ if (xflags & FS_XFLAG_DAX)
+ di_flags2 |= XFS_DIFLAG2_DAX;
+ if (xflags & FS_XFLAG_COWEXTSIZE)
+ di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+
+ return di_flags2;
+}
+
+uint32_t
+xfs_ip2xflags(
+ struct xfs_inode *ip)
+{
+ uint32_t flags = 0;
+
+ if (ip->i_diflags & XFS_DIFLAG_ANY) {
+ if (ip->i_diflags & XFS_DIFLAG_REALTIME)
+ flags |= FS_XFLAG_REALTIME;
+ if (ip->i_diflags & XFS_DIFLAG_PREALLOC)
+ flags |= FS_XFLAG_PREALLOC;
+ if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE)
+ flags |= FS_XFLAG_IMMUTABLE;
+ if (ip->i_diflags & XFS_DIFLAG_APPEND)
+ flags |= FS_XFLAG_APPEND;
+ if (ip->i_diflags & XFS_DIFLAG_SYNC)
+ flags |= FS_XFLAG_SYNC;
+ if (ip->i_diflags & XFS_DIFLAG_NOATIME)
+ flags |= FS_XFLAG_NOATIME;
+ if (ip->i_diflags & XFS_DIFLAG_NODUMP)
+ flags |= FS_XFLAG_NODUMP;
+ if (ip->i_diflags & XFS_DIFLAG_RTINHERIT)
+ flags |= FS_XFLAG_RTINHERIT;
+ if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT)
+ flags |= FS_XFLAG_PROJINHERIT;
+ if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS)
+ flags |= FS_XFLAG_NOSYMLINKS;
+ if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
+ flags |= FS_XFLAG_EXTSIZE;
+ if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT)
+ flags |= FS_XFLAG_EXTSZINHERIT;
+ if (ip->i_diflags & XFS_DIFLAG_NODEFRAG)
+ flags |= FS_XFLAG_NODEFRAG;
+ if (ip->i_diflags & XFS_DIFLAG_FILESTREAM)
+ flags |= FS_XFLAG_FILESTREAM;
+ }
+
+ if (ip->i_diflags2 & XFS_DIFLAG2_ANY) {
+ if (ip->i_diflags2 & XFS_DIFLAG2_DAX)
+ flags |= FS_XFLAG_DAX;
+ if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
+ flags |= FS_XFLAG_COWEXTSIZE;
+ }
+
+ if (xfs_inode_has_attr_fork(ip))
+ flags |= FS_XFLAG_HASATTR;
+ return flags;
+}
+
+prid_t
+xfs_get_initial_prid(struct xfs_inode *dp)
+{
+ if (dp->i_diflags & XFS_DIFLAG_PROJINHERIT)
+ return dp->i_projid;
+
+ /* Assign to the root project by default. */
+ return 0;
+}
+
+/* Propagate di_flags from a parent inode to a child inode. */
+static inline void
+xfs_inode_inherit_flags(
+ struct xfs_inode *ip,
+ const struct xfs_inode *pip)
+{
+ unsigned int di_flags = 0;
+ xfs_failaddr_t failaddr;
+ umode_t mode = VFS_I(ip)->i_mode;
+
+ if (S_ISDIR(mode)) {
+ if (pip->i_diflags & XFS_DIFLAG_RTINHERIT)
+ di_flags |= XFS_DIFLAG_RTINHERIT;
+ if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
+ di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+ ip->i_extsize = pip->i_extsize;
+ }
+ if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT)
+ di_flags |= XFS_DIFLAG_PROJINHERIT;
+ } else if (S_ISREG(mode)) {
+ if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+ xfs_has_realtime(ip->i_mount))
+ di_flags |= XFS_DIFLAG_REALTIME;
+ if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
+ di_flags |= XFS_DIFLAG_EXTSIZE;
+ ip->i_extsize = pip->i_extsize;
+ }
+ }
+ if ((pip->i_diflags & XFS_DIFLAG_NOATIME) &&
+ xfs_inherit_noatime)
+ di_flags |= XFS_DIFLAG_NOATIME;
+ if ((pip->i_diflags & XFS_DIFLAG_NODUMP) &&
+ xfs_inherit_nodump)
+ di_flags |= XFS_DIFLAG_NODUMP;
+ if ((pip->i_diflags & XFS_DIFLAG_SYNC) &&
+ xfs_inherit_sync)
+ di_flags |= XFS_DIFLAG_SYNC;
+ if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) &&
+ xfs_inherit_nosymlinks)
+ di_flags |= XFS_DIFLAG_NOSYMLINKS;
+ if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) &&
+ xfs_inherit_nodefrag)
+ di_flags |= XFS_DIFLAG_NODEFRAG;
+ if (pip->i_diflags & XFS_DIFLAG_FILESTREAM)
+ di_flags |= XFS_DIFLAG_FILESTREAM;
+
+ ip->i_diflags |= di_flags;
+
+ /*
+ * Inode verifiers on older kernels only check that the extent size
+ * hint is an integer multiple of the rt extent size on realtime files.
+ * They did not check the hint alignment on a directory with both
+ * rtinherit and extszinherit flags set. If the misaligned hint is
+ * propagated from a directory into a new realtime file, new file
+ * allocations will fail due to math errors in the rt allocator and/or
+ * trip the verifiers. Validate the hint settings in the new file so
+ * that we don't let broken hints propagate.
+ */
+ failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize,
+ VFS_I(ip)->i_mode, ip->i_diflags);
+ if (failaddr) {
+ ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
+ XFS_DIFLAG_EXTSZINHERIT);
+ ip->i_extsize = 0;
+ }
+}
+
+/* Propagate di_flags2 from a parent inode to a child inode. */
+static inline void
+xfs_inode_inherit_flags2(
+ struct xfs_inode *ip,
+ const struct xfs_inode *pip)
+{
+ xfs_failaddr_t failaddr;
+
+ if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) {
+ ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
+ ip->i_cowextsize = pip->i_cowextsize;
+ }
+ if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
+ ip->i_diflags2 |= XFS_DIFLAG2_DAX;
+
+ /* Don't let invalid cowextsize hints propagate. */
+ failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
+ VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2);
+ if (failaddr) {
+ ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
+ ip->i_cowextsize = 0;
+ }
+}
+
+/*
+ * If we need to create attributes immediately after allocating the inode,
+ * initialise an empty attribute fork right now. We use the default fork offset
+ * for attributes here as we don't know exactly what size or how many
+ * attributes we might be adding. We can do this safely here because we know
+ * the data fork is completely empty and this saves us from needing to run a
+ * separate transaction to set the fork offset in the immediate future.
+ *
+ * If we have parent pointers and the caller hasn't told us that the file will
+ * never be linked into a directory tree, we /must/ create the attr fork.
+ */
+static inline bool
+xfs_icreate_want_attrfork(
+ struct xfs_mount *mp,
+ const struct xfs_icreate_args *args)
+{
+ if (args->flags & XFS_ICREATE_INIT_XATTRS)
+ return true;
+
+ if (!(args->flags & XFS_ICREATE_UNLINKABLE) && xfs_has_parent(mp))
+ return true;
+
+ return false;
+}
+
+/* Initialise an inode's attributes. */
+void
+xfs_inode_init(
+ struct xfs_trans *tp,
+ const struct xfs_icreate_args *args,
+ struct xfs_inode *ip)
+{
+ struct xfs_inode *pip = args->pip;
+ struct inode *dir = pip ? VFS_I(pip) : NULL;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct inode *inode = VFS_I(ip);
+ unsigned int flags;
+ int times = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG |
+ XFS_ICHGTIME_ACCESS;
+
+ if (args->flags & XFS_ICREATE_TMPFILE)
+ set_nlink(inode, 0);
+ else if (S_ISDIR(args->mode))
+ set_nlink(inode, 2);
+ else
+ set_nlink(inode, 1);
+ inode->i_rdev = args->rdev;
+
+ if (!args->idmap || pip == NULL) {
+ /* creating a tree root, sb rooted, or detached file */
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+ ip->i_projid = 0;
+ inode->i_mode = args->mode;
+ } else {
+ /* creating a child in the directory tree */
+ if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) {
+ inode_fsuid_set(inode, args->idmap);
+ inode->i_gid = dir->i_gid;
+ inode->i_mode = args->mode;
+ } else {
+ inode_init_owner(args->idmap, inode, dir, args->mode);
+ }
+
+ /*
+ * If the group ID of the new file does not match the effective
+ * group ID or one of the supplementary group IDs, the S_ISGID
+ * bit is cleared (and only if the irix_sgid_inherit
+ * compatibility variable is set).
+ */
+ if (irix_sgid_inherit && (inode->i_mode & S_ISGID) &&
+ !vfsgid_in_group_p(i_gid_into_vfsgid(args->idmap, inode)))
+ inode->i_mode &= ~S_ISGID;
+
+ ip->i_projid = pip ? xfs_get_initial_prid(pip) : 0;
+ }
+
+ ip->i_disk_size = 0;
+ ip->i_df.if_nextents = 0;
+ ASSERT(ip->i_nblocks == 0);
+
+ ip->i_extsize = 0;
+ ip->i_diflags = 0;
+
+ if (xfs_has_v3inodes(mp)) {
+ inode_set_iversion(inode, 1);
+ ip->i_cowextsize = 0;
+ times |= XFS_ICHGTIME_CREATE;
+ }
+
+ xfs_trans_ichgtime(tp, ip, times);
+
+ flags = XFS_ILOG_CORE;
+ switch (args->mode & S_IFMT) {
+ case S_IFIFO:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFSOCK:
+ ip->i_df.if_format = XFS_DINODE_FMT_DEV;
+ flags |= XFS_ILOG_DEV;
+ break;
+ case S_IFREG:
+ case S_IFDIR:
+ if (pip && (pip->i_diflags & XFS_DIFLAG_ANY))
+ xfs_inode_inherit_flags(ip, pip);
+ if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY))
+ xfs_inode_inherit_flags2(ip, pip);
+ fallthrough;
+ case S_IFLNK:
+ ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
+ ip->i_df.if_bytes = 0;
+ ip->i_df.if_data = NULL;
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ if (xfs_icreate_want_attrfork(mp, args)) {
+ ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
+ xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
+
+ if (!xfs_has_attr(mp)) {
+ spin_lock(&mp->m_sb_lock);
+ xfs_add_attr(mp);
+ spin_unlock(&mp->m_sb_lock);
+ xfs_log_sb(tp);
+ }
+ }
+
+ xfs_trans_log_inode(tp, ip, flags);
+}
+
+/*
+ * In-Core Unlinked List Lookups
+ * =============================
+ *
+ * Every inode is supposed to be reachable from some other piece of metadata
+ * with the exception of the root directory. Inodes with a connection to a
+ * file descriptor but not linked from anywhere in the on-disk directory tree
+ * are collectively known as unlinked inodes, though the filesystem itself
+ * maintains links to these inodes so that on-disk metadata are consistent.
+ *
+ * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI
+ * header contains a number of buckets that point to an inode, and each inode
+ * record has a pointer to the next inode in the hash chain. This
+ * singly-linked list causes scaling problems in the iunlink remove function
+ * because we must walk that list to find the inode that points to the inode
+ * being removed from the unlinked hash bucket list.
+ *
+ * Hence we keep an in-memory double linked list to link each inode on an
+ * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer
+ * based lists would require having 64 list heads in the perag, one for each
+ * list. This is expensive in terms of memory (think millions of AGs) and cache
+ * misses on lookups. Instead, use the fact that inodes on the unlinked list
+ * must be referenced at the VFS level to keep them on the list and hence we
+ * have an existence guarantee for inodes on the unlinked list.
+ *
+ * Given we have an existence guarantee, we can use lockless inode cache lookups
+ * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode
+ * for the double linked unlinked list, and we don't need any extra locking to
+ * keep the list safe as all manipulations are done under the AGI buffer lock.
+ * Keeping the list up to date does not require memory allocation, just finding
+ * the XFS inode and updating the next/prev unlinked list aginos.
+ */
+
+/*
+ * Update the prev pointer of the next agino. Returns -ENOLINK if the inode
+ * is not in cache.
+ */
+static int
+xfs_iunlink_update_backref(
+ struct xfs_perag *pag,
+ xfs_agino_t prev_agino,
+ xfs_agino_t next_agino)
+{
+ struct xfs_inode *ip;
+
+ /* No update necessary if we are at the end of the list. */
+ if (next_agino == NULLAGINO)
+ return 0;
+
+ ip = xfs_iunlink_lookup(pag, next_agino);
+ if (!ip)
+ return -ENOLINK;
+
+ ip->i_prev_unlinked = prev_agino;
+ return 0;
+}
+
+/*
+ * Point the AGI unlinked bucket at an inode and log the results. The caller
+ * is responsible for validating the old value.
+ */
+STATIC int
+xfs_iunlink_update_bucket(
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ struct xfs_buf *agibp,
+ unsigned int bucket_index,
+ xfs_agino_t new_agino)
+{
+ struct xfs_agi *agi = agibp->b_addr;
+ xfs_agino_t old_value;
+ int offset;
+
+ ASSERT(xfs_verify_agino_or_null(pag, new_agino));
+
+ old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index,
+ old_value, new_agino);
+
+ /*
+ * We should never find the head of the list already set to the value
+ * passed in because either we're adding or removing ourselves from the
+ * head of the list.
+ */
+ if (old_value == new_agino) {
+ xfs_buf_mark_corrupt(agibp);
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
+ return -EFSCORRUPTED;
+ }
+
+ agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
+ offset = offsetof(struct xfs_agi, agi_unlinked) +
+ (sizeof(xfs_agino_t) * bucket_index);
+ xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
+ return 0;
+}
+
+static int
+xfs_iunlink_insert_inode(
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ struct xfs_buf *agibp,
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi = agibp->b_addr;
+ xfs_agino_t next_agino;
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+ int error;
+
+ /*
+ * Get the index into the agi hash table for the list this inode will
+ * go on. Make sure the pointer isn't garbage and that this inode
+ * isn't already on the list.
+ */
+ next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ if (next_agino == agino ||
+ !xfs_verify_agino_or_null(pag, next_agino)) {
+ xfs_buf_mark_corrupt(agibp);
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Update the prev pointer in the next inode to point back to this
+ * inode.
+ */
+ error = xfs_iunlink_update_backref(pag, agino, next_agino);
+ if (error == -ENOLINK)
+ error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino);
+ if (error)
+ return error;
+
+ if (next_agino != NULLAGINO) {
+ /*
+ * There is already another inode in the bucket, so point this
+ * inode to the current head of the list.
+ */
+ error = xfs_iunlink_log_inode(tp, ip, pag, next_agino);
+ if (error)
+ return error;
+ ip->i_next_unlinked = next_agino;
+ }
+
+ /* Point the head of the list to point to this inode. */
+ ip->i_prev_unlinked = NULLAGINO;
+ return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
+}
+
+/*
+ * This is called when the inode's link count has gone to 0 or we are creating
+ * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0.
+ *
+ * We place the on-disk inode on a list in the AGI. It will be pulled from this
+ * list when the inode is freed.
+ */
+int
+xfs_iunlink(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_perag *pag;
+ struct xfs_buf *agibp;
+ int error;
+
+ ASSERT(VFS_I(ip)->i_nlink == 0);
+ ASSERT(VFS_I(ip)->i_mode != 0);
+ trace_xfs_iunlink(ip);
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+
+ /* Get the agi buffer first. It ensures lock ordering on the list. */
+ error = xfs_read_agi(pag, tp, 0, &agibp);
+ if (error)
+ goto out;
+
+ error = xfs_iunlink_insert_inode(tp, pag, agibp, ip);
+out:
+ xfs_perag_put(pag);
+ return error;
+}
+
+static int
+xfs_iunlink_remove_inode(
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ struct xfs_buf *agibp,
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi = agibp->b_addr;
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ xfs_agino_t head_agino;
+ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+ int error;
+
+ trace_xfs_iunlink_remove(ip);
+
+ /*
+ * Get the index into the agi hash table for the list this inode will
+ * go on. Make sure the head pointer isn't garbage.
+ */
+ head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ if (!xfs_verify_agino(pag, head_agino)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ agi, sizeof(*agi));
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Set our inode's next_unlinked pointer to NULL and then return
+ * the old pointer value so that we can update whatever was previous
+ * to us in the list to point to whatever was next in the list.
+ */
+ error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO);
+ if (error)
+ return error;
+
+ /*
+ * Update the prev pointer in the next inode to point back to previous
+ * inode in the chain.
+ */
+ error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
+ ip->i_next_unlinked);
+ if (error == -ENOLINK)
+ error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked,
+ ip->i_next_unlinked);
+ if (error)
+ return error;
+
+ if (head_agino != agino) {
+ struct xfs_inode *prev_ip;
+
+ prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked);
+ if (!prev_ip) {
+ xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
+ return -EFSCORRUPTED;
+ }
+
+ error = xfs_iunlink_log_inode(tp, prev_ip, pag,
+ ip->i_next_unlinked);
+ prev_ip->i_next_unlinked = ip->i_next_unlinked;
+ } else {
+ /* Point the head of the list to the next unlinked inode. */
+ error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index,
+ ip->i_next_unlinked);
+ }
+
+ ip->i_next_unlinked = NULLAGINO;
+ ip->i_prev_unlinked = 0;
+ return error;
+}
+
+/*
+ * Pull the on-disk inode from the AGI unlinked list.
+ */
+int
+xfs_iunlink_remove(
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ struct xfs_inode *ip)
+{
+ struct xfs_buf *agibp;
+ int error;
+
+ trace_xfs_iunlink_remove(ip);
+
+ /* Get the agi buffer first. It ensures lock ordering on the list. */
+ error = xfs_read_agi(pag, tp, 0, &agibp);
+ if (error)
+ return error;
+
+ return xfs_iunlink_remove_inode(tp, pag, agibp, ip);
+}
+
+/*
+ * Decrement the link count on an inode & log the change. If this causes the
+ * link count to go to zero, move the inode to AGI unlinked list so that it can
+ * be freed when the last active reference goes away via xfs_inactive().
+ */
+int
+xfs_droplink(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
+{
+ struct inode *inode = VFS_I(ip);
+
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+
+ if (inode->i_nlink == 0) {
+ xfs_info_ratelimited(tp->t_mountp,
+ "Inode 0x%llx link count dropped below zero. Pinning link count.",
+ ip->i_ino);
+ set_nlink(inode, XFS_NLINK_PINNED);
+ }
+ if (inode->i_nlink != XFS_NLINK_PINNED)
+ drop_nlink(inode);
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ if (inode->i_nlink)
+ return 0;
+
+ return xfs_iunlink(tp, ip);
+}
+
+/*
+ * Increment the link count on an inode & log the change.
+ */
+void
+xfs_bumplink(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
+{
+ struct inode *inode = VFS_I(ip);
+
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+
+ if (inode->i_nlink == XFS_NLINK_PINNED - 1)
+ xfs_info_ratelimited(tp->t_mountp,
+ "Inode 0x%llx link count exceeded maximum. Pinning link count.",
+ ip->i_ino);
+ if (inode->i_nlink != XFS_NLINK_PINNED)
+ inc_nlink(inode);
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/* Free an inode in the ondisk index and zero it out. */
+int
+xfs_inode_uninit(
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ struct xfs_inode *ip,
+ struct xfs_icluster *xic)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ int error;
+
+ /*
+ * Free the inode first so that we guarantee that the AGI lock is going
+ * to be taken before we remove the inode from the unlinked list. This
+ * makes the AGI lock -> unlinked list modification order the same as
+ * used in O_TMPFILE creation.
+ */
+ error = xfs_difree(tp, pag, ip->i_ino, xic);
+ if (error)
+ return error;
+
+ error = xfs_iunlink_remove(tp, pag, ip);
+ if (error)
+ return error;
+
+ /*
+ * Free any local-format data sitting around before we reset the
+ * data fork to extents format. Note that the attr fork data has
+ * already been freed by xfs_attr_inactive.
+ */
+ if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+ kfree(ip->i_df.if_data);
+ ip->i_df.if_data = NULL;
+ ip->i_df.if_bytes = 0;
+ }
+
+ VFS_I(ip)->i_mode = 0; /* mark incore inode as free */
+ ip->i_diflags = 0;
+ ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
+ ip->i_forkoff = 0; /* mark the attr fork not in use */
+ ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
+
+ /*
+ * Bump the generation count so no one will be confused
+ * by reincarnations of this inode.
+ */
+ VFS_I(ip)->i_generation++;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_util.h b/fs/xfs/libxfs/xfs_inode_util.h
new file mode 100644
index 000000000000..060242998a23
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_inode_util.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_INODE_UTIL_H__
+#define __XFS_INODE_UTIL_H__
+
+struct xfs_icluster;
+
+uint16_t xfs_flags2diflags(struct xfs_inode *ip, unsigned int xflags);
+uint64_t xfs_flags2diflags2(struct xfs_inode *ip, unsigned int xflags);
+uint32_t xfs_dic2xflags(struct xfs_inode *ip);
+uint32_t xfs_ip2xflags(struct xfs_inode *ip);
+
+prid_t xfs_get_initial_prid(struct xfs_inode *dp);
+
+/*
+ * File creation context.
+ *
+ * Due to our only partial reliance on the VFS to propagate uid and gid values
+ * according to accepted Unix behaviors, callers must initialize idmap to the
+ * correct idmapping structure to get the correct inheritance behaviors when
+ * XFS_MOUNT_GRPID is set.
+ *
+ * To create files detached from the directory tree (e.g. quota inodes), set
+ * idmap to NULL. To create a tree root, set pip to NULL.
+ */
+struct xfs_icreate_args {
+ struct mnt_idmap *idmap;
+ struct xfs_inode *pip; /* parent inode or null */
+ dev_t rdev;
+ umode_t mode;
+
+#define XFS_ICREATE_TMPFILE (1U << 0) /* create an unlinked file */
+#define XFS_ICREATE_INIT_XATTRS (1U << 1) /* will set xattrs immediately */
+#define XFS_ICREATE_UNLINKABLE (1U << 2) /* cannot link into dir tree */
+ uint16_t flags;
+};
+
+/*
+ * Flags for xfs_trans_ichgtime().
+ */
+#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */
+#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */
+#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */
+#define XFS_ICHGTIME_ACCESS 0x8 /* last access timestamp */
+void xfs_trans_ichgtime(struct xfs_trans *tp, struct xfs_inode *ip, int flags);
+
+void xfs_inode_init(struct xfs_trans *tp, const struct xfs_icreate_args *args,
+ struct xfs_inode *ip);
+
+int xfs_inode_uninit(struct xfs_trans *tp, struct xfs_perag *pag,
+ struct xfs_inode *ip, struct xfs_icluster *xic);
+
+int xfs_iunlink(struct xfs_trans *tp, struct xfs_inode *ip);
+int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
+ struct xfs_inode *ip);
+int xfs_droplink(struct xfs_trans *tp, struct xfs_inode *ip);
+void xfs_bumplink(struct xfs_trans *tp, struct xfs_inode *ip);
+
+#endif /* __XFS_INODE_UTIL_H__ */
diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index e8cdd77d03fa..23c133fd36f5 100644
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -85,6 +85,7 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t, 12);
*/
+ XFS_CHECK_OFFSET(struct xfs_dsb, sb_crc, 224);
XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen, 0);
XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen, 2);
XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval, 3);
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 511c912d515c..198b84117df1 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -24,6 +24,7 @@
#include "xfs_rmap.h"
#include "xfs_ag.h"
#include "xfs_health.h"
+#include "xfs_refcount_item.h"
struct kmem_cache *xfs_refcount_intent_cache;
@@ -51,7 +52,7 @@ xfs_refcount_lookup_le(
xfs_agblock_t bno,
int *stat)
{
- trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ trace_xfs_refcount_lookup(cur,
xfs_refcount_encode_startblock(bno, domain),
XFS_LOOKUP_LE);
cur->bc_rec.rc.rc_startblock = bno;
@@ -71,7 +72,7 @@ xfs_refcount_lookup_ge(
xfs_agblock_t bno,
int *stat)
{
- trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ trace_xfs_refcount_lookup(cur,
xfs_refcount_encode_startblock(bno, domain),
XFS_LOOKUP_GE);
cur->bc_rec.rc.rc_startblock = bno;
@@ -91,7 +92,7 @@ xfs_refcount_lookup_eq(
xfs_agblock_t bno,
int *stat)
{
- trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ trace_xfs_refcount_lookup(cur,
xfs_refcount_encode_startblock(bno, domain),
XFS_LOOKUP_LE);
cur->bc_rec.rc.rc_startblock = bno;
@@ -183,7 +184,7 @@ xfs_refcount_get_rec(
if (fa)
return xfs_refcount_complain_bad_rec(cur, fa, irec);
- trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
+ trace_xfs_refcount_get(cur, irec);
return 0;
}
@@ -201,7 +202,7 @@ xfs_refcount_update(
uint32_t start;
int error;
- trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
+ trace_xfs_refcount_update(cur, irec);
start = xfs_refcount_encode_startblock(irec->rc_startblock,
irec->rc_domain);
@@ -211,8 +212,7 @@ xfs_refcount_update(
error = xfs_btree_update(cur, &rec);
if (error)
- trace_xfs_refcount_update_error(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ trace_xfs_refcount_update_error(cur, error, _RET_IP_);
return error;
}
@@ -229,7 +229,7 @@ xfs_refcount_insert(
{
int error;
- trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec);
+ trace_xfs_refcount_insert(cur, irec);
cur->bc_rec.rc.rc_startblock = irec->rc_startblock;
cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount;
@@ -247,8 +247,7 @@ xfs_refcount_insert(
out_error:
if (error)
- trace_xfs_refcount_insert_error(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ trace_xfs_refcount_insert_error(cur, error, _RET_IP_);
return error;
}
@@ -275,7 +274,7 @@ xfs_refcount_delete(
error = -EFSCORRUPTED;
goto out_error;
}
- trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.pag->pag_agno, &irec);
+ trace_xfs_refcount_delete(cur, &irec);
error = xfs_btree_delete(cur, i);
if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) {
xfs_btree_mark_sick(cur);
@@ -288,8 +287,7 @@ xfs_refcount_delete(
&found_rec);
out_error:
if (error)
- trace_xfs_refcount_delete_error(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ trace_xfs_refcount_delete_error(cur, error, _RET_IP_);
return error;
}
@@ -413,8 +411,7 @@ xfs_refcount_split_extent(
return 0;
*shape_changed = true;
- trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- &rcext, agbno);
+ trace_xfs_refcount_split_extent(cur, &rcext, agbno);
/* Establish the right extent. */
tmp = rcext;
@@ -438,8 +435,7 @@ xfs_refcount_split_extent(
return error;
out_error:
- trace_xfs_refcount_split_extent_error(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ trace_xfs_refcount_split_extent_error(cur, error, _RET_IP_);
return error;
}
@@ -458,8 +454,7 @@ xfs_refcount_merge_center_extents(
int error;
int found_rec;
- trace_xfs_refcount_merge_center_extents(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, left, center, right);
+ trace_xfs_refcount_merge_center_extents(cur, left, center, right);
ASSERT(left->rc_domain == center->rc_domain);
ASSERT(right->rc_domain == center->rc_domain);
@@ -522,8 +517,7 @@ xfs_refcount_merge_center_extents(
return error;
out_error:
- trace_xfs_refcount_merge_center_extents_error(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ trace_xfs_refcount_merge_center_extents_error(cur, error, _RET_IP_);
return error;
}
@@ -541,8 +535,7 @@ xfs_refcount_merge_left_extent(
int error;
int found_rec;
- trace_xfs_refcount_merge_left_extent(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, left, cleft);
+ trace_xfs_refcount_merge_left_extent(cur, left, cleft);
ASSERT(left->rc_domain == cleft->rc_domain);
@@ -589,8 +582,7 @@ xfs_refcount_merge_left_extent(
return error;
out_error:
- trace_xfs_refcount_merge_left_extent_error(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ trace_xfs_refcount_merge_left_extent_error(cur, error, _RET_IP_);
return error;
}
@@ -607,8 +599,7 @@ xfs_refcount_merge_right_extent(
int error;
int found_rec;
- trace_xfs_refcount_merge_right_extent(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, cright, right);
+ trace_xfs_refcount_merge_right_extent(cur, cright, right);
ASSERT(right->rc_domain == cright->rc_domain);
@@ -658,8 +649,7 @@ xfs_refcount_merge_right_extent(
return error;
out_error:
- trace_xfs_refcount_merge_right_extent_error(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ trace_xfs_refcount_merge_right_extent_error(cur, error, _RET_IP_);
return error;
}
@@ -748,13 +738,11 @@ not_found:
cleft->rc_refcount = 1;
cleft->rc_domain = domain;
}
- trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- left, cleft, agbno);
+ trace_xfs_refcount_find_left_extent(cur, left, cleft, agbno);
return error;
out_error:
- trace_xfs_refcount_find_left_extent_error(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ trace_xfs_refcount_find_left_extent_error(cur, error, _RET_IP_);
return error;
}
@@ -843,13 +831,12 @@ not_found:
cright->rc_refcount = 1;
cright->rc_domain = domain;
}
- trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- cright, right, agbno + aglen);
+ trace_xfs_refcount_find_right_extent(cur, cright, right,
+ agbno + aglen);
return error;
out_error:
- trace_xfs_refcount_find_right_extent_error(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ trace_xfs_refcount_find_right_extent_error(cur, error, _RET_IP_);
return error;
}
@@ -1148,8 +1135,7 @@ xfs_refcount_adjust_extents(
tmp.rc_refcount = 1 + adj;
tmp.rc_domain = XFS_REFC_DOMAIN_SHARED;
- trace_xfs_refcount_modify_extent(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, &tmp);
+ trace_xfs_refcount_modify_extent(cur, &tmp);
/*
* Either cover the hole (increment) or
@@ -1173,7 +1159,7 @@ xfs_refcount_adjust_extents(
tmp.rc_startblock);
error = xfs_free_extent_later(cur->bc_tp, fsbno,
tmp.rc_blockcount, NULL,
- XFS_AG_RESV_NONE, false);
+ XFS_AG_RESV_NONE, 0);
if (error)
goto out_error;
}
@@ -1214,8 +1200,7 @@ xfs_refcount_adjust_extents(
if (ext.rc_refcount == MAXREFCOUNT)
goto skip;
ext.rc_refcount += adj;
- trace_xfs_refcount_modify_extent(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, &ext);
+ trace_xfs_refcount_modify_extent(cur, &ext);
cur->bc_refc.nr_ops++;
if (ext.rc_refcount > 1) {
error = xfs_refcount_update(cur, &ext);
@@ -1237,7 +1222,7 @@ xfs_refcount_adjust_extents(
ext.rc_startblock);
error = xfs_free_extent_later(cur->bc_tp, fsbno,
ext.rc_blockcount, NULL,
- XFS_AG_RESV_NONE, false);
+ XFS_AG_RESV_NONE, 0);
if (error)
goto out_error;
}
@@ -1254,8 +1239,7 @@ advloop:
return error;
out_error:
- trace_xfs_refcount_modify_extent_error(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ trace_xfs_refcount_modify_extent_error(cur, error, _RET_IP_);
return error;
}
@@ -1272,11 +1256,9 @@ xfs_refcount_adjust(
int error;
if (adj == XFS_REFCOUNT_ADJUST_INCREASE)
- trace_xfs_refcount_increase(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, *agbno, *aglen);
+ trace_xfs_refcount_increase(cur, *agbno, *aglen);
else
- trace_xfs_refcount_decrease(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, *agbno, *aglen);
+ trace_xfs_refcount_decrease(cur, *agbno, *aglen);
/*
* Ensure that no rcextents cross the boundary of the adjustment range.
@@ -1315,28 +1297,10 @@ xfs_refcount_adjust(
return 0;
out_error:
- trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- error, _RET_IP_);
+ trace_xfs_refcount_adjust_error(cur, error, _RET_IP_);
return error;
}
-/* Clean up after calling xfs_refcount_finish_one. */
-void
-xfs_refcount_finish_one_cleanup(
- struct xfs_trans *tp,
- struct xfs_btree_cur *rcur,
- int error)
-{
- struct xfs_buf *agbp;
-
- if (rcur == NULL)
- return;
- agbp = rcur->bc_ag.agbp;
- xfs_btree_del_cursor(rcur, error);
- if (error)
- xfs_trans_brelse(tp, agbp);
-}
-
/*
* Set up a continuation a deferred refcount operation by updating the intent.
* Checks to make sure we're not going to run off the end of the AG.
@@ -1378,7 +1342,7 @@ xfs_refcount_finish_one(
struct xfs_btree_cur **pcur)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_btree_cur *rcur;
+ struct xfs_btree_cur *rcur = *pcur;
struct xfs_buf *agbp = NULL;
int error = 0;
xfs_agblock_t bno;
@@ -1387,9 +1351,7 @@ xfs_refcount_finish_one(
bno = XFS_FSB_TO_AGBNO(mp, ri->ri_startblock);
- trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock),
- ri->ri_type, XFS_FSB_TO_AGBNO(mp, ri->ri_startblock),
- ri->ri_blockcount);
+ trace_xfs_refcount_deferred(mp, ri);
if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
return -EIO;
@@ -1398,11 +1360,10 @@ xfs_refcount_finish_one(
* If we haven't gotten a cursor or the cursor AG doesn't match
* the startblock, get one now.
*/
- rcur = *pcur;
if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) {
nr_ops = rcur->bc_refc.nr_ops;
shape_changes = rcur->bc_refc.shape_changes;
- xfs_refcount_finish_one_cleanup(tp, rcur, 0);
+ xfs_btree_del_cursor(rcur, 0);
rcur = NULL;
*pcur = NULL;
}
@@ -1412,11 +1373,11 @@ xfs_refcount_finish_one(
if (error)
return error;
- rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, ri->ri_pag);
+ *pcur = rcur = xfs_refcountbt_init_cursor(mp, tp, agbp,
+ ri->ri_pag);
rcur->bc_refc.nr_ops = nr_ops;
rcur->bc_refc.shape_changes = shape_changes;
}
- *pcur = rcur;
switch (ri->ri_type) {
case XFS_REFCOUNT_INCREASE:
@@ -1452,8 +1413,7 @@ xfs_refcount_finish_one(
return -EFSCORRUPTED;
}
if (!error && ri->ri_blockcount > 0)
- trace_xfs_refcount_finish_one_leftover(mp, ri->ri_pag->pag_agno,
- ri->ri_type, bno, ri->ri_blockcount);
+ trace_xfs_refcount_finish_one_leftover(mp, ri);
return error;
}
@@ -1469,11 +1429,6 @@ __xfs_refcount_add(
{
struct xfs_refcount_intent *ri;
- trace_xfs_refcount_defer(tp->t_mountp,
- XFS_FSB_TO_AGNO(tp->t_mountp, startblock),
- type, XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
- blockcount);
-
ri = kmem_cache_alloc(xfs_refcount_intent_cache,
GFP_KERNEL | __GFP_NOFAIL);
INIT_LIST_HEAD(&ri->ri_list);
@@ -1481,8 +1436,7 @@ __xfs_refcount_add(
ri->ri_startblock = startblock;
ri->ri_blockcount = blockcount;
- xfs_refcount_update_get_group(tp->t_mountp, ri);
- xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type);
+ xfs_refcount_defer_add(tp, ri);
}
/*
@@ -1537,8 +1491,7 @@ xfs_refcount_find_shared(
int have;
int error;
- trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- agbno, aglen);
+ trace_xfs_refcount_find_shared(cur, agbno, aglen);
/* By default, skip the whole range */
*fbno = NULLAGBLOCK;
@@ -1625,13 +1578,11 @@ xfs_refcount_find_shared(
}
done:
- trace_xfs_refcount_find_shared_result(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, *fbno, *flen);
+ trace_xfs_refcount_find_shared_result(cur, *fbno, *flen);
out_error:
if (error)
- trace_xfs_refcount_find_shared_error(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ trace_xfs_refcount_find_shared_error(cur, error, _RET_IP_);
return error;
}
@@ -1737,8 +1688,7 @@ xfs_refcount_adjust_cow_extents(
tmp.rc_refcount = 1;
tmp.rc_domain = XFS_REFC_DOMAIN_COW;
- trace_xfs_refcount_modify_extent(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, &tmp);
+ trace_xfs_refcount_modify_extent(cur, &tmp);
error = xfs_refcount_insert(cur, &tmp,
&found_tmp);
@@ -1769,8 +1719,7 @@ xfs_refcount_adjust_cow_extents(
}
ext.rc_refcount = 0;
- trace_xfs_refcount_modify_extent(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, &ext);
+ trace_xfs_refcount_modify_extent(cur, &ext);
error = xfs_refcount_delete(cur, &found_rec);
if (error)
goto out_error;
@@ -1786,8 +1735,7 @@ xfs_refcount_adjust_cow_extents(
return error;
out_error:
- trace_xfs_refcount_modify_extent_error(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ trace_xfs_refcount_modify_extent_error(cur, error, _RET_IP_);
return error;
}
@@ -1833,8 +1781,7 @@ xfs_refcount_adjust_cow(
return 0;
out_error:
- trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- error, _RET_IP_);
+ trace_xfs_refcount_adjust_cow_error(cur, error, _RET_IP_);
return error;
}
@@ -1847,8 +1794,7 @@ __xfs_refcount_cow_alloc(
xfs_agblock_t agbno,
xfs_extlen_t aglen)
{
- trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_ag.pag->pag_agno,
- agbno, aglen);
+ trace_xfs_refcount_cow_increase(rcur, agbno, aglen);
/* Add refcount btree reservation */
return xfs_refcount_adjust_cow(rcur, agbno, aglen,
@@ -1864,8 +1810,7 @@ __xfs_refcount_cow_free(
xfs_agblock_t agbno,
xfs_extlen_t aglen)
{
- trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_ag.pag->pag_agno,
- agbno, aglen);
+ trace_xfs_refcount_cow_decrease(rcur, agbno, aglen);
/* Remove refcount btree reservation */
return xfs_refcount_adjust_cow(rcur, agbno, aglen,
@@ -2010,9 +1955,6 @@ xfs_refcount_recover_cow_leftovers(
if (error)
goto out_free;
- trace_xfs_refcount_recover_extent(mp, pag->pag_agno,
- &rr->rr_rrec);
-
/* Free the orphan record */
fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno,
rr->rr_rrec.rc_startblock);
@@ -2022,7 +1964,7 @@ xfs_refcount_recover_cow_leftovers(
/* Free the block. */
error = xfs_free_extent_later(tp, fsb,
rr->rr_rrec.rc_blockcount, NULL,
- XFS_AG_RESV_NONE, false);
+ XFS_AG_RESV_NONE, 0);
if (error)
goto out_trans;
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 9b56768a590c..68acb0b1b4a8 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -48,6 +48,12 @@ enum xfs_refcount_intent_type {
XFS_REFCOUNT_FREE_COW,
};
+#define XFS_REFCOUNT_INTENT_STRINGS \
+ { XFS_REFCOUNT_INCREASE, "incr" }, \
+ { XFS_REFCOUNT_DECREASE, "decr" }, \
+ { XFS_REFCOUNT_ALLOC_COW, "alloc_cow" }, \
+ { XFS_REFCOUNT_FREE_COW, "free_cow" }
+
struct xfs_refcount_intent {
struct list_head ri_list;
struct xfs_perag *ri_pag;
@@ -68,16 +74,11 @@ xfs_refcount_check_domain(
return true;
}
-void xfs_refcount_update_get_group(struct xfs_mount *mp,
- struct xfs_refcount_intent *ri);
-
void xfs_refcount_increase_extent(struct xfs_trans *tp,
struct xfs_bmbt_irec *irec);
void xfs_refcount_decrease_extent(struct xfs_trans *tp,
struct xfs_bmbt_irec *irec);
-extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp,
- struct xfs_btree_cur *rcur, int error);
extern int xfs_refcount_finish_one(struct xfs_trans *tp,
struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur);
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index ca59f6c89f3e..cb3b1d42ae9a 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -109,7 +109,7 @@ xfs_refcountbt_free_block(
be32_add_cpu(&agf->agf_refcount_blocks, -1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
return xfs_free_extent_later(cur->bc_tp, fsbno, 1,
- &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA, false);
+ &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA, 0);
}
STATIC int
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index ef16f6f9cef6..6ef4687b3aba 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -24,6 +24,7 @@
#include "xfs_inode.h"
#include "xfs_ag.h"
#include "xfs_health.h"
+#include "xfs_rmap_item.h"
struct kmem_cache *xfs_rmap_intent_cache;
@@ -100,8 +101,7 @@ xfs_rmap_update(
union xfs_btree_rec rec;
int error;
- trace_xfs_rmap_update(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- irec->rm_startblock, irec->rm_blockcount,
+ trace_xfs_rmap_update(cur, irec->rm_startblock, irec->rm_blockcount,
irec->rm_owner, irec->rm_offset, irec->rm_flags);
rec.rmap.rm_startblock = cpu_to_be32(irec->rm_startblock);
@@ -111,8 +111,7 @@ xfs_rmap_update(
xfs_rmap_irec_offset_pack(irec));
error = xfs_btree_update(cur, &rec);
if (error)
- trace_xfs_rmap_update_error(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ trace_xfs_rmap_update_error(cur, error, _RET_IP_);
return error;
}
@@ -128,8 +127,7 @@ xfs_rmap_insert(
int i;
int error;
- trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno,
- len, owner, offset, flags);
+ trace_xfs_rmap_insert(rcur, agbno, len, owner, offset, flags);
error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
if (error)
@@ -155,8 +153,7 @@ xfs_rmap_insert(
}
done:
if (error)
- trace_xfs_rmap_insert_error(rcur->bc_mp,
- rcur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ trace_xfs_rmap_insert_error(rcur, error, _RET_IP_);
return error;
}
@@ -172,8 +169,7 @@ xfs_rmap_delete(
int i;
int error;
- trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno,
- len, owner, offset, flags);
+ trace_xfs_rmap_delete(rcur, agbno, len, owner, offset, flags);
error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
if (error)
@@ -194,8 +190,7 @@ xfs_rmap_delete(
}
done:
if (error)
- trace_xfs_rmap_delete_error(rcur->bc_mp,
- rcur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ trace_xfs_rmap_delete_error(rcur, error, _RET_IP_);
return error;
}
@@ -342,8 +337,7 @@ xfs_rmap_find_left_neighbor_helper(
{
struct xfs_find_left_neighbor_info *info = priv;
- trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, rec->rm_startblock,
+ trace_xfs_rmap_find_left_neighbor_candidate(cur, rec->rm_startblock,
rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
rec->rm_flags);
@@ -393,8 +387,8 @@ xfs_rmap_find_left_neighbor(
info.high.rm_blockcount = 0;
info.irec = irec;
- trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags);
+ trace_xfs_rmap_find_left_neighbor_query(cur, bno, 0, owner, offset,
+ flags);
/*
* Historically, we always used the range query to walk every reverse
@@ -425,8 +419,7 @@ xfs_rmap_find_left_neighbor(
return error;
*stat = 1;
- trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, irec->rm_startblock,
+ trace_xfs_rmap_find_left_neighbor_result(cur, irec->rm_startblock,
irec->rm_blockcount, irec->rm_owner, irec->rm_offset,
irec->rm_flags);
return 0;
@@ -441,8 +434,7 @@ xfs_rmap_lookup_le_range_helper(
{
struct xfs_find_left_neighbor_info *info = priv;
- trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, rec->rm_startblock,
+ trace_xfs_rmap_lookup_le_range_candidate(cur, rec->rm_startblock,
rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
rec->rm_flags);
@@ -489,8 +481,7 @@ xfs_rmap_lookup_le_range(
*stat = 0;
info.irec = irec;
- trace_xfs_rmap_lookup_le_range(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- bno, 0, owner, offset, flags);
+ trace_xfs_rmap_lookup_le_range(cur, bno, 0, owner, offset, flags);
/*
* Historically, we always used the range query to walk every reverse
@@ -521,8 +512,7 @@ xfs_rmap_lookup_le_range(
return error;
*stat = 1;
- trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, irec->rm_startblock,
+ trace_xfs_rmap_lookup_le_range_result(cur, irec->rm_startblock,
irec->rm_blockcount, irec->rm_owner, irec->rm_offset,
irec->rm_flags);
return 0;
@@ -634,8 +624,7 @@ xfs_rmap_unmap(
(flags & XFS_RMAP_BMBT_BLOCK);
if (unwritten)
flags |= XFS_RMAP_UNWRITTEN;
- trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len,
- unwritten, oinfo);
+ trace_xfs_rmap_unmap(cur, bno, len, unwritten, oinfo);
/*
* We should always have a left record because there's a static record
@@ -651,10 +640,9 @@ xfs_rmap_unmap(
goto out_error;
}
- trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, ltrec.rm_startblock,
- ltrec.rm_blockcount, ltrec.rm_owner,
- ltrec.rm_offset, ltrec.rm_flags);
+ trace_xfs_rmap_lookup_le_range_result(cur, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset,
+ ltrec.rm_flags);
ltoff = ltrec.rm_offset;
/*
@@ -721,10 +709,9 @@ xfs_rmap_unmap(
if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) {
/* exact match, simply remove the record from rmap tree */
- trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
- ltrec.rm_startblock, ltrec.rm_blockcount,
- ltrec.rm_owner, ltrec.rm_offset,
- ltrec.rm_flags);
+ trace_xfs_rmap_delete(cur, ltrec.rm_startblock,
+ ltrec.rm_blockcount, ltrec.rm_owner,
+ ltrec.rm_offset, ltrec.rm_flags);
error = xfs_btree_delete(cur, &i);
if (error)
goto out_error;
@@ -800,8 +787,7 @@ xfs_rmap_unmap(
else
cur->bc_rec.r.rm_offset = offset + len;
cur->bc_rec.r.rm_flags = flags;
- trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno,
- cur->bc_rec.r.rm_startblock,
+ trace_xfs_rmap_insert(cur, cur->bc_rec.r.rm_startblock,
cur->bc_rec.r.rm_blockcount,
cur->bc_rec.r.rm_owner,
cur->bc_rec.r.rm_offset,
@@ -812,12 +798,10 @@ xfs_rmap_unmap(
}
out_done:
- trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
- unwritten, oinfo);
+ trace_xfs_rmap_unmap_done(cur, bno, len, unwritten, oinfo);
out_error:
if (error)
- trace_xfs_rmap_unmap_error(mp, cur->bc_ag.pag->pag_agno,
- error, _RET_IP_);
+ trace_xfs_rmap_unmap_error(cur, error, _RET_IP_);
return error;
}
@@ -987,8 +971,7 @@ xfs_rmap_map(
(flags & XFS_RMAP_BMBT_BLOCK);
if (unwritten)
flags |= XFS_RMAP_UNWRITTEN;
- trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len,
- unwritten, oinfo);
+ trace_xfs_rmap_map(cur, bno, len, unwritten, oinfo);
ASSERT(!xfs_rmap_should_skip_owner_update(oinfo));
/*
@@ -1001,8 +984,7 @@ xfs_rmap_map(
if (error)
goto out_error;
if (have_lt) {
- trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, ltrec.rm_startblock,
+ trace_xfs_rmap_lookup_le_range_result(cur, ltrec.rm_startblock,
ltrec.rm_blockcount, ltrec.rm_owner,
ltrec.rm_offset, ltrec.rm_flags);
@@ -1040,10 +1022,10 @@ xfs_rmap_map(
error = -EFSCORRUPTED;
goto out_error;
}
- trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, gtrec.rm_startblock,
- gtrec.rm_blockcount, gtrec.rm_owner,
- gtrec.rm_offset, gtrec.rm_flags);
+ trace_xfs_rmap_find_right_neighbor_result(cur,
+ gtrec.rm_startblock, gtrec.rm_blockcount,
+ gtrec.rm_owner, gtrec.rm_offset,
+ gtrec.rm_flags);
if (!xfs_rmap_is_mergeable(&gtrec, owner, flags))
have_gt = 0;
}
@@ -1080,12 +1062,9 @@ xfs_rmap_map(
* result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr|
*/
ltrec.rm_blockcount += gtrec.rm_blockcount;
- trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
- gtrec.rm_startblock,
- gtrec.rm_blockcount,
- gtrec.rm_owner,
- gtrec.rm_offset,
- gtrec.rm_flags);
+ trace_xfs_rmap_delete(cur, gtrec.rm_startblock,
+ gtrec.rm_blockcount, gtrec.rm_owner,
+ gtrec.rm_offset, gtrec.rm_flags);
error = xfs_btree_delete(cur, &i);
if (error)
goto out_error;
@@ -1132,8 +1111,7 @@ xfs_rmap_map(
cur->bc_rec.r.rm_owner = owner;
cur->bc_rec.r.rm_offset = offset;
cur->bc_rec.r.rm_flags = flags;
- trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len,
- owner, offset, flags);
+ trace_xfs_rmap_insert(cur, bno, len, owner, offset, flags);
error = xfs_btree_insert(cur, &i);
if (error)
goto out_error;
@@ -1144,12 +1122,10 @@ xfs_rmap_map(
}
}
- trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
- unwritten, oinfo);
+ trace_xfs_rmap_map_done(cur, bno, len, unwritten, oinfo);
out_error:
if (error)
- trace_xfs_rmap_map_error(mp, cur->bc_ag.pag->pag_agno,
- error, _RET_IP_);
+ trace_xfs_rmap_map_error(cur, error, _RET_IP_);
return error;
}
@@ -1223,8 +1199,7 @@ xfs_rmap_convert(
(flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
new_endoff = offset + len;
- trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len,
- unwritten, oinfo);
+ trace_xfs_rmap_convert(cur, bno, len, unwritten, oinfo);
/*
* For the initial lookup, look for an exact match or the left-adjacent
@@ -1240,10 +1215,9 @@ xfs_rmap_convert(
goto done;
}
- trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, PREV.rm_startblock,
- PREV.rm_blockcount, PREV.rm_owner,
- PREV.rm_offset, PREV.rm_flags);
+ trace_xfs_rmap_lookup_le_range_result(cur, PREV.rm_startblock,
+ PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset,
+ PREV.rm_flags);
ASSERT(PREV.rm_offset <= offset);
ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff);
@@ -1284,10 +1258,9 @@ xfs_rmap_convert(
error = -EFSCORRUPTED;
goto done;
}
- trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, LEFT.rm_startblock,
- LEFT.rm_blockcount, LEFT.rm_owner,
- LEFT.rm_offset, LEFT.rm_flags);
+ trace_xfs_rmap_find_left_neighbor_result(cur,
+ LEFT.rm_startblock, LEFT.rm_blockcount,
+ LEFT.rm_owner, LEFT.rm_offset, LEFT.rm_flags);
if (LEFT.rm_startblock + LEFT.rm_blockcount == bno &&
LEFT.rm_offset + LEFT.rm_blockcount == offset &&
xfs_rmap_is_mergeable(&LEFT, owner, newext))
@@ -1325,10 +1298,10 @@ xfs_rmap_convert(
error = -EFSCORRUPTED;
goto done;
}
- trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock,
- RIGHT.rm_blockcount, RIGHT.rm_owner,
- RIGHT.rm_offset, RIGHT.rm_flags);
+ trace_xfs_rmap_find_right_neighbor_result(cur,
+ RIGHT.rm_startblock, RIGHT.rm_blockcount,
+ RIGHT.rm_owner, RIGHT.rm_offset,
+ RIGHT.rm_flags);
if (bno + len == RIGHT.rm_startblock &&
offset + len == RIGHT.rm_offset &&
xfs_rmap_is_mergeable(&RIGHT, owner, newext))
@@ -1344,8 +1317,7 @@ xfs_rmap_convert(
RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
state &= ~RMAP_RIGHT_CONTIG;
- trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state,
- _RET_IP_);
+ trace_xfs_rmap_convert_state(cur, state, _RET_IP_);
/* reset the cursor back to PREV */
error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, NULL, &i);
@@ -1376,10 +1348,9 @@ xfs_rmap_convert(
error = -EFSCORRUPTED;
goto done;
}
- trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
- RIGHT.rm_startblock, RIGHT.rm_blockcount,
- RIGHT.rm_owner, RIGHT.rm_offset,
- RIGHT.rm_flags);
+ trace_xfs_rmap_delete(cur, RIGHT.rm_startblock,
+ RIGHT.rm_blockcount, RIGHT.rm_owner,
+ RIGHT.rm_offset, RIGHT.rm_flags);
error = xfs_btree_delete(cur, &i);
if (error)
goto done;
@@ -1396,10 +1367,9 @@ xfs_rmap_convert(
error = -EFSCORRUPTED;
goto done;
}
- trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
- PREV.rm_startblock, PREV.rm_blockcount,
- PREV.rm_owner, PREV.rm_offset,
- PREV.rm_flags);
+ trace_xfs_rmap_delete(cur, PREV.rm_startblock,
+ PREV.rm_blockcount, PREV.rm_owner,
+ PREV.rm_offset, PREV.rm_flags);
error = xfs_btree_delete(cur, &i);
if (error)
goto done;
@@ -1428,10 +1398,9 @@ xfs_rmap_convert(
* Setting all of a previous oldext extent to newext.
* The left neighbor is contiguous, the right is not.
*/
- trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
- PREV.rm_startblock, PREV.rm_blockcount,
- PREV.rm_owner, PREV.rm_offset,
- PREV.rm_flags);
+ trace_xfs_rmap_delete(cur, PREV.rm_startblock,
+ PREV.rm_blockcount, PREV.rm_owner,
+ PREV.rm_offset, PREV.rm_flags);
error = xfs_btree_delete(cur, &i);
if (error)
goto done;
@@ -1468,10 +1437,9 @@ xfs_rmap_convert(
error = -EFSCORRUPTED;
goto done;
}
- trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno,
- RIGHT.rm_startblock, RIGHT.rm_blockcount,
- RIGHT.rm_owner, RIGHT.rm_offset,
- RIGHT.rm_flags);
+ trace_xfs_rmap_delete(cur, RIGHT.rm_startblock,
+ RIGHT.rm_blockcount, RIGHT.rm_owner,
+ RIGHT.rm_offset, RIGHT.rm_flags);
error = xfs_btree_delete(cur, &i);
if (error)
goto done;
@@ -1549,8 +1517,7 @@ xfs_rmap_convert(
NEW.rm_blockcount = len;
NEW.rm_flags = newext;
cur->bc_rec.r = NEW;
- trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno,
- len, owner, offset, newext);
+ trace_xfs_rmap_insert(cur, bno, len, owner, offset, newext);
error = xfs_btree_insert(cur, &i);
if (error)
goto done;
@@ -1608,8 +1575,7 @@ xfs_rmap_convert(
NEW.rm_blockcount = len;
NEW.rm_flags = newext;
cur->bc_rec.r = NEW;
- trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno,
- len, owner, offset, newext);
+ trace_xfs_rmap_insert(cur, bno, len, owner, offset, newext);
error = xfs_btree_insert(cur, &i);
if (error)
goto done;
@@ -1640,9 +1606,8 @@ xfs_rmap_convert(
NEW = PREV;
NEW.rm_blockcount = offset - PREV.rm_offset;
cur->bc_rec.r = NEW;
- trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno,
- NEW.rm_startblock, NEW.rm_blockcount,
- NEW.rm_owner, NEW.rm_offset,
+ trace_xfs_rmap_insert(cur, NEW.rm_startblock,
+ NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset,
NEW.rm_flags);
error = xfs_btree_insert(cur, &i);
if (error)
@@ -1669,8 +1634,7 @@ xfs_rmap_convert(
/* new middle extent - newext */
cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN;
cur->bc_rec.r.rm_flags |= newext;
- trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len,
- owner, offset, newext);
+ trace_xfs_rmap_insert(cur, bno, len, owner, offset, newext);
error = xfs_btree_insert(cur, &i);
if (error)
goto done;
@@ -1694,12 +1658,10 @@ xfs_rmap_convert(
ASSERT(0);
}
- trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
- unwritten, oinfo);
+ trace_xfs_rmap_convert_done(cur, bno, len, unwritten, oinfo);
done:
if (error)
- trace_xfs_rmap_convert_error(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ trace_xfs_rmap_convert_error(cur, error, _RET_IP_);
return error;
}
@@ -1735,8 +1697,7 @@ xfs_rmap_convert_shared(
(flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
new_endoff = offset + len;
- trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len,
- unwritten, oinfo);
+ trace_xfs_rmap_convert(cur, bno, len, unwritten, oinfo);
/*
* For the initial lookup, look for and exact match or the left-adjacent
@@ -1805,10 +1766,10 @@ xfs_rmap_convert_shared(
error = -EFSCORRUPTED;
goto done;
}
- trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock,
- RIGHT.rm_blockcount, RIGHT.rm_owner,
- RIGHT.rm_offset, RIGHT.rm_flags);
+ trace_xfs_rmap_find_right_neighbor_result(cur,
+ RIGHT.rm_startblock, RIGHT.rm_blockcount,
+ RIGHT.rm_owner, RIGHT.rm_offset,
+ RIGHT.rm_flags);
if (xfs_rmap_is_mergeable(&RIGHT, owner, newext))
state |= RMAP_RIGHT_CONTIG;
}
@@ -1822,8 +1783,7 @@ xfs_rmap_convert_shared(
RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
state &= ~RMAP_RIGHT_CONTIG;
- trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state,
- _RET_IP_);
+ trace_xfs_rmap_convert_state(cur, state, _RET_IP_);
/*
* Switch out based on the FILLING and CONTIG state bits.
*/
@@ -2121,12 +2081,10 @@ xfs_rmap_convert_shared(
ASSERT(0);
}
- trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
- unwritten, oinfo);
+ trace_xfs_rmap_convert_done(cur, bno, len, unwritten, oinfo);
done:
if (error)
- trace_xfs_rmap_convert_error(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ trace_xfs_rmap_convert_error(cur, error, _RET_IP_);
return error;
}
@@ -2164,8 +2122,7 @@ xfs_rmap_unmap_shared(
xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
if (unwritten)
flags |= XFS_RMAP_UNWRITTEN;
- trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len,
- unwritten, oinfo);
+ trace_xfs_rmap_unmap(cur, bno, len, unwritten, oinfo);
/*
* We should always have a left record because there's a static record
@@ -2321,12 +2278,10 @@ xfs_rmap_unmap_shared(
goto out_error;
}
- trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
- unwritten, oinfo);
+ trace_xfs_rmap_unmap_done(cur, bno, len, unwritten, oinfo);
out_error:
if (error)
- trace_xfs_rmap_unmap_error(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ trace_xfs_rmap_unmap_error(cur, error, _RET_IP_);
return error;
}
@@ -2361,8 +2316,7 @@ xfs_rmap_map_shared(
xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
if (unwritten)
flags |= XFS_RMAP_UNWRITTEN;
- trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len,
- unwritten, oinfo);
+ trace_xfs_rmap_map(cur, bno, len, unwritten, oinfo);
/* Is there a left record that abuts our range? */
error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, flags,
@@ -2387,10 +2341,10 @@ xfs_rmap_map_shared(
error = -EFSCORRUPTED;
goto out_error;
}
- trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, gtrec.rm_startblock,
- gtrec.rm_blockcount, gtrec.rm_owner,
- gtrec.rm_offset, gtrec.rm_flags);
+ trace_xfs_rmap_find_right_neighbor_result(cur,
+ gtrec.rm_startblock, gtrec.rm_blockcount,
+ gtrec.rm_owner, gtrec.rm_offset,
+ gtrec.rm_flags);
if (!xfs_rmap_is_mergeable(&gtrec, owner, flags))
have_gt = 0;
@@ -2482,12 +2436,10 @@ xfs_rmap_map_shared(
goto out_error;
}
- trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len,
- unwritten, oinfo);
+ trace_xfs_rmap_map_done(cur, bno, len, unwritten, oinfo);
out_error:
if (error)
- trace_xfs_rmap_map_error(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, error, _RET_IP_);
+ trace_xfs_rmap_map_error(cur, error, _RET_IP_);
return error;
}
@@ -2572,23 +2524,6 @@ xfs_rmap_query_all(
return xfs_btree_query_all(cur, xfs_rmap_query_range_helper, &query);
}
-/* Clean up after calling xfs_rmap_finish_one. */
-void
-xfs_rmap_finish_one_cleanup(
- struct xfs_trans *tp,
- struct xfs_btree_cur *rcur,
- int error)
-{
- struct xfs_buf *agbp;
-
- if (rcur == NULL)
- return;
- agbp = rcur->bc_ag.agbp;
- xfs_btree_del_cursor(rcur, error);
- if (error)
- xfs_trans_brelse(tp, agbp);
-}
-
/* Commit an rmap operation into the ondisk tree. */
int
__xfs_rmap_finish_intent(
@@ -2634,20 +2569,15 @@ xfs_rmap_finish_one(
struct xfs_rmap_intent *ri,
struct xfs_btree_cur **pcur)
{
+ struct xfs_owner_info oinfo;
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_btree_cur *rcur;
+ struct xfs_btree_cur *rcur = *pcur;
struct xfs_buf *agbp = NULL;
- int error = 0;
- struct xfs_owner_info oinfo;
xfs_agblock_t bno;
bool unwritten;
+ int error = 0;
- bno = XFS_FSB_TO_AGBNO(mp, ri->ri_bmap.br_startblock);
-
- trace_xfs_rmap_deferred(mp, ri->ri_pag->pag_agno, ri->ri_type, bno,
- ri->ri_owner, ri->ri_whichfork,
- ri->ri_bmap.br_startoff, ri->ri_bmap.br_blockcount,
- ri->ri_bmap.br_state);
+ trace_xfs_rmap_deferred(mp, ri);
if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE))
return -EIO;
@@ -2656,9 +2586,8 @@ xfs_rmap_finish_one(
* If we haven't gotten a cursor or the cursor AG doesn't match
* the startblock, get one now.
*/
- rcur = *pcur;
if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) {
- xfs_rmap_finish_one_cleanup(tp, rcur, 0);
+ xfs_btree_del_cursor(rcur, 0);
rcur = NULL;
*pcur = NULL;
}
@@ -2678,9 +2607,8 @@ xfs_rmap_finish_one(
return -EFSCORRUPTED;
}
- rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag);
+ *pcur = rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag);
}
- *pcur = rcur;
xfs_rmap_ino_owner(&oinfo, ri->ri_owner, ri->ri_whichfork,
ri->ri_bmap.br_startoff);
@@ -2722,15 +2650,6 @@ __xfs_rmap_add(
{
struct xfs_rmap_intent *ri;
- trace_xfs_rmap_defer(tp->t_mountp,
- XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock),
- type,
- XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock),
- owner, whichfork,
- bmap->br_startoff,
- bmap->br_blockcount,
- bmap->br_state);
-
ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL);
INIT_LIST_HEAD(&ri->ri_list);
ri->ri_type = type;
@@ -2738,8 +2657,7 @@ __xfs_rmap_add(
ri->ri_whichfork = whichfork;
ri->ri_bmap = *bmap;
- xfs_rmap_update_get_group(tp->t_mountp, ri);
- xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type);
+ xfs_rmap_defer_add(tp, ri);
}
/* Map an extent into a file. */
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 9d01fe689497..b783dd4dd95d 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -157,6 +157,16 @@ enum xfs_rmap_intent_type {
XFS_RMAP_FREE,
};
+#define XFS_RMAP_INTENT_STRINGS \
+ { XFS_RMAP_MAP, "map" }, \
+ { XFS_RMAP_MAP_SHARED, "map_shared" }, \
+ { XFS_RMAP_UNMAP, "unmap" }, \
+ { XFS_RMAP_UNMAP_SHARED, "unmap_shared" }, \
+ { XFS_RMAP_CONVERT, "cvt" }, \
+ { XFS_RMAP_CONVERT_SHARED, "cvt_shared" }, \
+ { XFS_RMAP_ALLOC, "alloc" }, \
+ { XFS_RMAP_FREE, "free" }
+
struct xfs_rmap_intent {
struct list_head ri_list;
enum xfs_rmap_intent_type ri_type;
@@ -166,9 +176,6 @@ struct xfs_rmap_intent {
struct xfs_perag *ri_pag;
};
-void xfs_rmap_update_get_group(struct xfs_mount *mp,
- struct xfs_rmap_intent *ri);
-
/* functions for updating the rmapbt based on bmbt map/unmap operations */
void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
int whichfork, struct xfs_bmbt_irec *imap);
@@ -182,8 +189,6 @@ void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
-void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,
- struct xfs_btree_cur *rcur, int error);
int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri,
struct xfs_btree_cur **pcur);
int __xfs_rmap_finish_intent(struct xfs_btree_cur *rcur,
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 9e759efa81cc..56fd6c4bd8b4 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -88,6 +88,7 @@ xfs_rmapbt_alloc_block(
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agf *agf = agbp->b_addr;
struct xfs_perag *pag = cur->bc_ag.pag;
+ struct xfs_alloc_arg args = { .len = 1 };
int error;
xfs_agblock_t bno;
@@ -107,7 +108,11 @@ xfs_rmapbt_alloc_block(
be32_add_cpu(&agf->agf_rmap_blocks, 1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
- xfs_ag_resv_rmapbt_alloc(cur->bc_mp, pag->pag_agno);
+ /*
+ * Since rmapbt blocks are sourced from the AGFL, they are allocated one
+ * at a time and the reservation updates don't require a transaction.
+ */
+ xfs_ag_resv_alloc_extent(pag, XFS_AG_RESV_RMAPBT, &args);
*stat = 1;
return 0;
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 34f104ed372c..2f7413afbf46 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -177,13 +177,6 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp,
#define XFS_REFC_BTREE_REF 1
#define XFS_SSB_REF 0
-/*
- * Flags for xfs_trans_ichgtime().
- */
-#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */
-#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */
-#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */
-
/* Computed inode geometry for the filesystem. */
struct xfs_ino_geometry {
/* Maximum inode count in this filesystem. */
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index 69fc5b981352..3c40f37e82c7 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -68,6 +68,8 @@ xfs_trans_ichgtime(
inode_set_mtime_to_ts(inode, tv);
if (flags & XFS_ICHGTIME_CHG)
inode_set_ctime_to_ts(inode, tv);
+ if (flags & XFS_ICHGTIME_ACCESS)
+ inode_set_atime_to_ts(inode, tv);
if (flags & XFS_ICHGTIME_CREATE)
ip->i_crtime = tv;
}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 6dbe6e7251e7..3dc8f785bf29 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -22,7 +22,6 @@
#include "xfs_rtbitmap.h"
#include "xfs_attr_item.h"
#include "xfs_log.h"
-#include "xfs_da_format.h"
#define _ALLOC true
#define _FREE false
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 1ad8ec63a7f4..22f5f1a9d3f0 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -26,6 +26,7 @@
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_dir2_priv.h"
+#include "xfs_dir2.h"
#include "xfs_attr.h"
#include "xfs_reflink.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
index 4a0271123d94..2aa14b7ab630 100644
--- a/fs/xfs/scrub/newbt.c
+++ b/fs/xfs/scrub/newbt.c
@@ -160,7 +160,8 @@ xrep_newbt_add_blocks(
if (args->tp) {
ASSERT(xnr->oinfo.oi_offset == 0);
- error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap);
+ error = xfs_alloc_schedule_autoreap(args,
+ XFS_FREE_EXTENT_SKIP_DISCARD, &resv->autoreap);
if (error)
goto out_pag;
}
@@ -414,7 +415,7 @@ xrep_newbt_free_extent(
*/
fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno);
error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo,
- xnr->resv, true);
+ xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
if (error)
return error;
diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c
index 90cd1512bba9..cd51f10f2920 100644
--- a/fs/xfs/scrub/quota_repair.c
+++ b/fs/xfs/scrub/quota_repair.c
@@ -12,7 +12,6 @@
#include "xfs_defer.h"
#include "xfs_btree.h"
#include "xfs_bit.h"
-#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index be283153c254..53697f3c5e1b 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -451,7 +451,7 @@ xreap_agextent_iter(
xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL,
- rs->resv, true);
+ rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
if (error)
return error;
@@ -477,7 +477,7 @@ xreap_agextent_iter(
* system with large EFIs.
*/
error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
- rs->resv, true);
+ rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
if (error)
return error;
@@ -943,7 +943,8 @@ xrep_reap_bmapi_iter(
xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
-(int64_t)imap->br_blockcount);
return xfs_free_extent_later(sc->tp, imap->br_startblock,
- imap->br_blockcount, NULL, XFS_AG_RESV_NONE, true);
+ imap->br_blockcount, NULL, XFS_AG_RESV_NONE,
+ XFS_FREE_EXTENT_SKIP_DISCARD);
}
/*
diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c
index b747b625c5ee..d390d56cd875 100644
--- a/fs/xfs/scrub/tempfile.c
+++ b/fs/xfs/scrub/tempfile.c
@@ -40,11 +40,16 @@ xrep_tempfile_create(
struct xfs_scrub *sc,
uint16_t mode)
{
+ struct xfs_icreate_args args = {
+ .pip = sc->mp->m_rootip,
+ .mode = mode,
+ .flags = XFS_ICREATE_TMPFILE | XFS_ICREATE_UNLINKABLE,
+ };
struct xfs_mount *mp = sc->mp;
struct xfs_trans *tp = NULL;
- struct xfs_dquot *udqp = NULL;
- struct xfs_dquot *gdqp = NULL;
- struct xfs_dquot *pdqp = NULL;
+ struct xfs_dquot *udqp;
+ struct xfs_dquot *gdqp;
+ struct xfs_dquot *pdqp;
struct xfs_trans_res *tres;
struct xfs_inode *dp = mp->m_rootip;
xfs_ino_t ino;
@@ -65,8 +70,7 @@ xrep_tempfile_create(
* inode should be completely root owned so that we don't fail due to
* quota limits.
*/
- error = xfs_qm_vop_dqalloc(dp, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
- XFS_QMOPT_QUOTALL, &udqp, &gdqp, &pdqp);
+ error = xfs_icreate_dqalloc(&args, &udqp, &gdqp, &pdqp);
if (error)
return error;
@@ -87,14 +91,11 @@ xrep_tempfile_create(
error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
if (error)
goto out_trans_cancel;
- error = xfs_init_new_inode(&nop_mnt_idmap, tp, dp, ino, mode, 0, 0,
- 0, false, &sc->tempip);
+ error = xfs_icreate(tp, ino, &args, &sc->tempip);
if (error)
goto out_trans_cancel;
- /* Change the ownership of the inode to root. */
- VFS_I(sc->tempip)->i_uid = GLOBAL_ROOT_UID;
- VFS_I(sc->tempip)->i_gid = GLOBAL_ROOT_GID;
+ /* We don't touch file data, so drop the realtime flags. */
sc->tempip->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT);
xfs_trans_log_inode(tp, sc->tempip, XFS_ILOG_CORE);
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index f6ffb4f248f7..9355ccad9503 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -10,6 +10,10 @@
#define DEBUG 1
#endif
+#ifdef CONFIG_XFS_DEBUG_EXPENSIVE
+#define DEBUG_EXPENSIVE 1
+#endif
+
#ifdef CONFIG_XFS_ASSERT_FATAL
#define XFS_ASSERT_FATAL 1
#endif
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index a19d62e78aa1..e224b49b7cff 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -324,13 +324,9 @@ xfs_bmap_update_get_group(
struct xfs_mount *mp,
struct xfs_bmap_intent *bi)
{
- xfs_agnumber_t agno;
-
if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork))
return;
- agno = XFS_FSB_TO_AGNO(mp, bi->bi_bmap.br_startblock);
-
/*
* Bump the intent count on behalf of the deferred rmap and refcount
* intent items that that we can queue when we finish this bmap work.
@@ -338,7 +334,7 @@ xfs_bmap_update_get_group(
* intent drops the intent count, ensuring that the intent count
* remains nonzero across the transaction roll.
*/
- bi->bi_pag = xfs_perag_intent_get(mp, agno);
+ bi->bi_pag = xfs_perag_intent_get(mp, bi->bi_bmap.br_startblock);
}
/* Add this deferred BUI to the transaction. */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index ac2e77ebb54c..fe2e2c930975 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -486,13 +486,11 @@ out_unlock:
/*
* Test whether it is appropriate to check an inode for and free post EOF
- * blocks. The 'force' parameter determines whether we should also consider
- * regular files that are marked preallocated or append-only.
+ * blocks.
*/
bool
xfs_can_free_eofblocks(
- struct xfs_inode *ip,
- bool force)
+ struct xfs_inode *ip)
{
struct xfs_bmbt_irec imap;
struct xfs_mount *mp = ip->i_mount;
@@ -526,11 +524,11 @@ xfs_can_free_eofblocks(
return false;
/*
- * Do not free real preallocated or append-only files unless the file
- * has delalloc blocks and we are forced to remove them.
+ * Only free real extents for inodes with persistent preallocations or
+ * the append-only flag.
*/
if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
- if (!force || ip->i_delayed_blks == 0)
+ if (ip->i_delayed_blks == 0)
return false;
/*
@@ -584,6 +582,22 @@ xfs_free_eofblocks(
/* Wait on dio to ensure i_size has settled. */
inode_dio_wait(VFS_I(ip));
+ /*
+ * For preallocated files only free delayed allocations.
+ *
+ * Note that this means we also leave speculative preallocations in
+ * place for preallocated files.
+ */
+ if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) {
+ if (ip->i_delayed_blks) {
+ xfs_bmap_punch_delalloc_range(ip,
+ round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize),
+ LLONG_MAX);
+ }
+ xfs_inode_clear_eofblocks_tag(ip);
+ return 0;
+ }
+
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
if (error) {
ASSERT(xfs_is_shutdown(mp));
@@ -794,14 +808,18 @@ xfs_flush_unmap_range(
xfs_off_t offset,
xfs_off_t len)
{
- struct xfs_mount *mp = ip->i_mount;
struct inode *inode = VFS_I(ip);
xfs_off_t rounding, start, end;
int error;
- rounding = max_t(xfs_off_t, mp->m_sb.sb_blocksize, PAGE_SIZE);
- start = round_down(offset, rounding);
- end = round_up(offset + len, rounding) - 1;
+ /*
+ * Make sure we extend the flush out to extent alignment
+ * boundaries so any extent range overlapping the start/end
+ * of the modification we are about to do is clean and idle.
+ */
+ rounding = max_t(xfs_off_t, xfs_inode_alloc_unitsize(ip), PAGE_SIZE);
+ start = rounddown_64(offset, rounding);
+ end = roundup_64(offset + len, rounding) - 1;
error = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (error)
@@ -884,14 +902,14 @@ xfs_prepare_shift(
struct xfs_inode *ip,
loff_t offset)
{
- struct xfs_mount *mp = ip->i_mount;
+ unsigned int rounding;
int error;
/*
* Trim eofblocks to avoid shifting uninitialized post-eof preallocation
* into the accessible region of the file.
*/
- if (xfs_can_free_eofblocks(ip, true)) {
+ if (xfs_can_free_eofblocks(ip)) {
error = xfs_free_eofblocks(ip);
if (error)
return error;
@@ -902,11 +920,13 @@ xfs_prepare_shift(
* with the full range of the operation. If we don't, a COW writeback
* completion could race with an insert, front merge with the start
* extent (after split) during the shift and corrupt the file. Start
- * with the block just prior to the start to stabilize the boundary.
+ * with the allocation unit just prior to the start to stabilize the
+ * boundary.
*/
- offset = round_down(offset, mp->m_sb.sb_blocksize);
+ rounding = xfs_inode_alloc_unitsize(ip);
+ offset = rounddown_64(offset, rounding);
if (offset)
- offset -= mp->m_sb.sb_blocksize;
+ offset -= rounding;
/*
* Writeback and invalidate cache for the remainder of the file as we're
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 51f84d8ff372..eb0895bfb9da 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -63,7 +63,7 @@ int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
xfs_off_t len);
/* EOF block manipulation functions */
-bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
+bool xfs_can_free_eofblocks(struct xfs_inode *ip);
int xfs_free_eofblocks(struct xfs_inode *ip);
int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 43031842341a..47549cfa61cd 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -22,6 +22,7 @@
#include "xfs_trace.h"
#include "xfs_log.h"
#include "xfs_log_priv.h"
+#include "xfs_error.h"
struct kmem_cache *xfs_buf_item_cache;
@@ -781,8 +782,39 @@ xfs_buf_item_committed(
return lsn;
}
+#ifdef DEBUG_EXPENSIVE
+static int
+xfs_buf_item_precommit(
+ struct xfs_trans *tp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+ struct xfs_buf *bp = bip->bli_buf;
+ struct xfs_mount *mp = bp->b_mount;
+ xfs_failaddr_t fa;
+
+ if (!bp->b_ops || !bp->b_ops->verify_struct)
+ return 0;
+ if (bip->bli_flags & XFS_BLI_STALE)
+ return 0;
+
+ fa = bp->b_ops->verify_struct(bp);
+ if (fa) {
+ xfs_buf_verifier_error(bp, -EFSCORRUPTED, bp->b_ops->name,
+ bp->b_addr, BBTOB(bp->b_length), fa);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ ASSERT(fa == NULL);
+ }
+
+ return 0;
+}
+#else
+# define xfs_buf_item_precommit NULL
+#endif
+
static const struct xfs_item_ops xfs_buf_item_ops = {
.iop_size = xfs_buf_item_size,
+ .iop_precommit = xfs_buf_item_precommit,
.iop_format = xfs_buf_item_format,
.iop_pin = xfs_buf_item_pin,
.iop_unpin = xfs_buf_item_unpin,
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 25fe3b932b5a..6f0fc7fe1f2b 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -20,6 +20,7 @@
#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_health.h"
+#include "xfs_rtbitmap.h"
/*
* Notes on an efficient, low latency fstrim algorithm
@@ -322,7 +323,7 @@ xfs_trim_should_stop(void)
* we found in the last batch as the key to start the next.
*/
static int
-xfs_trim_extents(
+xfs_trim_perag_extents(
struct xfs_perag *pag,
xfs_agblock_t start,
xfs_agblock_t end,
@@ -383,6 +384,259 @@ xfs_trim_extents(
}
+static int
+xfs_trim_datadev_extents(
+ struct xfs_mount *mp,
+ xfs_daddr_t start,
+ xfs_daddr_t end,
+ xfs_extlen_t minlen,
+ uint64_t *blocks_trimmed)
+{
+ xfs_agnumber_t start_agno, end_agno;
+ xfs_agblock_t start_agbno, end_agbno;
+ xfs_daddr_t ddev_end;
+ struct xfs_perag *pag;
+ int last_error = 0, error;
+
+ ddev_end = min_t(xfs_daddr_t, end,
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1);
+
+ start_agno = xfs_daddr_to_agno(mp, start);
+ start_agbno = xfs_daddr_to_agbno(mp, start);
+ end_agno = xfs_daddr_to_agno(mp, ddev_end);
+ end_agbno = xfs_daddr_to_agbno(mp, ddev_end);
+
+ for_each_perag_range(mp, start_agno, end_agno, pag) {
+ xfs_agblock_t agend = pag->block_count;
+
+ if (start_agno == end_agno)
+ agend = end_agbno;
+ error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen,
+ blocks_trimmed);
+ if (error)
+ last_error = error;
+
+ if (xfs_trim_should_stop()) {
+ xfs_perag_rele(pag);
+ break;
+ }
+ start_agbno = 0;
+ }
+
+ return last_error;
+}
+
+#ifdef CONFIG_XFS_RT
+struct xfs_trim_rtdev {
+ /* list of rt extents to free */
+ struct list_head extent_list;
+
+ /* pointer to count of blocks trimmed */
+ uint64_t *blocks_trimmed;
+
+ /* minimum length that caller allows us to trim */
+ xfs_rtblock_t minlen_fsb;
+
+ /* restart point for the rtbitmap walk */
+ xfs_rtxnum_t restart_rtx;
+
+ /* stopping point for the current rtbitmap walk */
+ xfs_rtxnum_t stop_rtx;
+};
+
+struct xfs_rtx_busy {
+ struct list_head list;
+ xfs_rtblock_t bno;
+ xfs_rtblock_t length;
+};
+
+static void
+xfs_discard_free_rtdev_extents(
+ struct xfs_trim_rtdev *tr)
+{
+ struct xfs_rtx_busy *busyp, *n;
+
+ list_for_each_entry_safe(busyp, n, &tr->extent_list, list) {
+ list_del_init(&busyp->list);
+ kfree(busyp);
+ }
+}
+
+/*
+ * Walk the discard list and issue discards on all the busy extents in the
+ * list. We plug and chain the bios so that we only need a single completion
+ * call to clear all the busy extents once the discards are complete.
+ */
+static int
+xfs_discard_rtdev_extents(
+ struct xfs_mount *mp,
+ struct xfs_trim_rtdev *tr)
+{
+ struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
+ struct xfs_rtx_busy *busyp;
+ struct bio *bio = NULL;
+ struct blk_plug plug;
+ xfs_rtblock_t start = NULLRTBLOCK, length = 0;
+ int error = 0;
+
+ blk_start_plug(&plug);
+ list_for_each_entry(busyp, &tr->extent_list, list) {
+ if (start == NULLRTBLOCK)
+ start = busyp->bno;
+ length += busyp->length;
+
+ trace_xfs_discard_rtextent(mp, busyp->bno, busyp->length);
+
+ error = __blkdev_issue_discard(bdev,
+ XFS_FSB_TO_BB(mp, busyp->bno),
+ XFS_FSB_TO_BB(mp, busyp->length),
+ GFP_NOFS, &bio);
+ if (error)
+ break;
+ }
+ xfs_discard_free_rtdev_extents(tr);
+
+ if (bio) {
+ error = submit_bio_wait(bio);
+ if (error == -EOPNOTSUPP)
+ error = 0;
+ if (error)
+ xfs_info(mp,
+ "discard failed for rtextent [0x%llx,%llu], error %d",
+ (unsigned long long)start,
+ (unsigned long long)length,
+ error);
+ bio_put(bio);
+ }
+ blk_finish_plug(&plug);
+
+ return error;
+}
+
+static int
+xfs_trim_gather_rtextent(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *rec,
+ void *priv)
+{
+ struct xfs_trim_rtdev *tr = priv;
+ struct xfs_rtx_busy *busyp;
+ xfs_rtblock_t rbno, rlen;
+
+ if (rec->ar_startext > tr->stop_rtx) {
+ /*
+ * If we've scanned a large number of rtbitmap blocks, update
+ * the cursor to point at this extent so we restart the next
+ * batch from this extent.
+ */
+ tr->restart_rtx = rec->ar_startext;
+ return -ECANCELED;
+ }
+
+ rbno = xfs_rtx_to_rtb(mp, rec->ar_startext);
+ rlen = xfs_rtx_to_rtb(mp, rec->ar_extcount);
+
+ /* Ignore too small. */
+ if (rlen < tr->minlen_fsb) {
+ trace_xfs_discard_rttoosmall(mp, rbno, rlen);
+ return 0;
+ }
+
+ busyp = kzalloc(sizeof(struct xfs_rtx_busy), GFP_KERNEL);
+ if (!busyp)
+ return -ENOMEM;
+
+ busyp->bno = rbno;
+ busyp->length = rlen;
+ INIT_LIST_HEAD(&busyp->list);
+ list_add_tail(&busyp->list, &tr->extent_list);
+ *tr->blocks_trimmed += rlen;
+
+ tr->restart_rtx = rec->ar_startext + rec->ar_extcount;
+ return 0;
+}
+
+static int
+xfs_trim_rtdev_extents(
+ struct xfs_mount *mp,
+ xfs_daddr_t start,
+ xfs_daddr_t end,
+ xfs_daddr_t minlen,
+ uint64_t *blocks_trimmed)
+{
+ struct xfs_rtalloc_rec low = { };
+ struct xfs_rtalloc_rec high = { };
+ struct xfs_trim_rtdev tr = {
+ .blocks_trimmed = blocks_trimmed,
+ .minlen_fsb = XFS_BB_TO_FSB(mp, minlen),
+ };
+ struct xfs_trans *tp;
+ xfs_daddr_t rtdev_daddr;
+ int error;
+
+ INIT_LIST_HEAD(&tr.extent_list);
+
+ /* Shift the start and end downwards to match the rt device. */
+ rtdev_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
+ if (start > rtdev_daddr)
+ start -= rtdev_daddr;
+ else
+ start = 0;
+
+ if (end <= rtdev_daddr)
+ return 0;
+ end -= rtdev_daddr;
+
+ error = xfs_trans_alloc_empty(mp, &tp);
+ if (error)
+ return error;
+
+ end = min_t(xfs_daddr_t, end,
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks) - 1);
+
+ /* Convert the rt blocks to rt extents */
+ low.ar_startext = xfs_rtb_to_rtxup(mp, XFS_BB_TO_FSB(mp, start));
+ high.ar_startext = xfs_rtb_to_rtx(mp, XFS_BB_TO_FSBT(mp, end));
+
+ /*
+ * Walk the free ranges between low and high. The query_range function
+ * trims the extents returned.
+ */
+ do {
+ tr.stop_rtx = low.ar_startext + (mp->m_sb.sb_blocksize * NBBY);
+ xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP);
+ error = xfs_rtalloc_query_range(mp, tp, &low, &high,
+ xfs_trim_gather_rtextent, &tr);
+
+ if (error == -ECANCELED)
+ error = 0;
+ if (error) {
+ xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
+ xfs_discard_free_rtdev_extents(&tr);
+ break;
+ }
+
+ if (list_empty(&tr.extent_list)) {
+ xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
+ break;
+ }
+
+ error = xfs_discard_rtdev_extents(mp, &tr);
+ xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
+ if (error)
+ break;
+
+ low.ar_startext = tr.restart_rtx;
+ } while (!xfs_trim_should_stop() && low.ar_startext <= high.ar_startext);
+
+ xfs_trans_cancel(tp);
+ return error;
+}
+#else
+# define xfs_trim_rtdev_extents(m,s,e,n,b) (-EOPNOTSUPP)
+#endif /* CONFIG_XFS_RT */
+
/*
* trim a range of the filesystem.
*
@@ -391,28 +645,37 @@ xfs_trim_extents(
* addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format
* is a linear address range. Hence we need to use DADDR based conversions and
* comparisons for determining the correct offset and regions to trim.
+ *
+ * The realtime device is mapped into the FITRIM "address space" immediately
+ * after the data device.
*/
int
xfs_ioc_trim(
struct xfs_mount *mp,
struct fstrim_range __user *urange)
{
- struct xfs_perag *pag;
unsigned int granularity =
bdev_discard_granularity(mp->m_ddev_targp->bt_bdev);
+ struct block_device *rt_bdev = NULL;
struct fstrim_range range;
xfs_daddr_t start, end;
xfs_extlen_t minlen;
- xfs_agnumber_t start_agno, end_agno;
- xfs_agblock_t start_agbno, end_agbno;
+ xfs_rfsblock_t max_blocks;
uint64_t blocks_trimmed = 0;
int error, last_error = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev))
+ if (mp->m_rtdev_targp &&
+ bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev))
+ rt_bdev = mp->m_rtdev_targp->bt_bdev;
+ if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev)
return -EOPNOTSUPP;
+ if (rt_bdev)
+ granularity = max(granularity,
+ bdev_discard_granularity(rt_bdev));
+
/*
* We haven't recovered the log, so we cannot use our bnobt-guided
* storage zapping commands.
@@ -433,35 +696,27 @@ xfs_ioc_trim(
* used by the fstrim application. In the end it really doesn't
* matter as trimming blocks is an advisory interface.
*/
- if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
+ max_blocks = mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks;
+ if (range.start >= XFS_FSB_TO_B(mp, max_blocks) ||
range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) ||
range.len < mp->m_sb.sb_blocksize)
return -EINVAL;
start = BTOBB(range.start);
- end = min_t(xfs_daddr_t, start + BTOBBT(range.len),
- XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) - 1;
+ end = start + BTOBBT(range.len) - 1;
- start_agno = xfs_daddr_to_agno(mp, start);
- start_agbno = xfs_daddr_to_agbno(mp, start);
- end_agno = xfs_daddr_to_agno(mp, end);
- end_agbno = xfs_daddr_to_agbno(mp, end);
-
- for_each_perag_range(mp, start_agno, end_agno, pag) {
- xfs_agblock_t agend = pag->block_count;
-
- if (start_agno == end_agno)
- agend = end_agbno;
- error = xfs_trim_extents(pag, start_agbno, agend, minlen,
+ if (bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) {
+ error = xfs_trim_datadev_extents(mp, start, end, minlen,
&blocks_trimmed);
if (error)
last_error = error;
+ }
- if (xfs_trim_should_stop()) {
- xfs_perag_rele(pag);
- break;
- }
- start_agbno = 0;
+ if (rt_bdev && !xfs_trim_should_stop()) {
+ error = xfs_trim_rtdev_extents(mp, start, end, minlen,
+ &blocks_trimmed);
+ if (error)
+ last_error = error;
}
if (last_error)
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 6a1aae799cf1..7d19091215b0 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -17,6 +17,7 @@
#include "xfs_trans_priv.h"
#include "xfs_qm.h"
#include "xfs_log.h"
+#include "xfs_error.h"
static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip)
{
@@ -193,8 +194,38 @@ xfs_qm_dquot_logitem_committing(
return xfs_qm_dquot_logitem_release(lip);
}
+#ifdef DEBUG_EXPENSIVE
+static int
+xfs_qm_dquot_logitem_precommit(
+ struct xfs_trans *tp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
+ struct xfs_mount *mp = dqp->q_mount;
+ struct xfs_disk_dquot ddq = { };
+ xfs_failaddr_t fa;
+
+ xfs_dquot_to_disk(&ddq, dqp);
+ fa = xfs_dquot_verify(mp, &ddq, dqp->q_id);
+ if (fa) {
+ XFS_CORRUPTION_ERROR("Bad dquot during logging",
+ XFS_ERRLEVEL_LOW, mp, &ddq, sizeof(ddq));
+ xfs_alert(mp,
+ "Metadata corruption detected at %pS, dquot 0x%x",
+ fa, dqp->q_id);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ ASSERT(fa == NULL);
+ }
+
+ return 0;
+}
+#else
+# define xfs_qm_dquot_logitem_precommit NULL
+#endif
+
static const struct xfs_item_ops xfs_dquot_item_ops = {
.iop_size = xfs_qm_dquot_logitem_size,
+ .iop_precommit = xfs_qm_dquot_logitem_precommit,
.iop_format = xfs_qm_dquot_logitem_format,
.iop_pin = xfs_qm_dquot_logitem_pin,
.iop_unpin = xfs_qm_dquot_logitem_unpin,
diff --git a/fs/xfs/xfs_drain.c b/fs/xfs/xfs_drain.c
index 005a66be44a2..7bdb9688c0f5 100644
--- a/fs/xfs/xfs_drain.c
+++ b/fs/xfs/xfs_drain.c
@@ -94,17 +94,17 @@ static inline int xfs_defer_drain_wait(struct xfs_defer_drain *dr)
}
/*
- * Get a passive reference to an AG and declare an intent to update its
- * metadata.
+ * Get a passive reference to the AG that contains a fsbno and declare an intent
+ * to update its metadata.
*/
struct xfs_perag *
xfs_perag_intent_get(
struct xfs_mount *mp,
- xfs_agnumber_t agno)
+ xfs_fsblock_t fsbno)
{
struct xfs_perag *pag;
- pag = xfs_perag_get(mp, agno);
+ pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, fsbno));
if (!pag)
return NULL;
diff --git a/fs/xfs/xfs_drain.h b/fs/xfs/xfs_drain.h
index 50a5772a8296..775164f54ea6 100644
--- a/fs/xfs/xfs_drain.h
+++ b/fs/xfs/xfs_drain.h
@@ -62,7 +62,7 @@ void xfs_drain_wait_enable(void);
* until the item is finished or cancelled.
*/
struct xfs_perag *xfs_perag_intent_get(struct xfs_mount *mp,
- xfs_agnumber_t agno);
+ xfs_fsblock_t fsbno);
void xfs_perag_intent_put(struct xfs_perag *pag);
void xfs_perag_intent_hold(struct xfs_perag *pag);
@@ -76,7 +76,8 @@ struct xfs_defer_drain { /* empty */ };
#define xfs_defer_drain_free(dr) ((void)0)
#define xfs_defer_drain_init(dr) ((void)0)
-#define xfs_perag_intent_get(mp, agno) xfs_perag_get((mp), (agno))
+#define xfs_perag_intent_get(mp, fsbno) \
+ xfs_perag_get((mp), XFS_FSB_TO_AGNO(mp, fsbno))
#define xfs_perag_intent_put(pag) xfs_perag_put(pag)
static inline void xfs_perag_intent_hold(struct xfs_perag *pag) { }
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 8c382f092332..abffc74a924f 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -303,6 +303,11 @@ static const struct xfs_item_ops xfs_efd_item_ops = {
.iop_intent = xfs_efd_item_intent,
};
+static inline struct xfs_extent_free_item *xefi_entry(const struct list_head *e)
+{
+ return list_entry(e, struct xfs_extent_free_item, xefi_list);
+}
+
/*
* Fill the EFD with all extents from the EFI when we need to roll the
* transaction and continue with a new EFI.
@@ -331,6 +336,22 @@ xfs_efd_from_efi(
efdp->efd_next_extent = efip->efi_format.efi_nextents;
}
+static void
+xfs_efd_add_extent(
+ struct xfs_efd_log_item *efdp,
+ struct xfs_extent_free_item *xefi)
+{
+ struct xfs_extent *extp;
+
+ ASSERT(efdp->efd_next_extent < efdp->efd_format.efd_nextents);
+
+ extp = &efdp->efd_format.efd_extents[efdp->efd_next_extent];
+ extp->ext_start = xefi->xefi_startblock;
+ extp->ext_len = xefi->xefi_blockcount;
+
+ efdp->efd_next_extent++;
+}
+
/* Sort bmap items by AG. */
static int
xfs_extent_free_diff_items(
@@ -338,11 +359,8 @@ xfs_extent_free_diff_items(
const struct list_head *a,
const struct list_head *b)
{
- struct xfs_extent_free_item *ra;
- struct xfs_extent_free_item *rb;
-
- ra = container_of(a, struct xfs_extent_free_item, xefi_list);
- rb = container_of(b, struct xfs_extent_free_item, xefi_list);
+ struct xfs_extent_free_item *ra = xefi_entry(a);
+ struct xfs_extent_free_item *rb = xefi_entry(b);
return ra->xefi_pag->pag_agno - rb->xefi_pag->pag_agno;
}
@@ -418,24 +436,35 @@ xfs_extent_free_create_done(
return &efdp->efd_item;
}
-/* Take a passive ref to the AG containing the space we're freeing. */
+/* Add this deferred EFI to the transaction. */
void
-xfs_extent_free_get_group(
- struct xfs_mount *mp,
- struct xfs_extent_free_item *xefi)
+xfs_extent_free_defer_add(
+ struct xfs_trans *tp,
+ struct xfs_extent_free_item *xefi,
+ struct xfs_defer_pending **dfpp)
{
- xfs_agnumber_t agno;
+ struct xfs_mount *mp = tp->t_mountp;
+
+ trace_xfs_extent_free_defer(mp, xefi);
- agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock);
- xefi->xefi_pag = xfs_perag_intent_get(mp, agno);
+ xefi->xefi_pag = xfs_perag_intent_get(mp, xefi->xefi_startblock);
+ if (xefi->xefi_agresv == XFS_AG_RESV_AGFL)
+ *dfpp = xfs_defer_add(tp, &xefi->xefi_list,
+ &xfs_agfl_free_defer_type);
+ else
+ *dfpp = xfs_defer_add(tp, &xefi->xefi_list,
+ &xfs_extent_free_defer_type);
}
-/* Release a passive AG ref after some freeing work. */
-static inline void
-xfs_extent_free_put_group(
- struct xfs_extent_free_item *xefi)
+/* Cancel a free extent. */
+STATIC void
+xfs_extent_free_cancel_item(
+ struct list_head *item)
{
+ struct xfs_extent_free_item *xefi = xefi_entry(item);
+
xfs_perag_intent_put(xefi->xefi_pag);
+ kmem_cache_free(xfs_extfree_item_cache, xefi);
}
/* Process a free extent. */
@@ -447,15 +476,12 @@ xfs_extent_free_finish_item(
struct xfs_btree_cur **state)
{
struct xfs_owner_info oinfo = { };
- struct xfs_extent_free_item *xefi;
+ struct xfs_extent_free_item *xefi = xefi_entry(item);
struct xfs_efd_log_item *efdp = EFD_ITEM(done);
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_extent *extp;
- uint next_extent;
xfs_agblock_t agbno;
int error = 0;
- xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock);
oinfo.oi_owner = xefi->xefi_owner;
@@ -464,8 +490,7 @@ xfs_extent_free_finish_item(
if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK)
oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
- trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0,
- agbno, xefi->xefi_blockcount);
+ trace_xfs_extent_free_deferred(mp, xefi);
/*
* If we need a new transaction to make progress, the caller will log a
@@ -482,16 +507,8 @@ xfs_extent_free_finish_item(
return error;
}
- /* Add the work we finished to the EFD, even though nobody uses that */
- next_extent = efdp->efd_next_extent;
- ASSERT(next_extent < efdp->efd_format.efd_nextents);
- extp = &(efdp->efd_format.efd_extents[next_extent]);
- extp->ext_start = xefi->xefi_startblock;
- extp->ext_len = xefi->xefi_blockcount;
- efdp->efd_next_extent++;
-
- xfs_extent_free_put_group(xefi);
- kmem_cache_free(xfs_extfree_item_cache, xefi);
+ xfs_efd_add_extent(efdp, xefi);
+ xfs_extent_free_cancel_item(item);
return error;
}
@@ -503,19 +520,6 @@ xfs_extent_free_abort_intent(
xfs_efi_release(EFI_ITEM(intent));
}
-/* Cancel a free extent. */
-STATIC void
-xfs_extent_free_cancel_item(
- struct list_head *item)
-{
- struct xfs_extent_free_item *xefi;
-
- xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
-
- xfs_extent_free_put_group(xefi);
- kmem_cache_free(xfs_extfree_item_cache, xefi);
-}
-
/*
* AGFL blocks are accounted differently in the reserve pools and are not
* inserted into the busy extent list.
@@ -530,35 +534,24 @@ xfs_agfl_free_finish_item(
struct xfs_owner_info oinfo = { };
struct xfs_mount *mp = tp->t_mountp;
struct xfs_efd_log_item *efdp = EFD_ITEM(done);
- struct xfs_extent_free_item *xefi;
- struct xfs_extent *extp;
+ struct xfs_extent_free_item *xefi = xefi_entry(item);
struct xfs_buf *agbp;
int error;
xfs_agblock_t agbno;
- uint next_extent;
- xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
ASSERT(xefi->xefi_blockcount == 1);
agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock);
oinfo.oi_owner = xefi->xefi_owner;
- trace_xfs_agfl_free_deferred(mp, xefi->xefi_pag->pag_agno, 0, agbno,
- xefi->xefi_blockcount);
+ trace_xfs_agfl_free_deferred(mp, xefi);
error = xfs_alloc_read_agf(xefi->xefi_pag, tp, 0, &agbp);
if (!error)
- error = xfs_free_agfl_block(tp, xefi->xefi_pag->pag_agno,
- agbno, agbp, &oinfo);
+ error = xfs_free_ag_extent(tp, agbp, xefi->xefi_pag->pag_agno,
+ agbno, 1, &oinfo, XFS_AG_RESV_AGFL);
- next_extent = efdp->efd_next_extent;
- ASSERT(next_extent < efdp->efd_format.efd_nextents);
- extp = &(efdp->efd_format.efd_extents[next_extent]);
- extp->ext_start = xefi->xefi_startblock;
- extp->ext_len = xefi->xefi_blockcount;
- efdp->efd_next_extent++;
-
- xfs_extent_free_put_group(xefi);
- kmem_cache_free(xfs_extfree_item_cache, xefi);
+ xfs_efd_add_extent(efdp, xefi);
+ xfs_extent_free_cancel_item(&xefi->xefi_list);
return error;
}
@@ -585,7 +578,7 @@ xfs_efi_recover_work(
xefi->xefi_blockcount = extp->ext_len;
xefi->xefi_agresv = XFS_AG_RESV_NONE;
xefi->xefi_owner = XFS_RMAP_OWN_UNKNOWN;
- xfs_extent_free_get_group(mp, xefi);
+ xefi->xefi_pag = xfs_perag_intent_get(mp, extp->ext_start);
xfs_defer_add_item(dfp, &xefi->xefi_list);
}
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index da6a5afa607c..41b7c4306079 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -88,4 +88,10 @@ xfs_efd_log_item_sizeof(
extern struct kmem_cache *xfs_efi_cache;
extern struct kmem_cache *xfs_efd_cache;
+struct xfs_extent_free_item;
+
+void xfs_extent_free_defer_add(struct xfs_trans *tp,
+ struct xfs_extent_free_item *xefi,
+ struct xfs_defer_pending **dfpp);
+
#endif /* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index b240ea5241dc..4cdc54dc9686 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -213,29 +213,18 @@ xfs_ilock_iocb_for_write(
if (ret)
return ret;
- if (*lock_mode == XFS_IOLOCK_EXCL)
- return 0;
- if (!xfs_iflags_test(ip, XFS_IREMAPPING))
- return 0;
-
- xfs_iunlock(ip, *lock_mode);
- *lock_mode = XFS_IOLOCK_EXCL;
- return xfs_ilock_iocb(iocb, *lock_mode);
-}
-
-static unsigned int
-xfs_ilock_for_write_fault(
- struct xfs_inode *ip)
-{
- /* get a shared lock if no remapping in progress */
- xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
- if (!xfs_iflags_test(ip, XFS_IREMAPPING))
- return XFS_MMAPLOCK_SHARED;
+ /*
+ * If a reflink remap is in progress we always need to take the iolock
+ * exclusively to wait for it to finish.
+ */
+ if (*lock_mode == XFS_IOLOCK_SHARED &&
+ xfs_iflags_test(ip, XFS_IREMAPPING)) {
+ xfs_iunlock(ip, *lock_mode);
+ *lock_mode = XFS_IOLOCK_EXCL;
+ return xfs_ilock_iocb(iocb, *lock_mode);
+ }
- /* wait for remapping to complete */
- xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
- xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
- return XFS_MMAPLOCK_EXCL;
+ return 0;
}
STATIC ssize_t
@@ -1247,31 +1236,77 @@ xfs_file_llseek(
return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
}
-#ifdef CONFIG_FS_DAX
static inline vm_fault_t
-xfs_dax_fault(
+xfs_dax_fault_locked(
struct vm_fault *vmf,
unsigned int order,
- bool write_fault,
- pfn_t *pfn)
+ bool write_fault)
{
- return dax_iomap_fault(vmf, order, pfn, NULL,
+ vm_fault_t ret;
+ pfn_t pfn;
+
+ if (!IS_ENABLED(CONFIG_FS_DAX)) {
+ ASSERT(0);
+ return VM_FAULT_SIGBUS;
+ }
+ ret = dax_iomap_fault(vmf, order, &pfn, NULL,
(write_fault && !vmf->cow_page) ?
&xfs_dax_write_iomap_ops :
&xfs_read_iomap_ops);
+ if (ret & VM_FAULT_NEEDDSYNC)
+ ret = dax_finish_sync_fault(vmf, order, pfn);
+ return ret;
}
-#else
-static inline vm_fault_t
-xfs_dax_fault(
+
+static vm_fault_t
+xfs_dax_read_fault(
struct vm_fault *vmf,
- unsigned int order,
- bool write_fault,
- pfn_t *pfn)
+ unsigned int order)
{
- ASSERT(0);
- return VM_FAULT_SIGBUS;
+ struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file));
+ vm_fault_t ret;
+
+ xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+ ret = xfs_dax_fault_locked(vmf, order, false);
+ xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+
+ return ret;
+}
+
+static vm_fault_t
+xfs_write_fault(
+ struct vm_fault *vmf,
+ unsigned int order)
+{
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ struct xfs_inode *ip = XFS_I(inode);
+ unsigned int lock_mode = XFS_MMAPLOCK_SHARED;
+ vm_fault_t ret;
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vmf->vma->vm_file);
+
+ /*
+ * Normally we only need the shared mmaplock, but if a reflink remap is
+ * in progress we take the exclusive lock to wait for the remap to
+ * finish before taking a write fault.
+ */
+ xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+ if (xfs_iflags_test(ip, XFS_IREMAPPING)) {
+ xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ lock_mode = XFS_MMAPLOCK_EXCL;
+ }
+
+ if (IS_DAX(inode))
+ ret = xfs_dax_fault_locked(vmf, order, true);
+ else
+ ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops);
+ xfs_iunlock(ip, lock_mode);
+
+ sb_end_pagefault(inode->i_sb);
+ return ret;
}
-#endif
/*
* Locking for serialisation of IO during page faults. This results in a lock
@@ -1290,38 +1325,14 @@ __xfs_filemap_fault(
bool write_fault)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
- struct xfs_inode *ip = XFS_I(inode);
- vm_fault_t ret;
- unsigned int lock_mode = 0;
-
- trace_xfs_filemap_fault(ip, order, write_fault);
-
- if (write_fault) {
- sb_start_pagefault(inode->i_sb);
- file_update_time(vmf->vma->vm_file);
- }
-
- if (IS_DAX(inode) || write_fault)
- lock_mode = xfs_ilock_for_write_fault(XFS_I(inode));
- if (IS_DAX(inode)) {
- pfn_t pfn;
-
- ret = xfs_dax_fault(vmf, order, write_fault, &pfn);
- if (ret & VM_FAULT_NEEDDSYNC)
- ret = dax_finish_sync_fault(vmf, order, pfn);
- } else if (write_fault) {
- ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops);
- } else {
- ret = filemap_fault(vmf);
- }
-
- if (lock_mode)
- xfs_iunlock(XFS_I(inode), lock_mode);
+ trace_xfs_filemap_fault(XFS_I(inode), order, write_fault);
if (write_fault)
- sb_end_pagefault(inode->i_sb);
- return ret;
+ return xfs_write_fault(vmf, order);
+ if (IS_DAX(inode))
+ return xfs_dax_read_fault(vmf, order);
+ return filemap_fault(vmf);
}
static inline bool
diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c
index a3f16e9b6fe5..cf5acbd3c7ca 100644
--- a/fs/xfs/xfs_handle.c
+++ b/fs/xfs/xfs_handle.c
@@ -21,7 +21,6 @@
#include "xfs_attr.h"
#include "xfs_ioctl.h"
#include "xfs_parent.h"
-#include "xfs_da_btree.h"
#include "xfs_handle.h"
#include "xfs_health.h"
#include "xfs_icache.h"
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 0953163a2d84..cf629302d48e 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -86,9 +86,8 @@ xfs_inode_alloc(
return NULL;
}
- /* VFS doesn't initialise i_mode or i_state! */
+ /* VFS doesn't initialise i_mode! */
VFS_I(ip)->i_mode = 0;
- VFS_I(ip)->i_state = 0;
mapping_set_large_folios(VFS_I(ip)->i_mapping);
XFS_STATS_INC(mp, vn_active);
@@ -314,6 +313,7 @@ xfs_reinit_inode(
dev_t dev = inode->i_rdev;
kuid_t uid = inode->i_uid;
kgid_t gid = inode->i_gid;
+ unsigned long state = inode->i_state;
error = inode_init_always(mp->m_super, inode);
@@ -324,6 +324,7 @@ xfs_reinit_inode(
inode->i_rdev = dev;
inode->i_uid = uid;
inode->i_gid = gid;
+ inode->i_state = state;
mapping_set_large_folios(inode->i_mapping);
return error;
}
@@ -1155,7 +1156,7 @@ xfs_inode_free_eofblocks(
}
*lockflags |= XFS_IOLOCK_EXCL;
- if (xfs_can_free_eofblocks(ip, false))
+ if (xfs_can_free_eofblocks(ip))
return xfs_free_eofblocks(ip);
/* inode could be preallocated or append-only */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index f36091e1e7f5..7dc6f326936c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -42,54 +42,11 @@
#include "xfs_pnfs.h"
#include "xfs_parent.h"
#include "xfs_xattr.h"
+#include "xfs_inode_util.h"
struct kmem_cache *xfs_inode_cache;
/*
- * helper function to extract extent size hint from inode
- */
-xfs_extlen_t
-xfs_get_extsz_hint(
- struct xfs_inode *ip)
-{
- /*
- * No point in aligning allocations if we need to COW to actually
- * write to them.
- */
- if (xfs_is_always_cow_inode(ip))
- return 0;
- if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
- return ip->i_extsize;
- if (XFS_IS_REALTIME_INODE(ip) &&
- ip->i_mount->m_sb.sb_rextsize > 1)
- return ip->i_mount->m_sb.sb_rextsize;
- return 0;
-}
-
-/*
- * Helper function to extract CoW extent size hint from inode.
- * Between the extent size hint and the CoW extent size hint, we
- * return the greater of the two. If the value is zero (automatic),
- * use the default size.
- */
-xfs_extlen_t
-xfs_get_cowextsz_hint(
- struct xfs_inode *ip)
-{
- xfs_extlen_t a, b;
-
- a = 0;
- if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
- a = ip->i_cowextsize;
- b = xfs_get_extsz_hint(ip);
-
- a = max(a, b);
- if (a == 0)
- return XFS_DEFAULT_COWEXTSZ_HINT;
- return a;
-}
-
-/*
* These two are wrapper routines around the xfs_ilock() routine used to
* centralize some grungy code. They are used in places that wish to lock the
* inode solely for reading the extents. The reason these places can't just
@@ -566,55 +523,6 @@ xfs_lock_two_inodes(
}
}
-uint
-xfs_ip2xflags(
- struct xfs_inode *ip)
-{
- uint flags = 0;
-
- if (ip->i_diflags & XFS_DIFLAG_ANY) {
- if (ip->i_diflags & XFS_DIFLAG_REALTIME)
- flags |= FS_XFLAG_REALTIME;
- if (ip->i_diflags & XFS_DIFLAG_PREALLOC)
- flags |= FS_XFLAG_PREALLOC;
- if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE)
- flags |= FS_XFLAG_IMMUTABLE;
- if (ip->i_diflags & XFS_DIFLAG_APPEND)
- flags |= FS_XFLAG_APPEND;
- if (ip->i_diflags & XFS_DIFLAG_SYNC)
- flags |= FS_XFLAG_SYNC;
- if (ip->i_diflags & XFS_DIFLAG_NOATIME)
- flags |= FS_XFLAG_NOATIME;
- if (ip->i_diflags & XFS_DIFLAG_NODUMP)
- flags |= FS_XFLAG_NODUMP;
- if (ip->i_diflags & XFS_DIFLAG_RTINHERIT)
- flags |= FS_XFLAG_RTINHERIT;
- if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT)
- flags |= FS_XFLAG_PROJINHERIT;
- if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS)
- flags |= FS_XFLAG_NOSYMLINKS;
- if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
- flags |= FS_XFLAG_EXTSIZE;
- if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT)
- flags |= FS_XFLAG_EXTSZINHERIT;
- if (ip->i_diflags & XFS_DIFLAG_NODEFRAG)
- flags |= FS_XFLAG_NODEFRAG;
- if (ip->i_diflags & XFS_DIFLAG_FILESTREAM)
- flags |= FS_XFLAG_FILESTREAM;
- }
-
- if (ip->i_diflags2 & XFS_DIFLAG2_ANY) {
- if (ip->i_diflags2 & XFS_DIFLAG2_DAX)
- flags |= FS_XFLAG_DAX;
- if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
- flags |= FS_XFLAG_COWEXTSIZE;
- }
-
- if (xfs_inode_has_attr_fork(ip))
- flags |= FS_XFLAG_HASATTR;
- return flags;
-}
-
/*
* Lookups up an inode from "name". If ci_name is not NULL, then a CI match
* is allowed, otherwise it has to be an exact match. If a CI match is found,
@@ -656,97 +564,6 @@ out_unlock:
return error;
}
-/* Propagate di_flags from a parent inode to a child inode. */
-static void
-xfs_inode_inherit_flags(
- struct xfs_inode *ip,
- const struct xfs_inode *pip)
-{
- unsigned int di_flags = 0;
- xfs_failaddr_t failaddr;
- umode_t mode = VFS_I(ip)->i_mode;
-
- if (S_ISDIR(mode)) {
- if (pip->i_diflags & XFS_DIFLAG_RTINHERIT)
- di_flags |= XFS_DIFLAG_RTINHERIT;
- if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
- di_flags |= XFS_DIFLAG_EXTSZINHERIT;
- ip->i_extsize = pip->i_extsize;
- }
- if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT)
- di_flags |= XFS_DIFLAG_PROJINHERIT;
- } else if (S_ISREG(mode)) {
- if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
- xfs_has_realtime(ip->i_mount))
- di_flags |= XFS_DIFLAG_REALTIME;
- if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
- di_flags |= XFS_DIFLAG_EXTSIZE;
- ip->i_extsize = pip->i_extsize;
- }
- }
- if ((pip->i_diflags & XFS_DIFLAG_NOATIME) &&
- xfs_inherit_noatime)
- di_flags |= XFS_DIFLAG_NOATIME;
- if ((pip->i_diflags & XFS_DIFLAG_NODUMP) &&
- xfs_inherit_nodump)
- di_flags |= XFS_DIFLAG_NODUMP;
- if ((pip->i_diflags & XFS_DIFLAG_SYNC) &&
- xfs_inherit_sync)
- di_flags |= XFS_DIFLAG_SYNC;
- if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) &&
- xfs_inherit_nosymlinks)
- di_flags |= XFS_DIFLAG_NOSYMLINKS;
- if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) &&
- xfs_inherit_nodefrag)
- di_flags |= XFS_DIFLAG_NODEFRAG;
- if (pip->i_diflags & XFS_DIFLAG_FILESTREAM)
- di_flags |= XFS_DIFLAG_FILESTREAM;
-
- ip->i_diflags |= di_flags;
-
- /*
- * Inode verifiers on older kernels only check that the extent size
- * hint is an integer multiple of the rt extent size on realtime files.
- * They did not check the hint alignment on a directory with both
- * rtinherit and extszinherit flags set. If the misaligned hint is
- * propagated from a directory into a new realtime file, new file
- * allocations will fail due to math errors in the rt allocator and/or
- * trip the verifiers. Validate the hint settings in the new file so
- * that we don't let broken hints propagate.
- */
- failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize,
- VFS_I(ip)->i_mode, ip->i_diflags);
- if (failaddr) {
- ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
- XFS_DIFLAG_EXTSZINHERIT);
- ip->i_extsize = 0;
- }
-}
-
-/* Propagate di_flags2 from a parent inode to a child inode. */
-static void
-xfs_inode_inherit_flags2(
- struct xfs_inode *ip,
- const struct xfs_inode *pip)
-{
- xfs_failaddr_t failaddr;
-
- if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) {
- ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
- ip->i_cowextsize = pip->i_cowextsize;
- }
- if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
- ip->i_diflags2 |= XFS_DIFLAG2_DAX;
-
- /* Don't let invalid cowextsize hints propagate. */
- failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
- VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2);
- if (failaddr) {
- ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
- ip->i_cowextsize = 0;
- }
-}
-
/*
* Initialise a newly allocated inode and return the in-core inode to the
* caller locked exclusively.
@@ -754,39 +571,15 @@ xfs_inode_inherit_flags2(
* Caller is responsible for unlocking the inode manually upon return
*/
int
-xfs_init_new_inode(
- struct mnt_idmap *idmap,
+xfs_icreate(
struct xfs_trans *tp,
- struct xfs_inode *pip,
xfs_ino_t ino,
- umode_t mode,
- xfs_nlink_t nlink,
- dev_t rdev,
- prid_t prid,
- bool init_xattrs,
+ const struct xfs_icreate_args *args,
struct xfs_inode **ipp)
{
- struct inode *dir = pip ? VFS_I(pip) : NULL;
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_inode *ip;
- unsigned int flags;
+ struct xfs_inode *ip = NULL;
int error;
- struct timespec64 tv;
- struct inode *inode;
-
- /*
- * Protect against obviously corrupt allocation btree records. Later
- * xfs_iget checks will catch re-allocation of other active in-memory
- * and on-disk inodes. If we don't catch reallocating the parent inode
- * here we will deadlock in xfs_iget() so we have to do these checks
- * first.
- */
- if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) {
- xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
- xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino),
- XFS_SICK_AG_INOBT);
- return -EFSCORRUPTED;
- }
/*
* Get the in-core inode with the lock held exclusively to prevent
@@ -797,89 +590,8 @@ xfs_init_new_inode(
return error;
ASSERT(ip != NULL);
- inode = VFS_I(ip);
- set_nlink(inode, nlink);
- inode->i_rdev = rdev;
- ip->i_projid = prid;
-
- if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) {
- inode_fsuid_set(inode, idmap);
- inode->i_gid = dir->i_gid;
- inode->i_mode = mode;
- } else {
- inode_init_owner(idmap, inode, dir, mode);
- }
-
- /*
- * If the group ID of the new file does not match the effective group
- * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
- * (and only if the irix_sgid_inherit compatibility variable is set).
- */
- if (irix_sgid_inherit && (inode->i_mode & S_ISGID) &&
- !vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)))
- inode->i_mode &= ~S_ISGID;
-
- ip->i_disk_size = 0;
- ip->i_df.if_nextents = 0;
- ASSERT(ip->i_nblocks == 0);
-
- tv = inode_set_ctime_current(inode);
- inode_set_mtime_to_ts(inode, tv);
- inode_set_atime_to_ts(inode, tv);
-
- ip->i_extsize = 0;
- ip->i_diflags = 0;
-
- if (xfs_has_v3inodes(mp)) {
- inode_set_iversion(inode, 1);
- ip->i_cowextsize = 0;
- ip->i_crtime = tv;
- }
-
- flags = XFS_ILOG_CORE;
- switch (mode & S_IFMT) {
- case S_IFIFO:
- case S_IFCHR:
- case S_IFBLK:
- case S_IFSOCK:
- ip->i_df.if_format = XFS_DINODE_FMT_DEV;
- flags |= XFS_ILOG_DEV;
- break;
- case S_IFREG:
- case S_IFDIR:
- if (pip && (pip->i_diflags & XFS_DIFLAG_ANY))
- xfs_inode_inherit_flags(ip, pip);
- if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY))
- xfs_inode_inherit_flags2(ip, pip);
- fallthrough;
- case S_IFLNK:
- ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
- ip->i_df.if_bytes = 0;
- ip->i_df.if_data = NULL;
- break;
- default:
- ASSERT(0);
- }
-
- /*
- * If we need to create attributes immediately after allocating the
- * inode, initialise an empty attribute fork right now. We use the
- * default fork offset for attributes here as we don't know exactly what
- * size or how many attributes we might be adding. We can do this
- * safely here because we know the data fork is completely empty and
- * this saves us from needing to run a separate transaction to set the
- * fork offset in the immediate future.
- */
- if (init_xattrs && xfs_has_attr(mp)) {
- ip->i_forkoff = xfs_default_attroffset(ip) >> 3;
- xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0);
- }
-
- /*
- * Log the new values stuffed into the inode.
- */
xfs_trans_ijoin(tp, ip, 0);
- xfs_trans_log_inode(tp, ip, flags);
+ xfs_inode_init(tp, args, ip);
/* now that we have an i_mode we can setup the inode structure */
xfs_setup_inode(ip);
@@ -888,158 +600,60 @@ xfs_init_new_inode(
return 0;
}
-/*
- * Decrement the link count on an inode & log the change. If this causes the
- * link count to go to zero, move the inode to AGI unlinked list so that it can
- * be freed when the last active reference goes away via xfs_inactive().
- */
+/* Return dquots for the ids that will be assigned to a new file. */
int
-xfs_droplink(
- struct xfs_trans *tp,
- struct xfs_inode *ip)
-{
- struct inode *inode = VFS_I(ip);
-
- xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
-
- if (inode->i_nlink == 0) {
- xfs_info_ratelimited(tp->t_mountp,
- "Inode 0x%llx link count dropped below zero. Pinning link count.",
- ip->i_ino);
- set_nlink(inode, XFS_NLINK_PINNED);
- }
- if (inode->i_nlink != XFS_NLINK_PINNED)
- drop_nlink(inode);
-
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
- if (inode->i_nlink)
- return 0;
-
- return xfs_iunlink(tp, ip);
-}
-
-/*
- * Increment the link count on an inode & log the change.
- */
-void
-xfs_bumplink(
- struct xfs_trans *tp,
- struct xfs_inode *ip)
-{
- struct inode *inode = VFS_I(ip);
-
- xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
-
- if (inode->i_nlink == XFS_NLINK_PINNED - 1)
- xfs_info_ratelimited(tp->t_mountp,
- "Inode 0x%llx link count exceeded maximum. Pinning link count.",
- ip->i_ino);
- if (inode->i_nlink != XFS_NLINK_PINNED)
- inc_nlink(inode);
-
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-}
-
-#ifdef CONFIG_XFS_LIVE_HOOKS
-/*
- * Use a static key here to reduce the overhead of directory live update hooks.
- * If the compiler supports jump labels, the static branch will be replaced by
- * a nop sled when there are no hook users. Online fsck is currently the only
- * caller, so this is a reasonable tradeoff.
- *
- * Note: Patching the kernel code requires taking the cpu hotplug lock. Other
- * parts of the kernel allocate memory with that lock held, which means that
- * XFS callers cannot hold any locks that might be used by memory reclaim or
- * writeback when calling the static_branch_{inc,dec} functions.
- */
-DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch);
-
-void
-xfs_dir_hook_disable(void)
-{
- xfs_hooks_switch_off(&xfs_dir_hooks_switch);
-}
-
-void
-xfs_dir_hook_enable(void)
-{
- xfs_hooks_switch_on(&xfs_dir_hooks_switch);
-}
-
-/* Call hooks for a directory update relating to a child dirent update. */
-inline void
-xfs_dir_update_hook(
- struct xfs_inode *dp,
- struct xfs_inode *ip,
- int delta,
- const struct xfs_name *name)
-{
- if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) {
- struct xfs_dir_update_params p = {
- .dp = dp,
- .ip = ip,
- .delta = delta,
- .name = name,
- };
- struct xfs_mount *mp = ip->i_mount;
-
- xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p);
+xfs_icreate_dqalloc(
+ const struct xfs_icreate_args *args,
+ struct xfs_dquot **udqpp,
+ struct xfs_dquot **gdqpp,
+ struct xfs_dquot **pdqpp)
+{
+ struct inode *dir = VFS_I(args->pip);
+ kuid_t uid = GLOBAL_ROOT_UID;
+ kgid_t gid = GLOBAL_ROOT_GID;
+ prid_t prid = 0;
+ unsigned int flags = XFS_QMOPT_QUOTALL;
+
+ if (args->idmap) {
+ /*
+ * The uid/gid computation code must match what the VFS uses to
+ * assign i_[ug]id. INHERIT adjusts the gid computation for
+ * setgid/grpid systems.
+ */
+ uid = mapped_fsuid(args->idmap, i_user_ns(dir));
+ gid = mapped_fsgid(args->idmap, i_user_ns(dir));
+ prid = xfs_get_initial_prid(args->pip);
+ flags |= XFS_QMOPT_INHERIT;
}
-}
-/* Call the specified function during a directory update. */
-int
-xfs_dir_hook_add(
- struct xfs_mount *mp,
- struct xfs_dir_hook *hook)
-{
- return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook);
-}
+ *udqpp = *gdqpp = *pdqpp = NULL;
-/* Stop calling the specified function during a directory update. */
-void
-xfs_dir_hook_del(
- struct xfs_mount *mp,
- struct xfs_dir_hook *hook)
-{
- xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook);
+ return xfs_qm_vop_dqalloc(args->pip, uid, gid, prid, flags, udqpp,
+ gdqpp, pdqpp);
}
-/* Configure directory update hook functions. */
-void
-xfs_dir_hook_setup(
- struct xfs_dir_hook *hook,
- notifier_fn_t mod_fn)
-{
- xfs_hook_setup(&hook->dirent_hook, mod_fn);
-}
-#endif /* CONFIG_XFS_LIVE_HOOKS */
-
int
xfs_create(
- struct mnt_idmap *idmap,
- struct xfs_inode *dp,
+ const struct xfs_icreate_args *args,
struct xfs_name *name,
- umode_t mode,
- dev_t rdev,
- bool init_xattrs,
- xfs_inode_t **ipp)
+ struct xfs_inode **ipp)
{
- int is_dir = S_ISDIR(mode);
+ struct xfs_inode *dp = args->pip;
+ struct xfs_dir_update du = {
+ .dp = dp,
+ .name = name,
+ };
struct xfs_mount *mp = dp->i_mount;
- struct xfs_inode *ip = NULL;
struct xfs_trans *tp = NULL;
- int error;
- bool unlock_dp_on_error = false;
- prid_t prid;
- struct xfs_dquot *udqp = NULL;
- struct xfs_dquot *gdqp = NULL;
- struct xfs_dquot *pdqp = NULL;
+ struct xfs_dquot *udqp;
+ struct xfs_dquot *gdqp;
+ struct xfs_dquot *pdqp;
struct xfs_trans_res *tres;
- uint resblks;
xfs_ino_t ino;
- struct xfs_parent_args *ppargs;
+ bool unlock_dp_on_error = false;
+ bool is_dir = S_ISDIR(args->mode);
+ uint resblks;
+ int error;
trace_xfs_create(dp, name);
@@ -1048,15 +662,8 @@ xfs_create(
if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
return -EIO;
- prid = xfs_get_initial_prid(dp);
-
- /*
- * Make sure that we have allocated dquot(s) on disk.
- */
- error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns),
- mapped_fsgid(idmap, &init_user_ns), prid,
- XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
- &udqp, &gdqp, &pdqp);
+ /* Make sure that we have allocated dquot(s) on disk. */
+ error = xfs_icreate_dqalloc(args, &udqp, &gdqp, &pdqp);
if (error)
return error;
@@ -1068,7 +675,7 @@ xfs_create(
tres = &M_RES(mp)->tr_create;
}
- error = xfs_parent_start(mp, &ppargs);
+ error = xfs_parent_start(mp, &du.ppargs);
if (error)
goto out_release_dquots;
@@ -1097,10 +704,9 @@ xfs_create(
* entry pointing to them, but a directory also the "." entry
* pointing to itself.
*/
- error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
+ error = xfs_dialloc(&tp, dp->i_ino, args->mode, &ino);
if (!error)
- error = xfs_init_new_inode(idmap, tp, dp, ino, mode,
- is_dir ? 2 : 1, rdev, prid, init_xattrs, &ip);
+ error = xfs_icreate(tp, ino, args, &du.ip);
if (error)
goto out_trans_cancel;
@@ -1113,38 +719,9 @@ xfs_create(
*/
xfs_trans_ijoin(tp, dp, 0);
- error = xfs_dir_createname(tp, dp, name, ip->i_ino,
- resblks - XFS_IALLOC_SPACE_RES(mp));
- if (error) {
- ASSERT(error != -ENOSPC);
+ error = xfs_dir_create_child(tp, resblks, &du);
+ if (error)
goto out_trans_cancel;
- }
- xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
-
- if (is_dir) {
- error = xfs_dir_init(tp, ip, dp);
- if (error)
- goto out_trans_cancel;
-
- xfs_bumplink(tp, dp);
- }
-
- /*
- * If we have parent pointers, we need to add the attribute containing
- * the parent information now.
- */
- if (ppargs) {
- error = xfs_parent_addname(tp, ppargs, dp, name, ip);
- if (error)
- goto out_trans_cancel;
- }
-
- /*
- * Create ip with a reference from dp, and add '.' and '..' references
- * if it's a directory.
- */
- xfs_dir_update_hook(dp, ip, 1, name);
/*
* If this is a synchronous mount, make sure that the
@@ -1159,7 +736,7 @@ xfs_create(
* These ids of the inode couldn't have changed since the new
* inode has been locked ever since it was created.
*/
- xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
+ xfs_qm_vop_create_dqattach(tp, du.ip, udqp, gdqp, pdqp);
error = xfs_trans_commit(tp);
if (error)
@@ -1169,10 +746,10 @@ xfs_create(
xfs_qm_dqrele(gdqp);
xfs_qm_dqrele(pdqp);
- *ipp = ip;
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ *ipp = du.ip;
+ xfs_iunlock(du.ip, XFS_ILOCK_EXCL);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- xfs_parent_finish(mp, ppargs);
+ xfs_parent_finish(mp, du.ppargs);
return 0;
out_trans_cancel:
@@ -1183,13 +760,13 @@ xfs_create(
* setup of the inode and release the inode. This prevents recursive
* transactions and deadlocks from xfs_inactive.
*/
- if (ip) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_finish_inode_setup(ip);
- xfs_irele(ip);
+ if (du.ip) {
+ xfs_iunlock(du.ip, XFS_ILOCK_EXCL);
+ xfs_finish_inode_setup(du.ip);
+ xfs_irele(du.ip);
}
out_parent:
- xfs_parent_finish(mp, ppargs);
+ xfs_parent_finish(mp, du.ppargs);
out_release_dquots:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
@@ -1202,36 +779,28 @@ xfs_create(
int
xfs_create_tmpfile(
- struct mnt_idmap *idmap,
- struct xfs_inode *dp,
- umode_t mode,
- bool init_xattrs,
+ const struct xfs_icreate_args *args,
struct xfs_inode **ipp)
{
+ struct xfs_inode *dp = args->pip;
struct xfs_mount *mp = dp->i_mount;
struct xfs_inode *ip = NULL;
struct xfs_trans *tp = NULL;
- int error;
- prid_t prid;
- struct xfs_dquot *udqp = NULL;
- struct xfs_dquot *gdqp = NULL;
- struct xfs_dquot *pdqp = NULL;
+ struct xfs_dquot *udqp;
+ struct xfs_dquot *gdqp;
+ struct xfs_dquot *pdqp;
struct xfs_trans_res *tres;
- uint resblks;
xfs_ino_t ino;
+ uint resblks;
+ int error;
+
+ ASSERT(args->flags & XFS_ICREATE_TMPFILE);
if (xfs_is_shutdown(mp))
return -EIO;
- prid = xfs_get_initial_prid(dp);
-
- /*
- * Make sure that we have allocated dquot(s) on disk.
- */
- error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns),
- mapped_fsgid(idmap, &init_user_ns), prid,
- XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
- &udqp, &gdqp, &pdqp);
+ /* Make sure that we have allocated dquot(s) on disk. */
+ error = xfs_icreate_dqalloc(args, &udqp, &gdqp, &pdqp);
if (error)
return error;
@@ -1243,10 +812,9 @@ xfs_create_tmpfile(
if (error)
goto out_release_dquots;
- error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
+ error = xfs_dialloc(&tp, dp->i_ino, args->mode, &ino);
if (!error)
- error = xfs_init_new_inode(idmap, tp, dp, ino, mode,
- 0, 0, prid, init_xattrs, &ip);
+ error = xfs_icreate(tp, ino, args, &ip);
if (error)
goto out_trans_cancel;
@@ -1303,11 +871,15 @@ xfs_link(
struct xfs_inode *sip,
struct xfs_name *target_name)
{
+ struct xfs_dir_update du = {
+ .dp = tdp,
+ .name = target_name,
+ .ip = sip,
+ };
struct xfs_mount *mp = tdp->i_mount;
struct xfs_trans *tp;
int error, nospace_error = 0;
int resblks;
- struct xfs_parent_args *ppargs;
trace_xfs_link(tdp, target_name);
@@ -1326,7 +898,7 @@ xfs_link(
if (error)
goto std_return;
- error = xfs_parent_start(mp, &ppargs);
+ error = xfs_parent_start(mp, &du.ppargs);
if (error)
goto std_return;
@@ -1341,7 +913,7 @@ xfs_link(
* pointers are enabled because we can't back out if the xattrs must
* grow.
*/
- if (ppargs && nospace_error) {
+ if (du.ppargs && nospace_error) {
error = nospace_error;
goto error_return;
}
@@ -1368,47 +940,9 @@ xfs_link(
}
}
- if (!resblks) {
- error = xfs_dir_canenter(tp, tdp, target_name);
- if (error)
- goto error_return;
- }
-
- /*
- * Handle initial link state of O_TMPFILE inode
- */
- if (VFS_I(sip)->i_nlink == 0) {
- struct xfs_perag *pag;
-
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sip->i_ino));
- error = xfs_iunlink_remove(tp, pag, sip);
- xfs_perag_put(pag);
- if (error)
- goto error_return;
- }
-
- error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
- resblks);
+ error = xfs_dir_add_child(tp, resblks, &du);
if (error)
goto error_return;
- xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
-
- xfs_bumplink(tp, sip);
-
- /*
- * If we have parent pointers, we now need to add the parent record to
- * the attribute fork of the inode. If this is the initial parent
- * attribute, we need to create it correctly, otherwise we can just add
- * the parent to the inode.
- */
- if (ppargs) {
- error = xfs_parent_addname(tp, ppargs, tdp, target_name, sip);
- if (error)
- goto error_return;
- }
-
- xfs_dir_update_hook(tdp, sip, 1, target_name);
/*
* If this is a synchronous mount, make sure that the
@@ -1421,7 +955,7 @@ xfs_link(
error = xfs_trans_commit(tp);
xfs_iunlock(tdp, XFS_ILOCK_EXCL);
xfs_iunlock(sip, XFS_ILOCK_EXCL);
- xfs_parent_finish(mp, ppargs);
+ xfs_parent_finish(mp, du.ppargs);
return error;
error_return:
@@ -1429,7 +963,7 @@ xfs_link(
xfs_iunlock(tdp, XFS_ILOCK_EXCL);
xfs_iunlock(sip, XFS_ILOCK_EXCL);
out_parent:
- xfs_parent_finish(mp, ppargs);
+ xfs_parent_finish(mp, du.ppargs);
std_return:
if (error == -ENOSPC && nospace_error)
error = nospace_error;
@@ -1595,7 +1129,7 @@ xfs_release(
if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
return 0;
- if (xfs_can_free_eofblocks(ip, false)) {
+ if (xfs_can_free_eofblocks(ip)) {
/*
* Check if the inode is being opened, written and closed
* frequently and we have delayed allocation blocks outstanding
@@ -1856,15 +1390,13 @@ xfs_inode_needs_inactive(
/*
* This file isn't being freed, so check if there are post-eof blocks
- * to free. @force is true because we are evicting an inode from the
- * cache. Post-eof blocks must be freed, lest we end up with broken
- * free space accounting.
+ * to free.
*
* Note: don't bother with iolock here since lockdep complains about
* acquiring it in reclaim context. We have the only reference to the
* inode at this point anyways.
*/
- return xfs_can_free_eofblocks(ip, true);
+ return xfs_can_free_eofblocks(ip);
}
/*
@@ -1947,15 +1479,11 @@ xfs_inactive(
if (VFS_I(ip)->i_nlink != 0) {
/*
- * force is true because we are evicting an inode from the
- * cache. Post-eof blocks must be freed, lest we end up with
- * broken free space accounting.
- *
* Note: don't bother with iolock here since lockdep complains
* about acquiring it in reclaim context. We have the only
* reference to the inode at this point anyways.
*/
- if (xfs_can_free_eofblocks(ip, true))
+ if (xfs_can_free_eofblocks(ip))
error = xfs_free_eofblocks(ip);
goto out;
@@ -2022,39 +1550,6 @@ out:
}
/*
- * In-Core Unlinked List Lookups
- * =============================
- *
- * Every inode is supposed to be reachable from some other piece of metadata
- * with the exception of the root directory. Inodes with a connection to a
- * file descriptor but not linked from anywhere in the on-disk directory tree
- * are collectively known as unlinked inodes, though the filesystem itself
- * maintains links to these inodes so that on-disk metadata are consistent.
- *
- * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI
- * header contains a number of buckets that point to an inode, and each inode
- * record has a pointer to the next inode in the hash chain. This
- * singly-linked list causes scaling problems in the iunlink remove function
- * because we must walk that list to find the inode that points to the inode
- * being removed from the unlinked hash bucket list.
- *
- * Hence we keep an in-memory double linked list to link each inode on an
- * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer
- * based lists would require having 64 list heads in the perag, one for each
- * list. This is expensive in terms of memory (think millions of AGs) and cache
- * misses on lookups. Instead, use the fact that inodes on the unlinked list
- * must be referenced at the VFS level to keep them on the list and hence we
- * have an existence guarantee for inodes on the unlinked list.
- *
- * Given we have an existence guarantee, we can use lockless inode cache lookups
- * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode
- * for the double linked unlinked list, and we don't need any extra locking to
- * keep the list safe as all manipulations are done under the AGI buffer lock.
- * Keeping the list up to date does not require memory allocation, just finding
- * the XFS inode and updating the next/prev unlinked list aginos.
- */
-
-/*
* Find an inode on the unlinked list. This does not take references to the
* inode as we have existence guarantees by holding the AGI buffer lock and that
* only unlinked, referenced inodes can be on the unlinked inode list. If we
@@ -2089,75 +1584,11 @@ xfs_iunlink_lookup(
}
/*
- * Update the prev pointer of the next agino. Returns -ENOLINK if the inode
- * is not in cache.
- */
-static int
-xfs_iunlink_update_backref(
- struct xfs_perag *pag,
- xfs_agino_t prev_agino,
- xfs_agino_t next_agino)
-{
- struct xfs_inode *ip;
-
- /* No update necessary if we are at the end of the list. */
- if (next_agino == NULLAGINO)
- return 0;
-
- ip = xfs_iunlink_lookup(pag, next_agino);
- if (!ip)
- return -ENOLINK;
-
- ip->i_prev_unlinked = prev_agino;
- return 0;
-}
-
-/*
- * Point the AGI unlinked bucket at an inode and log the results. The caller
- * is responsible for validating the old value.
- */
-STATIC int
-xfs_iunlink_update_bucket(
- struct xfs_trans *tp,
- struct xfs_perag *pag,
- struct xfs_buf *agibp,
- unsigned int bucket_index,
- xfs_agino_t new_agino)
-{
- struct xfs_agi *agi = agibp->b_addr;
- xfs_agino_t old_value;
- int offset;
-
- ASSERT(xfs_verify_agino_or_null(pag, new_agino));
-
- old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
- trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index,
- old_value, new_agino);
-
- /*
- * We should never find the head of the list already set to the value
- * passed in because either we're adding or removing ourselves from the
- * head of the list.
- */
- if (old_value == new_agino) {
- xfs_buf_mark_corrupt(agibp);
- xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
- return -EFSCORRUPTED;
- }
-
- agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
- offset = offsetof(struct xfs_agi, agi_unlinked) +
- (sizeof(xfs_agino_t) * bucket_index);
- xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
- return 0;
-}
-
-/*
* Load the inode @next_agino into the cache and set its prev_unlinked pointer
* to @prev_agino. Caller must hold the AGI to synchronize with other changes
* to the unlinked list.
*/
-STATIC int
+int
xfs_iunlink_reload_next(
struct xfs_trans *tp,
struct xfs_buf *agibp,
@@ -2213,187 +1644,6 @@ rele:
return error;
}
-static int
-xfs_iunlink_insert_inode(
- struct xfs_trans *tp,
- struct xfs_perag *pag,
- struct xfs_buf *agibp,
- struct xfs_inode *ip)
-{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_agi *agi = agibp->b_addr;
- xfs_agino_t next_agino;
- xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
- short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
- int error;
-
- /*
- * Get the index into the agi hash table for the list this inode will
- * go on. Make sure the pointer isn't garbage and that this inode
- * isn't already on the list.
- */
- next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
- if (next_agino == agino ||
- !xfs_verify_agino_or_null(pag, next_agino)) {
- xfs_buf_mark_corrupt(agibp);
- xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
- return -EFSCORRUPTED;
- }
-
- /*
- * Update the prev pointer in the next inode to point back to this
- * inode.
- */
- error = xfs_iunlink_update_backref(pag, agino, next_agino);
- if (error == -ENOLINK)
- error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino);
- if (error)
- return error;
-
- if (next_agino != NULLAGINO) {
- /*
- * There is already another inode in the bucket, so point this
- * inode to the current head of the list.
- */
- error = xfs_iunlink_log_inode(tp, ip, pag, next_agino);
- if (error)
- return error;
- ip->i_next_unlinked = next_agino;
- }
-
- /* Point the head of the list to point to this inode. */
- ip->i_prev_unlinked = NULLAGINO;
- return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
-}
-
-/*
- * This is called when the inode's link count has gone to 0 or we are creating
- * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0.
- *
- * We place the on-disk inode on a list in the AGI. It will be pulled from this
- * list when the inode is freed.
- */
-int
-xfs_iunlink(
- struct xfs_trans *tp,
- struct xfs_inode *ip)
-{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_perag *pag;
- struct xfs_buf *agibp;
- int error;
-
- ASSERT(VFS_I(ip)->i_nlink == 0);
- ASSERT(VFS_I(ip)->i_mode != 0);
- trace_xfs_iunlink(ip);
-
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-
- /* Get the agi buffer first. It ensures lock ordering on the list. */
- error = xfs_read_agi(pag, tp, 0, &agibp);
- if (error)
- goto out;
-
- error = xfs_iunlink_insert_inode(tp, pag, agibp, ip);
-out:
- xfs_perag_put(pag);
- return error;
-}
-
-static int
-xfs_iunlink_remove_inode(
- struct xfs_trans *tp,
- struct xfs_perag *pag,
- struct xfs_buf *agibp,
- struct xfs_inode *ip)
-{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_agi *agi = agibp->b_addr;
- xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
- xfs_agino_t head_agino;
- short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
- int error;
-
- trace_xfs_iunlink_remove(ip);
-
- /*
- * Get the index into the agi hash table for the list this inode will
- * go on. Make sure the head pointer isn't garbage.
- */
- head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
- if (!xfs_verify_agino(pag, head_agino)) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- agi, sizeof(*agi));
- xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
- return -EFSCORRUPTED;
- }
-
- /*
- * Set our inode's next_unlinked pointer to NULL and then return
- * the old pointer value so that we can update whatever was previous
- * to us in the list to point to whatever was next in the list.
- */
- error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO);
- if (error)
- return error;
-
- /*
- * Update the prev pointer in the next inode to point back to previous
- * inode in the chain.
- */
- error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
- ip->i_next_unlinked);
- if (error == -ENOLINK)
- error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked,
- ip->i_next_unlinked);
- if (error)
- return error;
-
- if (head_agino != agino) {
- struct xfs_inode *prev_ip;
-
- prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked);
- if (!prev_ip) {
- xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
- return -EFSCORRUPTED;
- }
-
- error = xfs_iunlink_log_inode(tp, prev_ip, pag,
- ip->i_next_unlinked);
- prev_ip->i_next_unlinked = ip->i_next_unlinked;
- } else {
- /* Point the head of the list to the next unlinked inode. */
- error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index,
- ip->i_next_unlinked);
- }
-
- ip->i_next_unlinked = NULLAGINO;
- ip->i_prev_unlinked = 0;
- return error;
-}
-
-/*
- * Pull the on-disk inode from the AGI unlinked list.
- */
-int
-xfs_iunlink_remove(
- struct xfs_trans *tp,
- struct xfs_perag *pag,
- struct xfs_inode *ip)
-{
- struct xfs_buf *agibp;
- int error;
-
- trace_xfs_iunlink_remove(ip);
-
- /* Get the agi buffer first. It ensures lock ordering on the list. */
- error = xfs_read_agi(pag, tp, 0, &agibp);
- if (error)
- return error;
-
- return xfs_iunlink_remove_inode(tp, pag, agibp, ip);
-}
-
/*
* Look up the inode number specified and if it is not already marked XFS_ISTALE
* mark it stale. We should only find clean inodes in this lookup that aren't
@@ -2612,36 +1862,10 @@ xfs_ifree(
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
- /*
- * Free the inode first so that we guarantee that the AGI lock is going
- * to be taken before we remove the inode from the unlinked list. This
- * makes the AGI lock -> unlinked list modification order the same as
- * used in O_TMPFILE creation.
- */
- error = xfs_difree(tp, pag, ip->i_ino, &xic);
- if (error)
- goto out;
-
- error = xfs_iunlink_remove(tp, pag, ip);
+ error = xfs_inode_uninit(tp, pag, ip, &xic);
if (error)
goto out;
- /*
- * Free any local-format data sitting around before we reset the
- * data fork to extents format. Note that the attr fork data has
- * already been freed by xfs_attr_inactive.
- */
- if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
- kfree(ip->i_df.if_data);
- ip->i_df.if_data = NULL;
- ip->i_df.if_bytes = 0;
- }
-
- VFS_I(ip)->i_mode = 0; /* mark incore inode as free */
- ip->i_diflags = 0;
- ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
- ip->i_forkoff = 0; /* mark the attr fork not in use */
- ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS))
xfs_iflags_clear(ip, XFS_IPRESERVE_DM_FIELDS);
@@ -2650,13 +1874,6 @@ xfs_ifree(
iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER);
spin_unlock(&iip->ili_lock);
- /*
- * Bump the generation count so no one will be confused
- * by reincarnations of this inode.
- */
- VFS_I(ip)->i_generation++;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
if (xic.deleted)
error = xfs_ifree_cluster(tp, pag, ip, &xic);
out:
@@ -2740,13 +1957,17 @@ xfs_remove(
struct xfs_name *name,
struct xfs_inode *ip)
{
+ struct xfs_dir_update du = {
+ .dp = dp,
+ .name = name,
+ .ip = ip,
+ };
struct xfs_mount *mp = dp->i_mount;
struct xfs_trans *tp = NULL;
int is_dir = S_ISDIR(VFS_I(ip)->i_mode);
int dontcare;
int error = 0;
uint resblks;
- struct xfs_parent_args *ppargs;
trace_xfs_remove(dp, name);
@@ -2763,7 +1984,7 @@ xfs_remove(
if (error)
goto std_return;
- error = xfs_parent_start(mp, &ppargs);
+ error = xfs_parent_start(mp, &du.ppargs);
if (error)
goto std_return;
@@ -2786,76 +2007,10 @@ xfs_remove(
goto out_parent;
}
- /*
- * If we're removing a directory perform some additional validation.
- */
- if (is_dir) {
- ASSERT(VFS_I(ip)->i_nlink >= 2);
- if (VFS_I(ip)->i_nlink != 2) {
- error = -ENOTEMPTY;
- goto out_trans_cancel;
- }
- if (!xfs_dir_isempty(ip)) {
- error = -ENOTEMPTY;
- goto out_trans_cancel;
- }
-
- /* Drop the link from ip's "..". */
- error = xfs_droplink(tp, dp);
- if (error)
- goto out_trans_cancel;
-
- /* Drop the "." link from ip to self. */
- error = xfs_droplink(tp, ip);
- if (error)
- goto out_trans_cancel;
-
- /*
- * Point the unlinked child directory's ".." entry to the root
- * directory to eliminate back-references to inodes that may
- * get freed before the child directory is closed. If the fs
- * gets shrunk, this can lead to dirent inode validation errors.
- */
- if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) {
- error = xfs_dir_replace(tp, ip, &xfs_name_dotdot,
- tp->t_mountp->m_sb.sb_rootino, 0);
- if (error)
- goto out_trans_cancel;
- }
- } else {
- /*
- * When removing a non-directory we need to log the parent
- * inode here. For a directory this is done implicitly
- * by the xfs_droplink call for the ".." entry.
- */
- xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
- }
- xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-
- /* Drop the link from dp to ip. */
- error = xfs_droplink(tp, ip);
+ error = xfs_dir_remove_child(tp, resblks, &du);
if (error)
goto out_trans_cancel;
- error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks);
- if (error) {
- ASSERT(error != -ENOENT);
- goto out_trans_cancel;
- }
-
- /* Remove parent pointer. */
- if (ppargs) {
- error = xfs_parent_removename(tp, ppargs, dp, name, ip);
- if (error)
- goto out_trans_cancel;
- }
-
- /*
- * Drop the link from dp to ip, and if ip was a directory, remove the
- * '.' and '..' references since we freed the directory.
- */
- xfs_dir_update_hook(dp, ip, -1, name);
-
/*
* If this is a synchronous mount, make sure that the
* remove transaction goes to disk before returning to
@@ -2873,7 +2028,7 @@ xfs_remove(
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- xfs_parent_finish(mp, ppargs);
+ xfs_parent_finish(mp, du.ppargs);
return 0;
out_trans_cancel:
@@ -2882,7 +2037,7 @@ xfs_remove(
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
out_parent:
- xfs_parent_finish(mp, ppargs);
+ xfs_parent_finish(mp, du.ppargs);
std_return:
return error;
}
@@ -2962,160 +2117,6 @@ xfs_sort_inodes(
}
}
-static int
-xfs_finish_rename(
- struct xfs_trans *tp)
-{
- /*
- * If this is a synchronous mount, make sure that the rename transaction
- * goes to disk before returning to the user.
- */
- if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp))
- xfs_trans_set_sync(tp);
-
- return xfs_trans_commit(tp);
-}
-
-/*
- * xfs_cross_rename()
- *
- * responsible for handling RENAME_EXCHANGE flag in renameat2() syscall
- */
-STATIC int
-xfs_cross_rename(
- struct xfs_trans *tp,
- struct xfs_inode *dp1,
- struct xfs_name *name1,
- struct xfs_inode *ip1,
- struct xfs_parent_args *ip1_ppargs,
- struct xfs_inode *dp2,
- struct xfs_name *name2,
- struct xfs_inode *ip2,
- struct xfs_parent_args *ip2_ppargs,
- int spaceres)
-{
- int error = 0;
- int ip1_flags = 0;
- int ip2_flags = 0;
- int dp2_flags = 0;
-
- /* Swap inode number for dirent in first parent */
- error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres);
- if (error)
- goto out_trans_abort;
-
- /* Swap inode number for dirent in second parent */
- error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres);
- if (error)
- goto out_trans_abort;
-
- /*
- * If we're renaming one or more directories across different parents,
- * update the respective ".." entries (and link counts) to match the new
- * parents.
- */
- if (dp1 != dp2) {
- dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
-
- if (S_ISDIR(VFS_I(ip2)->i_mode)) {
- error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
- dp1->i_ino, spaceres);
- if (error)
- goto out_trans_abort;
-
- /* transfer ip2 ".." reference to dp1 */
- if (!S_ISDIR(VFS_I(ip1)->i_mode)) {
- error = xfs_droplink(tp, dp2);
- if (error)
- goto out_trans_abort;
- xfs_bumplink(tp, dp1);
- }
-
- /*
- * Although ip1 isn't changed here, userspace needs
- * to be warned about the change, so that applications
- * relying on it (like backup ones), will properly
- * notify the change
- */
- ip1_flags |= XFS_ICHGTIME_CHG;
- ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
- }
-
- if (S_ISDIR(VFS_I(ip1)->i_mode)) {
- error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
- dp2->i_ino, spaceres);
- if (error)
- goto out_trans_abort;
-
- /* transfer ip1 ".." reference to dp2 */
- if (!S_ISDIR(VFS_I(ip2)->i_mode)) {
- error = xfs_droplink(tp, dp1);
- if (error)
- goto out_trans_abort;
- xfs_bumplink(tp, dp2);
- }
-
- /*
- * Although ip2 isn't changed here, userspace needs
- * to be warned about the change, so that applications
- * relying on it (like backup ones), will properly
- * notify the change
- */
- ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
- ip2_flags |= XFS_ICHGTIME_CHG;
- }
- }
-
- /* Schedule parent pointer replacements */
- if (ip1_ppargs) {
- error = xfs_parent_replacename(tp, ip1_ppargs, dp1, name1, dp2,
- name2, ip1);
- if (error)
- goto out_trans_abort;
- }
-
- if (ip2_ppargs) {
- error = xfs_parent_replacename(tp, ip2_ppargs, dp2, name2, dp1,
- name1, ip2);
- if (error)
- goto out_trans_abort;
- }
-
- if (ip1_flags) {
- xfs_trans_ichgtime(tp, ip1, ip1_flags);
- xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
- }
- if (ip2_flags) {
- xfs_trans_ichgtime(tp, ip2, ip2_flags);
- xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
- }
- if (dp2_flags) {
- xfs_trans_ichgtime(tp, dp2, dp2_flags);
- xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE);
- }
- xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
-
- /*
- * Inform our hook clients that we've finished an exchange operation as
- * follows: removed the source and target files from their directories;
- * added the target to the source directory; and added the source to
- * the target directory. All inodes are locked, so it's ok to model a
- * rename this way so long as we say we deleted entries before we add
- * new ones.
- */
- xfs_dir_update_hook(dp1, ip1, -1, name1);
- xfs_dir_update_hook(dp2, ip2, -1, name2);
- xfs_dir_update_hook(dp1, ip2, 1, name1);
- xfs_dir_update_hook(dp2, ip1, 1, name2);
-
- return xfs_finish_rename(tp);
-
-out_trans_abort:
- xfs_trans_cancel(tp);
- return error;
-}
-
/*
* xfs_rename_alloc_whiteout()
*
@@ -3131,12 +2132,17 @@ xfs_rename_alloc_whiteout(
struct xfs_inode *dp,
struct xfs_inode **wip)
{
+ struct xfs_icreate_args args = {
+ .idmap = idmap,
+ .pip = dp,
+ .mode = S_IFCHR | WHITEOUT_MODE,
+ .flags = XFS_ICREATE_TMPFILE,
+ };
struct xfs_inode *tmpfile;
struct qstr name;
int error;
- error = xfs_create_tmpfile(idmap, dp, S_IFCHR | WHITEOUT_MODE,
- xfs_has_parent(dp->i_mount), &tmpfile);
+ error = xfs_create_tmpfile(&args, &tmpfile);
if (error)
return error;
@@ -3176,13 +2182,20 @@ xfs_rename(
struct xfs_inode *target_ip,
unsigned int flags)
{
+ struct xfs_dir_update du_src = {
+ .dp = src_dp,
+ .name = src_name,
+ .ip = src_ip,
+ };
+ struct xfs_dir_update du_tgt = {
+ .dp = target_dp,
+ .name = target_name,
+ .ip = target_ip,
+ };
+ struct xfs_dir_update du_wip = { };
struct xfs_mount *mp = src_dp->i_mount;
struct xfs_trans *tp;
- struct xfs_inode *wip = NULL; /* whiteout inode */
struct xfs_inode *inodes[__XFS_SORT_INODES];
- struct xfs_parent_args *src_ppargs = NULL;
- struct xfs_parent_args *tgt_ppargs = NULL;
- struct xfs_parent_args *wip_ppargs = NULL;
int i;
int num_inodes = __XFS_SORT_INODES;
bool new_parent = (src_dp != target_dp);
@@ -3202,8 +2215,8 @@ xfs_rename(
* appropriately.
*/
if (flags & RENAME_WHITEOUT) {
- error = xfs_rename_alloc_whiteout(idmap, src_name,
- target_dp, &wip);
+ error = xfs_rename_alloc_whiteout(idmap, src_name, target_dp,
+ &du_wip.ip);
if (error)
return error;
@@ -3211,21 +2224,21 @@ xfs_rename(
src_name->type = XFS_DIR3_FT_CHRDEV;
}
- xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
- inodes, &num_inodes);
+ xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, du_wip.ip,
+ inodes, &num_inodes);
- error = xfs_parent_start(mp, &src_ppargs);
+ error = xfs_parent_start(mp, &du_src.ppargs);
if (error)
goto out_release_wip;
- if (wip) {
- error = xfs_parent_start(mp, &wip_ppargs);
+ if (du_wip.ip) {
+ error = xfs_parent_start(mp, &du_wip.ppargs);
if (error)
goto out_src_ppargs;
}
if (target_ip) {
- error = xfs_parent_start(mp, &tgt_ppargs);
+ error = xfs_parent_start(mp, &du_tgt.ppargs);
if (error)
goto out_wip_ppargs;
}
@@ -3233,7 +2246,7 @@ xfs_rename(
retry:
nospace_error = 0;
spaceres = xfs_rename_space_res(mp, src_name->len, target_ip != NULL,
- target_name->len, wip != NULL);
+ target_name->len, du_wip.ip != NULL);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
if (error == -ENOSPC) {
nospace_error = error;
@@ -3248,7 +2261,7 @@ retry:
* We don't allow reservationless renaming when parent pointers are
* enabled because we can't back out if the xattrs must grow.
*/
- if (src_ppargs && nospace_error) {
+ if (du_src.ppargs && nospace_error) {
error = nospace_error;
xfs_trans_cancel(tp);
goto out_tgt_ppargs;
@@ -3280,8 +2293,8 @@ retry:
xfs_trans_ijoin(tp, src_ip, 0);
if (target_ip)
xfs_trans_ijoin(tp, target_ip, 0);
- if (wip)
- xfs_trans_ijoin(tp, wip, 0);
+ if (du_wip.ip)
+ xfs_trans_ijoin(tp, du_wip.ip, 0);
/*
* If we are using project inheritance, we only allow renames
@@ -3296,11 +2309,11 @@ retry:
/* RENAME_EXCHANGE is unique from here on. */
if (flags & RENAME_EXCHANGE) {
- error = xfs_cross_rename(tp, src_dp, src_name, src_ip,
- src_ppargs, target_dp, target_name, target_ip,
- tgt_ppargs, spaceres);
- nospace_error = 0;
- goto out_unlock;
+ error = xfs_dir_exchange_children(tp, &du_src, &du_tgt,
+ spaceres);
+ if (error)
+ goto out_trans_cancel;
+ goto out_commit;
}
/*
@@ -3333,39 +2346,12 @@ retry:
* We don't allow quotaless renaming when parent pointers are enabled
* because we can't back out if the xattrs must grow.
*/
- if (src_ppargs && nospace_error) {
+ if (du_src.ppargs && nospace_error) {
error = nospace_error;
goto out_trans_cancel;
}
/*
- * Check for expected errors before we dirty the transaction
- * so we can return an error without a transaction abort.
- */
- if (target_ip == NULL) {
- /*
- * If there's no space reservation, check the entry will
- * fit before actually inserting it.
- */
- if (!spaceres) {
- error = xfs_dir_canenter(tp, target_dp, target_name);
- if (error)
- goto out_trans_cancel;
- }
- } else {
- /*
- * If target exists and it's a directory, check that whether
- * it can be destroyed.
- */
- if (S_ISDIR(VFS_I(target_ip)->i_mode) &&
- (!xfs_dir_isempty(target_ip) ||
- (VFS_I(target_ip)->i_nlink > 2))) {
- error = -EEXIST;
- goto out_trans_cancel;
- }
- }
-
- /*
* Lock the AGI buffers we need to handle bumping the nlink of the
* whiteout inode off the unlinked list and to handle dropping the
* nlink of the target inode. Per locking order rules, do this in
@@ -3376,7 +2362,7 @@ retry:
* target_ip is either null or an empty directory.
*/
for (i = 0; i < num_inodes && inodes[i] != NULL; i++) {
- if (inodes[i] == wip ||
+ if (inodes[i] == du_wip.ip ||
(inodes[i] == target_ip &&
(VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) {
struct xfs_perag *pag;
@@ -3391,188 +2377,29 @@ retry:
}
}
- /*
- * Directory entry creation below may acquire the AGF. Remove
- * the whiteout from the unlinked list first to preserve correct
- * AGI/AGF locking order. This dirties the transaction so failures
- * after this point will abort and log recovery will clean up the
- * mess.
- *
- * For whiteouts, we need to bump the link count on the whiteout
- * inode. After this point, we have a real link, clear the tmpfile
- * state flag from the inode so it doesn't accidentally get misused
- * in future.
- */
- if (wip) {
- struct xfs_perag *pag;
-
- ASSERT(VFS_I(wip)->i_nlink == 0);
-
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, wip->i_ino));
- error = xfs_iunlink_remove(tp, pag, wip);
- xfs_perag_put(pag);
- if (error)
- goto out_trans_cancel;
-
- xfs_bumplink(tp, wip);
- VFS_I(wip)->i_state &= ~I_LINKABLE;
- }
-
- /*
- * Set up the target.
- */
- if (target_ip == NULL) {
- /*
- * If target does not exist and the rename crosses
- * directories, adjust the target directory link count
- * to account for the ".." reference from the new entry.
- */
- error = xfs_dir_createname(tp, target_dp, target_name,
- src_ip->i_ino, spaceres);
- if (error)
- goto out_trans_cancel;
-
- xfs_trans_ichgtime(tp, target_dp,
- XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-
- if (new_parent && src_is_directory) {
- xfs_bumplink(tp, target_dp);
- }
- } else { /* target_ip != NULL */
- /*
- * Link the source inode under the target name.
- * If the source inode is a directory and we are moving
- * it across directories, its ".." entry will be
- * inconsistent until we replace that down below.
- *
- * In case there is already an entry with the same
- * name at the destination directory, remove it first.
- */
- error = xfs_dir_replace(tp, target_dp, target_name,
- src_ip->i_ino, spaceres);
- if (error)
- goto out_trans_cancel;
-
- xfs_trans_ichgtime(tp, target_dp,
- XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-
- /*
- * Decrement the link count on the target since the target
- * dir no longer points to it.
- */
- error = xfs_droplink(tp, target_ip);
- if (error)
- goto out_trans_cancel;
-
- if (src_is_directory) {
- /*
- * Drop the link from the old "." entry.
- */
- error = xfs_droplink(tp, target_ip);
- if (error)
- goto out_trans_cancel;
- }
- } /* target_ip != NULL */
-
- /*
- * Remove the source.
- */
- if (new_parent && src_is_directory) {
- /*
- * Rewrite the ".." entry to point to the new
- * directory.
- */
- error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
- target_dp->i_ino, spaceres);
- ASSERT(error != -EEXIST);
- if (error)
- goto out_trans_cancel;
- }
-
- /*
- * We always want to hit the ctime on the source inode.
- *
- * This isn't strictly required by the standards since the source
- * inode isn't really being changed, but old unix file systems did
- * it and some incremental backup programs won't work without it.
- */
- xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
- xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
-
- /*
- * Adjust the link count on src_dp. This is necessary when
- * renaming a directory, either within one parent when
- * the target existed, or across two parent directories.
- */
- if (src_is_directory && (new_parent || target_ip != NULL)) {
-
- /*
- * Decrement link count on src_directory since the
- * entry that's moved no longer points to it.
- */
- error = xfs_droplink(tp, src_dp);
- if (error)
- goto out_trans_cancel;
- }
-
- /*
- * For whiteouts, we only need to update the source dirent with the
- * inode number of the whiteout inode rather than removing it
- * altogether.
- */
- if (wip)
- error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
- spaceres);
- else
- error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
- spaceres);
-
+ error = xfs_dir_rename_children(tp, &du_src, &du_tgt, spaceres,
+ &du_wip);
if (error)
goto out_trans_cancel;
- /* Schedule parent pointer updates. */
- if (wip_ppargs) {
- error = xfs_parent_addname(tp, wip_ppargs, src_dp, src_name,
- wip);
- if (error)
- goto out_trans_cancel;
- }
-
- if (src_ppargs) {
- error = xfs_parent_replacename(tp, src_ppargs, src_dp,
- src_name, target_dp, target_name, src_ip);
- if (error)
- goto out_trans_cancel;
- }
-
- if (tgt_ppargs) {
- error = xfs_parent_removename(tp, tgt_ppargs, target_dp,
- target_name, target_ip);
- if (error)
- goto out_trans_cancel;
+ if (du_wip.ip) {
+ /*
+ * Now we have a real link, clear the "I'm a tmpfile" state
+ * flag from the inode so it doesn't accidentally get misused in
+ * future.
+ */
+ VFS_I(du_wip.ip)->i_state &= ~I_LINKABLE;
}
- xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
- if (new_parent)
- xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
-
+out_commit:
/*
- * Inform our hook clients that we've finished a rename operation as
- * follows: removed the source and target files from their directories;
- * that we've added the source to the target directory; and finally
- * that we've added the whiteout, if there was one. All inodes are
- * locked, so it's ok to model a rename this way so long as we say we
- * deleted entries before we add new ones.
+ * If this is a synchronous mount, make sure that the rename
+ * transaction goes to disk before returning to the user.
*/
- if (target_ip)
- xfs_dir_update_hook(target_dp, target_ip, -1, target_name);
- xfs_dir_update_hook(src_dp, src_ip, -1, src_name);
- xfs_dir_update_hook(target_dp, src_ip, 1, target_name);
- if (wip)
- xfs_dir_update_hook(src_dp, wip, 1, src_name);
+ if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp))
+ xfs_trans_set_sync(tp);
- error = xfs_finish_rename(tp);
+ error = xfs_trans_commit(tp);
nospace_error = 0;
goto out_unlock;
@@ -3581,14 +2408,14 @@ out_trans_cancel:
out_unlock:
xfs_iunlock_rename(inodes, num_inodes);
out_tgt_ppargs:
- xfs_parent_finish(mp, tgt_ppargs);
+ xfs_parent_finish(mp, du_tgt.ppargs);
out_wip_ppargs:
- xfs_parent_finish(mp, wip_ppargs);
+ xfs_parent_finish(mp, du_wip.ppargs);
out_src_ppargs:
- xfs_parent_finish(mp, src_ppargs);
+ xfs_parent_finish(mp, du_src.ppargs);
out_release_wip:
- if (wip)
- xfs_irele(wip);
+ if (du_wip.ip)
+ xfs_irele(du_wip.ip);
if (error == -ENOSPC && nospace_error)
error = nospace_error;
return error;
@@ -3728,6 +2555,7 @@ flush_out:
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
iip->ili_fsync_fields = 0;
+ set_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags);
spin_unlock(&iip->ili_lock);
/*
@@ -4291,3 +3119,11 @@ xfs_inode_alloc_unitsize(
return XFS_FSB_TO_B(ip->i_mount, blocks);
}
+
+/* Should we always be using copy on write for file writes? */
+bool
+xfs_is_always_cow_inode(
+ struct xfs_inode *ip)
+{
+ return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount);
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 292b90b5f2ac..51defdebef30 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -8,6 +8,7 @@
#include "xfs_inode_buf.h"
#include "xfs_inode_fork.h"
+#include "xfs_inode_util.h"
/*
* Kernel only inode definitions
@@ -270,15 +271,6 @@ xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned long flags)
return ret;
}
-static inline prid_t
-xfs_get_initial_prid(struct xfs_inode *dp)
-{
- if (dp->i_diflags & XFS_DIFLAG_PROJINHERIT)
- return dp->i_projid;
-
- return XFS_PROJID_DEFAULT;
-}
-
static inline bool xfs_is_reflink_inode(struct xfs_inode *ip)
{
return ip->i_diflags2 & XFS_DIFLAG2_REFLINK;
@@ -292,6 +284,13 @@ static inline bool xfs_is_metadata_inode(struct xfs_inode *ip)
xfs_is_quota_inode(&mp->m_sb, ip->i_ino);
}
+bool xfs_is_always_cow_inode(struct xfs_inode *ip);
+
+static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
+{
+ return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
+}
+
/*
* Check if an inode has any data in the COW fork. This might be often false
* even for inodes with the reflink flag when there is no pending COW operation.
@@ -517,12 +516,9 @@ int xfs_release(struct xfs_inode *ip);
int xfs_inactive(struct xfs_inode *ip);
int xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name,
struct xfs_inode **ipp, struct xfs_name *ci_name);
-int xfs_create(struct mnt_idmap *idmap,
- struct xfs_inode *dp, struct xfs_name *name,
- umode_t mode, dev_t rdev, bool need_xattr,
- struct xfs_inode **ipp);
-int xfs_create_tmpfile(struct mnt_idmap *idmap,
- struct xfs_inode *dp, umode_t mode, bool init_xattrs,
+int xfs_create(const struct xfs_icreate_args *iargs,
+ struct xfs_name *name, struct xfs_inode **ipp);
+int xfs_create_tmpfile(const struct xfs_icreate_args *iargs,
struct xfs_inode **ipp);
int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
struct xfs_inode *ip);
@@ -542,7 +538,6 @@ void xfs_assert_ilocked(struct xfs_inode *, uint);
uint xfs_ilock_data_map_shared(struct xfs_inode *);
uint xfs_ilock_attr_map_shared(struct xfs_inode *);
-uint xfs_ip2xflags(struct xfs_inode *);
int xfs_ifree(struct xfs_trans *, struct xfs_inode *);
int xfs_itruncate_extents_flags(struct xfs_trans **,
struct xfs_inode *, int, xfs_fsize_t, int);
@@ -556,13 +551,8 @@ int xfs_iflush_cluster(struct xfs_buf *);
void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode,
struct xfs_inode *ip1, uint ip1_mode);
-xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
-xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip);
-
-int xfs_init_new_inode(struct mnt_idmap *idmap, struct xfs_trans *tp,
- struct xfs_inode *pip, xfs_ino_t ino, umode_t mode,
- xfs_nlink_t nlink, dev_t rdev, prid_t prid, bool init_xattrs,
- struct xfs_inode **ipp);
+int xfs_icreate(struct xfs_trans *tp, xfs_ino_t ino,
+ const struct xfs_icreate_args *args, struct xfs_inode **ipp);
static inline int
xfs_itruncate_extents(
@@ -616,18 +606,15 @@ extern struct kmem_cache *xfs_inode_cache;
bool xfs_inode_needs_inactive(struct xfs_inode *ip);
-int xfs_iunlink(struct xfs_trans *tp, struct xfs_inode *ip);
-int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
- struct xfs_inode *ip);
struct xfs_inode *xfs_iunlink_lookup(struct xfs_perag *pag, xfs_agino_t agino);
+int xfs_iunlink_reload_next(struct xfs_trans *tp, struct xfs_buf *agibp,
+ xfs_agino_t prev_agino, xfs_agino_t next_agino);
void xfs_end_io(struct work_struct *work);
int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2);
-int xfs_droplink(struct xfs_trans *tp, struct xfs_inode *ip);
-void xfs_bumplink(struct xfs_trans *tp, struct xfs_inode *ip);
void xfs_lock_inodes(struct xfs_inode **ips, int inodes, uint lock_mode);
void xfs_sort_inodes(struct xfs_inode **i_tab, unsigned int num_inodes);
@@ -645,29 +632,8 @@ void xfs_inode_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_filblks_t *dblocks, xfs_filblks_t *rblocks);
unsigned int xfs_inode_alloc_unitsize(struct xfs_inode *ip);
-struct xfs_dir_update_params {
- const struct xfs_inode *dp;
- const struct xfs_inode *ip;
- const struct xfs_name *name;
- int delta;
-};
-
-#ifdef CONFIG_XFS_LIVE_HOOKS
-void xfs_dir_update_hook(struct xfs_inode *dp, struct xfs_inode *ip,
- int delta, const struct xfs_name *name);
-
-struct xfs_dir_hook {
- struct xfs_hook dirent_hook;
-};
-
-void xfs_dir_hook_disable(void);
-void xfs_dir_hook_enable(void);
-
-int xfs_dir_hook_add(struct xfs_mount *mp, struct xfs_dir_hook *hook);
-void xfs_dir_hook_del(struct xfs_mount *mp, struct xfs_dir_hook *hook);
-void xfs_dir_hook_setup(struct xfs_dir_hook *hook, notifier_fn_t mod_fn);
-#else
-# define xfs_dir_update_hook(dp, ip, delta, name) ((void)0)
-#endif /* CONFIG_XFS_LIVE_HOOKS */
+int xfs_icreate_dqalloc(const struct xfs_icreate_args *args,
+ struct xfs_dquot **udqpp, struct xfs_dquot **gdqpp,
+ struct xfs_dquot **pdqpp);
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index f28d653300d1..b509cbd191f4 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -37,6 +37,36 @@ xfs_inode_item_sort(
return INODE_ITEM(lip)->ili_inode->i_ino;
}
+#ifdef DEBUG_EXPENSIVE
+static void
+xfs_inode_item_precommit_check(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_dinode *dip;
+ xfs_failaddr_t fa;
+
+ dip = kzalloc(mp->m_sb.sb_inodesize, GFP_KERNEL | GFP_NOFS);
+ if (!dip) {
+ ASSERT(dip != NULL);
+ return;
+ }
+
+ xfs_inode_to_disk(ip, dip, 0);
+ xfs_dinode_calc_crc(mp, dip);
+ fa = xfs_dinode_verify(mp, ip->i_ino, dip);
+ if (fa) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
+ sizeof(*dip), fa);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ ASSERT(fa == NULL);
+ }
+ kfree(dip);
+}
+#else
+# define xfs_inode_item_precommit_check(ip) ((void)0)
+#endif
+
/*
* Prior to finally logging the inode, we have to ensure that all the
* per-modification inode state changes are applied. This includes VFS inode
@@ -169,6 +199,8 @@ xfs_inode_item_precommit(
iip->ili_fields |= (flags | iip->ili_last_fields);
spin_unlock(&iip->ili_lock);
+ xfs_inode_item_precommit_check(ip);
+
/*
* We are done with the log item transaction dirty state, so clear it so
* that it doesn't pollute future transactions.
@@ -933,6 +965,7 @@ xfs_iflush_finish(
}
iip->ili_last_fields = 0;
iip->ili_flush_lsn = 0;
+ clear_bit(XFS_LI_FLUSHING, &lip->li_flags);
spin_unlock(&iip->ili_lock);
xfs_iflags_clear(iip->ili_inode, XFS_IFLUSHING);
if (drop_buffer)
@@ -991,8 +1024,10 @@ xfs_buf_inode_io_fail(
{
struct xfs_log_item *lip;
- list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
+ list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
set_bit(XFS_LI_FAILED, &lip->li_flags);
+ clear_bit(XFS_LI_FLUSHING, &lip->li_flags);
+ }
}
/*
@@ -1011,6 +1046,7 @@ xfs_iflush_abort_clean(
iip->ili_flush_lsn = 0;
iip->ili_item.li_buf = NULL;
list_del_init(&iip->ili_item.li_bio_list);
+ clear_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags);
}
/*
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index f0117188f302..4e933db75b12 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -469,66 +469,6 @@ xfs_fileattr_get(
return 0;
}
-STATIC uint16_t
-xfs_flags2diflags(
- struct xfs_inode *ip,
- unsigned int xflags)
-{
- /* can't set PREALLOC this way, just preserve it */
- uint16_t di_flags =
- (ip->i_diflags & XFS_DIFLAG_PREALLOC);
-
- if (xflags & FS_XFLAG_IMMUTABLE)
- di_flags |= XFS_DIFLAG_IMMUTABLE;
- if (xflags & FS_XFLAG_APPEND)
- di_flags |= XFS_DIFLAG_APPEND;
- if (xflags & FS_XFLAG_SYNC)
- di_flags |= XFS_DIFLAG_SYNC;
- if (xflags & FS_XFLAG_NOATIME)
- di_flags |= XFS_DIFLAG_NOATIME;
- if (xflags & FS_XFLAG_NODUMP)
- di_flags |= XFS_DIFLAG_NODUMP;
- if (xflags & FS_XFLAG_NODEFRAG)
- di_flags |= XFS_DIFLAG_NODEFRAG;
- if (xflags & FS_XFLAG_FILESTREAM)
- di_flags |= XFS_DIFLAG_FILESTREAM;
- if (S_ISDIR(VFS_I(ip)->i_mode)) {
- if (xflags & FS_XFLAG_RTINHERIT)
- di_flags |= XFS_DIFLAG_RTINHERIT;
- if (xflags & FS_XFLAG_NOSYMLINKS)
- di_flags |= XFS_DIFLAG_NOSYMLINKS;
- if (xflags & FS_XFLAG_EXTSZINHERIT)
- di_flags |= XFS_DIFLAG_EXTSZINHERIT;
- if (xflags & FS_XFLAG_PROJINHERIT)
- di_flags |= XFS_DIFLAG_PROJINHERIT;
- } else if (S_ISREG(VFS_I(ip)->i_mode)) {
- if (xflags & FS_XFLAG_REALTIME)
- di_flags |= XFS_DIFLAG_REALTIME;
- if (xflags & FS_XFLAG_EXTSIZE)
- di_flags |= XFS_DIFLAG_EXTSIZE;
- }
-
- return di_flags;
-}
-
-STATIC uint64_t
-xfs_flags2diflags2(
- struct xfs_inode *ip,
- unsigned int xflags)
-{
- uint64_t di_flags2 =
- (ip->i_diflags2 & (XFS_DIFLAG2_REFLINK |
- XFS_DIFLAG2_BIGTIME |
- XFS_DIFLAG2_NREXT64));
-
- if (xflags & FS_XFLAG_DAX)
- di_flags2 |= XFS_DIFLAG2_DAX;
- if (xflags & FS_XFLAG_COWEXTSIZE)
- di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
-
- return di_flags2;
-}
-
static int
xfs_ioctl_setattr_xflags(
struct xfs_trans *tp,
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 378342673925..72c981e3dc92 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -717,53 +717,30 @@ imap_needs_cow(
return true;
}
+/*
+ * Extents not yet cached requires exclusive access, don't block for
+ * IOMAP_NOWAIT.
+ *
+ * This is basically an opencoded xfs_ilock_data_map_shared() call, but with
+ * support for IOMAP_NOWAIT.
+ */
static int
xfs_ilock_for_iomap(
struct xfs_inode *ip,
unsigned flags,
unsigned *lockmode)
{
- unsigned int mode = *lockmode;
- bool is_write = flags & (IOMAP_WRITE | IOMAP_ZERO);
-
- /*
- * COW writes may allocate delalloc space or convert unwritten COW
- * extents, so we need to make sure to take the lock exclusively here.
- */
- if (xfs_is_cow_inode(ip) && is_write)
- mode = XFS_ILOCK_EXCL;
-
- /*
- * Extents not yet cached requires exclusive access, don't block. This
- * is an opencoded xfs_ilock_data_map_shared() call but with
- * non-blocking behaviour.
- */
- if (xfs_need_iread_extents(&ip->i_df)) {
- if (flags & IOMAP_NOWAIT)
- return -EAGAIN;
- mode = XFS_ILOCK_EXCL;
- }
-
-relock:
if (flags & IOMAP_NOWAIT) {
- if (!xfs_ilock_nowait(ip, mode))
+ if (xfs_need_iread_extents(&ip->i_df))
+ return -EAGAIN;
+ if (!xfs_ilock_nowait(ip, *lockmode))
return -EAGAIN;
} else {
- xfs_ilock(ip, mode);
- }
-
- /*
- * The reflink iflag could have changed since the earlier unlocked
- * check, so if we got ILOCK_SHARED for a write and but we're now a
- * reflink inode we have to switch to ILOCK_EXCL and relock.
- */
- if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) {
- xfs_iunlock(ip, mode);
- mode = XFS_ILOCK_EXCL;
- goto relock;
+ if (xfs_need_iread_extents(&ip->i_df))
+ *lockmode = XFS_ILOCK_EXCL;
+ xfs_ilock(ip, *lockmode);
}
- *lockmode = mode;
return 0;
}
@@ -801,7 +778,7 @@ xfs_direct_write_iomap_begin(
int nimaps = 1, error = 0;
bool shared = false;
u16 iomap_flags = 0;
- unsigned int lockmode = XFS_ILOCK_SHARED;
+ unsigned int lockmode;
u64 seq;
ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
@@ -817,10 +794,30 @@ xfs_direct_write_iomap_begin(
if (offset + length > i_size_read(inode))
iomap_flags |= IOMAP_F_DIRTY;
+ /*
+ * COW writes may allocate delalloc space or convert unwritten COW
+ * extents, so we need to make sure to take the lock exclusively here.
+ */
+ if (xfs_is_cow_inode(ip))
+ lockmode = XFS_ILOCK_EXCL;
+ else
+ lockmode = XFS_ILOCK_SHARED;
+
+relock:
error = xfs_ilock_for_iomap(ip, flags, &lockmode);
if (error)
return error;
+ /*
+ * The reflink iflag could have changed since the earlier unlocked
+ * check, check if it again and relock if needed.
+ */
+ if (xfs_is_cow_inode(ip) && lockmode == XFS_ILOCK_SHARED) {
+ xfs_iunlock(ip, lockmode);
+ lockmode = XFS_ILOCK_EXCL;
+ goto relock;
+ }
+
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
&nimaps, 0);
if (error)
@@ -1148,33 +1145,23 @@ xfs_buffered_write_iomap_begin(
}
}
-retry:
- error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
- end_fsb - offset_fsb, prealloc_blocks,
- allocfork == XFS_DATA_FORK ? &imap : &cmap,
- allocfork == XFS_DATA_FORK ? &icur : &ccur,
- allocfork == XFS_DATA_FORK ? eof : cow_eof);
- switch (error) {
- case 0:
- break;
- case -ENOSPC:
- case -EDQUOT:
- /* retry without any preallocation */
- trace_xfs_delalloc_enospc(ip, offset, count);
- if (prealloc_blocks) {
- prealloc_blocks = 0;
- goto retry;
- }
- fallthrough;
- default:
- goto out_unlock;
- }
-
if (allocfork == XFS_COW_FORK) {
+ error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
+ end_fsb - offset_fsb, prealloc_blocks, &cmap,
+ &ccur, cow_eof);
+ if (error)
+ goto out_unlock;
+
trace_xfs_iomap_alloc(ip, offset, count, allocfork, &cmap);
goto found_cow;
}
+ error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
+ end_fsb - offset_fsb, prealloc_blocks, &imap, &icur,
+ eof);
+ if (error)
+ goto out_unlock;
+
/*
* Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
* them out if the write happens to fail.
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ff222827e550..1cdc8034f54d 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -17,6 +17,8 @@
#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_trans.h"
+#include "xfs_trans_space.h"
+#include "xfs_bmap_btree.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_symlink.h"
@@ -26,6 +28,7 @@
#include "xfs_ioctl.h"
#include "xfs_xattr.h"
#include "xfs_file.h"
+#include "xfs_bmap.h"
#include <linux/posix_acl.h>
#include <linux/security.h>
@@ -157,8 +160,6 @@ xfs_create_need_xattr(
if (dir->i_sb->s_security)
return true;
#endif
- if (xfs_has_parent(XFS_I(dir)->i_mount))
- return true;
return false;
}
@@ -172,49 +173,55 @@ xfs_generic_create(
dev_t rdev,
struct file *tmpfile) /* unnamed file */
{
- struct inode *inode;
- struct xfs_inode *ip = NULL;
- struct posix_acl *default_acl, *acl;
- struct xfs_name name;
- int error;
+ struct xfs_icreate_args args = {
+ .idmap = idmap,
+ .pip = XFS_I(dir),
+ .rdev = rdev,
+ .mode = mode,
+ };
+ struct inode *inode;
+ struct xfs_inode *ip = NULL;
+ struct posix_acl *default_acl, *acl;
+ struct xfs_name name;
+ int error;
/*
* Irix uses Missed'em'V split, but doesn't want to see
* the upper 5 bits of (14bit) major.
*/
- if (S_ISCHR(mode) || S_ISBLK(mode)) {
- if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
+ if (S_ISCHR(args.mode) || S_ISBLK(args.mode)) {
+ if (unlikely(!sysv_valid_dev(args.rdev) ||
+ MAJOR(args.rdev) & ~0x1ff))
return -EINVAL;
} else {
- rdev = 0;
+ args.rdev = 0;
}
- error = posix_acl_create(dir, &mode, &default_acl, &acl);
+ error = posix_acl_create(dir, &args.mode, &default_acl, &acl);
if (error)
return error;
/* Verify mode is valid also for tmpfile case */
- error = xfs_dentry_mode_to_name(&name, dentry, mode);
+ error = xfs_dentry_mode_to_name(&name, dentry, args.mode);
if (unlikely(error))
goto out_free_acl;
if (!tmpfile) {
- error = xfs_create(idmap, XFS_I(dir), &name, mode, rdev,
- xfs_create_need_xattr(dir, default_acl, acl),
- &ip);
+ if (xfs_create_need_xattr(dir, default_acl, acl))
+ args.flags |= XFS_ICREATE_INIT_XATTRS;
+
+ error = xfs_create(&args, &name, &ip);
} else {
- bool init_xattrs = false;
+ args.flags |= XFS_ICREATE_TMPFILE;
/*
- * If this temporary file will be linkable, set up the file
- * with an attr fork to receive a parent pointer.
+ * If this temporary file will not be linkable, don't bother
+ * creating an attr fork to receive a parent pointer.
*/
- if (!(tmpfile->f_flags & O_EXCL) &&
- xfs_has_parent(XFS_I(dir)->i_mount))
- init_xattrs = true;
+ if (tmpfile->f_flags & O_EXCL)
+ args.flags |= XFS_ICREATE_UNLINKABLE;
- error = xfs_create_tmpfile(idmap, XFS_I(dir), mode,
- init_xattrs, &ip);
+ error = xfs_create_tmpfile(&args, &ip);
}
if (unlikely(error))
goto out_free_acl;
@@ -811,6 +818,7 @@ xfs_setattr_size(
struct xfs_trans *tp;
int error;
uint lock_flags = 0;
+ uint resblks = 0;
bool did_zeroing = false;
xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
@@ -917,7 +925,17 @@ xfs_setattr_size(
return error;
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+ /*
+ * For realtime inode with more than one block rtextsize, we need the
+ * block reservation for bmap btree block allocations/splits that can
+ * happen since it could split the tail written extent and convert the
+ * right beyond EOF one to unwritten.
+ */
+ if (xfs_inode_has_bigrtalloc(ip))
+ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
+ 0, 0, &tp);
if (error)
return error;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index ac355328121a..54a098fe7285 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -135,8 +135,6 @@ typedef __u32 xfs_nlink_t;
*/
#define __this_address ({ __label__ __here; __here: barrier(); &&__here; })
-#define XFS_PROJID_DEFAULT 0
-
#define howmany(x, y) (((x)+((y)-1))/(y))
static inline void delay(long ticks)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 416c15494983..817ea7e0a8ab 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -30,10 +30,6 @@ xlog_alloc_log(
struct xfs_buftarg *log_target,
xfs_daddr_t blk_offset,
int num_bblks);
-STATIC int
-xlog_space_left(
- struct xlog *log,
- atomic64_t *head);
STATIC void
xlog_dealloc_log(
struct xlog *log);
@@ -51,19 +47,12 @@ xlog_state_get_iclog_space(
struct xlog_ticket *ticket,
int *logoffsetp);
STATIC void
-xlog_grant_push_ail(
- struct xlog *log,
- int need_bytes);
-STATIC void
xlog_sync(
struct xlog *log,
struct xlog_in_core *iclog,
struct xlog_ticket *ticket);
#if defined(DEBUG)
STATIC void
-xlog_verify_grant_tail(
- struct xlog *log);
-STATIC void
xlog_verify_iclog(
struct xlog *log,
struct xlog_in_core *iclog,
@@ -73,7 +62,6 @@ xlog_verify_tail_lsn(
struct xlog *log,
struct xlog_in_core *iclog);
#else
-#define xlog_verify_grant_tail(a)
#define xlog_verify_iclog(a,b,c)
#define xlog_verify_tail_lsn(a,b)
#endif
@@ -141,70 +129,66 @@ xlog_prepare_iovec(
return buf;
}
-static void
+static inline void
xlog_grant_sub_space(
- struct xlog *log,
- atomic64_t *head,
- int bytes)
+ struct xlog_grant_head *head,
+ int64_t bytes)
{
- int64_t head_val = atomic64_read(head);
- int64_t new, old;
-
- do {
- int cycle, space;
-
- xlog_crack_grant_head_val(head_val, &cycle, &space);
-
- space -= bytes;
- if (space < 0) {
- space += log->l_logsize;
- cycle--;
- }
-
- old = head_val;
- new = xlog_assign_grant_head_val(cycle, space);
- head_val = atomic64_cmpxchg(head, old, new);
- } while (head_val != old);
+ atomic64_sub(bytes, &head->grant);
}
-static void
+static inline void
xlog_grant_add_space(
- struct xlog *log,
- atomic64_t *head,
- int bytes)
+ struct xlog_grant_head *head,
+ int64_t bytes)
{
- int64_t head_val = atomic64_read(head);
- int64_t new, old;
-
- do {
- int tmp;
- int cycle, space;
-
- xlog_crack_grant_head_val(head_val, &cycle, &space);
-
- tmp = log->l_logsize - space;
- if (tmp > bytes)
- space += bytes;
- else {
- space = bytes - tmp;
- cycle++;
- }
-
- old = head_val;
- new = xlog_assign_grant_head_val(cycle, space);
- head_val = atomic64_cmpxchg(head, old, new);
- } while (head_val != old);
+ atomic64_add(bytes, &head->grant);
}
-STATIC void
+static void
xlog_grant_head_init(
struct xlog_grant_head *head)
{
- xlog_assign_grant_head(&head->grant, 1, 0);
+ atomic64_set(&head->grant, 0);
INIT_LIST_HEAD(&head->waiters);
spin_lock_init(&head->lock);
}
+void
+xlog_grant_return_space(
+ struct xlog *log,
+ xfs_lsn_t old_head,
+ xfs_lsn_t new_head)
+{
+ int64_t diff = xlog_lsn_sub(log, new_head, old_head);
+
+ xlog_grant_sub_space(&log->l_reserve_head, diff);
+ xlog_grant_sub_space(&log->l_write_head, diff);
+}
+
+/*
+ * Return the space in the log between the tail and the head. In the case where
+ * we have overrun available reservation space, return 0. The memory barrier
+ * pairs with the smp_wmb() in xlog_cil_ail_insert() to ensure that grant head
+ * vs tail space updates are seen in the correct order and hence avoid
+ * transients as space is transferred from the grant heads to the AIL on commit
+ * completion.
+ */
+static uint64_t
+xlog_grant_space_left(
+ struct xlog *log,
+ struct xlog_grant_head *head)
+{
+ int64_t free_bytes;
+
+ smp_rmb(); /* paired with smp_wmb in xlog_cil_ail_insert() */
+ free_bytes = log->l_logsize - READ_ONCE(log->l_tail_space) -
+ atomic64_read(&head->grant);
+ if (free_bytes > 0)
+ return free_bytes;
+ return 0;
+}
+
STATIC void
xlog_grant_head_wake_all(
struct xlog_grant_head *head)
@@ -242,42 +226,15 @@ xlog_grant_head_wake(
{
struct xlog_ticket *tic;
int need_bytes;
- bool woken_task = false;
list_for_each_entry(tic, &head->waiters, t_queue) {
-
- /*
- * There is a chance that the size of the CIL checkpoints in
- * progress at the last AIL push target calculation resulted in
- * limiting the target to the log head (l_last_sync_lsn) at the
- * time. This may not reflect where the log head is now as the
- * CIL checkpoints may have completed.
- *
- * Hence when we are woken here, it may be that the head of the
- * log that has moved rather than the tail. As the tail didn't
- * move, there still won't be space available for the
- * reservation we require. However, if the AIL has already
- * pushed to the target defined by the old log head location, we
- * will hang here waiting for something else to update the AIL
- * push target.
- *
- * Therefore, if there isn't space to wake the first waiter on
- * the grant head, we need to push the AIL again to ensure the
- * target reflects both the current log tail and log head
- * position before we wait for the tail to move again.
- */
-
need_bytes = xlog_ticket_reservation(log, head, tic);
- if (*free_bytes < need_bytes) {
- if (!woken_task)
- xlog_grant_push_ail(log, need_bytes);
+ if (*free_bytes < need_bytes)
return false;
- }
*free_bytes -= need_bytes;
trace_xfs_log_grant_wake_up(log, tic);
wake_up_process(tic->t_task);
- woken_task = true;
}
return true;
@@ -296,13 +253,15 @@ xlog_grant_head_wait(
do {
if (xlog_is_shutdown(log))
goto shutdown;
- xlog_grant_push_ail(log, need_bytes);
__set_current_state(TASK_UNINTERRUPTIBLE);
spin_unlock(&head->lock);
XFS_STATS_INC(log->l_mp, xs_sleep_logspace);
+ /* Push on the AIL to free up all the log space. */
+ xfs_ail_push_all(log->l_ailp);
+
trace_xfs_log_grant_sleep(log, tic);
schedule();
trace_xfs_log_grant_wake(log, tic);
@@ -310,7 +269,7 @@ xlog_grant_head_wait(
spin_lock(&head->lock);
if (xlog_is_shutdown(log))
goto shutdown;
- } while (xlog_space_left(log, &head->grant) < need_bytes);
+ } while (xlog_grant_space_left(log, head) < need_bytes);
list_del_init(&tic->t_queue);
return 0;
@@ -355,7 +314,7 @@ xlog_grant_head_check(
* otherwise try to get some space for this transaction.
*/
*need_bytes = xlog_ticket_reservation(log, head, tic);
- free_bytes = xlog_space_left(log, &head->grant);
+ free_bytes = xlog_grant_space_left(log, head);
if (!list_empty_careful(&head->waiters)) {
spin_lock(&head->lock);
if (!xlog_grant_head_wake(log, head, &free_bytes) ||
@@ -418,9 +377,6 @@ xfs_log_regrant(
* of rolling transactions in the log easily.
*/
tic->t_tid++;
-
- xlog_grant_push_ail(log, tic->t_unit_res);
-
tic->t_curr_res = tic->t_unit_res;
if (tic->t_cnt > 0)
return 0;
@@ -432,9 +388,8 @@ xfs_log_regrant(
if (error)
goto out_error;
- xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes);
+ xlog_grant_add_space(&log->l_write_head, need_bytes);
trace_xfs_log_regrant_exit(log, tic);
- xlog_verify_grant_tail(log);
return 0;
out_error:
@@ -477,21 +432,15 @@ xfs_log_reserve(
ASSERT(*ticp == NULL);
tic = xlog_ticket_alloc(log, unit_bytes, cnt, permanent);
*ticp = tic;
-
- xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
- : tic->t_unit_res);
-
trace_xfs_log_reserve(log, tic);
-
error = xlog_grant_head_check(log, &log->l_reserve_head, tic,
&need_bytes);
if (error)
goto out_error;
- xlog_grant_add_space(log, &log->l_reserve_head.grant, need_bytes);
- xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes);
+ xlog_grant_add_space(&log->l_reserve_head, need_bytes);
+ xlog_grant_add_space(&log->l_write_head, need_bytes);
trace_xfs_log_reserve_exit(log, tic);
- xlog_verify_grant_tail(log);
return 0;
out_error:
@@ -571,7 +520,6 @@ xlog_state_release_iclog(
struct xlog_in_core *iclog,
struct xlog_ticket *ticket)
{
- xfs_lsn_t tail_lsn;
bool last_ref;
lockdep_assert_held(&log->l_icloglock);
@@ -586,8 +534,8 @@ xlog_state_release_iclog(
if ((iclog->ic_state == XLOG_STATE_WANT_SYNC ||
(iclog->ic_flags & XLOG_ICL_NEED_FUA)) &&
!iclog->ic_header.h_tail_lsn) {
- tail_lsn = xlog_assign_tail_lsn(log->l_mp);
- iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
+ iclog->ic_header.h_tail_lsn =
+ cpu_to_be64(atomic64_read(&log->l_tail_lsn));
}
last_ref = atomic_dec_and_test(&iclog->ic_refcnt);
@@ -1149,7 +1097,7 @@ xfs_log_space_wake(
ASSERT(!xlog_in_recovery(log));
spin_lock(&log->l_write_head.lock);
- free_bytes = xlog_space_left(log, &log->l_write_head.grant);
+ free_bytes = xlog_grant_space_left(log, &log->l_write_head);
xlog_grant_head_wake(log, &log->l_write_head, &free_bytes);
spin_unlock(&log->l_write_head.lock);
}
@@ -1158,7 +1106,7 @@ xfs_log_space_wake(
ASSERT(!xlog_in_recovery(log));
spin_lock(&log->l_reserve_head.lock);
- free_bytes = xlog_space_left(log, &log->l_reserve_head.grant);
+ free_bytes = xlog_grant_space_left(log, &log->l_reserve_head);
xlog_grant_head_wake(log, &log->l_reserve_head, &free_bytes);
spin_unlock(&log->l_reserve_head.lock);
}
@@ -1272,105 +1220,6 @@ xfs_log_cover(
return error;
}
-/*
- * We may be holding the log iclog lock upon entering this routine.
- */
-xfs_lsn_t
-xlog_assign_tail_lsn_locked(
- struct xfs_mount *mp)
-{
- struct xlog *log = mp->m_log;
- struct xfs_log_item *lip;
- xfs_lsn_t tail_lsn;
-
- assert_spin_locked(&mp->m_ail->ail_lock);
-
- /*
- * To make sure we always have a valid LSN for the log tail we keep
- * track of the last LSN which was committed in log->l_last_sync_lsn,
- * and use that when the AIL was empty.
- */
- lip = xfs_ail_min(mp->m_ail);
- if (lip)
- tail_lsn = lip->li_lsn;
- else
- tail_lsn = atomic64_read(&log->l_last_sync_lsn);
- trace_xfs_log_assign_tail_lsn(log, tail_lsn);
- atomic64_set(&log->l_tail_lsn, tail_lsn);
- return tail_lsn;
-}
-
-xfs_lsn_t
-xlog_assign_tail_lsn(
- struct xfs_mount *mp)
-{
- xfs_lsn_t tail_lsn;
-
- spin_lock(&mp->m_ail->ail_lock);
- tail_lsn = xlog_assign_tail_lsn_locked(mp);
- spin_unlock(&mp->m_ail->ail_lock);
-
- return tail_lsn;
-}
-
-/*
- * Return the space in the log between the tail and the head. The head
- * is passed in the cycle/bytes formal parms. In the special case where
- * the reserve head has wrapped passed the tail, this calculation is no
- * longer valid. In this case, just return 0 which means there is no space
- * in the log. This works for all places where this function is called
- * with the reserve head. Of course, if the write head were to ever
- * wrap the tail, we should blow up. Rather than catch this case here,
- * we depend on other ASSERTions in other parts of the code. XXXmiken
- *
- * If reservation head is behind the tail, we have a problem. Warn about it,
- * but then treat it as if the log is empty.
- *
- * If the log is shut down, the head and tail may be invalid or out of whack, so
- * shortcut invalidity asserts in this case so that we don't trigger them
- * falsely.
- */
-STATIC int
-xlog_space_left(
- struct xlog *log,
- atomic64_t *head)
-{
- int tail_bytes;
- int tail_cycle;
- int head_cycle;
- int head_bytes;
-
- xlog_crack_grant_head(head, &head_cycle, &head_bytes);
- xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
- tail_bytes = BBTOB(tail_bytes);
- if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
- return log->l_logsize - (head_bytes - tail_bytes);
- if (tail_cycle + 1 < head_cycle)
- return 0;
-
- /* Ignore potential inconsistency when shutdown. */
- if (xlog_is_shutdown(log))
- return log->l_logsize;
-
- if (tail_cycle < head_cycle) {
- ASSERT(tail_cycle == (head_cycle - 1));
- return tail_bytes - head_bytes;
- }
-
- /*
- * The reservation head is behind the tail. In this case we just want to
- * return the size of the log as the amount of space left.
- */
- xfs_alert(log->l_mp, "xlog_space_left: head behind tail");
- xfs_alert(log->l_mp, " tail_cycle = %d, tail_bytes = %d",
- tail_cycle, tail_bytes);
- xfs_alert(log->l_mp, " GH cycle = %d, GH bytes = %d",
- head_cycle, head_bytes);
- ASSERT(0);
- return log->l_logsize;
-}
-
-
static void
xlog_ioend_work(
struct work_struct *work)
@@ -1543,7 +1392,6 @@ xlog_alloc_log(
log->l_prev_block = -1;
/* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
- xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
log->l_curr_cycle = 1; /* 0 is bad since this is initial value */
if (xfs_has_logv2(mp) && mp->m_sb.sb_logsunit > 1)
@@ -1668,89 +1516,6 @@ out:
} /* xlog_alloc_log */
/*
- * Compute the LSN that we'd need to push the log tail towards in order to have
- * (a) enough on-disk log space to log the number of bytes specified, (b) at
- * least 25% of the log space free, and (c) at least 256 blocks free. If the
- * log free space already meets all three thresholds, this function returns
- * NULLCOMMITLSN.
- */
-xfs_lsn_t
-xlog_grant_push_threshold(
- struct xlog *log,
- int need_bytes)
-{
- xfs_lsn_t threshold_lsn = 0;
- xfs_lsn_t last_sync_lsn;
- int free_blocks;
- int free_bytes;
- int threshold_block;
- int threshold_cycle;
- int free_threshold;
-
- ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
-
- free_bytes = xlog_space_left(log, &log->l_reserve_head.grant);
- free_blocks = BTOBBT(free_bytes);
-
- /*
- * Set the threshold for the minimum number of free blocks in the
- * log to the maximum of what the caller needs, one quarter of the
- * log, and 256 blocks.
- */
- free_threshold = BTOBB(need_bytes);
- free_threshold = max(free_threshold, (log->l_logBBsize >> 2));
- free_threshold = max(free_threshold, 256);
- if (free_blocks >= free_threshold)
- return NULLCOMMITLSN;
-
- xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
- &threshold_block);
- threshold_block += free_threshold;
- if (threshold_block >= log->l_logBBsize) {
- threshold_block -= log->l_logBBsize;
- threshold_cycle += 1;
- }
- threshold_lsn = xlog_assign_lsn(threshold_cycle,
- threshold_block);
- /*
- * Don't pass in an lsn greater than the lsn of the last
- * log record known to be on disk. Use a snapshot of the last sync lsn
- * so that it doesn't change between the compare and the set.
- */
- last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
- if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
- threshold_lsn = last_sync_lsn;
-
- return threshold_lsn;
-}
-
-/*
- * Push the tail of the log if we need to do so to maintain the free log space
- * thresholds set out by xlog_grant_push_threshold. We may need to adopt a
- * policy which pushes on an lsn which is further along in the log once we
- * reach the high water mark. In this manner, we would be creating a low water
- * mark.
- */
-STATIC void
-xlog_grant_push_ail(
- struct xlog *log,
- int need_bytes)
-{
- xfs_lsn_t threshold_lsn;
-
- threshold_lsn = xlog_grant_push_threshold(log, need_bytes);
- if (threshold_lsn == NULLCOMMITLSN || xlog_is_shutdown(log))
- return;
-
- /*
- * Get the transaction layer to kick the dirty buffers out to
- * disk asynchronously. No point in trying to do this if
- * the filesystem is shutting down.
- */
- xfs_ail_push(log->l_ailp, threshold_lsn);
-}
-
-/*
* Stamp cycle number in every block
*/
STATIC void
@@ -2048,8 +1813,8 @@ xlog_sync(
if (ticket) {
ticket->t_curr_res -= roundoff;
} else {
- xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
- xlog_grant_add_space(log, &log->l_write_head.grant, roundoff);
+ xlog_grant_add_space(&log->l_reserve_head, roundoff);
+ xlog_grant_add_space(&log->l_write_head, roundoff);
}
/* put cycle number in every block */
@@ -2675,47 +2440,6 @@ xlog_get_lowest_lsn(
}
/*
- * Completion of a iclog IO does not imply that a transaction has completed, as
- * transactions can be large enough to span many iclogs. We cannot change the
- * tail of the log half way through a transaction as this may be the only
- * transaction in the log and moving the tail to point to the middle of it
- * will prevent recovery from finding the start of the transaction. Hence we
- * should only update the last_sync_lsn if this iclog contains transaction
- * completion callbacks on it.
- *
- * We have to do this before we drop the icloglock to ensure we are the only one
- * that can update it.
- *
- * If we are moving the last_sync_lsn forwards, we also need to ensure we kick
- * the reservation grant head pushing. This is due to the fact that the push
- * target is bound by the current last_sync_lsn value. Hence if we have a large
- * amount of log space bound up in this committing transaction then the
- * last_sync_lsn value may be the limiting factor preventing tail pushing from
- * freeing space in the log. Hence once we've updated the last_sync_lsn we
- * should push the AIL to ensure the push target (and hence the grant head) is
- * no longer bound by the old log head location and can move forwards and make
- * progress again.
- */
-static void
-xlog_state_set_callback(
- struct xlog *log,
- struct xlog_in_core *iclog,
- xfs_lsn_t header_lsn)
-{
- trace_xlog_iclog_callback(iclog, _RET_IP_);
- iclog->ic_state = XLOG_STATE_CALLBACK;
-
- ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
- header_lsn) <= 0);
-
- if (list_empty_careful(&iclog->ic_callbacks))
- return;
-
- atomic64_set(&log->l_last_sync_lsn, header_lsn);
- xlog_grant_push_ail(log, 0);
-}
-
-/*
* Return true if we need to stop processing, false to continue to the next
* iclog. The caller will need to run callbacks if the iclog is returned in the
* XLOG_STATE_CALLBACK state.
@@ -2746,7 +2470,17 @@ xlog_state_iodone_process_iclog(
lowest_lsn = xlog_get_lowest_lsn(log);
if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0)
return false;
- xlog_state_set_callback(log, iclog, header_lsn);
+ /*
+ * If there are no callbacks on this iclog, we can mark it clean
+ * immediately and return. Otherwise we need to run the
+ * callbacks.
+ */
+ if (list_empty(&iclog->ic_callbacks)) {
+ xlog_state_clean_iclog(log, iclog);
+ return false;
+ }
+ trace_xlog_iclog_callback(iclog, _RET_IP_);
+ iclog->ic_state = XLOG_STATE_CALLBACK;
return false;
default:
/*
@@ -3000,18 +2734,15 @@ xfs_log_ticket_regrant(
if (ticket->t_cnt > 0)
ticket->t_cnt--;
- xlog_grant_sub_space(log, &log->l_reserve_head.grant,
- ticket->t_curr_res);
- xlog_grant_sub_space(log, &log->l_write_head.grant,
- ticket->t_curr_res);
+ xlog_grant_sub_space(&log->l_reserve_head, ticket->t_curr_res);
+ xlog_grant_sub_space(&log->l_write_head, ticket->t_curr_res);
ticket->t_curr_res = ticket->t_unit_res;
trace_xfs_log_ticket_regrant_sub(log, ticket);
/* just return if we still have some of the pre-reserved space */
if (!ticket->t_cnt) {
- xlog_grant_add_space(log, &log->l_reserve_head.grant,
- ticket->t_unit_res);
+ xlog_grant_add_space(&log->l_reserve_head, ticket->t_unit_res);
trace_xfs_log_ticket_regrant_exit(log, ticket);
ticket->t_curr_res = ticket->t_unit_res;
@@ -3058,8 +2789,8 @@ xfs_log_ticket_ungrant(
bytes += ticket->t_unit_res*ticket->t_cnt;
}
- xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes);
- xlog_grant_sub_space(log, &log->l_write_head.grant, bytes);
+ xlog_grant_sub_space(&log->l_reserve_head, bytes);
+ xlog_grant_sub_space(&log->l_write_head, bytes);
trace_xfs_log_ticket_ungrant_exit(log, ticket);
@@ -3532,42 +3263,27 @@ xlog_ticket_alloc(
}
#if defined(DEBUG)
-/*
- * Check to make sure the grant write head didn't just over lap the tail. If
- * the cycles are the same, we can't be overlapping. Otherwise, make sure that
- * the cycles differ by exactly one and check the byte count.
- *
- * This check is run unlocked, so can give false positives. Rather than assert
- * on failures, use a warn-once flag and a panic tag to allow the admin to
- * determine if they want to panic the machine when such an error occurs. For
- * debug kernels this will have the same effect as using an assert but, unlinke
- * an assert, it can be turned off at runtime.
- */
-STATIC void
-xlog_verify_grant_tail(
- struct xlog *log)
+static void
+xlog_verify_dump_tail(
+ struct xlog *log,
+ struct xlog_in_core *iclog)
{
- int tail_cycle, tail_blocks;
- int cycle, space;
-
- xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &space);
- xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
- if (tail_cycle != cycle) {
- if (cycle - 1 != tail_cycle &&
- !test_and_set_bit(XLOG_TAIL_WARN, &log->l_opstate)) {
- xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
- "%s: cycle - 1 != tail_cycle", __func__);
- }
-
- if (space > BBTOB(tail_blocks) &&
- !test_and_set_bit(XLOG_TAIL_WARN, &log->l_opstate)) {
- xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
- "%s: space > BBTOB(tail_blocks)", __func__);
- }
- }
-}
-
-/* check if it will fit */
+ xfs_alert(log->l_mp,
+"ran out of log space tail 0x%llx/0x%llx, head lsn 0x%llx, head 0x%x/0x%x, prev head 0x%x/0x%x",
+ iclog ? be64_to_cpu(iclog->ic_header.h_tail_lsn) : -1,
+ atomic64_read(&log->l_tail_lsn),
+ log->l_ailp->ail_head_lsn,
+ log->l_curr_cycle, log->l_curr_block,
+ log->l_prev_cycle, log->l_prev_block);
+ xfs_alert(log->l_mp,
+"write grant 0x%llx, reserve grant 0x%llx, tail_space 0x%llx, size 0x%x, iclog flags 0x%x",
+ atomic64_read(&log->l_write_head.grant),
+ atomic64_read(&log->l_reserve_head.grant),
+ log->l_tail_space, log->l_logsize,
+ iclog ? iclog->ic_flags : -1);
+}
+
+/* Check if the new iclog will fit in the log. */
STATIC void
xlog_verify_tail_lsn(
struct xlog *log,
@@ -3576,21 +3292,34 @@ xlog_verify_tail_lsn(
xfs_lsn_t tail_lsn = be64_to_cpu(iclog->ic_header.h_tail_lsn);
int blocks;
- if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) {
- blocks =
- log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn));
- if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize))
- xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
- } else {
- ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle);
+ if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) {
+ blocks = log->l_logBBsize -
+ (log->l_prev_block - BLOCK_LSN(tail_lsn));
+ if (blocks < BTOBB(iclog->ic_offset) +
+ BTOBB(log->l_iclog_hsize)) {
+ xfs_emerg(log->l_mp,
+ "%s: ran out of log space", __func__);
+ xlog_verify_dump_tail(log, iclog);
+ }
+ return;
+ }
- if (BLOCK_LSN(tail_lsn) == log->l_prev_block)
+ if (CYCLE_LSN(tail_lsn) + 1 != log->l_prev_cycle) {
+ xfs_emerg(log->l_mp, "%s: head has wrapped tail.", __func__);
+ xlog_verify_dump_tail(log, iclog);
+ return;
+ }
+ if (BLOCK_LSN(tail_lsn) == log->l_prev_block) {
xfs_emerg(log->l_mp, "%s: tail wrapped", __func__);
+ xlog_verify_dump_tail(log, iclog);
+ return;
+ }
blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block;
- if (blocks < BTOBB(iclog->ic_offset) + 1)
- xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
- }
+ if (blocks < BTOBB(iclog->ic_offset) + 1) {
+ xfs_emerg(log->l_mp, "%s: ran out of iclog space", __func__);
+ xlog_verify_dump_tail(log, iclog);
+ }
}
/*
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index d69acf881153..67c539cc9305 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -156,7 +156,6 @@ int xfs_log_quiesce(struct xfs_mount *mp);
void xfs_log_clean(struct xfs_mount *mp);
bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
-xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes);
bool xlog_force_shutdown(struct xlog *log, uint32_t shutdown_flags);
int xfs_attr_use_log_assist(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index f51cbc6405c1..391a938d690c 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -694,6 +694,180 @@ xlog_cil_insert_items(
}
}
+static inline void
+xlog_cil_ail_insert_batch(
+ struct xfs_ail *ailp,
+ struct xfs_ail_cursor *cur,
+ struct xfs_log_item **log_items,
+ int nr_items,
+ xfs_lsn_t commit_lsn)
+{
+ int i;
+
+ spin_lock(&ailp->ail_lock);
+ /* xfs_trans_ail_update_bulk drops ailp->ail_lock */
+ xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn);
+
+ for (i = 0; i < nr_items; i++) {
+ struct xfs_log_item *lip = log_items[i];
+
+ if (lip->li_ops->iop_unpin)
+ lip->li_ops->iop_unpin(lip, 0);
+ }
+}
+
+/*
+ * Take the checkpoint's log vector chain of items and insert the attached log
+ * items into the AIL. This uses bulk insertion techniques to minimise AIL lock
+ * traffic.
+ *
+ * The AIL tracks log items via the start record LSN of the checkpoint,
+ * not the commit record LSN. This is because we can pipeline multiple
+ * checkpoints, and so the start record of checkpoint N+1 can be
+ * written before the commit record of checkpoint N. i.e:
+ *
+ * start N commit N
+ * +-------------+------------+----------------+
+ * start N+1 commit N+1
+ *
+ * The tail of the log cannot be moved to the LSN of commit N when all
+ * the items of that checkpoint are written back, because then the
+ * start record for N+1 is no longer in the active portion of the log
+ * and recovery will fail/corrupt the filesystem.
+ *
+ * Hence when all the log items in checkpoint N are written back, the
+ * tail of the log most now only move as far forwards as the start LSN
+ * of checkpoint N+1.
+ *
+ * If we are called with the aborted flag set, it is because a log write during
+ * a CIL checkpoint commit has failed. In this case, all the items in the
+ * checkpoint have already gone through iop_committed and iop_committing, which
+ * means that checkpoint commit abort handling is treated exactly the same as an
+ * iclog write error even though we haven't started any IO yet. Hence in this
+ * case all we need to do is iop_committed processing, followed by an
+ * iop_unpin(aborted) call.
+ *
+ * The AIL cursor is used to optimise the insert process. If commit_lsn is not
+ * at the end of the AIL, the insert cursor avoids the need to walk the AIL to
+ * find the insertion point on every xfs_log_item_batch_insert() call. This
+ * saves a lot of needless list walking and is a net win, even though it
+ * slightly increases that amount of AIL lock traffic to set it up and tear it
+ * down.
+ */
+static void
+xlog_cil_ail_insert(
+ struct xfs_cil_ctx *ctx,
+ bool aborted)
+{
+#define LOG_ITEM_BATCH_SIZE 32
+ struct xfs_ail *ailp = ctx->cil->xc_log->l_ailp;
+ struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE];
+ struct xfs_log_vec *lv;
+ struct xfs_ail_cursor cur;
+ xfs_lsn_t old_head;
+ int i = 0;
+
+ /*
+ * Update the AIL head LSN with the commit record LSN of this
+ * checkpoint. As iclogs are always completed in order, this should
+ * always be the same (as iclogs can contain multiple commit records) or
+ * higher LSN than the current head. We do this before insertion of the
+ * items so that log space checks during insertion will reflect the
+ * space that this checkpoint has already consumed. We call
+ * xfs_ail_update_finish() so that tail space and space-based wakeups
+ * will be recalculated appropriately.
+ */
+ ASSERT(XFS_LSN_CMP(ctx->commit_lsn, ailp->ail_head_lsn) >= 0 ||
+ aborted);
+ spin_lock(&ailp->ail_lock);
+ xfs_trans_ail_cursor_last(ailp, &cur, ctx->start_lsn);
+ old_head = ailp->ail_head_lsn;
+ ailp->ail_head_lsn = ctx->commit_lsn;
+ /* xfs_ail_update_finish() drops the ail_lock */
+ xfs_ail_update_finish(ailp, NULLCOMMITLSN);
+
+ /*
+ * We move the AIL head forwards to account for the space used in the
+ * log before we remove that space from the grant heads. This prevents a
+ * transient condition where reservation space appears to become
+ * available on return, only for it to disappear again immediately as
+ * the AIL head update accounts in the log tail space.
+ */
+ smp_wmb(); /* paired with smp_rmb in xlog_grant_space_left */
+ xlog_grant_return_space(ailp->ail_log, old_head, ailp->ail_head_lsn);
+
+ /* unpin all the log items */
+ list_for_each_entry(lv, &ctx->lv_chain, lv_list) {
+ struct xfs_log_item *lip = lv->lv_item;
+ xfs_lsn_t item_lsn;
+
+ if (aborted)
+ set_bit(XFS_LI_ABORTED, &lip->li_flags);
+
+ if (lip->li_ops->flags & XFS_ITEM_RELEASE_WHEN_COMMITTED) {
+ lip->li_ops->iop_release(lip);
+ continue;
+ }
+
+ if (lip->li_ops->iop_committed)
+ item_lsn = lip->li_ops->iop_committed(lip,
+ ctx->start_lsn);
+ else
+ item_lsn = ctx->start_lsn;
+
+ /* item_lsn of -1 means the item needs no further processing */
+ if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
+ continue;
+
+ /*
+ * if we are aborting the operation, no point in inserting the
+ * object into the AIL as we are in a shutdown situation.
+ */
+ if (aborted) {
+ ASSERT(xlog_is_shutdown(ailp->ail_log));
+ if (lip->li_ops->iop_unpin)
+ lip->li_ops->iop_unpin(lip, 1);
+ continue;
+ }
+
+ if (item_lsn != ctx->start_lsn) {
+
+ /*
+ * Not a bulk update option due to unusual item_lsn.
+ * Push into AIL immediately, rechecking the lsn once
+ * we have the ail lock. Then unpin the item. This does
+ * not affect the AIL cursor the bulk insert path is
+ * using.
+ */
+ spin_lock(&ailp->ail_lock);
+ if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
+ xfs_trans_ail_update(ailp, lip, item_lsn);
+ else
+ spin_unlock(&ailp->ail_lock);
+ if (lip->li_ops->iop_unpin)
+ lip->li_ops->iop_unpin(lip, 0);
+ continue;
+ }
+
+ /* Item is a candidate for bulk AIL insert. */
+ log_items[i++] = lv->lv_item;
+ if (i >= LOG_ITEM_BATCH_SIZE) {
+ xlog_cil_ail_insert_batch(ailp, &cur, log_items,
+ LOG_ITEM_BATCH_SIZE, ctx->start_lsn);
+ i = 0;
+ }
+ }
+
+ /* make sure we insert the remainder! */
+ if (i)
+ xlog_cil_ail_insert_batch(ailp, &cur, log_items, i,
+ ctx->start_lsn);
+
+ spin_lock(&ailp->ail_lock);
+ xfs_trans_ail_cursor_done(&cur);
+ spin_unlock(&ailp->ail_lock);
+}
+
static void
xlog_cil_free_logvec(
struct list_head *lv_chain)
@@ -733,8 +907,7 @@ xlog_cil_committed(
spin_unlock(&ctx->cil->xc_push_lock);
}
- xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, &ctx->lv_chain,
- ctx->start_lsn, abort);
+ xlog_cil_ail_insert(ctx, abort);
xfs_extent_busy_sort(&ctx->busy_extents.extent_list);
xfs_extent_busy_clear(mp, &ctx->busy_extents.extent_list,
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 40e22ec0fbe6..b8778a4fd6b6 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -431,18 +431,16 @@ struct xlog {
int l_prev_block; /* previous logical log block */
/*
- * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
- * read without needing to hold specific locks. To avoid operations
- * contending with other hot objects, place each of them on a separate
- * cacheline.
+ * l_tail_lsn is atomic so it can be set and read without needing to
+ * hold specific locks. To avoid operations contending with other hot
+ * objects, it on a separate cacheline.
*/
- /* lsn of last LR on disk */
- atomic64_t l_last_sync_lsn ____cacheline_aligned_in_smp;
/* lsn of 1st LR with unflushed * buffers */
atomic64_t l_tail_lsn ____cacheline_aligned_in_smp;
struct xlog_grant_head l_reserve_head;
struct xlog_grant_head l_write_head;
+ uint64_t l_tail_space;
struct xfs_kobj l_kobj;
@@ -546,36 +544,6 @@ xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
}
/*
- * When we crack the grant head, we sample it first so that the value will not
- * change while we are cracking it into the component values. This means we
- * will always get consistent component values to work from.
- */
-static inline void
-xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
-{
- *cycle = val >> 32;
- *space = val & 0xffffffff;
-}
-
-static inline void
-xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
-{
- xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
-}
-
-static inline int64_t
-xlog_assign_grant_head_val(int cycle, int space)
-{
- return ((int64_t)cycle << 32) | space;
-}
-
-static inline void
-xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
-{
- atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
-}
-
-/*
* Committed Item List interfaces
*/
int xlog_cil_init(struct xlog *log);
@@ -623,6 +591,27 @@ xlog_wait(
int xlog_wait_on_iclog(struct xlog_in_core *iclog)
__releases(iclog->ic_log->l_icloglock);
+/* Calculate the distance between two LSNs in bytes */
+static inline uint64_t
+xlog_lsn_sub(
+ struct xlog *log,
+ xfs_lsn_t high,
+ xfs_lsn_t low)
+{
+ uint32_t hi_cycle = CYCLE_LSN(high);
+ uint32_t hi_block = BLOCK_LSN(high);
+ uint32_t lo_cycle = CYCLE_LSN(low);
+ uint32_t lo_block = BLOCK_LSN(low);
+
+ if (hi_cycle == lo_cycle)
+ return BBTOB(hi_block - lo_block);
+ ASSERT((hi_cycle == lo_cycle + 1) || xlog_is_shutdown(log));
+ return (uint64_t)log->l_logsize - BBTOB(lo_block - hi_block);
+}
+
+void xlog_grant_return_space(struct xlog *log, xfs_lsn_t old_head,
+ xfs_lsn_t new_head);
+
/*
* The LSN is valid so long as it is behind the current LSN. If it isn't, this
* means that the next log record that includes this metadata could have a
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 4fe627991e86..4423dd344239 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1177,8 +1177,8 @@ xlog_check_unmount_rec(
*/
xlog_assign_atomic_lsn(&log->l_tail_lsn,
log->l_curr_cycle, after_umount_blk);
- xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
- log->l_curr_cycle, after_umount_blk);
+ log->l_ailp->ail_head_lsn =
+ atomic64_read(&log->l_tail_lsn);
*tail_blk = after_umount_blk;
*clean = true;
@@ -1212,11 +1212,7 @@ xlog_set_state(
if (bump_cycle)
log->l_curr_cycle++;
atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
- atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
- xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
- BBTOB(log->l_curr_block));
- xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
- BBTOB(log->l_curr_block));
+ log->l_ailp->ail_head_lsn = be64_to_cpu(rhead->h_lsn);
}
/*
@@ -2489,7 +2485,10 @@ xlog_recover_process_data(
ohead = (struct xlog_op_header *)dp;
dp += sizeof(*ohead);
- ASSERT(dp <= end);
+ if (dp > end) {
+ xfs_warn(log->l_mp, "%s: op header overrun", __func__);
+ return -EFSCORRUPTED;
+ }
/* errors will abort recovery */
error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
@@ -3363,14 +3362,13 @@ xlog_do_recover(
/*
* We now update the tail_lsn since much of the recovery has completed
- * and there may be space available to use. If there were no extent
- * or iunlinks, we can free up the entire log and set the tail_lsn to
- * be the last_sync_lsn. This was set in xlog_find_tail to be the
- * lsn of the last known good LR on disk. If there are extent frees
- * or iunlinks they will have some entries in the AIL; so we look at
- * the AIL to determine how to set the tail_lsn.
+ * and there may be space available to use. If there were no extent or
+ * iunlinks, we can free up the entire log. This was set in
+ * xlog_find_tail to be the lsn of the last known good LR on disk. If
+ * there are extent frees or iunlinks they will have some entries in the
+ * AIL; so we look at the AIL to determine how to set the tail_lsn.
*/
- xlog_assign_tail_lsn(mp);
+ xfs_ail_assign_tail_lsn(log->l_ailp);
/*
* Now that we've finished replaying all buffer and inode updates,
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 47120b745c47..9490b913a4ab 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -793,12 +793,15 @@ xfs_qm_qino_alloc(
return error;
if (need_alloc) {
+ struct xfs_icreate_args args = {
+ .mode = S_IFREG,
+ .flags = XFS_ICREATE_UNLINKABLE,
+ };
xfs_ino_t ino;
error = xfs_dialloc(&tp, 0, S_IFREG, &ino);
if (!error)
- error = xfs_init_new_inode(&nop_mnt_idmap, tp, NULL, ino,
- S_IFREG, 1, 0, 0, false, ipp);
+ error = xfs_icreate(tp, ino, &args, ipp);
if (error) {
xfs_trans_cancel(tp);
return error;
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 271c1021c733..a11436579877 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -11,7 +11,6 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_quota.h"
-#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_qm.h"
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 14919b33e4fe..27398512b179 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -21,6 +21,8 @@
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
#include "xfs_ag.h"
+#include "xfs_btree.h"
+#include "xfs_trace.h"
struct kmem_cache *xfs_cui_cache;
struct kmem_cache *xfs_cud_cache;
@@ -227,6 +229,11 @@ static const struct xfs_item_ops xfs_cud_item_ops = {
.iop_intent = xfs_cud_item_intent,
};
+static inline struct xfs_refcount_intent *ci_entry(const struct list_head *e)
+{
+ return list_entry(e, struct xfs_refcount_intent, ri_list);
+}
+
/* Sort refcount intents by AG. */
static int
xfs_refcount_update_diff_items(
@@ -234,34 +241,12 @@ xfs_refcount_update_diff_items(
const struct list_head *a,
const struct list_head *b)
{
- struct xfs_refcount_intent *ra;
- struct xfs_refcount_intent *rb;
-
- ra = container_of(a, struct xfs_refcount_intent, ri_list);
- rb = container_of(b, struct xfs_refcount_intent, ri_list);
+ struct xfs_refcount_intent *ra = ci_entry(a);
+ struct xfs_refcount_intent *rb = ci_entry(b);
return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno;
}
-/* Set the phys extent flags for this reverse mapping. */
-static void
-xfs_trans_set_refcount_flags(
- struct xfs_phys_extent *pmap,
- enum xfs_refcount_intent_type type)
-{
- pmap->pe_flags = 0;
- switch (type) {
- case XFS_REFCOUNT_INCREASE:
- case XFS_REFCOUNT_DECREASE:
- case XFS_REFCOUNT_ALLOC_COW:
- case XFS_REFCOUNT_FREE_COW:
- pmap->pe_flags |= type;
- break;
- default:
- ASSERT(0);
- }
-}
-
/* Log refcount updates in the intent item. */
STATIC void
xfs_refcount_update_log_item(
@@ -282,7 +267,18 @@ xfs_refcount_update_log_item(
pmap = &cuip->cui_format.cui_extents[next_extent];
pmap->pe_startblock = ri->ri_startblock;
pmap->pe_len = ri->ri_blockcount;
- xfs_trans_set_refcount_flags(pmap, ri->ri_type);
+
+ pmap->pe_flags = 0;
+ switch (ri->ri_type) {
+ case XFS_REFCOUNT_INCREASE:
+ case XFS_REFCOUNT_DECREASE:
+ case XFS_REFCOUNT_ALLOC_COW:
+ case XFS_REFCOUNT_FREE_COW:
+ pmap->pe_flags |= ri->ri_type;
+ break;
+ default:
+ ASSERT(0);
+ }
}
static struct xfs_log_item *
@@ -324,24 +320,29 @@ xfs_refcount_update_create_done(
return &cudp->cud_item;
}
-/* Take a passive ref to the AG containing the space we're refcounting. */
+/* Add this deferred CUI to the transaction. */
void
-xfs_refcount_update_get_group(
- struct xfs_mount *mp,
+xfs_refcount_defer_add(
+ struct xfs_trans *tp,
struct xfs_refcount_intent *ri)
{
- xfs_agnumber_t agno;
+ struct xfs_mount *mp = tp->t_mountp;
+
+ trace_xfs_refcount_defer(mp, ri);
- agno = XFS_FSB_TO_AGNO(mp, ri->ri_startblock);
- ri->ri_pag = xfs_perag_intent_get(mp, agno);
+ ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_startblock);
+ xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type);
}
-/* Release a passive AG ref after finishing refcounting work. */
-static inline void
-xfs_refcount_update_put_group(
- struct xfs_refcount_intent *ri)
+/* Cancel a deferred refcount update. */
+STATIC void
+xfs_refcount_update_cancel_item(
+ struct list_head *item)
{
+ struct xfs_refcount_intent *ri = ci_entry(item);
+
xfs_perag_intent_put(ri->ri_pag);
+ kmem_cache_free(xfs_refcount_intent_cache, ri);
}
/* Process a deferred refcount update. */
@@ -352,11 +353,9 @@ xfs_refcount_update_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
- struct xfs_refcount_intent *ri;
+ struct xfs_refcount_intent *ri = ci_entry(item);
int error;
- ri = container_of(item, struct xfs_refcount_intent, ri_list);
-
/* Did we run out of reservation? Requeue what we didn't finish. */
error = xfs_refcount_finish_one(tp, ri, state);
if (!error && ri->ri_blockcount > 0) {
@@ -365,30 +364,33 @@ xfs_refcount_update_finish_item(
return -EAGAIN;
}
- xfs_refcount_update_put_group(ri);
- kmem_cache_free(xfs_refcount_intent_cache, ri);
+ xfs_refcount_update_cancel_item(item);
return error;
}
-/* Abort all pending CUIs. */
+/* Clean up after calling xfs_refcount_finish_one. */
STATIC void
-xfs_refcount_update_abort_intent(
- struct xfs_log_item *intent)
+xfs_refcount_finish_one_cleanup(
+ struct xfs_trans *tp,
+ struct xfs_btree_cur *rcur,
+ int error)
{
- xfs_cui_release(CUI_ITEM(intent));
+ struct xfs_buf *agbp;
+
+ if (rcur == NULL)
+ return;
+ agbp = rcur->bc_ag.agbp;
+ xfs_btree_del_cursor(rcur, error);
+ if (error)
+ xfs_trans_brelse(tp, agbp);
}
-/* Cancel a deferred refcount update. */
+/* Abort all pending CUIs. */
STATIC void
-xfs_refcount_update_cancel_item(
- struct list_head *item)
+xfs_refcount_update_abort_intent(
+ struct xfs_log_item *intent)
{
- struct xfs_refcount_intent *ri;
-
- ri = container_of(item, struct xfs_refcount_intent, ri_list);
-
- xfs_refcount_update_put_group(ri);
- kmem_cache_free(xfs_refcount_intent_cache, ri);
+ xfs_cui_release(CUI_ITEM(intent));
}
/* Is this recovered CUI ok? */
@@ -429,7 +431,7 @@ xfs_cui_recover_work(
ri->ri_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
ri->ri_startblock = pmap->pe_startblock;
ri->ri_blockcount = pmap->pe_len;
- xfs_refcount_update_get_group(mp, ri);
+ ri->ri_pag = xfs_perag_intent_get(mp, pmap->pe_startblock);
xfs_defer_add_item(dfp, &ri->ri_list);
}
diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
index eb0ab13682d0..bfee8f30c63c 100644
--- a/fs/xfs/xfs_refcount_item.h
+++ b/fs/xfs/xfs_refcount_item.h
@@ -71,4 +71,9 @@ struct xfs_cud_log_item {
extern struct kmem_cache *xfs_cui_cache;
extern struct kmem_cache *xfs_cud_cache;
+struct xfs_refcount_intent;
+
+void xfs_refcount_defer_add(struct xfs_trans *tp,
+ struct xfs_refcount_intent *ri);
+
#endif /* __XFS_REFCOUNT_ITEM_H__ */
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 265a2a418bc7..6fde6ec8092f 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -603,7 +603,7 @@ xfs_reflink_cancel_cow_blocks(
error = xfs_free_extent_later(*tpp, del.br_startblock,
del.br_blockcount, NULL,
- XFS_AG_RESV_NONE, false);
+ XFS_AG_RESV_NONE, 0);
if (error)
break;
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 65c5dfe17ecf..fb55e4ce49fa 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -6,16 +6,6 @@
#ifndef __XFS_REFLINK_H
#define __XFS_REFLINK_H 1
-static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
-{
- return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount);
-}
-
-static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
-{
- return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
-}
-
extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
struct xfs_bmbt_irec *irec, bool *shared);
int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index e473124e29cc..88b5580e1e19 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -21,6 +21,8 @@
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
#include "xfs_ag.h"
+#include "xfs_btree.h"
+#include "xfs_trace.h"
struct kmem_cache *xfs_rui_cache;
struct kmem_cache *xfs_rud_cache;
@@ -226,47 +228,9 @@ static const struct xfs_item_ops xfs_rud_item_ops = {
.iop_intent = xfs_rud_item_intent,
};
-/* Set the map extent flags for this reverse mapping. */
-static void
-xfs_trans_set_rmap_flags(
- struct xfs_map_extent *map,
- enum xfs_rmap_intent_type type,
- int whichfork,
- xfs_exntst_t state)
+static inline struct xfs_rmap_intent *ri_entry(const struct list_head *e)
{
- map->me_flags = 0;
- if (state == XFS_EXT_UNWRITTEN)
- map->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN;
- if (whichfork == XFS_ATTR_FORK)
- map->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK;
- switch (type) {
- case XFS_RMAP_MAP:
- map->me_flags |= XFS_RMAP_EXTENT_MAP;
- break;
- case XFS_RMAP_MAP_SHARED:
- map->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED;
- break;
- case XFS_RMAP_UNMAP:
- map->me_flags |= XFS_RMAP_EXTENT_UNMAP;
- break;
- case XFS_RMAP_UNMAP_SHARED:
- map->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED;
- break;
- case XFS_RMAP_CONVERT:
- map->me_flags |= XFS_RMAP_EXTENT_CONVERT;
- break;
- case XFS_RMAP_CONVERT_SHARED:
- map->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED;
- break;
- case XFS_RMAP_ALLOC:
- map->me_flags |= XFS_RMAP_EXTENT_ALLOC;
- break;
- case XFS_RMAP_FREE:
- map->me_flags |= XFS_RMAP_EXTENT_FREE;
- break;
- default:
- ASSERT(0);
- }
+ return list_entry(e, struct xfs_rmap_intent, ri_list);
}
/* Sort rmap intents by AG. */
@@ -276,11 +240,8 @@ xfs_rmap_update_diff_items(
const struct list_head *a,
const struct list_head *b)
{
- struct xfs_rmap_intent *ra;
- struct xfs_rmap_intent *rb;
-
- ra = container_of(a, struct xfs_rmap_intent, ri_list);
- rb = container_of(b, struct xfs_rmap_intent, ri_list);
+ struct xfs_rmap_intent *ra = ri_entry(a);
+ struct xfs_rmap_intent *rb = ri_entry(b);
return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno;
}
@@ -307,8 +268,40 @@ xfs_rmap_update_log_item(
map->me_startblock = ri->ri_bmap.br_startblock;
map->me_startoff = ri->ri_bmap.br_startoff;
map->me_len = ri->ri_bmap.br_blockcount;
- xfs_trans_set_rmap_flags(map, ri->ri_type, ri->ri_whichfork,
- ri->ri_bmap.br_state);
+
+ map->me_flags = 0;
+ if (ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN)
+ map->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN;
+ if (ri->ri_whichfork == XFS_ATTR_FORK)
+ map->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK;
+ switch (ri->ri_type) {
+ case XFS_RMAP_MAP:
+ map->me_flags |= XFS_RMAP_EXTENT_MAP;
+ break;
+ case XFS_RMAP_MAP_SHARED:
+ map->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED;
+ break;
+ case XFS_RMAP_UNMAP:
+ map->me_flags |= XFS_RMAP_EXTENT_UNMAP;
+ break;
+ case XFS_RMAP_UNMAP_SHARED:
+ map->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED;
+ break;
+ case XFS_RMAP_CONVERT:
+ map->me_flags |= XFS_RMAP_EXTENT_CONVERT;
+ break;
+ case XFS_RMAP_CONVERT_SHARED:
+ map->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED;
+ break;
+ case XFS_RMAP_ALLOC:
+ map->me_flags |= XFS_RMAP_EXTENT_ALLOC;
+ break;
+ case XFS_RMAP_FREE:
+ map->me_flags |= XFS_RMAP_EXTENT_FREE;
+ break;
+ default:
+ ASSERT(0);
+ }
}
static struct xfs_log_item *
@@ -350,24 +343,29 @@ xfs_rmap_update_create_done(
return &rudp->rud_item;
}
-/* Take a passive ref to the AG containing the space we're rmapping. */
+/* Add this deferred RUI to the transaction. */
void
-xfs_rmap_update_get_group(
- struct xfs_mount *mp,
+xfs_rmap_defer_add(
+ struct xfs_trans *tp,
struct xfs_rmap_intent *ri)
{
- xfs_agnumber_t agno;
+ struct xfs_mount *mp = tp->t_mountp;
+
+ trace_xfs_rmap_defer(mp, ri);
- agno = XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock);
- ri->ri_pag = xfs_perag_intent_get(mp, agno);
+ ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_bmap.br_startblock);
+ xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type);
}
-/* Release a passive AG ref after finishing rmapping work. */
-static inline void
-xfs_rmap_update_put_group(
- struct xfs_rmap_intent *ri)
+/* Cancel a deferred rmap update. */
+STATIC void
+xfs_rmap_update_cancel_item(
+ struct list_head *item)
{
+ struct xfs_rmap_intent *ri = ri_entry(item);
+
xfs_perag_intent_put(ri->ri_pag);
+ kmem_cache_free(xfs_rmap_intent_cache, ri);
}
/* Process a deferred rmap update. */
@@ -378,37 +376,38 @@ xfs_rmap_update_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
- struct xfs_rmap_intent *ri;
+ struct xfs_rmap_intent *ri = ri_entry(item);
int error;
- ri = container_of(item, struct xfs_rmap_intent, ri_list);
-
error = xfs_rmap_finish_one(tp, ri, state);
- xfs_rmap_update_put_group(ri);
- kmem_cache_free(xfs_rmap_intent_cache, ri);
+ xfs_rmap_update_cancel_item(item);
return error;
}
-/* Abort all pending RUIs. */
+/* Clean up after calling xfs_rmap_finish_one. */
STATIC void
-xfs_rmap_update_abort_intent(
- struct xfs_log_item *intent)
+xfs_rmap_finish_one_cleanup(
+ struct xfs_trans *tp,
+ struct xfs_btree_cur *rcur,
+ int error)
{
- xfs_rui_release(RUI_ITEM(intent));
+ struct xfs_buf *agbp = NULL;
+
+ if (rcur == NULL)
+ return;
+ agbp = rcur->bc_ag.agbp;
+ xfs_btree_del_cursor(rcur, error);
+ if (error && agbp)
+ xfs_trans_brelse(tp, agbp);
}
-/* Cancel a deferred rmap update. */
+/* Abort all pending RUIs. */
STATIC void
-xfs_rmap_update_cancel_item(
- struct list_head *item)
+xfs_rmap_update_abort_intent(
+ struct xfs_log_item *intent)
{
- struct xfs_rmap_intent *ri;
-
- ri = container_of(item, struct xfs_rmap_intent, ri_list);
-
- xfs_rmap_update_put_group(ri);
- kmem_cache_free(xfs_rmap_intent_cache, ri);
+ xfs_rui_release(RUI_ITEM(intent));
}
/* Is this recovered RUI ok? */
@@ -495,7 +494,7 @@ xfs_rui_recover_work(
ri->ri_bmap.br_blockcount = map->me_len;
ri->ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
- xfs_rmap_update_get_group(mp, ri);
+ ri->ri_pag = xfs_perag_intent_get(mp, map->me_startblock);
xfs_defer_add_item(dfp, &ri->ri_list);
}
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
index 802e5119eaca..40d331555675 100644
--- a/fs/xfs/xfs_rmap_item.h
+++ b/fs/xfs/xfs_rmap_item.h
@@ -71,4 +71,8 @@ struct xfs_rud_log_item {
extern struct kmem_cache *xfs_rui_cache;
extern struct kmem_cache *xfs_rud_cache;
+struct xfs_rmap_intent;
+
+void xfs_rmap_defer_add(struct xfs_trans *tp, struct xfs_rmap_intent *ri);
+
#endif /* __XFS_RMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 5a7ddfed1bb8..0c3e96c621a6 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -12,6 +12,7 @@
#include "xfs_bit.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
+#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_bmap_btree.h"
#include "xfs_bmap_util.h"
@@ -1382,7 +1383,7 @@ retry:
start = 0;
} else if (xfs_bmap_adjacent(ap)) {
start = xfs_rtb_to_rtx(mp, ap->blkno);
- } else if (ap->eof && ap->offset == 0) {
+ } else if (ap->datatype & XFS_ALLOC_INITIAL_USER_DATA) {
/*
* If it's an allocation to an empty file at offset 0, pick an
* extent that will space things out in the rt area.
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 17aee806ec2e..77f19e2f66e0 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -90,19 +90,25 @@ xfs_symlink(
struct xfs_inode **ipp)
{
struct xfs_mount *mp = dp->i_mount;
+ struct xfs_icreate_args args = {
+ .idmap = idmap,
+ .pip = dp,
+ .mode = S_IFLNK | (mode & ~S_IFMT),
+ };
+ struct xfs_dir_update du = {
+ .dp = dp,
+ .name = link_name,
+ };
struct xfs_trans *tp = NULL;
- struct xfs_inode *ip = NULL;
int error = 0;
int pathlen;
bool unlock_dp_on_error = false;
xfs_filblks_t fs_blocks;
- prid_t prid;
- struct xfs_dquot *udqp = NULL;
- struct xfs_dquot *gdqp = NULL;
- struct xfs_dquot *pdqp = NULL;
+ struct xfs_dquot *udqp;
+ struct xfs_dquot *gdqp;
+ struct xfs_dquot *pdqp;
uint resblks;
xfs_ino_t ino;
- struct xfs_parent_args *ppargs;
*ipp = NULL;
@@ -119,15 +125,8 @@ xfs_symlink(
return -ENAMETOOLONG;
ASSERT(pathlen > 0);
- prid = xfs_get_initial_prid(dp);
-
- /*
- * Make sure that we have allocated dquot(s) on disk.
- */
- error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns),
- mapped_fsgid(idmap, &init_user_ns), prid,
- XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
- &udqp, &gdqp, &pdqp);
+ /* Make sure that we have allocated dquot(s) on disk. */
+ error = xfs_icreate_dqalloc(&args, &udqp, &gdqp, &pdqp);
if (error)
return error;
@@ -143,7 +142,7 @@ xfs_symlink(
fs_blocks = xfs_symlink_blocks(mp, pathlen);
resblks = xfs_symlink_space_res(mp, link_name->len, fs_blocks);
- error = xfs_parent_start(mp, &ppargs);
+ error = xfs_parent_start(mp, &du.ppargs);
if (error)
goto out_release_dquots;
@@ -168,9 +167,7 @@ xfs_symlink(
*/
error = xfs_dialloc(&tp, dp->i_ino, S_IFLNK, &ino);
if (!error)
- error = xfs_init_new_inode(idmap, tp, dp, ino,
- S_IFLNK | (mode & ~S_IFMT), 1, 0, prid,
- xfs_has_parent(mp), &ip);
+ error = xfs_icreate(tp, ino, &args, &du.ip);
if (error)
goto out_trans_cancel;
@@ -186,33 +183,22 @@ xfs_symlink(
/*
* Also attach the dquot(s) to it, if applicable.
*/
- xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
+ xfs_qm_vop_create_dqattach(tp, du.ip, udqp, gdqp, pdqp);
resblks -= XFS_IALLOC_SPACE_RES(mp);
- error = xfs_symlink_write_target(tp, ip, ip->i_ino, target_path,
+ error = xfs_symlink_write_target(tp, du.ip, du.ip->i_ino, target_path,
pathlen, fs_blocks, resblks);
if (error)
goto out_trans_cancel;
resblks -= fs_blocks;
- i_size_write(VFS_I(ip), ip->i_disk_size);
+ i_size_write(VFS_I(du.ip), du.ip->i_disk_size);
/*
* Create the directory entry for the symlink.
*/
- error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, resblks);
+ error = xfs_dir_create_child(tp, resblks, &du);
if (error)
goto out_trans_cancel;
- xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
-
- /* Add parent pointer for the new symlink. */
- if (ppargs) {
- error = xfs_parent_addname(tp, ppargs, dp, link_name, ip);
- if (error)
- goto out_trans_cancel;
- }
-
- xfs_dir_update_hook(dp, ip, 1, link_name);
/*
* If this is a synchronous mount, make sure that the
@@ -230,10 +216,10 @@ xfs_symlink(
xfs_qm_dqrele(gdqp);
xfs_qm_dqrele(pdqp);
- *ipp = ip;
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ *ipp = du.ip;
+ xfs_iunlock(du.ip, XFS_ILOCK_EXCL);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- xfs_parent_finish(mp, ppargs);
+ xfs_parent_finish(mp, du.ppargs);
return 0;
out_trans_cancel:
@@ -244,13 +230,13 @@ out_release_inode:
* setup of the inode and release the inode. This prevents recursive
* transactions and deadlocks from xfs_inactive.
*/
- if (ip) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_finish_inode_setup(ip);
- xfs_irele(ip);
+ if (du.ip) {
+ xfs_iunlock(du.ip, XFS_ILOCK_EXCL);
+ xfs_finish_inode_setup(du.ip);
+ xfs_irele(du.ip);
}
out_parent:
- xfs_parent_finish(mp, ppargs);
+ xfs_parent_finish(mp, du.ppargs);
out_release_dquots:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index d2391eec37fe..60cb5318fdae 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -432,39 +432,30 @@ log_tail_lsn_show(
XFS_SYSFS_ATTR_RO(log_tail_lsn);
STATIC ssize_t
-reserve_grant_head_show(
+reserve_grant_head_bytes_show(
struct kobject *kobject,
char *buf)
-
{
- int cycle;
- int bytes;
- struct xlog *log = to_xlog(kobject);
-
- xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes);
- return sysfs_emit(buf, "%d:%d\n", cycle, bytes);
+ return sysfs_emit(buf, "%lld\n",
+ atomic64_read(&to_xlog(kobject)->l_reserve_head.grant));
}
-XFS_SYSFS_ATTR_RO(reserve_grant_head);
+XFS_SYSFS_ATTR_RO(reserve_grant_head_bytes);
STATIC ssize_t
-write_grant_head_show(
+write_grant_head_bytes_show(
struct kobject *kobject,
char *buf)
{
- int cycle;
- int bytes;
- struct xlog *log = to_xlog(kobject);
-
- xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes);
- return sysfs_emit(buf, "%d:%d\n", cycle, bytes);
+ return sysfs_emit(buf, "%lld\n",
+ atomic64_read(&to_xlog(kobject)->l_write_head.grant));
}
-XFS_SYSFS_ATTR_RO(write_grant_head);
+XFS_SYSFS_ATTR_RO(write_grant_head_bytes);
static struct attribute *xfs_log_attrs[] = {
ATTR_LIST(log_head_lsn),
ATTR_LIST(log_tail_lsn),
- ATTR_LIST(reserve_grant_head),
- ATTR_LIST(write_grant_head),
+ ATTR_LIST(reserve_grant_head_bytes),
+ ATTR_LIST(write_grant_head_bytes),
NULL,
};
ATTRIBUTE_GROUPS(xfs_log);
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 9c7fbaae2717..2af9f274e872 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -22,6 +22,7 @@
#include "xfs_trans.h"
#include "xfs_log.h"
#include "xfs_log_priv.h"
+#include "xfs_trans_priv.h"
#include "xfs_buf_item.h"
#include "xfs_quota.h"
#include "xfs_dquot_item.h"
@@ -38,10 +39,11 @@
#include "xfs_iomap.h"
#include "xfs_buf_mem.h"
#include "xfs_btree_mem.h"
-#include "xfs_bmap.h"
#include "xfs_exchmaps.h"
#include "xfs_exchrange.h"
#include "xfs_parent.h"
+#include "xfs_rmap.h"
+#include "xfs_refcount.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 25ff6fe1eb6c..5646d300b286 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -90,6 +90,9 @@ struct xfs_exchrange;
struct xfs_getparents;
struct xfs_parent_irec;
struct xfs_attrlist_cursor_kern;
+struct xfs_extent_free_item;
+struct xfs_rmap_intent;
+struct xfs_refcount_intent;
#define XFS_ATTR_FILTER_FLAGS \
{ XFS_ATTR_ROOT, "ROOT" }, \
@@ -1227,6 +1230,7 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
TP_ARGS(log, tic),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(unsigned long, tic)
__field(char, ocnt)
__field(char, cnt)
__field(int, curr_res)
@@ -1234,16 +1238,16 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
__field(unsigned int, flags)
__field(int, reserveq)
__field(int, writeq)
- __field(int, grant_reserve_cycle)
- __field(int, grant_reserve_bytes)
- __field(int, grant_write_cycle)
- __field(int, grant_write_bytes)
+ __field(uint64_t, grant_reserve_bytes)
+ __field(uint64_t, grant_write_bytes)
+ __field(uint64_t, tail_space)
__field(int, curr_cycle)
__field(int, curr_block)
__field(xfs_lsn_t, tail_lsn)
),
TP_fast_assign(
__entry->dev = log->l_mp->m_super->s_dev;
+ __entry->tic = (unsigned long)tic;
__entry->ocnt = tic->t_ocnt;
__entry->cnt = tic->t_cnt;
__entry->curr_res = tic->t_curr_res;
@@ -1251,23 +1255,22 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
__entry->flags = tic->t_flags;
__entry->reserveq = list_empty(&log->l_reserve_head.waiters);
__entry->writeq = list_empty(&log->l_write_head.waiters);
- xlog_crack_grant_head(&log->l_reserve_head.grant,
- &__entry->grant_reserve_cycle,
- &__entry->grant_reserve_bytes);
- xlog_crack_grant_head(&log->l_write_head.grant,
- &__entry->grant_write_cycle,
- &__entry->grant_write_bytes);
+ __entry->tail_space = READ_ONCE(log->l_tail_space);
+ __entry->grant_reserve_bytes = __entry->tail_space +
+ atomic64_read(&log->l_reserve_head.grant);
+ __entry->grant_write_bytes = __entry->tail_space +
+ atomic64_read(&log->l_write_head.grant);
__entry->curr_cycle = log->l_curr_cycle;
__entry->curr_block = log->l_curr_block;
__entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
),
- TP_printk("dev %d:%d t_ocnt %u t_cnt %u t_curr_res %u "
- "t_unit_res %u t_flags %s reserveq %s "
- "writeq %s grant_reserve_cycle %d "
- "grant_reserve_bytes %d grant_write_cycle %d "
- "grant_write_bytes %d curr_cycle %d curr_block %d "
+ TP_printk("dev %d:%d tic 0x%lx t_ocnt %u t_cnt %u t_curr_res %u "
+ "t_unit_res %u t_flags %s reserveq %s writeq %s "
+ "tail space %llu grant_reserve_bytes %llu "
+ "grant_write_bytes %llu curr_cycle %d curr_block %d "
"tail_cycle %d tail_block %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->tic,
__entry->ocnt,
__entry->cnt,
__entry->curr_res,
@@ -1275,9 +1278,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
__print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
__entry->reserveq ? "empty" : "active",
__entry->writeq ? "empty" : "active",
- __entry->grant_reserve_cycle,
+ __entry->tail_space,
__entry->grant_reserve_bytes,
- __entry->grant_write_cycle,
__entry->grant_write_bytes,
__entry->curr_cycle,
__entry->curr_block,
@@ -1305,6 +1307,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_sub);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_cil_wait);
+DEFINE_LOGGRANT_EVENT(xfs_log_cil_return);
DECLARE_EVENT_CLASS(xfs_log_item_class,
TP_PROTO(struct xfs_log_item *lip),
@@ -1404,19 +1407,19 @@ TRACE_EVENT(xfs_log_assign_tail_lsn,
__field(dev_t, dev)
__field(xfs_lsn_t, new_lsn)
__field(xfs_lsn_t, old_lsn)
- __field(xfs_lsn_t, last_sync_lsn)
+ __field(xfs_lsn_t, head_lsn)
),
TP_fast_assign(
__entry->dev = log->l_mp->m_super->s_dev;
__entry->new_lsn = new_lsn;
__entry->old_lsn = atomic64_read(&log->l_tail_lsn);
- __entry->last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
+ __entry->head_lsn = log->l_ailp->ail_head_lsn;
),
- TP_printk("dev %d:%d new tail lsn %d/%d, old lsn %d/%d, last sync %d/%d",
+ TP_printk("dev %d:%d new tail lsn %d/%d, old lsn %d/%d, head lsn %d/%d",
MAJOR(__entry->dev), MINOR(__entry->dev),
CYCLE_LSN(__entry->new_lsn), BLOCK_LSN(__entry->new_lsn),
CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn),
- CYCLE_LSN(__entry->last_sync_lsn), BLOCK_LSN(__entry->last_sync_lsn))
+ CYCLE_LSN(__entry->head_lsn), BLOCK_LSN(__entry->head_lsn))
)
DECLARE_EVENT_CLASS(xfs_file_class,
@@ -2460,6 +2463,35 @@ DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
DEFINE_DISCARD_EVENT(xfs_discard_exclude);
DEFINE_DISCARD_EVENT(xfs_discard_busy);
+DECLARE_EVENT_CLASS(xfs_rtdiscard_class,
+ TP_PROTO(struct xfs_mount *mp,
+ xfs_rtblock_t rtbno, xfs_rtblock_t len),
+ TP_ARGS(mp, rtbno, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_rtblock_t, rtbno)
+ __field(xfs_rtblock_t, len)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_rtdev_targp->bt_dev;
+ __entry->rtbno = rtbno;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d rtbno 0x%llx rtbcount 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rtbno,
+ __entry->len)
+)
+
+#define DEFINE_RTDISCARD_EVENT(name) \
+DEFINE_EVENT(xfs_rtdiscard_class, name, \
+ TP_PROTO(struct xfs_mount *mp, \
+ xfs_rtblock_t rtbno, xfs_rtblock_t len), \
+ TP_ARGS(mp, rtbno, len))
+DEFINE_RTDISCARD_EVENT(xfs_discard_rtextent);
+DEFINE_RTDISCARD_EVENT(xfs_discard_rttoosmall);
+DEFINE_RTDISCARD_EVENT(xfs_discard_rtrelax);
+
DECLARE_EVENT_CLASS(xfs_btree_cur_class,
TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp),
TP_ARGS(cur, level, bp),
@@ -2681,41 +2713,37 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_pause);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_unpause);
DECLARE_EVENT_CLASS(xfs_free_extent_deferred_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- int type, xfs_agblock_t agbno, xfs_extlen_t len),
- TP_ARGS(mp, agno, type, agbno, len),
+ TP_PROTO(struct xfs_mount *mp, struct xfs_extent_free_item *free),
+ TP_ARGS(mp, free),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
- __field(int, type)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, len)
+ __field(unsigned int, flags)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
- __entry->type = type;
- __entry->agbno = agbno;
- __entry->len = len;
+ __entry->agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock);
+ __entry->agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock);
+ __entry->len = free->xefi_blockcount;
+ __entry->flags = free->xefi_flags;
),
- TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x fsbcount 0x%x",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x flags 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->type,
__entry->agno,
__entry->agbno,
- __entry->len)
+ __entry->len,
+ __entry->flags)
);
#define DEFINE_FREE_EXTENT_DEFERRED_EVENT(name) \
DEFINE_EVENT(xfs_free_extent_deferred_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
- int type, \
- xfs_agblock_t bno, \
- xfs_extlen_t len), \
- TP_ARGS(mp, agno, type, bno, len))
-DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_bmap_free_defer);
-DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_bmap_free_deferred);
+ TP_PROTO(struct xfs_mount *mp, struct xfs_extent_free_item *free), \
+ TP_ARGS(mp, free))
DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_defer);
DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_deferred);
+DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_extent_free_defer);
+DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_extent_free_deferred);
DECLARE_EVENT_CLASS(xfs_defer_pending_item_class,
TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp,
@@ -2760,10 +2788,10 @@ DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_finish_item);
/* rmap tracepoints */
DECLARE_EVENT_CLASS(xfs_rmap_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ TP_PROTO(struct xfs_btree_cur *cur,
xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten,
const struct xfs_owner_info *oinfo),
- TP_ARGS(mp, agno, agbno, len, unwritten, oinfo),
+ TP_ARGS(cur, agbno, len, unwritten, oinfo),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -2774,8 +2802,8 @@ DECLARE_EVENT_CLASS(xfs_rmap_class,
__field(unsigned long, flags)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->agno = cur->bc_ag.pag->pag_agno;
__entry->agbno = agbno;
__entry->len = len;
__entry->owner = oinfo->oi_owner;
@@ -2795,57 +2823,109 @@ DECLARE_EVENT_CLASS(xfs_rmap_class,
);
#define DEFINE_RMAP_EVENT(name) \
DEFINE_EVENT(xfs_rmap_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ TP_PROTO(struct xfs_btree_cur *cur, \
xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, \
const struct xfs_owner_info *oinfo), \
- TP_ARGS(mp, agno, agbno, len, unwritten, oinfo))
+ TP_ARGS(cur, agbno, len, unwritten, oinfo))
-/* simple AG-based error/%ip tracepoint class */
-DECLARE_EVENT_CLASS(xfs_ag_error_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error,
+/* btree cursor error/%ip tracepoint class */
+DECLARE_EVENT_CLASS(xfs_btree_error_class,
+ TP_PROTO(struct xfs_btree_cur *cur, int error,
unsigned long caller_ip),
- TP_ARGS(mp, agno, error, caller_ip),
+ TP_ARGS(cur, error, caller_ip),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
+ __field(xfs_ino_t, ino)
__field(int, error)
__field(unsigned long, caller_ip)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_INODE:
+ __entry->agno = 0;
+ __entry->ino = cur->bc_ino.ip->i_ino;
+ break;
+ case XFS_BTREE_TYPE_AG:
+ __entry->agno = cur->bc_ag.pag->pag_agno;
+ __entry->ino = 0;
+ break;
+ case XFS_BTREE_TYPE_MEM:
+ __entry->agno = 0;
+ __entry->ino = 0;
+ break;
+ }
__entry->error = error;
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d agno 0x%x error %d caller %pS",
+ TP_printk("dev %d:%d agno 0x%x ino 0x%llx error %d caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
+ __entry->ino,
__entry->error,
(char *)__entry->caller_ip)
);
-#define DEFINE_AG_ERROR_EVENT(name) \
-DEFINE_EVENT(xfs_ag_error_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error, \
+#define DEFINE_BTREE_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_btree_error_class, name, \
+ TP_PROTO(struct xfs_btree_cur *cur, int error, \
unsigned long caller_ip), \
- TP_ARGS(mp, agno, error, caller_ip))
+ TP_ARGS(cur, error, caller_ip))
DEFINE_RMAP_EVENT(xfs_rmap_unmap);
DEFINE_RMAP_EVENT(xfs_rmap_unmap_done);
-DEFINE_AG_ERROR_EVENT(xfs_rmap_unmap_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_rmap_unmap_error);
DEFINE_RMAP_EVENT(xfs_rmap_map);
DEFINE_RMAP_EVENT(xfs_rmap_map_done);
-DEFINE_AG_ERROR_EVENT(xfs_rmap_map_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_rmap_map_error);
DEFINE_RMAP_EVENT(xfs_rmap_convert);
DEFINE_RMAP_EVENT(xfs_rmap_convert_done);
-DEFINE_AG_ERROR_EVENT(xfs_rmap_convert_error);
-DEFINE_AG_ERROR_EVENT(xfs_rmap_convert_state);
+DEFINE_BTREE_ERROR_EVENT(xfs_rmap_convert_error);
+
+TRACE_EVENT(xfs_rmap_convert_state,
+ TP_PROTO(struct xfs_btree_cur *cur, int state,
+ unsigned long caller_ip),
+ TP_ARGS(cur, state, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_ino_t, ino)
+ __field(int, state)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ switch (cur->bc_ops->type) {
+ case XFS_BTREE_TYPE_INODE:
+ __entry->agno = 0;
+ __entry->ino = cur->bc_ino.ip->i_ino;
+ break;
+ case XFS_BTREE_TYPE_AG:
+ __entry->agno = cur->bc_ag.pag->pag_agno;
+ __entry->ino = 0;
+ break;
+ case XFS_BTREE_TYPE_MEM:
+ __entry->agno = 0;
+ __entry->ino = 0;
+ break;
+ }
+ __entry->state = state;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d agno 0x%x ino 0x%llx state %d caller %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->ino,
+ __entry->state,
+ (char *)__entry->caller_ip)
+);
DECLARE_EVENT_CLASS(xfs_rmapbt_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ TP_PROTO(struct xfs_btree_cur *cur,
xfs_agblock_t agbno, xfs_extlen_t len,
uint64_t owner, uint64_t offset, unsigned int flags),
- TP_ARGS(mp, agno, agbno, len, owner, offset, flags),
+ TP_ARGS(cur, agbno, len, owner, offset, flags),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -2856,8 +2936,8 @@ DECLARE_EVENT_CLASS(xfs_rmapbt_class,
__field(unsigned int, flags)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->agno = cur->bc_ag.pag->pag_agno;
__entry->agbno = agbno;
__entry->len = len;
__entry->owner = owner;
@@ -2875,25 +2955,27 @@ DECLARE_EVENT_CLASS(xfs_rmapbt_class,
);
#define DEFINE_RMAPBT_EVENT(name) \
DEFINE_EVENT(xfs_rmapbt_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ TP_PROTO(struct xfs_btree_cur *cur, \
xfs_agblock_t agbno, xfs_extlen_t len, \
uint64_t owner, uint64_t offset, unsigned int flags), \
- TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
+ TP_ARGS(cur, agbno, len, owner, offset, flags))
+
+TRACE_DEFINE_ENUM(XFS_RMAP_MAP);
+TRACE_DEFINE_ENUM(XFS_RMAP_MAP_SHARED);
+TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP);
+TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP_SHARED);
+TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT);
+TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT_SHARED);
+TRACE_DEFINE_ENUM(XFS_RMAP_ALLOC);
+TRACE_DEFINE_ENUM(XFS_RMAP_FREE);
DECLARE_EVENT_CLASS(xfs_rmap_deferred_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- int op,
- xfs_agblock_t agbno,
- xfs_ino_t ino,
- int whichfork,
- xfs_fileoff_t offset,
- xfs_filblks_t len,
- xfs_exntst_t state),
- TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state),
+ TP_PROTO(struct xfs_mount *mp, struct xfs_rmap_intent *ri),
+ TP_ARGS(mp, ri),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(unsigned long long, owner)
__field(xfs_agnumber_t, agno)
- __field(xfs_ino_t, ino)
__field(xfs_agblock_t, agbno)
__field(int, whichfork)
__field(xfs_fileoff_t, l_loff)
@@ -2903,21 +2985,22 @@ DECLARE_EVENT_CLASS(xfs_rmap_deferred_class,
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
- __entry->ino = ino;
- __entry->agbno = agbno;
- __entry->whichfork = whichfork;
- __entry->l_loff = offset;
- __entry->l_len = len;
- __entry->l_state = state;
- __entry->op = op;
- ),
- TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
+ __entry->agno = XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock);
+ __entry->agbno = XFS_FSB_TO_AGBNO(mp,
+ ri->ri_bmap.br_startblock);
+ __entry->owner = ri->ri_owner;
+ __entry->whichfork = ri->ri_whichfork;
+ __entry->l_loff = ri->ri_bmap.br_startoff;
+ __entry->l_len = ri->ri_bmap.br_blockcount;
+ __entry->l_state = ri->ri_bmap.br_state;
+ __entry->op = ri->ri_type;
+ ),
+ TP_printk("dev %d:%d op %s agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->op,
+ __print_symbolic(__entry->op, XFS_RMAP_INTENT_STRINGS),
__entry->agno,
__entry->agbno,
- __entry->ino,
+ __entry->owner,
__print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__entry->l_loff,
__entry->l_len,
@@ -2925,24 +3008,17 @@ DECLARE_EVENT_CLASS(xfs_rmap_deferred_class,
);
#define DEFINE_RMAP_DEFERRED_EVENT(name) \
DEFINE_EVENT(xfs_rmap_deferred_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
- int op, \
- xfs_agblock_t agbno, \
- xfs_ino_t ino, \
- int whichfork, \
- xfs_fileoff_t offset, \
- xfs_filblks_t len, \
- xfs_exntst_t state), \
- TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state))
+ TP_PROTO(struct xfs_mount *mp, struct xfs_rmap_intent *ri), \
+ TP_ARGS(mp, ri))
DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_defer);
DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_deferred);
DEFINE_RMAPBT_EVENT(xfs_rmap_update);
DEFINE_RMAPBT_EVENT(xfs_rmap_insert);
DEFINE_RMAPBT_EVENT(xfs_rmap_delete);
-DEFINE_AG_ERROR_EVENT(xfs_rmap_insert_error);
-DEFINE_AG_ERROR_EVENT(xfs_rmap_delete_error);
-DEFINE_AG_ERROR_EVENT(xfs_rmap_update_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_rmap_insert_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_rmap_delete_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_rmap_update_error);
DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_candidate);
DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_query);
@@ -3068,21 +3144,74 @@ DEFINE_AG_RESV_EVENT(xfs_ag_resv_free_extent);
DEFINE_AG_RESV_EVENT(xfs_ag_resv_critical);
DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed);
+/* simple AG-based error/%ip tracepoint class */
+DECLARE_EVENT_CLASS(xfs_ag_error_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error,
+ unsigned long caller_ip),
+ TP_ARGS(mp, agno, error, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(int, error)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->error = error;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d agno 0x%x error %d caller %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->error,
+ (char *)__entry->caller_ip)
+);
+
+#define DEFINE_AG_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_ag_error_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error, \
+ unsigned long caller_ip), \
+ TP_ARGS(mp, agno, error, caller_ip))
DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error);
/* refcount tracepoint classes */
-/* reuse the discard trace class for agbno/aglen-based traces */
-#define DEFINE_AG_EXTENT_EVENT(name) DEFINE_DISCARD_EVENT(name)
+DECLARE_EVENT_CLASS(xfs_refcount_class,
+ TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno,
+ xfs_extlen_t len),
+ TP_ARGS(cur, agbno, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ ),
+ TP_fast_assign(
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->agno = cur->bc_ag.pag->pag_agno;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->len)
+);
+#define DEFINE_REFCOUNT_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_class, name, \
+ TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno, \
+ xfs_extlen_t len), \
+ TP_ARGS(cur, agbno, len))
-/* ag btree lookup tracepoint class */
TRACE_DEFINE_ENUM(XFS_LOOKUP_EQi);
TRACE_DEFINE_ENUM(XFS_LOOKUP_LEi);
TRACE_DEFINE_ENUM(XFS_LOOKUP_GEi);
-DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agblock_t agbno, xfs_lookup_t dir),
- TP_ARGS(mp, agno, agbno, dir),
+TRACE_EVENT(xfs_refcount_lookup,
+ TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t agbno,
+ xfs_lookup_t dir),
+ TP_ARGS(cur, agbno, dir),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -3090,8 +3219,8 @@ DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class,
__field(xfs_lookup_t, dir)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->agno = cur->bc_ag.pag->pag_agno;
__entry->agbno = agbno;
__entry->dir = dir;
),
@@ -3103,17 +3232,10 @@ DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class,
__entry->dir)
)
-#define DEFINE_AG_BTREE_LOOKUP_EVENT(name) \
-DEFINE_EVENT(xfs_ag_btree_lookup_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
- xfs_agblock_t agbno, xfs_lookup_t dir), \
- TP_ARGS(mp, agno, agbno, dir))
-
/* single-rcext tracepoint class */
DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- struct xfs_refcount_irec *irec),
- TP_ARGS(mp, agno, irec),
+ TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec),
+ TP_ARGS(cur, irec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -3123,8 +3245,8 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
__field(xfs_nlink_t, refcount)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->agno = cur->bc_ag.pag->pag_agno;
__entry->domain = irec->rc_domain;
__entry->startblock = irec->rc_startblock;
__entry->blockcount = irec->rc_blockcount;
@@ -3141,15 +3263,14 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
#define DEFINE_REFCOUNT_EXTENT_EVENT(name) \
DEFINE_EVENT(xfs_refcount_extent_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
- struct xfs_refcount_irec *irec), \
- TP_ARGS(mp, agno, irec))
+ TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec), \
+ TP_ARGS(cur, irec))
/* single-rcext and an agbno tracepoint class */
DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- struct xfs_refcount_irec *irec, xfs_agblock_t agbno),
- TP_ARGS(mp, agno, irec, agbno),
+ TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec,
+ xfs_agblock_t agbno),
+ TP_ARGS(cur, irec, agbno),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -3160,8 +3281,8 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
__field(xfs_agblock_t, agbno)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->agno = cur->bc_ag.pag->pag_agno;
__entry->domain = irec->rc_domain;
__entry->startblock = irec->rc_startblock;
__entry->blockcount = irec->rc_blockcount;
@@ -3180,15 +3301,15 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
#define DEFINE_REFCOUNT_EXTENT_AT_EVENT(name) \
DEFINE_EVENT(xfs_refcount_extent_at_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
- struct xfs_refcount_irec *irec, xfs_agblock_t agbno), \
- TP_ARGS(mp, agno, irec, agbno))
+ TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, \
+ xfs_agblock_t agbno), \
+ TP_ARGS(cur, irec, agbno))
/* double-rcext tracepoint class */
DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2),
- TP_ARGS(mp, agno, i1, i2),
+ TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1,
+ struct xfs_refcount_irec *i2),
+ TP_ARGS(cur, i1, i2),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -3202,8 +3323,8 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
__field(xfs_nlink_t, i2_refcount)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->agno = cur->bc_ag.pag->pag_agno;
__entry->i1_domain = i1->rc_domain;
__entry->i1_startblock = i1->rc_startblock;
__entry->i1_blockcount = i1->rc_blockcount;
@@ -3229,16 +3350,15 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
#define DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(name) \
DEFINE_EVENT(xfs_refcount_double_extent_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
- struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2), \
- TP_ARGS(mp, agno, i1, i2))
+ TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, \
+ struct xfs_refcount_irec *i2), \
+ TP_ARGS(cur, i1, i2))
/* double-rcext and an agbno tracepoint class */
DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2,
- xfs_agblock_t agbno),
- TP_ARGS(mp, agno, i1, i2, agbno),
+ TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1,
+ struct xfs_refcount_irec *i2, xfs_agblock_t agbno),
+ TP_ARGS(cur, i1, i2, agbno),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -3253,8 +3373,8 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
__field(xfs_agblock_t, agbno)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->agno = cur->bc_ag.pag->pag_agno;
__entry->i1_domain = i1->rc_domain;
__entry->i1_startblock = i1->rc_startblock;
__entry->i1_blockcount = i1->rc_blockcount;
@@ -3282,17 +3402,15 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
#define DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(name) \
DEFINE_EVENT(xfs_refcount_double_extent_at_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
- struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, \
- xfs_agblock_t agbno), \
- TP_ARGS(mp, agno, i1, i2, agbno))
+ TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, \
+ struct xfs_refcount_irec *i2, xfs_agblock_t agbno), \
+ TP_ARGS(cur, i1, i2, agbno))
/* triple-rcext tracepoint class */
DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2,
- struct xfs_refcount_irec *i3),
- TP_ARGS(mp, agno, i1, i2, i3),
+ TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1,
+ struct xfs_refcount_irec *i2, struct xfs_refcount_irec *i3),
+ TP_ARGS(cur, i1, i2, i3),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -3310,8 +3428,8 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
__field(xfs_nlink_t, i3_refcount)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->agno = cur->bc_ag.pag->pag_agno;
__entry->i1_domain = i1->rc_domain;
__entry->i1_startblock = i1->rc_startblock;
__entry->i1_blockcount = i1->rc_blockcount;
@@ -3346,109 +3464,82 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
#define DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(name) \
DEFINE_EVENT(xfs_refcount_triple_extent_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
- struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, \
- struct xfs_refcount_irec *i3), \
- TP_ARGS(mp, agno, i1, i2, i3))
+ TP_PROTO(struct xfs_btree_cur *cur, struct xfs_refcount_irec *i1, \
+ struct xfs_refcount_irec *i2, struct xfs_refcount_irec *i3), \
+ TP_ARGS(cur, i1, i2, i3))
/* refcount btree tracepoints */
-DEFINE_AG_BTREE_LOOKUP_EVENT(xfs_refcount_lookup);
DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_get);
DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_update);
DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_insert);
DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_delete);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_insert_error);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_delete_error);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_update_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_insert_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_delete_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_update_error);
/* refcount adjustment tracepoints */
-DEFINE_AG_EXTENT_EVENT(xfs_refcount_increase);
-DEFINE_AG_EXTENT_EVENT(xfs_refcount_decrease);
-DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_increase);
-DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_decrease);
+DEFINE_REFCOUNT_EVENT(xfs_refcount_increase);
+DEFINE_REFCOUNT_EVENT(xfs_refcount_decrease);
+DEFINE_REFCOUNT_EVENT(xfs_refcount_cow_increase);
+DEFINE_REFCOUNT_EVENT(xfs_refcount_cow_decrease);
DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(xfs_refcount_merge_center_extents);
DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_modify_extent);
-DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_recover_extent);
DEFINE_REFCOUNT_EXTENT_AT_EVENT(xfs_refcount_split_extent);
DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_left_extent);
DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_right_extent);
DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_left_extent);
DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_right_extent);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_error);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_cow_error);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_center_extents_error);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_modify_extent_error);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_split_extent_error);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_left_extent_error);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_right_extent_error);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_find_left_extent_error);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_find_right_extent_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_adjust_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_adjust_cow_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_merge_center_extents_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_modify_extent_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_split_extent_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_merge_left_extent_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_merge_right_extent_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_find_left_extent_error);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_find_right_extent_error);
/* reflink helpers */
-DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared);
-DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared_result);
-DEFINE_AG_ERROR_EVENT(xfs_refcount_find_shared_error);
+DEFINE_REFCOUNT_EVENT(xfs_refcount_find_shared);
+DEFINE_REFCOUNT_EVENT(xfs_refcount_find_shared_result);
+DEFINE_BTREE_ERROR_EVENT(xfs_refcount_find_shared_error);
+
+TRACE_DEFINE_ENUM(XFS_REFCOUNT_INCREASE);
+TRACE_DEFINE_ENUM(XFS_REFCOUNT_DECREASE);
+TRACE_DEFINE_ENUM(XFS_REFCOUNT_ALLOC_COW);
+TRACE_DEFINE_ENUM(XFS_REFCOUNT_FREE_COW);
DECLARE_EVENT_CLASS(xfs_refcount_deferred_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- int type, xfs_agblock_t agbno, xfs_extlen_t len),
- TP_ARGS(mp, agno, type, agbno, len),
+ TP_PROTO(struct xfs_mount *mp, struct xfs_refcount_intent *refc),
+ TP_ARGS(mp, refc),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
- __field(int, type)
+ __field(int, op)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, len)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
- __entry->type = type;
- __entry->agbno = agbno;
- __entry->len = len;
+ __entry->agno = XFS_FSB_TO_AGNO(mp, refc->ri_startblock);
+ __entry->op = refc->ri_type;
+ __entry->agbno = XFS_FSB_TO_AGBNO(mp, refc->ri_startblock);
+ __entry->len = refc->ri_blockcount;
),
- TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x fsbcount 0x%x",
+ TP_printk("dev %d:%d op %s agno 0x%x agbno 0x%x fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->type,
+ __print_symbolic(__entry->op, XFS_REFCOUNT_INTENT_STRINGS),
__entry->agno,
__entry->agbno,
__entry->len)
);
#define DEFINE_REFCOUNT_DEFERRED_EVENT(name) \
DEFINE_EVENT(xfs_refcount_deferred_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
- int type, \
- xfs_agblock_t bno, \
- xfs_extlen_t len), \
- TP_ARGS(mp, agno, type, bno, len))
+ TP_PROTO(struct xfs_mount *mp, struct xfs_refcount_intent *refc), \
+ TP_ARGS(mp, refc))
DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_defer);
DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred);
-
-TRACE_EVENT(xfs_refcount_finish_one_leftover,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- int type, xfs_agblock_t agbno, xfs_extlen_t len),
- TP_ARGS(mp, agno, type, agbno, len),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_agnumber_t, agno)
- __field(int, type)
- __field(xfs_agblock_t, agbno)
- __field(xfs_extlen_t, len)
- ),
- TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
- __entry->type = type;
- __entry->agbno = agbno;
- __entry->len = len;
- ),
- TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->type,
- __entry->agno,
- __entry->agbno,
- __entry->len)
-);
+DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_finish_one_leftover);
/* simple inode-based error/%ip tracepoint class */
DECLARE_EVENT_CLASS(xfs_inode_error_class,
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 828da4ac4316..bdf3704dc301 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -725,135 +725,6 @@ xfs_trans_free_items(
}
}
-static inline void
-xfs_log_item_batch_insert(
- struct xfs_ail *ailp,
- struct xfs_ail_cursor *cur,
- struct xfs_log_item **log_items,
- int nr_items,
- xfs_lsn_t commit_lsn)
-{
- int i;
-
- spin_lock(&ailp->ail_lock);
- /* xfs_trans_ail_update_bulk drops ailp->ail_lock */
- xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn);
-
- for (i = 0; i < nr_items; i++) {
- struct xfs_log_item *lip = log_items[i];
-
- if (lip->li_ops->iop_unpin)
- lip->li_ops->iop_unpin(lip, 0);
- }
-}
-
-/*
- * Bulk operation version of xfs_trans_committed that takes a log vector of
- * items to insert into the AIL. This uses bulk AIL insertion techniques to
- * minimise lock traffic.
- *
- * If we are called with the aborted flag set, it is because a log write during
- * a CIL checkpoint commit has failed. In this case, all the items in the
- * checkpoint have already gone through iop_committed and iop_committing, which
- * means that checkpoint commit abort handling is treated exactly the same
- * as an iclog write error even though we haven't started any IO yet. Hence in
- * this case all we need to do is iop_committed processing, followed by an
- * iop_unpin(aborted) call.
- *
- * The AIL cursor is used to optimise the insert process. If commit_lsn is not
- * at the end of the AIL, the insert cursor avoids the need to walk
- * the AIL to find the insertion point on every xfs_log_item_batch_insert()
- * call. This saves a lot of needless list walking and is a net win, even
- * though it slightly increases that amount of AIL lock traffic to set it up
- * and tear it down.
- */
-void
-xfs_trans_committed_bulk(
- struct xfs_ail *ailp,
- struct list_head *lv_chain,
- xfs_lsn_t commit_lsn,
- bool aborted)
-{
-#define LOG_ITEM_BATCH_SIZE 32
- struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE];
- struct xfs_log_vec *lv;
- struct xfs_ail_cursor cur;
- int i = 0;
-
- spin_lock(&ailp->ail_lock);
- xfs_trans_ail_cursor_last(ailp, &cur, commit_lsn);
- spin_unlock(&ailp->ail_lock);
-
- /* unpin all the log items */
- list_for_each_entry(lv, lv_chain, lv_list) {
- struct xfs_log_item *lip = lv->lv_item;
- xfs_lsn_t item_lsn;
-
- if (aborted)
- set_bit(XFS_LI_ABORTED, &lip->li_flags);
-
- if (lip->li_ops->flags & XFS_ITEM_RELEASE_WHEN_COMMITTED) {
- lip->li_ops->iop_release(lip);
- continue;
- }
-
- if (lip->li_ops->iop_committed)
- item_lsn = lip->li_ops->iop_committed(lip, commit_lsn);
- else
- item_lsn = commit_lsn;
-
- /* item_lsn of -1 means the item needs no further processing */
- if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
- continue;
-
- /*
- * if we are aborting the operation, no point in inserting the
- * object into the AIL as we are in a shutdown situation.
- */
- if (aborted) {
- ASSERT(xlog_is_shutdown(ailp->ail_log));
- if (lip->li_ops->iop_unpin)
- lip->li_ops->iop_unpin(lip, 1);
- continue;
- }
-
- if (item_lsn != commit_lsn) {
-
- /*
- * Not a bulk update option due to unusual item_lsn.
- * Push into AIL immediately, rechecking the lsn once
- * we have the ail lock. Then unpin the item. This does
- * not affect the AIL cursor the bulk insert path is
- * using.
- */
- spin_lock(&ailp->ail_lock);
- if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
- xfs_trans_ail_update(ailp, lip, item_lsn);
- else
- spin_unlock(&ailp->ail_lock);
- if (lip->li_ops->iop_unpin)
- lip->li_ops->iop_unpin(lip, 0);
- continue;
- }
-
- /* Item is a candidate for bulk AIL insert. */
- log_items[i++] = lv->lv_item;
- if (i >= LOG_ITEM_BATCH_SIZE) {
- xfs_log_item_batch_insert(ailp, &cur, log_items,
- LOG_ITEM_BATCH_SIZE, commit_lsn);
- i = 0;
- }
- }
-
- /* make sure we insert the remainder! */
- if (i)
- xfs_log_item_batch_insert(ailp, &cur, log_items, i, commit_lsn);
-
- spin_lock(&ailp->ail_lock);
- xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->ail_lock);
-}
-
/*
* Sort transaction items prior to running precommit operations. This will
* attempt to order the items such that they will always be locked in the same
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 1636663707dc..f06cc0f41665 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -58,13 +58,15 @@ struct xfs_log_item {
#define XFS_LI_FAILED 2
#define XFS_LI_DIRTY 3
#define XFS_LI_WHITEOUT 4
+#define XFS_LI_FLUSHING 5
#define XFS_LI_FLAGS \
{ (1u << XFS_LI_IN_AIL), "IN_AIL" }, \
{ (1u << XFS_LI_ABORTED), "ABORTED" }, \
{ (1u << XFS_LI_FAILED), "FAILED" }, \
{ (1u << XFS_LI_DIRTY), "DIRTY" }, \
- { (1u << XFS_LI_WHITEOUT), "WHITEOUT" }
+ { (1u << XFS_LI_WHITEOUT), "WHITEOUT" }, \
+ { (1u << XFS_LI_FLUSHING), "FLUSHING" }
struct xfs_item_ops {
unsigned flags;
@@ -224,7 +226,6 @@ void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
bool xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
-void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
void xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint,
uint);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index e4c343096f95..0fafcc9f3dbe 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -135,25 +135,6 @@ xfs_ail_min_lsn(
}
/*
- * Return the maximum lsn held in the AIL, or zero if the AIL is empty.
- */
-static xfs_lsn_t
-xfs_ail_max_lsn(
- struct xfs_ail *ailp)
-{
- xfs_lsn_t lsn = 0;
- struct xfs_log_item *lip;
-
- spin_lock(&ailp->ail_lock);
- lip = xfs_ail_max(ailp);
- if (lip)
- lsn = lip->li_lsn;
- spin_unlock(&ailp->ail_lock);
-
- return lsn;
-}
-
-/*
* The cursor keeps track of where our current traversal is up to by tracking
* the next item in the list for us. However, for this to be safe, removing an
* object from the AIL needs to invalidate any cursor that points to it. hence
@@ -414,6 +395,74 @@ xfsaild_push_item(
return lip->li_ops->iop_push(lip, &ailp->ail_buf_list);
}
+/*
+ * Compute the LSN that we'd need to push the log tail towards in order to have
+ * at least 25% of the log space free. If the log free space already meets this
+ * threshold, this function returns the lowest LSN in the AIL to slowly keep
+ * writeback ticking over and the tail of the log moving forward.
+ */
+static xfs_lsn_t
+xfs_ail_calc_push_target(
+ struct xfs_ail *ailp)
+{
+ struct xlog *log = ailp->ail_log;
+ struct xfs_log_item *lip;
+ xfs_lsn_t target_lsn;
+ xfs_lsn_t max_lsn;
+ xfs_lsn_t min_lsn;
+ int32_t free_bytes;
+ uint32_t target_block;
+ uint32_t target_cycle;
+
+ lockdep_assert_held(&ailp->ail_lock);
+
+ lip = xfs_ail_max(ailp);
+ if (!lip)
+ return NULLCOMMITLSN;
+
+ max_lsn = lip->li_lsn;
+ min_lsn = __xfs_ail_min_lsn(ailp);
+
+ /*
+ * If we are supposed to push all the items in the AIL, we want to push
+ * to the current head. We then clear the push flag so that we don't
+ * keep pushing newly queued items beyond where the push all command was
+ * run. If the push waiter wants to empty the ail, it should queue
+ * itself on the ail_empty wait queue.
+ */
+ if (test_and_clear_bit(XFS_AIL_OPSTATE_PUSH_ALL, &ailp->ail_opstate))
+ return max_lsn;
+
+ /* If someone wants the AIL empty, keep pushing everything we have. */
+ if (waitqueue_active(&ailp->ail_empty))
+ return max_lsn;
+
+ /*
+ * Background pushing - attempt to keep 25% of the log free and if we
+ * have that much free retain the existing target.
+ */
+ free_bytes = log->l_logsize - xlog_lsn_sub(log, max_lsn, min_lsn);
+ if (free_bytes >= log->l_logsize >> 2)
+ return ailp->ail_target;
+
+ target_cycle = CYCLE_LSN(min_lsn);
+ target_block = BLOCK_LSN(min_lsn) + (log->l_logBBsize >> 2);
+ if (target_block >= log->l_logBBsize) {
+ target_block -= log->l_logBBsize;
+ target_cycle += 1;
+ }
+ target_lsn = xlog_assign_lsn(target_cycle, target_block);
+
+ /* Cap the target to the highest LSN known to be in the AIL. */
+ if (XFS_LSN_CMP(target_lsn, max_lsn) > 0)
+ return max_lsn;
+
+ /* If the existing target is higher than the new target, keep it. */
+ if (XFS_LSN_CMP(ailp->ail_target, target_lsn) >= 0)
+ return ailp->ail_target;
+ return target_lsn;
+}
+
static long
xfsaild_push(
struct xfs_ail *ailp)
@@ -422,7 +471,6 @@ xfsaild_push(
struct xfs_ail_cursor cur;
struct xfs_log_item *lip;
xfs_lsn_t lsn;
- xfs_lsn_t target = NULLCOMMITLSN;
long tout;
int stuck = 0;
int flushing = 0;
@@ -447,37 +495,26 @@ xfsaild_push(
}
spin_lock(&ailp->ail_lock);
-
- /*
- * If we have a sync push waiter, we always have to push till the AIL is
- * empty. Update the target to point to the end of the AIL so that
- * capture updates that occur after the sync push waiter has gone to
- * sleep.
- */
- if (waitqueue_active(&ailp->ail_empty)) {
- lip = xfs_ail_max(ailp);
- if (lip)
- target = lip->li_lsn;
- } else {
- /* barrier matches the ail_target update in xfs_ail_push() */
- smp_rmb();
- target = ailp->ail_target;
- ailp->ail_target_prev = target;
- }
+ WRITE_ONCE(ailp->ail_target, xfs_ail_calc_push_target(ailp));
+ if (ailp->ail_target == NULLCOMMITLSN)
+ goto out_done;
/* we're done if the AIL is empty or our push has reached the end */
lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->ail_last_pushed_lsn);
if (!lip)
- goto out_done;
+ goto out_done_cursor;
XFS_STATS_INC(mp, xs_push_ail);
- ASSERT(target != NULLCOMMITLSN);
+ ASSERT(ailp->ail_target != NULLCOMMITLSN);
lsn = lip->li_lsn;
- while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {
+ while ((XFS_LSN_CMP(lip->li_lsn, ailp->ail_target) <= 0)) {
int lock_result;
+ if (test_bit(XFS_LI_FLUSHING, &lip->li_flags))
+ goto next_item;
+
/*
* Note that iop_push may unlock and reacquire the AIL lock. We
* rely on the AIL cursor implementation to be able to deal with
@@ -547,20 +584,24 @@ xfsaild_push(
if (stuck > 100)
break;
+next_item:
lip = xfs_trans_ail_cursor_next(ailp, &cur);
if (lip == NULL)
break;
+ if (lip->li_lsn != lsn && count > 1000)
+ break;
lsn = lip->li_lsn;
}
-out_done:
+out_done_cursor:
xfs_trans_ail_cursor_done(&cur);
+out_done:
spin_unlock(&ailp->ail_lock);
if (xfs_buf_delwri_submit_nowait(&ailp->ail_buf_list))
ailp->ail_log_flush++;
- if (!count || XFS_LSN_CMP(lsn, target) >= 0) {
+ if (!count || XFS_LSN_CMP(lsn, ailp->ail_target) >= 0) {
/*
* We reached the target or the AIL is empty, so wait a bit
* longer for I/O to complete and remove pushed items from the
@@ -585,7 +626,7 @@ out_done:
/*
* Assume we have more work to do in a short while.
*/
- tout = 10;
+ tout = 0;
}
return tout;
@@ -603,7 +644,7 @@ xfsaild(
set_freezable();
while (1) {
- if (tout && tout <= 20)
+ if (tout)
set_current_state(TASK_KILLABLE|TASK_FREEZABLE);
else
set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
@@ -639,21 +680,9 @@ xfsaild(
break;
}
+ /* Idle if the AIL is empty. */
spin_lock(&ailp->ail_lock);
-
- /*
- * Idle if the AIL is empty and we are not racing with a target
- * update. We check the AIL after we set the task to a sleep
- * state to guarantee that we either catch an ail_target update
- * or that a wake_up resets the state to TASK_RUNNING.
- * Otherwise, we run the risk of sleeping indefinitely.
- *
- * The barrier matches the ail_target update in xfs_ail_push().
- */
- smp_rmb();
- if (!xfs_ail_min(ailp) &&
- ailp->ail_target == ailp->ail_target_prev &&
- list_empty(&ailp->ail_buf_list)) {
+ if (!xfs_ail_min(ailp) && list_empty(&ailp->ail_buf_list)) {
spin_unlock(&ailp->ail_lock);
schedule();
tout = 0;
@@ -676,56 +705,6 @@ xfsaild(
}
/*
- * This routine is called to move the tail of the AIL forward. It does this by
- * trying to flush items in the AIL whose lsns are below the given
- * threshold_lsn.
- *
- * The push is run asynchronously in a workqueue, which means the caller needs
- * to handle waiting on the async flush for space to become available.
- * We don't want to interrupt any push that is in progress, hence we only queue
- * work if we set the pushing bit appropriately.
- *
- * We do this unlocked - we only need to know whether there is anything in the
- * AIL at the time we are called. We don't need to access the contents of
- * any of the objects, so the lock is not needed.
- */
-void
-xfs_ail_push(
- struct xfs_ail *ailp,
- xfs_lsn_t threshold_lsn)
-{
- struct xfs_log_item *lip;
-
- lip = xfs_ail_min(ailp);
- if (!lip || xlog_is_shutdown(ailp->ail_log) ||
- XFS_LSN_CMP(threshold_lsn, ailp->ail_target) <= 0)
- return;
-
- /*
- * Ensure that the new target is noticed in push code before it clears
- * the XFS_AIL_PUSHING_BIT.
- */
- smp_wmb();
- xfs_trans_ail_copy_lsn(ailp, &ailp->ail_target, &threshold_lsn);
- smp_wmb();
-
- wake_up_process(ailp->ail_task);
-}
-
-/*
- * Push out all items in the AIL immediately
- */
-void
-xfs_ail_push_all(
- struct xfs_ail *ailp)
-{
- xfs_lsn_t threshold_lsn = xfs_ail_max_lsn(ailp);
-
- if (threshold_lsn)
- xfs_ail_push(ailp, threshold_lsn);
-}
-
-/*
* Push out all items in the AIL immediately and wait until the AIL is empty.
*/
void
@@ -748,21 +727,49 @@ xfs_ail_push_all_sync(
}
void
+__xfs_ail_assign_tail_lsn(
+ struct xfs_ail *ailp)
+{
+ struct xlog *log = ailp->ail_log;
+ xfs_lsn_t tail_lsn;
+
+ assert_spin_locked(&ailp->ail_lock);
+
+ if (xlog_is_shutdown(log))
+ return;
+
+ tail_lsn = __xfs_ail_min_lsn(ailp);
+ if (!tail_lsn)
+ tail_lsn = ailp->ail_head_lsn;
+
+ WRITE_ONCE(log->l_tail_space,
+ xlog_lsn_sub(log, ailp->ail_head_lsn, tail_lsn));
+ trace_xfs_log_assign_tail_lsn(log, tail_lsn);
+ atomic64_set(&log->l_tail_lsn, tail_lsn);
+}
+
+/*
+ * Callers should pass the original tail lsn so that we can detect if the tail
+ * has moved as a result of the operation that was performed. If the caller
+ * needs to force a tail space update, it should pass NULLCOMMITLSN to bypass
+ * the "did the tail LSN change?" checks. If the caller wants to avoid a tail
+ * update (e.g. it knows the tail did not change) it should pass an @old_lsn of
+ * 0.
+ */
+void
xfs_ail_update_finish(
struct xfs_ail *ailp,
xfs_lsn_t old_lsn) __releases(ailp->ail_lock)
{
struct xlog *log = ailp->ail_log;
- /* if the tail lsn hasn't changed, don't do updates or wakeups. */
+ /* If the tail lsn hasn't changed, don't do updates or wakeups. */
if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) {
spin_unlock(&ailp->ail_lock);
return;
}
- if (!xlog_is_shutdown(log))
- xlog_assign_tail_lsn_locked(log->l_mp);
-
+ __xfs_ail_assign_tail_lsn(ailp);
if (list_empty(&ailp->ail_head))
wake_up_all(&ailp->ail_empty);
spin_unlock(&ailp->ail_lock);
@@ -829,6 +836,19 @@ xfs_trans_ail_update_bulk(
if (!list_empty(&tmp))
xfs_ail_splice(ailp, cur, &tmp, lsn);
+ /*
+ * If this is the first insert, wake up the push daemon so it can
+ * actively scan for items to push. We also need to do a log tail
+ * LSN update to ensure that it is correctly tracked by the log, so
+ * set the tail_lsn to NULLCOMMITLSN so that xfs_ail_update_finish()
+ * will see that the tail lsn has changed and will update the tail
+ * appropriately.
+ */
+ if (!mlip) {
+ wake_up_process(ailp->ail_task);
+ tail_lsn = NULLCOMMITLSN;
+ }
+
xfs_ail_update_finish(ailp, tail_lsn);
}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index d5400150358e..bd841df93021 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -19,9 +19,6 @@ void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
void xfs_trans_del_item(struct xfs_log_item *);
void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
-void xfs_trans_committed_bulk(struct xfs_ail *ailp,
- struct list_head *lv_chain,
- xfs_lsn_t commit_lsn, bool aborted);
/*
* AIL traversal cursor.
*
@@ -55,16 +52,20 @@ struct xfs_ail {
struct xlog *ail_log;
struct task_struct *ail_task;
struct list_head ail_head;
- xfs_lsn_t ail_target;
- xfs_lsn_t ail_target_prev;
struct list_head ail_cursors;
spinlock_t ail_lock;
xfs_lsn_t ail_last_pushed_lsn;
+ xfs_lsn_t ail_head_lsn;
int ail_log_flush;
+ unsigned long ail_opstate;
struct list_head ail_buf_list;
wait_queue_head_t ail_empty;
+ xfs_lsn_t ail_target;
};
+/* Push all items out of the AIL immediately. */
+#define XFS_AIL_OPSTATE_PUSH_ALL 0u
+
/*
* From xfs_trans_ail.c
*/
@@ -101,10 +102,23 @@ void xfs_ail_update_finish(struct xfs_ail *ailp, xfs_lsn_t old_lsn)
__releases(ailp->ail_lock);
void xfs_trans_ail_delete(struct xfs_log_item *lip, int shutdown_type);
-void xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
-void xfs_ail_push_all(struct xfs_ail *);
-void xfs_ail_push_all_sync(struct xfs_ail *);
-struct xfs_log_item *xfs_ail_min(struct xfs_ail *ailp);
+static inline void xfs_ail_push(struct xfs_ail *ailp)
+{
+ wake_up_process(ailp->ail_task);
+}
+
+static inline void xfs_ail_push_all(struct xfs_ail *ailp)
+{
+ if (!test_and_set_bit(XFS_AIL_OPSTATE_PUSH_ALL, &ailp->ail_opstate))
+ xfs_ail_push(ailp);
+}
+
+static inline xfs_lsn_t xfs_ail_get_push_target(struct xfs_ail *ailp)
+{
+ return READ_ONCE(ailp->ail_target);
+}
+
+void xfs_ail_push_all_sync(struct xfs_ail *ailp);
xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp);
struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
@@ -117,6 +131,18 @@ struct xfs_log_item * xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
struct xfs_ail_cursor *cur);
void xfs_trans_ail_cursor_done(struct xfs_ail_cursor *cur);
+void __xfs_ail_assign_tail_lsn(struct xfs_ail *ailp);
+
+static inline void
+xfs_ail_assign_tail_lsn(
+ struct xfs_ail *ailp)
+{
+
+ spin_lock(&ailp->ail_lock);
+ __xfs_ail_assign_tail_lsn(ailp);
+ spin_unlock(&ailp->ail_lock);
+}
+
#if BITS_PER_LONG != 64
static inline void
xfs_trans_ail_copy_lsn(
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 964fa7f24003..faf1eb87895d 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -662,6 +662,7 @@ static struct inode *zonefs_get_file_inode(struct inode *dir,
inode->i_op = &zonefs_file_inode_operations;
inode->i_fop = &zonefs_file_operations;
inode->i_mapping->a_ops = &zonefs_file_aops;
+ mapping_set_large_folios(inode->i_mapping);
/* Update the inode access rights depending on the zone condition */
zonefs_inode_update_mode(inode);