summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_inode.c4
-rw-r--r--fs/9p/vfs_inode_dotl.c14
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile1
-rw-r--r--fs/afs/Kconfig1
-rw-r--r--fs/afs/dir.c225
-rw-r--r--fs/afs/file.c483
-rw-r--r--fs/afs/fs_operation.c4
-rw-r--r--fs/afs/fsclient.c108
-rw-r--r--fs/afs/inode.c13
-rw-r--r--fs/afs/internal.h59
-rw-r--r--fs/afs/rxrpc.c150
-rw-r--r--fs/afs/write.c658
-rw-r--r--fs/afs/yfsclient.c82
-rw-r--r--fs/binfmt_elf.c4
-rw-r--r--fs/binfmt_elf_fdpic.c3
-rw-r--r--fs/block_dev.c8
-rw-r--r--fs/btrfs/Makefile12
-rw-r--r--fs/btrfs/backref.c33
-rw-r--r--fs/btrfs/block-group.c207
-rw-r--r--fs/btrfs/block-group.h3
-rw-r--r--fs/btrfs/btrfs_inode.h33
-rw-r--r--fs/btrfs/check-integrity.c14
-rw-r--r--fs/btrfs/compression.c15
-rw-r--r--fs/btrfs/ctree.c984
-rw-r--r--fs/btrfs/ctree.h83
-rw-r--r--fs/btrfs/delayed-inode.c35
-rw-r--r--fs/btrfs/delayed-ref.c31
-rw-r--r--fs/btrfs/dev-replace.c3
-rw-r--r--fs/btrfs/disk-io.c181
-rw-r--r--fs/btrfs/extent-tree.c21
-rw-r--r--fs/btrfs/extent_io.c439
-rw-r--r--fs/btrfs/extent_io.h4
-rw-r--r--fs/btrfs/file-item.c1
-rw-r--r--fs/btrfs/file.c118
-rw-r--r--fs/btrfs/free-space-cache.c9
-rw-r--r--fs/btrfs/inode.c147
-rw-r--r--fs/btrfs/ioctl.c269
-rw-r--r--fs/btrfs/lzo.c9
-rw-r--r--fs/btrfs/ordered-data.c19
-rw-r--r--fs/btrfs/ordered-data.h4
-rw-r--r--fs/btrfs/qgroup.c59
-rw-r--r--fs/btrfs/raid56.c73
-rw-r--r--fs/btrfs/reflink.c65
-rw-r--r--fs/btrfs/relocation.c448
-rw-r--r--fs/btrfs/scrub.c13
-rw-r--r--fs/btrfs/send.c43
-rw-r--r--fs/btrfs/space-info.c4
-rw-r--r--fs/btrfs/subpage.c140
-rw-r--r--fs/btrfs/subpage.h7
-rw-r--r--fs/btrfs/super.c26
-rw-r--r--fs/btrfs/sysfs.c50
-rw-r--r--fs/btrfs/transaction.c59
-rw-r--r--fs/btrfs/transaction.h9
-rw-r--r--fs/btrfs/tree-checker.c5
-rw-r--r--fs/btrfs/tree-log.c24
-rw-r--r--fs/btrfs/tree-mod-log.c929
-rw-r--r--fs/btrfs/tree-mod-log.h53
-rw-r--r--fs/btrfs/volumes.c129
-rw-r--r--fs/btrfs/volumes.h1
-rw-r--r--fs/btrfs/zoned.c60
-rw-r--r--fs/btrfs/zoned.h6
-rw-r--r--fs/cachefiles/Makefile1
-rw-r--r--fs/cachefiles/bind.c6
-rw-r--r--fs/cachefiles/interface.c5
-rw-r--r--fs/cachefiles/internal.h9
-rw-r--r--fs/cachefiles/io.c420
-rw-r--r--fs/cachefiles/rdwr.c7
-rw-r--r--fs/ceph/caps.c8
-rw-r--r--fs/ceph/dir.c2
-rw-r--r--fs/ceph/export.c9
-rw-r--r--fs/ceph/inode.c41
-rw-r--r--fs/cifs/Kconfig3
-rw-r--r--fs/cifs/Makefile5
-rw-r--r--fs/cifs/cifs_debug.c58
-rw-r--r--fs/cifs/cifs_dfs_ref.c14
-rw-r--r--fs/cifs/cifs_fs_sb.h4
-rw-r--r--fs/cifs/cifs_swn.h27
-rw-r--r--fs/cifs/cifsacl.c7
-rw-r--r--fs/cifs/cifsfs.c50
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h38
-rw-r--r--fs/cifs/cifspdu.h7
-rw-r--r--fs/cifs/cifsproto.h32
-rw-r--r--fs/cifs/cifssmb.c52
-rw-r--r--fs/cifs/connect.c49
-rw-r--r--fs/cifs/dfs_cache.c41
-rw-r--r--fs/cifs/dir.c169
-rw-r--r--fs/cifs/file.c82
-rw-r--r--fs/cifs/fs_context.c143
-rw-r--r--fs/cifs/fs_context.h13
-rw-r--r--fs/cifs/inode.c197
-rw-r--r--fs/cifs/ioctl.c13
-rw-r--r--fs/cifs/link.c46
-rw-r--r--fs/cifs/misc.c2
-rw-r--r--fs/cifs/readdir.c19
-rw-r--r--fs/cifs/smb1ops.c6
-rw-r--r--fs/cifs/smb2glob.h1
-rw-r--r--fs/cifs/smb2inode.c10
-rw-r--r--fs/cifs/smb2misc.c5
-rw-r--r--fs/cifs/smb2ops.c222
-rw-r--r--fs/cifs/smb2pdu.c2
-rw-r--r--fs/cifs/smb2pdu.h49
-rw-r--r--fs/cifs/smb2proto.h16
-rw-r--r--fs/cifs/smb2transport.c37
-rw-r--r--fs/cifs/unc.c4
-rw-r--r--fs/cifs/xattr.c40
-rw-r--r--fs/coda/file.c6
-rw-r--r--fs/coredump.c72
-rw-r--r--fs/crypto/Kconfig30
-rw-r--r--fs/debugfs/file.c94
-rw-r--r--fs/debugfs/inode.c2
-rw-r--r--fs/direct-io.c5
-rw-r--r--fs/ecryptfs/inode.c22
-rw-r--r--fs/efivarfs/file.c77
-rw-r--r--fs/efivarfs/inode.c44
-rw-r--r--fs/erofs/Kconfig14
-rw-r--r--fs/erofs/Makefile2
-rw-r--r--fs/erofs/data.c19
-rw-r--r--fs/erofs/decompressor.c272
-rw-r--r--fs/erofs/erofs_fs.h54
-rw-r--r--fs/erofs/inode.c7
-rw-r--r--fs/erofs/internal.h86
-rw-r--r--fs/erofs/pcpubuf.c148
-rw-r--r--fs/erofs/super.c148
-rw-r--r--fs/erofs/utils.c12
-rw-r--r--fs/erofs/zdata.c254
-rw-r--r--fs/erofs/zdata.h14
-rw-r--r--fs/erofs/zmap.c164
-rw-r--r--fs/eventpoll.c52
-rw-r--r--fs/ext2/ext2.h7
-rw-r--r--fs/ext2/file.c2
-rw-r--r--fs/ext2/ioctl.c88
-rw-r--r--fs/ext2/namei.c2
-rw-r--r--fs/ext4/ext4.h12
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/fsmap.c4
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/ioctl.c208
-rw-r--r--fs/ext4/namei.c2
-rw-r--r--fs/ext4/verity.c2
-rw-r--r--fs/f2fs/f2fs.h3
-rw-r--r--fs/f2fs/file.c218
-rw-r--r--fs/f2fs/namei.c2
-rw-r--r--fs/f2fs/verity.c2
-rw-r--r--fs/file.c21
-rw-r--r--fs/fscache/Kconfig1
-rw-r--r--fs/fscache/Makefile1
-rw-r--r--fs/fscache/internal.h4
-rw-r--r--fs/fscache/io.c116
-rw-r--r--fs/fscache/page.c2
-rw-r--r--fs/fscache/stats.c1
-rw-r--r--fs/fuse/Makefile2
-rw-r--r--fs/fuse/dir.c12
-rw-r--r--fs/fuse/file.c435
-rw-r--r--fs/fuse/fuse_i.h40
-rw-r--r--fs/fuse/inode.c2
-rw-r--r--fs/fuse/ioctl.c490
-rw-r--r--fs/fuse/readdir.c2
-rw-r--r--fs/gfs2/file.c63
-rw-r--r--fs/gfs2/glock.c3
-rw-r--r--fs/gfs2/glops.c22
-rw-r--r--fs/gfs2/inode.c4
-rw-r--r--fs/gfs2/inode.h3
-rw-r--r--fs/gfs2/log.c2
-rw-r--r--fs/gfs2/lops.c3
-rw-r--r--fs/gfs2/super.c14
-rw-r--r--fs/hfsplus/dir.c2
-rw-r--r--fs/hfsplus/hfsplus_fs.h14
-rw-r--r--fs/hfsplus/inode.c54
-rw-r--r--fs/hfsplus/ioctl.c84
-rw-r--r--fs/hostfs/hostfs_kern.c8
-rw-r--r--fs/inode.c91
-rw-r--r--fs/io-wq.c40
-rw-r--r--fs/io_uring.c155
-rw-r--r--fs/ioctl.c325
-rw-r--r--fs/iomap/buffered-io.c3
-rw-r--r--fs/iomap/swapfile.c38
-rw-r--r--fs/jfs/file.c6
-rw-r--r--fs/jfs/ioctl.c111
-rw-r--r--fs/jfs/jfs_dinode.h7
-rw-r--r--fs/jfs/jfs_inode.h4
-rw-r--r--fs/jfs/namei.c6
-rw-r--r--fs/libfs.c1
-rw-r--r--fs/locks.c66
-rw-r--r--fs/namei.c39
-rw-r--r--fs/namespace.c14
-rw-r--r--fs/netfs/Kconfig23
-rw-r--r--fs/netfs/Makefile5
-rw-r--r--fs/netfs/internal.h97
-rw-r--r--fs/netfs/read_helper.c1185
-rw-r--r--fs/netfs/stats.c59
-rw-r--r--fs/nfs/fs_context.c3
-rw-r--r--fs/nfs/inode.c6
-rw-r--r--fs/nfs/internal.h1
-rw-r--r--fs/nfs/super.c6
-rw-r--r--fs/nfs_common/nfsacl.c71
-rw-r--r--fs/nfsd/Kconfig6
-rw-r--r--fs/nfsd/netns.h6
-rw-r--r--fs/nfsd/nfs2acl.c87
-rw-r--r--fs/nfsd/nfs3acl.c39
-rw-r--r--fs/nfsd/nfs3proc.c97
-rw-r--r--fs/nfsd/nfs3xdr.c1043
-rw-r--r--fs/nfsd/nfs4proc.c40
-rw-r--r--fs/nfsd/nfs4recover.c8
-rw-r--r--fs/nfsd/nfs4state.c82
-rw-r--r--fs/nfsd/nfs4xdr.c110
-rw-r--r--fs/nfsd/nfsctl.c28
-rw-r--r--fs/nfsd/nfsd.h7
-rw-r--r--fs/nfsd/nfsfh.c2
-rw-r--r--fs/nfsd/nfsfh.h2
-rw-r--r--fs/nfsd/nfsproc.c55
-rw-r--r--fs/nfsd/nfssvc.c42
-rw-r--r--fs/nfsd/nfsxdr.c413
-rw-r--r--fs/nfsd/state.h4
-rw-r--r--fs/nfsd/trace.h24
-rw-r--r--fs/nfsd/vfs.c9
-rw-r--r--fs/nfsd/vfs.h2
-rw-r--r--fs/nfsd/xdr.h23
-rw-r--r--fs/nfsd/xdr3.h37
-rw-r--r--fs/nfsd/xdr4.h2
-rw-r--r--fs/nilfs2/file.c2
-rw-r--r--fs/nilfs2/ioctl.c61
-rw-r--r--fs/nilfs2/namei.c2
-rw-r--r--fs/nilfs2/nilfs.h3
-rw-r--r--fs/ocfs2/aops.c11
-rw-r--r--fs/ocfs2/dlmglue.c12
-rw-r--r--fs/ocfs2/file.c10
-rw-r--r--fs/ocfs2/ioctl.c59
-rw-r--r--fs/ocfs2/ioctl.h3
-rw-r--r--fs/ocfs2/namei.c3
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h8
-rw-r--r--fs/openpromfs/inode.c67
-rw-r--r--fs/orangefs/file.c79
-rw-r--r--fs/orangefs/inode.c50
-rw-r--r--fs/orangefs/orangefs-utils.c2
-rw-r--r--fs/overlayfs/dir.c2
-rw-r--r--fs/overlayfs/file.c121
-rw-r--r--fs/overlayfs/inode.c77
-rw-r--r--fs/overlayfs/namei.c4
-rw-r--r--fs/overlayfs/overlayfs.h5
-rw-r--r--fs/overlayfs/readdir.c4
-rw-r--r--fs/proc/array.c2
-rw-r--r--fs/pstore/platform.c5
-rw-r--r--fs/pstore/ram.c7
-rw-r--r--fs/pstore/ram_core.c18
-rw-r--r--fs/readdir.c6
-rw-r--r--fs/reiserfs/file.c2
-rw-r--r--fs/reiserfs/ioctl.c121
-rw-r--r--fs/reiserfs/namei.c2
-rw-r--r--fs/reiserfs/reiserfs.h7
-rw-r--r--fs/reiserfs/super.c2
-rw-r--r--fs/reiserfs/xattr.h2
-rw-r--r--fs/signalfd.c4
-rw-r--r--fs/squashfs/export.c8
-rw-r--r--fs/squashfs/id.c6
-rw-r--r--fs/squashfs/squashfs_fs.h1
-rw-r--r--fs/squashfs/xattr_id.c6
-rw-r--r--fs/ubifs/dir.c2
-rw-r--r--fs/ubifs/file.c2
-rw-r--r--fs/ubifs/gc.c7
-rw-r--r--fs/ubifs/ioctl.c78
-rw-r--r--fs/ubifs/replay.c4
-rw-r--r--fs/ubifs/ubifs.h3
-rw-r--r--fs/vboxsf/dir.c4
-rw-r--r--fs/vboxsf/super.c4
-rw-r--r--fs/vboxsf/utils.c68
-rw-r--r--fs/vboxsf/vfsmod.h4
-rw-r--r--fs/verity/Kconfig8
-rw-r--r--fs/xattr.c14
-rw-r--r--fs/xfs/libxfs/xfs_fs.h4
-rw-r--r--fs/xfs/scrub/bitmap.c4
-rw-r--r--fs/xfs/xfs_bmap_item.c4
-rw-r--r--fs/xfs/xfs_buf.c6
-rw-r--r--fs/xfs/xfs_extent_busy.c4
-rw-r--r--fs/xfs/xfs_extent_busy.h3
-rw-r--r--fs/xfs/xfs_extfree_item.c4
-rw-r--r--fs/xfs/xfs_inode.c10
-rw-r--r--fs/xfs/xfs_ioctl.c258
-rw-r--r--fs/xfs/xfs_ioctl.h11
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_ioctl32.h2
-rw-r--r--fs/xfs/xfs_iops.c7
-rw-r--r--fs/xfs/xfs_refcount_item.c4
-rw-r--r--fs/xfs/xfs_rmap_item.c4
-rw-r--r--fs/xfs/xfs_symlink.c4
286 files changed, 11196 insertions, 6874 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 8d97f0b45e9c..795706520b5e 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -399,7 +399,7 @@ static int v9fs_test_inode(struct inode *inode, void *data)
umode = p9mode2unixmode(v9ses, st, &rdev);
/* don't match inode of different type */
- if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
+ if (inode_wrong_type(inode, umode))
return 0;
/* compare qid details */
@@ -1390,7 +1390,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
* Don't update inode if the file type is different
*/
umode = p9mode2unixmode(v9ses, st, &rdev);
- if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
+ if (inode_wrong_type(inode, umode))
goto out;
/*
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 1dc7af046615..e1c0240b51c0 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -59,7 +59,7 @@ static int v9fs_test_inode_dotl(struct inode *inode, void *data)
struct p9_stat_dotl *st = (struct p9_stat_dotl *)data;
/* don't match inode of different type */
- if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
+ if (inode_wrong_type(inode, st->st_mode))
return 0;
if (inode->i_generation != st->st_gen)
@@ -663,14 +663,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
if (stat->st_result_mask & P9_STATS_NLINK)
set_nlink(inode, stat->st_nlink);
if (stat->st_result_mask & P9_STATS_MODE) {
- inode->i_mode = stat->st_mode;
- if ((S_ISBLK(inode->i_mode)) ||
- (S_ISCHR(inode->i_mode)))
- init_special_inode(inode, inode->i_mode,
- inode->i_rdev);
+ mode = stat->st_mode & S_IALLUGO;
+ mode |= inode->i_mode & ~S_IALLUGO;
+ inode->i_mode = mode;
}
- if (stat->st_result_mask & P9_STATS_RDEV)
- inode->i_rdev = new_decode_dev(stat->st_rdev);
if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) &&
stat->st_result_mask & P9_STATS_SIZE)
v9fs_i_size_write(inode, stat->st_size);
@@ -959,7 +955,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
/*
* Don't update inode if the file type is different
*/
- if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
+ if (inode_wrong_type(inode, st->st_mode))
goto out;
/*
diff --git a/fs/Kconfig b/fs/Kconfig
index a55bda4233bb..97e7b77c9309 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -125,6 +125,7 @@ source "fs/overlayfs/Kconfig"
menu "Caches"
+source "fs/netfs/Kconfig"
source "fs/fscache/Kconfig"
source "fs/cachefiles/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 3215fe205256..9c708e1fbe8f 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -67,6 +67,7 @@ obj-y += devpts/
obj-$(CONFIG_DLM) += dlm/
# Do not add any filesystems before this line
+obj-$(CONFIG_NETFS_SUPPORT) += netfs/
obj-$(CONFIG_FSCACHE) += fscache/
obj-$(CONFIG_REISERFS_FS) += reiserfs/
obj-$(CONFIG_EXT4_FS) += ext4/
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index 1ad211d72b3b..fc8ba9142f2f 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -4,6 +4,7 @@ config AFS_FS
depends on INET
select AF_RXRPC
select DNS_RESOLVER
+ select NETFS_SUPPORT
help
If you say Y here, you will get an experimental Andrew File System
driver. It currently only supports unsecured read-only AFS access.
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 17548c1faf02..117df15e5367 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -103,6 +103,35 @@ struct afs_lookup_cookie {
};
/*
+ * Drop the refs that we're holding on the pages we were reading into. We've
+ * got refs on the first nr_pages pages.
+ */
+static void afs_dir_read_cleanup(struct afs_read *req)
+{
+ struct address_space *mapping = req->vnode->vfs_inode.i_mapping;
+ struct page *page;
+ pgoff_t last = req->nr_pages - 1;
+
+ XA_STATE(xas, &mapping->i_pages, 0);
+
+ if (unlikely(!req->nr_pages))
+ return;
+
+ rcu_read_lock();
+ xas_for_each(&xas, page, last) {
+ if (xas_retry(&xas, page))
+ continue;
+ BUG_ON(xa_is_value(page));
+ BUG_ON(PageCompound(page));
+ ASSERTCMP(page->mapping, ==, mapping);
+
+ put_page(page);
+ }
+
+ rcu_read_unlock();
+}
+
+/*
* check that a directory page is valid
*/
static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page,
@@ -127,7 +156,7 @@ static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page,
qty /= sizeof(union afs_xdr_dir_block);
/* check them */
- dbuf = kmap(page);
+ dbuf = kmap_atomic(page);
for (tmp = 0; tmp < qty; tmp++) {
if (dbuf->blocks[tmp].hdr.magic != AFS_DIR_MAGIC) {
printk("kAFS: %s(%lx): bad magic %d/%d is %04hx\n",
@@ -146,7 +175,7 @@ static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page,
((u8 *)&dbuf->blocks[tmp])[AFS_DIR_BLOCK_SIZE - 1] = 0;
}
- kunmap(page);
+ kunmap_atomic(dbuf);
checked:
afs_stat_v(dvnode, n_read_dir);
@@ -157,35 +186,74 @@ error:
}
/*
- * Check the contents of a directory that we've just read.
+ * Dump the contents of a directory.
*/
-static bool afs_dir_check_pages(struct afs_vnode *dvnode, struct afs_read *req)
+static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req)
{
struct afs_xdr_dir_page *dbuf;
- unsigned int i, j, qty = PAGE_SIZE / sizeof(union afs_xdr_dir_block);
+ struct address_space *mapping = dvnode->vfs_inode.i_mapping;
+ struct page *page;
+ unsigned int i, qty = PAGE_SIZE / sizeof(union afs_xdr_dir_block);
+ pgoff_t last = req->nr_pages - 1;
- for (i = 0; i < req->nr_pages; i++)
- if (!afs_dir_check_page(dvnode, req->pages[i], req->actual_len))
- goto bad;
- return true;
+ XA_STATE(xas, &mapping->i_pages, 0);
-bad:
- pr_warn("DIR %llx:%llx f=%llx l=%llx al=%llx r=%llx\n",
+ pr_warn("DIR %llx:%llx f=%llx l=%llx al=%llx\n",
dvnode->fid.vid, dvnode->fid.vnode,
- req->file_size, req->len, req->actual_len, req->remain);
- pr_warn("DIR %llx %x %x %x\n",
- req->pos, req->index, req->nr_pages, req->offset);
+ req->file_size, req->len, req->actual_len);
+ pr_warn("DIR %llx %x %zx %zx\n",
+ req->pos, req->nr_pages,
+ req->iter->iov_offset, iov_iter_count(req->iter));
- for (i = 0; i < req->nr_pages; i++) {
- dbuf = kmap(req->pages[i]);
- for (j = 0; j < qty; j++) {
- union afs_xdr_dir_block *block = &dbuf->blocks[j];
+ xas_for_each(&xas, page, last) {
+ if (xas_retry(&xas, page))
+ continue;
+
+ BUG_ON(PageCompound(page));
+ BUG_ON(page->mapping != mapping);
+
+ dbuf = kmap_atomic(page);
+ for (i = 0; i < qty; i++) {
+ union afs_xdr_dir_block *block = &dbuf->blocks[i];
- pr_warn("[%02x] %32phN\n", i * qty + j, block);
+ pr_warn("[%02lx] %32phN\n", page->index * qty + i, block);
}
- kunmap(req->pages[i]);
+ kunmap_atomic(dbuf);
}
- return false;
+}
+
+/*
+ * Check all the pages in a directory. All the pages are held pinned.
+ */
+static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req)
+{
+ struct address_space *mapping = dvnode->vfs_inode.i_mapping;
+ struct page *page;
+ pgoff_t last = req->nr_pages - 1;
+ int ret = 0;
+
+ XA_STATE(xas, &mapping->i_pages, 0);
+
+ if (unlikely(!req->nr_pages))
+ return 0;
+
+ rcu_read_lock();
+ xas_for_each(&xas, page, last) {
+ if (xas_retry(&xas, page))
+ continue;
+
+ BUG_ON(PageCompound(page));
+ BUG_ON(page->mapping != mapping);
+
+ if (!afs_dir_check_page(dvnode, page, req->file_size)) {
+ afs_dir_dump(dvnode, req);
+ ret = -EIO;
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+ return ret;
}
/*
@@ -214,57 +282,57 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
{
struct afs_read *req;
loff_t i_size;
- int nr_pages, nr_inline, i, n;
- int ret = -ENOMEM;
+ int nr_pages, i, n;
+ int ret;
+
+ _enter("");
-retry:
+ req = kzalloc(sizeof(*req), GFP_KERNEL);
+ if (!req)
+ return ERR_PTR(-ENOMEM);
+
+ refcount_set(&req->usage, 1);
+ req->vnode = dvnode;
+ req->key = key_get(key);
+ req->cleanup = afs_dir_read_cleanup;
+
+expand:
i_size = i_size_read(&dvnode->vfs_inode);
- if (i_size < 2048)
- return ERR_PTR(afs_bad(dvnode, afs_file_error_dir_small));
+ if (i_size < 2048) {
+ ret = afs_bad(dvnode, afs_file_error_dir_small);
+ goto error;
+ }
if (i_size > 2048 * 1024) {
trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big);
- return ERR_PTR(-EFBIG);
+ ret = -EFBIG;
+ goto error;
}
_enter("%llu", i_size);
- /* Get a request record to hold the page list. We want to hold it
- * inline if we can, but we don't want to make an order 1 allocation.
- */
nr_pages = (i_size + PAGE_SIZE - 1) / PAGE_SIZE;
- nr_inline = nr_pages;
- if (nr_inline > (PAGE_SIZE - sizeof(*req)) / sizeof(struct page *))
- nr_inline = 0;
- req = kzalloc(struct_size(req, array, nr_inline), GFP_KERNEL);
- if (!req)
- return ERR_PTR(-ENOMEM);
-
- refcount_set(&req->usage, 1);
- req->nr_pages = nr_pages;
req->actual_len = i_size; /* May change */
req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */
req->data_version = dvnode->status.data_version; /* May change */
- if (nr_inline > 0) {
- req->pages = req->array;
- } else {
- req->pages = kcalloc(nr_pages, sizeof(struct page *),
- GFP_KERNEL);
- if (!req->pages)
- goto error;
- }
+ iov_iter_xarray(&req->def_iter, READ, &dvnode->vfs_inode.i_mapping->i_pages,
+ 0, i_size);
+ req->iter = &req->def_iter;
- /* Get a list of all the pages that hold or will hold the directory
- * content. We need to fill in any gaps that we might find where the
- * memory reclaimer has been at work. If there are any gaps, we will
+ /* Fill in any gaps that we might find where the memory reclaimer has
+ * been at work and pin all the pages. If there are any gaps, we will
* need to reread the entire directory contents.
*/
- i = 0;
- do {
+ i = req->nr_pages;
+ while (i < nr_pages) {
+ struct page *pages[8], *page;
+
n = find_get_pages_contig(dvnode->vfs_inode.i_mapping, i,
- req->nr_pages - i,
- req->pages + i);
- _debug("find %u at %u/%u", n, i, req->nr_pages);
+ min_t(unsigned int, nr_pages - i,
+ ARRAY_SIZE(pages)),
+ pages);
+ _debug("find %u at %u/%u", n, i, nr_pages);
+
if (n == 0) {
gfp_t gfp = dvnode->vfs_inode.i_mapping->gfp_mask;
@@ -272,22 +340,24 @@ retry:
afs_stat_v(dvnode, n_inval);
ret = -ENOMEM;
- req->pages[i] = __page_cache_alloc(gfp);
- if (!req->pages[i])
+ page = __page_cache_alloc(gfp);
+ if (!page)
goto error;
- ret = add_to_page_cache_lru(req->pages[i],
+ ret = add_to_page_cache_lru(page,
dvnode->vfs_inode.i_mapping,
i, gfp);
if (ret < 0)
goto error;
- attach_page_private(req->pages[i], (void *)1);
- unlock_page(req->pages[i]);
+ attach_page_private(page, (void *)1);
+ unlock_page(page);
+ req->nr_pages++;
i++;
} else {
+ req->nr_pages += n;
i += n;
}
- } while (i < req->nr_pages);
+ }
/* If we're going to reload, we need to lock all the pages to prevent
* races.
@@ -305,18 +375,23 @@ retry:
if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) {
trace_afs_reload_dir(dvnode);
- ret = afs_fetch_data(dvnode, key, req);
+ ret = afs_fetch_data(dvnode, req);
if (ret < 0)
goto error_unlock;
task_io_account_read(PAGE_SIZE * req->nr_pages);
- if (req->len < req->file_size)
- goto content_has_grown;
+ if (req->len < req->file_size) {
+ /* The content has grown, so we need to expand the
+ * buffer.
+ */
+ up_write(&dvnode->validate_lock);
+ goto expand;
+ }
/* Validate the data we just read. */
- ret = -EIO;
- if (!afs_dir_check_pages(dvnode, req))
+ ret = afs_dir_check(dvnode, req);
+ if (ret < 0)
goto error_unlock;
// TODO: Trim excess pages
@@ -334,11 +409,6 @@ error:
afs_put_read(req);
_leave(" = %d", ret);
return ERR_PTR(ret);
-
-content_has_grown:
- up_write(&dvnode->validate_lock);
- afs_put_read(req);
- goto retry;
}
/*
@@ -448,6 +518,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
struct afs_read *req;
struct page *page;
unsigned blkoff, limit;
+ void __rcu **slot;
int ret;
_enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos);
@@ -472,9 +543,15 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
blkoff = ctx->pos & ~(sizeof(union afs_xdr_dir_block) - 1);
/* Fetch the appropriate page from the directory and re-add it
- * to the LRU.
+ * to the LRU. We have all the pages pinned with an extra ref.
*/
- page = req->pages[blkoff / PAGE_SIZE];
+ rcu_read_lock();
+ page = NULL;
+ slot = radix_tree_lookup_slot(&dvnode->vfs_inode.i_mapping->i_pages,
+ blkoff / PAGE_SIZE);
+ if (slot)
+ page = radix_tree_deref_slot(slot);
+ rcu_read_unlock();
if (!page) {
ret = afs_bad(dvnode, afs_file_error_dir_missing_page);
break;
@@ -2006,6 +2083,6 @@ static void afs_dir_invalidatepage(struct page *page, unsigned int offset,
afs_stat_v(dvnode, n_inval);
/* we clean up only if the entire page is being invalidated */
- if (offset == 0 && length == PAGE_SIZE)
+ if (offset == 0 && length == thp_size(page))
detach_page_private(page);
}
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 960b64268623..db035ae2a134 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -14,6 +14,7 @@
#include <linux/gfp.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/mm.h>
+#include <linux/netfs.h>
#include "internal.h"
static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
@@ -22,8 +23,7 @@ static void afs_invalidatepage(struct page *page, unsigned int offset,
unsigned int length);
static int afs_releasepage(struct page *page, gfp_t gfp_flags);
-static int afs_readpages(struct file *filp, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages);
+static void afs_readahead(struct readahead_control *ractl);
const struct file_operations afs_file_operations = {
.open = afs_open,
@@ -47,7 +47,7 @@ const struct inode_operations afs_file_inode_operations = {
const struct address_space_operations afs_fs_aops = {
.readpage = afs_readpage,
- .readpages = afs_readpages,
+ .readahead = afs_readahead,
.set_page_dirty = afs_set_page_dirty,
.launder_page = afs_launder_page,
.releasepage = afs_releasepage,
@@ -184,41 +184,50 @@ int afs_release(struct inode *inode, struct file *file)
}
/*
+ * Allocate a new read record.
+ */
+struct afs_read *afs_alloc_read(gfp_t gfp)
+{
+ struct afs_read *req;
+
+ req = kzalloc(sizeof(struct afs_read), gfp);
+ if (req)
+ refcount_set(&req->usage, 1);
+
+ return req;
+}
+
+/*
* Dispose of a ref to a read record.
*/
void afs_put_read(struct afs_read *req)
{
- int i;
-
if (refcount_dec_and_test(&req->usage)) {
- if (req->pages) {
- for (i = 0; i < req->nr_pages; i++)
- if (req->pages[i])
- put_page(req->pages[i]);
- if (req->pages != req->array)
- kfree(req->pages);
- }
+ if (req->cleanup)
+ req->cleanup(req);
+ key_put(req->key);
kfree(req);
}
}
-#ifdef CONFIG_AFS_FSCACHE
-/*
- * deal with notification that a page was read from the cache
- */
-static void afs_file_readpage_read_complete(struct page *page,
- void *data,
- int error)
+static void afs_fetch_data_notify(struct afs_operation *op)
{
- _enter("%p,%p,%d", page, data, error);
-
- /* if the read completes with an error, we just unlock the page and let
- * the VM reissue the readpage */
- if (!error)
- SetPageUptodate(page);
- unlock_page(page);
+ struct afs_read *req = op->fetch.req;
+ struct netfs_read_subrequest *subreq = req->subreq;
+ int error = op->error;
+
+ if (error == -ECONNABORTED)
+ error = afs_abort_to_error(op->ac.abort_code);
+ req->error = error;
+
+ if (subreq) {
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ netfs_subreq_terminated(subreq, error ?: req->actual_len, false);
+ req->subreq = NULL;
+ } else if (req->done) {
+ req->done(req);
+ }
}
-#endif
static void afs_fetch_data_success(struct afs_operation *op)
{
@@ -228,10 +237,12 @@ static void afs_fetch_data_success(struct afs_operation *op)
afs_vnode_commit_status(op, &op->file[0]);
afs_stat_v(vnode, n_fetches);
atomic_long_add(op->fetch.req->actual_len, &op->net->n_fetch_bytes);
+ afs_fetch_data_notify(op);
}
static void afs_fetch_data_put(struct afs_operation *op)
{
+ op->fetch.req->error = op->error;
afs_put_read(op->fetch.req);
}
@@ -240,13 +251,14 @@ static const struct afs_operation_ops afs_fetch_data_operation = {
.issue_yfs_rpc = yfs_fs_fetch_data,
.success = afs_fetch_data_success,
.aborted = afs_check_for_remote_deletion,
+ .failed = afs_fetch_data_notify,
.put = afs_fetch_data_put,
};
/*
* Fetch file data from the volume.
*/
-int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *req)
+int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req)
{
struct afs_operation *op;
@@ -255,11 +267,14 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *re
vnode->fid.vid,
vnode->fid.vnode,
vnode->fid.unique,
- key_serial(key));
+ key_serial(req->key));
- op = afs_alloc_operation(key, vnode->volume);
- if (IS_ERR(op))
+ op = afs_alloc_operation(req->key, vnode->volume);
+ if (IS_ERR(op)) {
+ if (req->subreq)
+ netfs_subreq_terminated(req->subreq, PTR_ERR(op), false);
return PTR_ERR(op);
+ }
afs_op_set_vnode(op, 0, vnode);
@@ -268,336 +283,103 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *re
return afs_do_sync_operation(op);
}
-/*
- * read page from file, directory or symlink, given a key to use
- */
-int afs_page_filler(void *data, struct page *page)
+static void afs_req_issue_op(struct netfs_read_subrequest *subreq)
{
- struct inode *inode = page->mapping->host;
- struct afs_vnode *vnode = AFS_FS_I(inode);
- struct afs_read *req;
- struct key *key = data;
- int ret;
-
- _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
+ struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);
+ struct afs_read *fsreq;
- BUG_ON(!PageLocked(page));
+ fsreq = afs_alloc_read(GFP_NOFS);
+ if (!fsreq)
+ return netfs_subreq_terminated(subreq, -ENOMEM, false);
- ret = -ESTALE;
- if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
- goto error;
+ fsreq->subreq = subreq;
+ fsreq->pos = subreq->start + subreq->transferred;
+ fsreq->len = subreq->len - subreq->transferred;
+ fsreq->key = subreq->rreq->netfs_priv;
+ fsreq->vnode = vnode;
+ fsreq->iter = &fsreq->def_iter;
- /* is it cached? */
-#ifdef CONFIG_AFS_FSCACHE
- ret = fscache_read_or_alloc_page(vnode->cache,
- page,
- afs_file_readpage_read_complete,
- NULL,
- GFP_KERNEL);
-#else
- ret = -ENOBUFS;
-#endif
- switch (ret) {
- /* read BIO submitted (page in cache) */
- case 0:
- break;
-
- /* page not yet cached */
- case -ENODATA:
- _debug("cache said ENODATA");
- goto go_on;
-
- /* page will not be cached */
- case -ENOBUFS:
- _debug("cache said ENOBUFS");
-
- fallthrough;
- default:
- go_on:
- req = kzalloc(struct_size(req, array, 1), GFP_KERNEL);
- if (!req)
- goto enomem;
-
- /* We request a full page. If the page is a partial one at the
- * end of the file, the server will return a short read and the
- * unmarshalling code will clear the unfilled space.
- */
- refcount_set(&req->usage, 1);
- req->pos = (loff_t)page->index << PAGE_SHIFT;
- req->len = PAGE_SIZE;
- req->nr_pages = 1;
- req->pages = req->array;
- req->pages[0] = page;
- get_page(page);
-
- /* read the contents of the file from the server into the
- * page */
- ret = afs_fetch_data(vnode, key, req);
- afs_put_read(req);
-
- if (ret < 0) {
- if (ret == -ENOENT) {
- _debug("got NOENT from server"
- " - marking file deleted and stale");
- set_bit(AFS_VNODE_DELETED, &vnode->flags);
- ret = -ESTALE;
- }
+ iov_iter_xarray(&fsreq->def_iter, READ,
+ &fsreq->vnode->vfs_inode.i_mapping->i_pages,
+ fsreq->pos, fsreq->len);
-#ifdef CONFIG_AFS_FSCACHE
- fscache_uncache_page(vnode->cache, page);
-#endif
- BUG_ON(PageFsCache(page));
-
- if (ret == -EINTR ||
- ret == -ENOMEM ||
- ret == -ERESTARTSYS ||
- ret == -EAGAIN)
- goto error;
- goto io_error;
- }
+ afs_fetch_data(fsreq->vnode, fsreq);
+}
- SetPageUptodate(page);
+static int afs_symlink_readpage(struct page *page)
+{
+ struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+ struct afs_read *fsreq;
+ int ret;
- /* send the page to the cache */
-#ifdef CONFIG_AFS_FSCACHE
- if (PageFsCache(page) &&
- fscache_write_page(vnode->cache, page, vnode->status.size,
- GFP_KERNEL) != 0) {
- fscache_uncache_page(vnode->cache, page);
- BUG_ON(PageFsCache(page));
- }
-#endif
- unlock_page(page);
- }
+ fsreq = afs_alloc_read(GFP_NOFS);
+ if (!fsreq)
+ return -ENOMEM;
- _leave(" = 0");
- return 0;
+ fsreq->pos = page->index * PAGE_SIZE;
+ fsreq->len = PAGE_SIZE;
+ fsreq->vnode = vnode;
+ fsreq->iter = &fsreq->def_iter;
+ iov_iter_xarray(&fsreq->def_iter, READ, &page->mapping->i_pages,
+ fsreq->pos, fsreq->len);
-io_error:
- SetPageError(page);
- goto error;
-enomem:
- ret = -ENOMEM;
-error:
- unlock_page(page);
- _leave(" = %d", ret);
+ ret = afs_fetch_data(fsreq->vnode, fsreq);
+ page_endio(page, false, ret);
return ret;
}
-/*
- * read page from file, directory or symlink, given a file to nominate the key
- * to be used
- */
-static int afs_readpage(struct file *file, struct page *page)
+static void afs_init_rreq(struct netfs_read_request *rreq, struct file *file)
{
- struct key *key;
- int ret;
-
- if (file) {
- key = afs_file_key(file);
- ASSERT(key != NULL);
- ret = afs_page_filler(key, page);
- } else {
- struct inode *inode = page->mapping->host;
- key = afs_request_key(AFS_FS_S(inode->i_sb)->cell);
- if (IS_ERR(key)) {
- ret = PTR_ERR(key);
- } else {
- ret = afs_page_filler(key, page);
- key_put(key);
- }
- }
- return ret;
+ rreq->netfs_priv = key_get(afs_file_key(file));
}
-/*
- * Make pages available as they're filled.
- */
-static void afs_readpages_page_done(struct afs_read *req)
+static bool afs_is_cache_enabled(struct inode *inode)
{
-#ifdef CONFIG_AFS_FSCACHE
- struct afs_vnode *vnode = req->vnode;
-#endif
- struct page *page = req->pages[req->index];
+ struct fscache_cookie *cookie = afs_vnode_cache(AFS_FS_I(inode));
- req->pages[req->index] = NULL;
- SetPageUptodate(page);
-
- /* send the page to the cache */
-#ifdef CONFIG_AFS_FSCACHE
- if (PageFsCache(page) &&
- fscache_write_page(vnode->cache, page, vnode->status.size,
- GFP_KERNEL) != 0) {
- fscache_uncache_page(vnode->cache, page);
- BUG_ON(PageFsCache(page));
- }
-#endif
- unlock_page(page);
- put_page(page);
+ return fscache_cookie_enabled(cookie) && !hlist_empty(&cookie->backing_objects);
}
-/*
- * Read a contiguous set of pages.
- */
-static int afs_readpages_one(struct file *file, struct address_space *mapping,
- struct list_head *pages)
+static int afs_begin_cache_operation(struct netfs_read_request *rreq)
{
- struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- struct afs_read *req;
- struct list_head *p;
- struct page *first, *page;
- struct key *key = afs_file_key(file);
- pgoff_t index;
- int ret, n, i;
-
- /* Count the number of contiguous pages at the front of the list. Note
- * that the list goes prev-wards rather than next-wards.
- */
- first = lru_to_page(pages);
- index = first->index + 1;
- n = 1;
- for (p = first->lru.prev; p != pages; p = p->prev) {
- page = list_entry(p, struct page, lru);
- if (page->index != index)
- break;
- index++;
- n++;
- }
-
- req = kzalloc(struct_size(req, array, n), GFP_NOFS);
- if (!req)
- return -ENOMEM;
-
- refcount_set(&req->usage, 1);
- req->vnode = vnode;
- req->page_done = afs_readpages_page_done;
- req->pos = first->index;
- req->pos <<= PAGE_SHIFT;
- req->pages = req->array;
-
- /* Transfer the pages to the request. We add them in until one fails
- * to add to the LRU and then we stop (as that'll make a hole in the
- * contiguous run.
- *
- * Note that it's possible for the file size to change whilst we're
- * doing this, but we rely on the server returning less than we asked
- * for if the file shrank. We also rely on this to deal with a partial
- * page at the end of the file.
- */
- do {
- page = lru_to_page(pages);
- list_del(&page->lru);
- index = page->index;
- if (add_to_page_cache_lru(page, mapping, index,
- readahead_gfp_mask(mapping))) {
-#ifdef CONFIG_AFS_FSCACHE
- fscache_uncache_page(vnode->cache, page);
-#endif
- put_page(page);
- break;
- }
-
- req->pages[req->nr_pages++] = page;
- req->len += PAGE_SIZE;
- } while (req->nr_pages < n);
+ struct afs_vnode *vnode = AFS_FS_I(rreq->inode);
- if (req->nr_pages == 0) {
- kfree(req);
- return 0;
- }
-
- ret = afs_fetch_data(vnode, key, req);
- if (ret < 0)
- goto error;
-
- task_io_account_read(PAGE_SIZE * req->nr_pages);
- afs_put_read(req);
- return 0;
-
-error:
- if (ret == -ENOENT) {
- _debug("got NOENT from server"
- " - marking file deleted and stale");
- set_bit(AFS_VNODE_DELETED, &vnode->flags);
- ret = -ESTALE;
- }
-
- for (i = 0; i < req->nr_pages; i++) {
- page = req->pages[i];
- if (page) {
-#ifdef CONFIG_AFS_FSCACHE
- fscache_uncache_page(vnode->cache, page);
-#endif
- SetPageError(page);
- unlock_page(page);
- }
- }
-
- afs_put_read(req);
- return ret;
+ return fscache_begin_read_operation(rreq, afs_vnode_cache(vnode));
}
-/*
- * read a set of pages
- */
-static int afs_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len,
+ struct page *page, void **_fsdata)
{
- struct key *key = afs_file_key(file);
- struct afs_vnode *vnode;
- int ret = 0;
-
- _enter("{%d},{%lu},,%d",
- key_serial(key), mapping->host->i_ino, nr_pages);
+ struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
- ASSERT(key != NULL);
+ return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0;
+}
- vnode = AFS_FS_I(mapping->host);
- if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
- _leave(" = -ESTALE");
- return -ESTALE;
- }
+static void afs_priv_cleanup(struct address_space *mapping, void *netfs_priv)
+{
+ key_put(netfs_priv);
+}
- /* attempt to read as many of the pages as possible */
-#ifdef CONFIG_AFS_FSCACHE
- ret = fscache_read_or_alloc_pages(vnode->cache,
- mapping,
- pages,
- &nr_pages,
- afs_file_readpage_read_complete,
- NULL,
- mapping_gfp_mask(mapping));
-#else
- ret = -ENOBUFS;
-#endif
+const struct netfs_read_request_ops afs_req_ops = {
+ .init_rreq = afs_init_rreq,
+ .is_cache_enabled = afs_is_cache_enabled,
+ .begin_cache_operation = afs_begin_cache_operation,
+ .check_write_begin = afs_check_write_begin,
+ .issue_op = afs_req_issue_op,
+ .cleanup = afs_priv_cleanup,
+};
- switch (ret) {
- /* all pages are being read from the cache */
- case 0:
- BUG_ON(!list_empty(pages));
- BUG_ON(nr_pages != 0);
- _leave(" = 0 [reading all]");
- return 0;
-
- /* there were pages that couldn't be read from the cache */
- case -ENODATA:
- case -ENOBUFS:
- break;
-
- /* other error */
- default:
- _leave(" = %d", ret);
- return ret;
- }
+static int afs_readpage(struct file *file, struct page *page)
+{
+ if (!file)
+ return afs_symlink_readpage(page);
- while (!list_empty(pages)) {
- ret = afs_readpages_one(file, mapping, pages);
- if (ret < 0)
- break;
- }
+ return netfs_readpage(file, page, &afs_req_ops, NULL);
+}
- _leave(" = %d [netting]", ret);
- return ret;
+static void afs_readahead(struct readahead_control *ractl)
+{
+ netfs_readahead(ractl, &afs_req_ops, NULL);
}
/*
@@ -625,8 +407,8 @@ static void afs_invalidate_dirty(struct page *page, unsigned int offset,
return;
/* We may need to shorten the dirty region */
- f = afs_page_dirty_from(priv);
- t = afs_page_dirty_to(priv);
+ f = afs_page_dirty_from(page, priv);
+ t = afs_page_dirty_to(page, priv);
if (t <= offset || f >= end)
return; /* Doesn't overlap */
@@ -644,17 +426,17 @@ static void afs_invalidate_dirty(struct page *page, unsigned int offset,
if (f == t)
goto undirty;
- priv = afs_page_dirty(f, t);
+ priv = afs_page_dirty(page, f, t);
set_page_private(page, priv);
- trace_afs_page_dirty(vnode, tracepoint_string("trunc"), page->index, priv);
+ trace_afs_page_dirty(vnode, tracepoint_string("trunc"), page);
return;
undirty:
- trace_afs_page_dirty(vnode, tracepoint_string("undirty"), page->index, priv);
+ trace_afs_page_dirty(vnode, tracepoint_string("undirty"), page);
clear_page_dirty_for_io(page);
full_invalidate:
- priv = (unsigned long)detach_page_private(page);
- trace_afs_page_dirty(vnode, tracepoint_string("inval"), page->index, priv);
+ trace_afs_page_dirty(vnode, tracepoint_string("inval"), page);
+ detach_page_private(page);
}
/*
@@ -669,20 +451,10 @@ static void afs_invalidatepage(struct page *page, unsigned int offset,
BUG_ON(!PageLocked(page));
-#ifdef CONFIG_AFS_FSCACHE
- /* we clean up only if the entire page is being invalidated */
- if (offset == 0 && length == PAGE_SIZE) {
- if (PageFsCache(page)) {
- struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
- fscache_wait_on_page_write(vnode->cache, page);
- fscache_uncache_page(vnode->cache, page);
- }
- }
-#endif
-
if (PagePrivate(page))
afs_invalidate_dirty(page, offset, length);
+ wait_on_page_fscache(page);
_leave("");
}
@@ -693,7 +465,6 @@ static void afs_invalidatepage(struct page *page, unsigned int offset,
static int afs_releasepage(struct page *page, gfp_t gfp_flags)
{
struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
- unsigned long priv;
_enter("{{%llx:%llu}[%lu],%lx},%x",
vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
@@ -702,16 +473,16 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags)
/* deny if page is being written to the cache and the caller hasn't
* elected to wait */
#ifdef CONFIG_AFS_FSCACHE
- if (!fscache_maybe_release_page(vnode->cache, page, gfp_flags)) {
- _leave(" = F [cache busy]");
- return 0;
+ if (PageFsCache(page)) {
+ if (!(gfp_flags & __GFP_DIRECT_RECLAIM) || !(gfp_flags & __GFP_FS))
+ return false;
+ wait_on_page_fscache(page);
}
#endif
if (PagePrivate(page)) {
- priv = (unsigned long)detach_page_private(page);
- trace_afs_page_dirty(vnode, tracepoint_string("rel"),
- page->index, priv);
+ trace_afs_page_dirty(vnode, tracepoint_string("rel"), page);
+ detach_page_private(page);
}
/* indicate that the page can be released */
diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
index 71c58723763d..2cb0951acca6 100644
--- a/fs/afs/fs_operation.c
+++ b/fs/afs/fs_operation.c
@@ -198,8 +198,10 @@ void afs_wait_for_operation(struct afs_operation *op)
case -ECONNABORTED:
if (op->ops->aborted)
op->ops->aborted(op);
- break;
+ fallthrough;
default:
+ if (op->ops->failed)
+ op->ops->failed(op);
break;
}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 1d95ed9dd86e..2f695a260442 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -10,6 +10,7 @@
#include <linux/sched.h>
#include <linux/circ_buf.h>
#include <linux/iversion.h>
+#include <linux/netfs.h>
#include "internal.h"
#include "afs_fs.h"
#include "xdr_fs.h"
@@ -302,17 +303,15 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
struct afs_vnode_param *vp = &op->file[0];
struct afs_read *req = op->fetch.req;
const __be32 *bp;
- unsigned int size;
int ret;
- _enter("{%u,%zu/%llu}",
- call->unmarshall, iov_iter_count(call->iter), req->actual_len);
+ _enter("{%u,%zu,%zu/%llu}",
+ call->unmarshall, call->iov_len, iov_iter_count(call->iter),
+ req->actual_len);
switch (call->unmarshall) {
case 0:
req->actual_len = 0;
- req->index = 0;
- req->offset = req->pos & (PAGE_SIZE - 1);
call->unmarshall++;
if (call->operation_ID == FSFETCHDATA64) {
afs_extract_to_tmp64(call);
@@ -322,7 +321,10 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
}
fallthrough;
- /* extract the returned data length */
+ /* Extract the returned data length into
+ * ->actual_len. This may indicate more or less data than was
+ * requested will be returned.
+ */
case 1:
_debug("extract data length");
ret = afs_extract_data(call, true);
@@ -331,44 +333,25 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
req->actual_len = be64_to_cpu(call->tmp64);
_debug("DATA length: %llu", req->actual_len);
- req->remain = min(req->len, req->actual_len);
- if (req->remain == 0)
+
+ if (req->actual_len == 0)
goto no_more_data;
+ call->iter = req->iter;
+ call->iov_len = min(req->actual_len, req->len);
call->unmarshall++;
-
- begin_page:
- ASSERTCMP(req->index, <, req->nr_pages);
- if (req->remain > PAGE_SIZE - req->offset)
- size = PAGE_SIZE - req->offset;
- else
- size = req->remain;
- call->bvec[0].bv_len = size;
- call->bvec[0].bv_offset = req->offset;
- call->bvec[0].bv_page = req->pages[req->index];
- iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size);
- ASSERTCMP(size, <=, PAGE_SIZE);
fallthrough;
/* extract the returned data */
case 2:
_debug("extract data %zu/%llu",
- iov_iter_count(call->iter), req->remain);
+ iov_iter_count(call->iter), req->actual_len);
ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
- req->remain -= call->bvec[0].bv_len;
- req->offset += call->bvec[0].bv_len;
- ASSERTCMP(req->offset, <=, PAGE_SIZE);
- if (req->offset == PAGE_SIZE) {
- req->offset = 0;
- req->index++;
- if (req->remain > 0)
- goto begin_page;
- }
- ASSERTCMP(req->remain, ==, 0);
+ call->iter = &call->def_iter;
if (req->actual_len <= req->len)
goto no_more_data;
@@ -410,17 +393,6 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
break;
}
- for (; req->index < req->nr_pages; req->index++) {
- if (req->offset < PAGE_SIZE)
- zero_user_segment(req->pages[req->index],
- req->offset, PAGE_SIZE);
- req->offset = 0;
- }
-
- if (req->page_done)
- for (req->index = 0; req->index < req->nr_pages; req->index++)
- req->page_done(req);
-
_leave(" = 0 [done]");
return 0;
}
@@ -494,6 +466,8 @@ void afs_fs_fetch_data(struct afs_operation *op)
if (!call)
return afs_op_nomem(op);
+ req->call_debug_id = call->debug_id;
+
/* marshall the parameters */
bp = call->request;
bp[0] = htonl(FSFETCHDATA);
@@ -1079,8 +1053,7 @@ static const struct afs_call_type afs_RXFSStoreData64 = {
/*
* store a set of pages to a very large file
*/
-static void afs_fs_store_data64(struct afs_operation *op,
- loff_t pos, loff_t size, loff_t i_size)
+static void afs_fs_store_data64(struct afs_operation *op)
{
struct afs_vnode_param *vp = &op->file[0];
struct afs_call *call;
@@ -1095,7 +1068,7 @@ static void afs_fs_store_data64(struct afs_operation *op,
if (!call)
return afs_op_nomem(op);
- call->send_pages = true;
+ call->write_iter = op->store.write_iter;
/* marshall the parameters */
bp = call->request;
@@ -1111,47 +1084,38 @@ static void afs_fs_store_data64(struct afs_operation *op,
*bp++ = 0; /* unix mode */
*bp++ = 0; /* segment size */
- *bp++ = htonl(upper_32_bits(pos));
- *bp++ = htonl(lower_32_bits(pos));
- *bp++ = htonl(upper_32_bits(size));
- *bp++ = htonl(lower_32_bits(size));
- *bp++ = htonl(upper_32_bits(i_size));
- *bp++ = htonl(lower_32_bits(i_size));
+ *bp++ = htonl(upper_32_bits(op->store.pos));
+ *bp++ = htonl(lower_32_bits(op->store.pos));
+ *bp++ = htonl(upper_32_bits(op->store.size));
+ *bp++ = htonl(lower_32_bits(op->store.size));
+ *bp++ = htonl(upper_32_bits(op->store.i_size));
+ *bp++ = htonl(lower_32_bits(op->store.i_size));
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
/*
- * store a set of pages
+ * Write data to a file on the server.
*/
void afs_fs_store_data(struct afs_operation *op)
{
struct afs_vnode_param *vp = &op->file[0];
struct afs_call *call;
- loff_t size, pos, i_size;
__be32 *bp;
_enter(",%x,{%llx:%llu},,",
key_serial(op->key), vp->fid.vid, vp->fid.vnode);
- size = (loff_t)op->store.last_to - (loff_t)op->store.first_offset;
- if (op->store.first != op->store.last)
- size += (loff_t)(op->store.last - op->store.first) << PAGE_SHIFT;
- pos = (loff_t)op->store.first << PAGE_SHIFT;
- pos += op->store.first_offset;
-
- i_size = i_size_read(&vp->vnode->vfs_inode);
- if (pos + size > i_size)
- i_size = size + pos;
-
_debug("size %llx, at %llx, i_size %llx",
- (unsigned long long) size, (unsigned long long) pos,
- (unsigned long long) i_size);
+ (unsigned long long)op->store.size,
+ (unsigned long long)op->store.pos,
+ (unsigned long long)op->store.i_size);
- if (upper_32_bits(pos) || upper_32_bits(i_size) || upper_32_bits(size) ||
- upper_32_bits(pos + size))
- return afs_fs_store_data64(op, pos, size, i_size);
+ if (upper_32_bits(op->store.pos) ||
+ upper_32_bits(op->store.size) ||
+ upper_32_bits(op->store.i_size))
+ return afs_fs_store_data64(op);
call = afs_alloc_flat_call(op->net, &afs_RXFSStoreData,
(4 + 6 + 3) * 4,
@@ -1159,7 +1123,7 @@ void afs_fs_store_data(struct afs_operation *op)
if (!call)
return afs_op_nomem(op);
- call->send_pages = true;
+ call->write_iter = op->store.write_iter;
/* marshall the parameters */
bp = call->request;
@@ -1175,9 +1139,9 @@ void afs_fs_store_data(struct afs_operation *op)
*bp++ = 0; /* unix mode */
*bp++ = 0; /* segment size */
- *bp++ = htonl(lower_32_bits(pos));
- *bp++ = htonl(lower_32_bits(size));
- *bp++ = htonl(lower_32_bits(i_size));
+ *bp++ = htonl(lower_32_bits(op->store.pos));
+ *bp++ = htonl(lower_32_bits(op->store.size));
+ *bp++ = htonl(lower_32_bits(op->store.i_size));
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 12be88716e4c..3a129b9fd9b8 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -102,13 +102,13 @@ static int afs_inode_init_from_status(struct afs_operation *op,
switch (status->type) {
case AFS_FTYPE_FILE:
- inode->i_mode = S_IFREG | status->mode;
+ inode->i_mode = S_IFREG | (status->mode & S_IALLUGO);
inode->i_op = &afs_file_inode_operations;
inode->i_fop = &afs_file_operations;
inode->i_mapping->a_ops = &afs_fs_aops;
break;
case AFS_FTYPE_DIR:
- inode->i_mode = S_IFDIR | status->mode;
+ inode->i_mode = S_IFDIR | (status->mode & S_IALLUGO);
inode->i_op = &afs_dir_inode_operations;
inode->i_fop = &afs_dir_file_operations;
inode->i_mapping->a_ops = &afs_dir_aops;
@@ -198,7 +198,7 @@ static void afs_apply_status(struct afs_operation *op,
if (status->mode != vnode->status.mode) {
mode = inode->i_mode;
mode &= ~S_IALLUGO;
- mode |= status->mode;
+ mode |= status->mode & S_IALLUGO;
WRITE_ONCE(inode->i_mode, mode);
}
@@ -214,11 +214,12 @@ static void afs_apply_status(struct afs_operation *op,
if (vp->dv_before + vp->dv_delta != status->data_version) {
if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
- pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s\n",
+ pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s (op=%x)\n",
vnode->fid.vid, vnode->fid.vnode,
(unsigned long long)vp->dv_before + vp->dv_delta,
(unsigned long long)status->data_version,
- op->type ? op->type->name : "???");
+ op->type ? op->type->name : "???",
+ op->debug_id);
vnode->invalid_before = status->data_version;
if (vnode->status.type == AFS_FTYPE_DIR) {
@@ -427,7 +428,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)
} __packed key;
struct afs_vnode_cache_aux aux;
- if (vnode->status.type == AFS_FTYPE_DIR) {
+ if (vnode->status.type != AFS_FTYPE_FILE) {
vnode->cache = NULL;
return;
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 1627b1872812..52157a05796a 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -14,6 +14,7 @@
#include <linux/key.h>
#include <linux/workqueue.h>
#include <linux/sched.h>
+#define FSCACHE_USE_NEW_IO_API
#include <linux/fscache.h>
#include <linux/backing-dev.h>
#include <linux/uuid.h>
@@ -31,6 +32,7 @@
struct pagevec;
struct afs_call;
+struct afs_vnode;
/*
* Partial file-locking emulation mode. (The problem being that AFS3 only
@@ -104,7 +106,9 @@ struct afs_call {
struct afs_server *server; /* The fileserver record if fs op (pins ref) */
struct afs_vlserver *vlserver; /* The vlserver record if vl op */
void *request; /* request data (first part) */
+ size_t iov_len; /* Size of *iter to be used */
struct iov_iter def_iter; /* Default buffer/data iterator */
+ struct iov_iter *write_iter; /* Iterator defining write to be made */
struct iov_iter *iter; /* Iterator currently in use */
union { /* Convenience for ->def_iter */
struct kvec kvec[1];
@@ -131,7 +135,6 @@ struct afs_call {
unsigned char unmarshall; /* unmarshalling phase */
unsigned char addr_ix; /* Address in ->alist */
bool drop_ref; /* T if need to drop ref for incoming call */
- bool send_pages; /* T if data from mapping should be sent */
bool need_attention; /* T if RxRPC poked us */
bool async; /* T if asynchronous */
bool upgrade; /* T to request service upgrade */
@@ -202,17 +205,19 @@ struct afs_read {
loff_t pos; /* Where to start reading */
loff_t len; /* How much we're asking for */
loff_t actual_len; /* How much we're actually getting */
- loff_t remain; /* Amount remaining */
loff_t file_size; /* File size returned by server */
+ struct key *key; /* The key to use to reissue the read */
+ struct afs_vnode *vnode; /* The file being read into. */
+ struct netfs_read_subrequest *subreq; /* Fscache helper read request this belongs to */
afs_dataversion_t data_version; /* Version number returned by server */
refcount_t usage;
- unsigned int index; /* Which page we're reading into */
+ unsigned int call_debug_id;
unsigned int nr_pages;
- unsigned int offset; /* offset into current page */
- struct afs_vnode *vnode;
- void (*page_done)(struct afs_read *);
- struct page **pages;
- struct page *array[];
+ int error;
+ void (*done)(struct afs_read *);
+ void (*cleanup)(struct afs_read *);
+ struct iov_iter *iter; /* Iterator representing the buffer */
+ struct iov_iter def_iter; /* Default iterator */
};
/*
@@ -739,6 +744,7 @@ struct afs_operation_ops {
void (*issue_yfs_rpc)(struct afs_operation *op);
void (*success)(struct afs_operation *op);
void (*aborted)(struct afs_operation *op);
+ void (*failed)(struct afs_operation *op);
void (*edit_dir)(struct afs_operation *op);
void (*put)(struct afs_operation *op);
};
@@ -808,12 +814,11 @@ struct afs_operation {
afs_lock_type_t type;
} lock;
struct {
- struct address_space *mapping; /* Pages being written from */
- pgoff_t first; /* first page in mapping to deal with */
- pgoff_t last; /* last page in mapping to deal with */
- unsigned first_offset; /* offset into mapping[first] */
- unsigned last_to; /* amount of mapping[last] */
- bool laundering; /* Laundering page, PG_writeback not set */
+ struct iov_iter *write_iter;
+ loff_t pos;
+ loff_t size;
+ loff_t i_size;
+ bool laundering; /* Laundering page, PG_writeback not set */
} store;
struct {
struct iattr *attr;
@@ -875,31 +880,31 @@ struct afs_vnode_cache_aux {
#define __AFS_PAGE_PRIV_MMAPPED 0x8000UL
#endif
-static inline unsigned int afs_page_dirty_resolution(void)
+static inline unsigned int afs_page_dirty_resolution(struct page *page)
{
- int shift = PAGE_SHIFT - (__AFS_PAGE_PRIV_SHIFT - 1);
+ int shift = thp_order(page) + PAGE_SHIFT - (__AFS_PAGE_PRIV_SHIFT - 1);
return (shift > 0) ? shift : 0;
}
-static inline size_t afs_page_dirty_from(unsigned long priv)
+static inline size_t afs_page_dirty_from(struct page *page, unsigned long priv)
{
unsigned long x = priv & __AFS_PAGE_PRIV_MASK;
/* The lower bound is inclusive */
- return x << afs_page_dirty_resolution();
+ return x << afs_page_dirty_resolution(page);
}
-static inline size_t afs_page_dirty_to(unsigned long priv)
+static inline size_t afs_page_dirty_to(struct page *page, unsigned long priv)
{
unsigned long x = (priv >> __AFS_PAGE_PRIV_SHIFT) & __AFS_PAGE_PRIV_MASK;
/* The upper bound is immediately beyond the region */
- return (x + 1) << afs_page_dirty_resolution();
+ return (x + 1) << afs_page_dirty_resolution(page);
}
-static inline unsigned long afs_page_dirty(size_t from, size_t to)
+static inline unsigned long afs_page_dirty(struct page *page, size_t from, size_t to)
{
- unsigned int res = afs_page_dirty_resolution();
+ unsigned int res = afs_page_dirty_resolution(page);
from >>= res;
to = (to - 1) >> res;
return (to << __AFS_PAGE_PRIV_SHIFT) | from;
@@ -1040,13 +1045,14 @@ extern void afs_dynroot_depopulate(struct super_block *);
extern const struct address_space_operations afs_fs_aops;
extern const struct inode_operations afs_file_inode_operations;
extern const struct file_operations afs_file_operations;
+extern const struct netfs_read_request_ops afs_req_ops;
extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *);
extern void afs_put_wb_key(struct afs_wb_key *);
extern int afs_open(struct inode *, struct file *);
extern int afs_release(struct inode *, struct file *);
-extern int afs_fetch_data(struct afs_vnode *, struct key *, struct afs_read *);
-extern int afs_page_filler(void *, struct page *);
+extern int afs_fetch_data(struct afs_vnode *, struct afs_read *);
+extern struct afs_read *afs_alloc_read(gfp_t);
extern void afs_put_read(struct afs_read *);
static inline struct afs_read *afs_get_read(struct afs_read *req)
@@ -1270,6 +1276,7 @@ static inline void afs_make_op_call(struct afs_operation *op, struct afs_call *c
static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t size)
{
+ call->iov_len = size;
call->kvec[0].iov_base = buf;
call->kvec[0].iov_len = size;
iov_iter_kvec(&call->def_iter, READ, call->kvec, 1, size);
@@ -1277,21 +1284,25 @@ static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t si
static inline void afs_extract_to_tmp(struct afs_call *call)
{
+ call->iov_len = sizeof(call->tmp);
afs_extract_begin(call, &call->tmp, sizeof(call->tmp));
}
static inline void afs_extract_to_tmp64(struct afs_call *call)
{
+ call->iov_len = sizeof(call->tmp64);
afs_extract_begin(call, &call->tmp64, sizeof(call->tmp64));
}
static inline void afs_extract_discard(struct afs_call *call, size_t size)
{
+ call->iov_len = size;
iov_iter_discard(&call->def_iter, READ, size);
}
static inline void afs_extract_to_buf(struct afs_call *call, size_t size)
{
+ call->iov_len = size;
afs_extract_begin(call, call->buffer, size);
}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 8be709cb8542..23a1a92d64bb 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -271,40 +271,6 @@ void afs_flat_call_destructor(struct afs_call *call)
call->buffer = NULL;
}
-#define AFS_BVEC_MAX 8
-
-/*
- * Load the given bvec with the next few pages.
- */
-static void afs_load_bvec(struct afs_call *call, struct msghdr *msg,
- struct bio_vec *bv, pgoff_t first, pgoff_t last,
- unsigned offset)
-{
- struct afs_operation *op = call->op;
- struct page *pages[AFS_BVEC_MAX];
- unsigned int nr, n, i, to, bytes = 0;
-
- nr = min_t(pgoff_t, last - first + 1, AFS_BVEC_MAX);
- n = find_get_pages_contig(op->store.mapping, first, nr, pages);
- ASSERTCMP(n, ==, nr);
-
- msg->msg_flags |= MSG_MORE;
- for (i = 0; i < nr; i++) {
- to = PAGE_SIZE;
- if (first + i >= last) {
- to = op->store.last_to;
- msg->msg_flags &= ~MSG_MORE;
- }
- bv[i].bv_page = pages[i];
- bv[i].bv_len = to - offset;
- bv[i].bv_offset = offset;
- bytes += to - offset;
- offset = 0;
- }
-
- iov_iter_bvec(&msg->msg_iter, WRITE, bv, nr, bytes);
-}
-
/*
* Advance the AFS call state when the RxRPC call ends the transmit phase.
*/
@@ -318,42 +284,6 @@ static void afs_notify_end_request_tx(struct sock *sock,
}
/*
- * attach the data from a bunch of pages on an inode to a call
- */
-static int afs_send_pages(struct afs_call *call, struct msghdr *msg)
-{
- struct afs_operation *op = call->op;
- struct bio_vec bv[AFS_BVEC_MAX];
- unsigned int bytes, nr, loop, offset;
- pgoff_t first = op->store.first, last = op->store.last;
- int ret;
-
- offset = op->store.first_offset;
- op->store.first_offset = 0;
-
- do {
- afs_load_bvec(call, msg, bv, first, last, offset);
- trace_afs_send_pages(call, msg, first, last, offset);
-
- offset = 0;
- bytes = msg->msg_iter.count;
- nr = msg->msg_iter.nr_segs;
-
- ret = rxrpc_kernel_send_data(op->net->socket, call->rxcall, msg,
- bytes, afs_notify_end_request_tx);
- for (loop = 0; loop < nr; loop++)
- put_page(bv[loop].bv_page);
- if (ret < 0)
- break;
-
- first += nr;
- } while (first <= last);
-
- trace_afs_sent_pages(call, op->store.first, last, first, ret);
- return ret;
-}
-
-/*
* Initiate a call and synchronously queue up the parameters for dispatch. Any
* error is stored into the call struct, which the caller must check for.
*/
@@ -363,6 +293,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
struct rxrpc_call *rxcall;
struct msghdr msg;
struct kvec iov[1];
+ size_t len;
s64 tx_total_len;
int ret;
@@ -383,21 +314,8 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
* after the initial fixed part.
*/
tx_total_len = call->request_size;
- if (call->send_pages) {
- struct afs_operation *op = call->op;
-
- if (op->store.last == op->store.first) {
- tx_total_len += op->store.last_to - op->store.first_offset;
- } else {
- /* It looks mathematically like you should be able to
- * combine the following lines with the ones above, but
- * unsigned arithmetic is fun when it wraps...
- */
- tx_total_len += PAGE_SIZE - op->store.first_offset;
- tx_total_len += op->store.last_to;
- tx_total_len += (op->store.last - op->store.first - 1) * PAGE_SIZE;
- }
- }
+ if (call->write_iter)
+ tx_total_len += iov_iter_count(call->write_iter);
/* If the call is going to be asynchronous, we need an extra ref for
* the call to hold itself so the caller need not hang on to its ref.
@@ -439,7 +357,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, call->request_size);
msg.msg_control = NULL;
msg.msg_controllen = 0;
- msg.msg_flags = MSG_WAITALL | (call->send_pages ? MSG_MORE : 0);
+ msg.msg_flags = MSG_WAITALL | (call->write_iter ? MSG_MORE : 0);
ret = rxrpc_kernel_send_data(call->net->socket, rxcall,
&msg, call->request_size,
@@ -447,8 +365,18 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
if (ret < 0)
goto error_do_abort;
- if (call->send_pages) {
- ret = afs_send_pages(call, &msg);
+ if (call->write_iter) {
+ msg.msg_iter = *call->write_iter;
+ msg.msg_flags &= ~MSG_MORE;
+ trace_afs_send_data(call, &msg);
+
+ ret = rxrpc_kernel_send_data(call->net->socket,
+ call->rxcall, &msg,
+ iov_iter_count(&msg.msg_iter),
+ afs_notify_end_request_tx);
+ *call->write_iter = msg.msg_iter;
+
+ trace_afs_sent_data(call, &msg, ret);
if (ret < 0)
goto error_do_abort;
}
@@ -466,9 +394,10 @@ error_do_abort:
rxrpc_kernel_abort_call(call->net->socket, rxcall,
RX_USER_ABORT, ret, "KSD");
} else {
+ len = 0;
iov_iter_kvec(&msg.msg_iter, READ, NULL, 0, 0);
rxrpc_kernel_recv_data(call->net->socket, rxcall,
- &msg.msg_iter, false,
+ &msg.msg_iter, &len, false,
&call->abort_code, &call->service_id);
ac->abort_code = call->abort_code;
ac->responded = true;
@@ -499,11 +428,45 @@ error_kill_call:
}
/*
+ * Log remote abort codes that indicate that we have a protocol disagreement
+ * with the server.
+ */
+static void afs_log_error(struct afs_call *call, s32 remote_abort)
+{
+ static int max = 0;
+ const char *msg;
+ int m;
+
+ switch (remote_abort) {
+ case RX_EOF: msg = "unexpected EOF"; break;
+ case RXGEN_CC_MARSHAL: msg = "client marshalling"; break;
+ case RXGEN_CC_UNMARSHAL: msg = "client unmarshalling"; break;
+ case RXGEN_SS_MARSHAL: msg = "server marshalling"; break;
+ case RXGEN_SS_UNMARSHAL: msg = "server unmarshalling"; break;
+ case RXGEN_DECODE: msg = "opcode decode"; break;
+ case RXGEN_SS_XDRFREE: msg = "server XDR cleanup"; break;
+ case RXGEN_CC_XDRFREE: msg = "client XDR cleanup"; break;
+ case -32: msg = "insufficient data"; break;
+ default:
+ return;
+ }
+
+ m = max;
+ if (m < 3) {
+ max = m + 1;
+ pr_notice("kAFS: Peer reported %s failure on %s [%pISp]\n",
+ msg, call->type->name,
+ &call->alist->addrs[call->addr_ix].transport);
+ }
+}
+
+/*
* deliver messages to a call
*/
static void afs_deliver_to_call(struct afs_call *call)
{
enum afs_call_state state;
+ size_t len;
u32 abort_code, remote_abort = 0;
int ret;
@@ -516,10 +479,11 @@ static void afs_deliver_to_call(struct afs_call *call)
state == AFS_CALL_SV_AWAIT_ACK
) {
if (state == AFS_CALL_SV_AWAIT_ACK) {
+ len = 0;
iov_iter_kvec(&call->def_iter, READ, NULL, 0, 0);
ret = rxrpc_kernel_recv_data(call->net->socket,
call->rxcall, &call->def_iter,
- false, &remote_abort,
+ &len, false, &remote_abort,
&call->service_id);
trace_afs_receive_data(call, &call->def_iter, false, ret);
@@ -559,6 +523,7 @@ static void afs_deliver_to_call(struct afs_call *call)
goto out;
case -ECONNABORTED:
ASSERTCMP(state, ==, AFS_CALL_COMPLETE);
+ afs_log_error(call, call->abort_code);
goto done;
case -ENOTSUPP:
abort_code = RXGEN_OPCODE;
@@ -929,10 +894,11 @@ int afs_extract_data(struct afs_call *call, bool want_more)
u32 remote_abort = 0;
int ret;
- _enter("{%s,%zu},%d", call->type->name, iov_iter_count(iter), want_more);
+ _enter("{%s,%zu,%zu},%d",
+ call->type->name, call->iov_len, iov_iter_count(iter), want_more);
ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, iter,
- want_more, &remote_abort,
+ &call->iov_len, want_more, &remote_abort,
&call->service_id);
if (ret == 0 || ret == -EAGAIN)
return ret;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index c9195fc67fd8..dc66ff15dd16 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -11,6 +11,8 @@
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
+#include <linux/netfs.h>
+#include <linux/fscache.h>
#include "internal.h"
/*
@@ -23,55 +25,6 @@ int afs_set_page_dirty(struct page *page)
}
/*
- * partly or wholly fill a page that's under preparation for writing
- */
-static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
- loff_t pos, unsigned int len, struct page *page)
-{
- struct afs_read *req;
- size_t p;
- void *data;
- int ret;
-
- _enter(",,%llu", (unsigned long long)pos);
-
- if (pos >= vnode->vfs_inode.i_size) {
- p = pos & ~PAGE_MASK;
- ASSERTCMP(p + len, <=, PAGE_SIZE);
- data = kmap(page);
- memset(data + p, 0, len);
- kunmap(page);
- return 0;
- }
-
- req = kzalloc(struct_size(req, array, 1), GFP_KERNEL);
- if (!req)
- return -ENOMEM;
-
- refcount_set(&req->usage, 1);
- req->pos = pos;
- req->len = len;
- req->nr_pages = 1;
- req->pages = req->array;
- req->pages[0] = page;
- get_page(page);
-
- ret = afs_fetch_data(vnode, key, req);
- afs_put_read(req);
- if (ret < 0) {
- if (ret == -ENOENT) {
- _debug("got NOENT from server"
- " - marking file deleted and stale");
- set_bit(AFS_VNODE_DELETED, &vnode->flags);
- ret = -ESTALE;
- }
- }
-
- _leave(" = %d", ret);
- return ret;
-}
-
-/*
* prepare to perform part of a write to a page
*/
int afs_write_begin(struct file *file, struct address_space *mapping,
@@ -80,47 +33,40 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
struct page *page;
- struct key *key = afs_file_key(file);
unsigned long priv;
- unsigned f, from = pos & (PAGE_SIZE - 1);
- unsigned t, to = from + len;
- pgoff_t index = pos >> PAGE_SHIFT;
+ unsigned f, from;
+ unsigned t, to;
+ pgoff_t index;
int ret;
- _enter("{%llx:%llu},{%lx},%u,%u",
- vnode->fid.vid, vnode->fid.vnode, index, from, to);
+ _enter("{%llx:%llu},%llx,%x",
+ vnode->fid.vid, vnode->fid.vnode, pos, len);
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page)
- return -ENOMEM;
+ /* Prefetch area to be written into the cache if we're caching this
+ * file. We need to do this before we get a lock on the page in case
+ * there's more than one writer competing for the same cache block.
+ */
+ ret = netfs_write_begin(file, mapping, pos, len, flags, &page, fsdata,
+ &afs_req_ops, NULL);
+ if (ret < 0)
+ return ret;
- if (!PageUptodate(page) && len != PAGE_SIZE) {
- ret = afs_fill_page(vnode, key, pos & PAGE_MASK, PAGE_SIZE, page);
- if (ret < 0) {
- unlock_page(page);
- put_page(page);
- _leave(" = %d [prep]", ret);
- return ret;
- }
- SetPageUptodate(page);
- }
+ index = page->index;
+ from = pos - index * PAGE_SIZE;
+ to = from + len;
try_again:
/* See if this page is already partially written in a way that we can
* merge the new write with.
*/
- t = f = 0;
if (PagePrivate(page)) {
priv = page_private(page);
- f = afs_page_dirty_from(priv);
- t = afs_page_dirty_to(priv);
+ f = afs_page_dirty_from(page, priv);
+ t = afs_page_dirty_to(page, priv);
ASSERTCMP(f, <=, t);
- }
- if (f != t) {
if (PageWriteback(page)) {
- trace_afs_page_dirty(vnode, tracepoint_string("alrdy"),
- page->index, priv);
+ trace_afs_page_dirty(vnode, tracepoint_string("alrdy"), page);
goto flush_conflicting_write;
}
/* If the file is being filled locally, allow inter-write
@@ -164,12 +110,10 @@ int afs_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
- struct key *key = afs_file_key(file);
unsigned long priv;
- unsigned int f, from = pos & (PAGE_SIZE - 1);
+ unsigned int f, from = pos & (thp_size(page) - 1);
unsigned int t, to = from + copied;
loff_t i_size, maybe_i_size;
- int ret = 0;
_enter("{%llx:%llu},{%lx}",
vnode->fid.vid, vnode->fid.vnode, page->index);
@@ -188,88 +132,75 @@ int afs_write_end(struct file *file, struct address_space *mapping,
write_sequnlock(&vnode->cb_lock);
}
- if (!PageUptodate(page)) {
- if (copied < len) {
- /* Try and load any missing data from the server. The
- * unmarshalling routine will take care of clearing any
- * bits that are beyond the EOF.
- */
- ret = afs_fill_page(vnode, key, pos + copied,
- len - copied, page);
- if (ret < 0)
- goto out;
- }
- SetPageUptodate(page);
- }
+ ASSERT(PageUptodate(page));
if (PagePrivate(page)) {
priv = page_private(page);
- f = afs_page_dirty_from(priv);
- t = afs_page_dirty_to(priv);
+ f = afs_page_dirty_from(page, priv);
+ t = afs_page_dirty_to(page, priv);
if (from < f)
f = from;
if (to > t)
t = to;
- priv = afs_page_dirty(f, t);
+ priv = afs_page_dirty(page, f, t);
set_page_private(page, priv);
- trace_afs_page_dirty(vnode, tracepoint_string("dirty+"),
- page->index, priv);
+ trace_afs_page_dirty(vnode, tracepoint_string("dirty+"), page);
} else {
- priv = afs_page_dirty(from, to);
+ priv = afs_page_dirty(page, from, to);
attach_page_private(page, (void *)priv);
- trace_afs_page_dirty(vnode, tracepoint_string("dirty"),
- page->index, priv);
+ trace_afs_page_dirty(vnode, tracepoint_string("dirty"), page);
}
- set_page_dirty(page);
- if (PageDirty(page))
- _debug("dirtied");
- ret = copied;
+ if (set_page_dirty(page))
+ _debug("dirtied %lx", page->index);
out:
unlock_page(page);
put_page(page);
- return ret;
+ return copied;
}
/*
* kill all the pages in the given range
*/
static void afs_kill_pages(struct address_space *mapping,
- pgoff_t first, pgoff_t last)
+ loff_t start, loff_t len)
{
struct afs_vnode *vnode = AFS_FS_I(mapping->host);
struct pagevec pv;
- unsigned count, loop;
+ unsigned int loop, psize;
- _enter("{%llx:%llu},%lx-%lx",
- vnode->fid.vid, vnode->fid.vnode, first, last);
+ _enter("{%llx:%llu},%llx @%llx",
+ vnode->fid.vid, vnode->fid.vnode, len, start);
pagevec_init(&pv);
do {
- _debug("kill %lx-%lx", first, last);
+ _debug("kill %llx @%llx", len, start);
- count = last - first + 1;
- if (count > PAGEVEC_SIZE)
- count = PAGEVEC_SIZE;
- pv.nr = find_get_pages_contig(mapping, first, count, pv.pages);
- ASSERTCMP(pv.nr, ==, count);
+ pv.nr = find_get_pages_contig(mapping, start / PAGE_SIZE,
+ PAGEVEC_SIZE, pv.pages);
+ if (pv.nr == 0)
+ break;
- for (loop = 0; loop < count; loop++) {
+ for (loop = 0; loop < pv.nr; loop++) {
struct page *page = pv.pages[loop];
+
+ if (page->index * PAGE_SIZE >= start + len)
+ break;
+
+ psize = thp_size(page);
+ start += psize;
+ len -= psize;
ClearPageUptodate(page);
- SetPageError(page);
end_page_writeback(page);
- if (page->index >= first)
- first = page->index + 1;
lock_page(page);
generic_error_remove_page(mapping, page);
unlock_page(page);
}
__pagevec_release(&pv);
- } while (first <= last);
+ } while (len > 0);
_leave("");
}
@@ -279,37 +210,40 @@ static void afs_kill_pages(struct address_space *mapping,
*/
static void afs_redirty_pages(struct writeback_control *wbc,
struct address_space *mapping,
- pgoff_t first, pgoff_t last)
+ loff_t start, loff_t len)
{
struct afs_vnode *vnode = AFS_FS_I(mapping->host);
struct pagevec pv;
- unsigned count, loop;
+ unsigned int loop, psize;
- _enter("{%llx:%llu},%lx-%lx",
- vnode->fid.vid, vnode->fid.vnode, first, last);
+ _enter("{%llx:%llu},%llx @%llx",
+ vnode->fid.vid, vnode->fid.vnode, len, start);
pagevec_init(&pv);
do {
- _debug("redirty %lx-%lx", first, last);
+ _debug("redirty %llx @%llx", len, start);
- count = last - first + 1;
- if (count > PAGEVEC_SIZE)
- count = PAGEVEC_SIZE;
- pv.nr = find_get_pages_contig(mapping, first, count, pv.pages);
- ASSERTCMP(pv.nr, ==, count);
+ pv.nr = find_get_pages_contig(mapping, start / PAGE_SIZE,
+ PAGEVEC_SIZE, pv.pages);
+ if (pv.nr == 0)
+ break;
- for (loop = 0; loop < count; loop++) {
+ for (loop = 0; loop < pv.nr; loop++) {
struct page *page = pv.pages[loop];
+ if (page->index * PAGE_SIZE >= start + len)
+ break;
+
+ psize = thp_size(page);
+ start += psize;
+ len -= psize;
redirty_page_for_writepage(wbc, page);
end_page_writeback(page);
- if (page->index >= first)
- first = page->index + 1;
}
__pagevec_release(&pv);
- } while (first <= last);
+ } while (len > 0);
_leave("");
}
@@ -317,37 +251,32 @@ static void afs_redirty_pages(struct writeback_control *wbc,
/*
* completion of write to server
*/
-static void afs_pages_written_back(struct afs_vnode *vnode,
- pgoff_t first, pgoff_t last)
+static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len)
{
- struct pagevec pv;
- unsigned long priv;
- unsigned count, loop;
+ struct address_space *mapping = vnode->vfs_inode.i_mapping;
+ struct page *page;
+ pgoff_t end;
- _enter("{%llx:%llu},{%lx-%lx}",
- vnode->fid.vid, vnode->fid.vnode, first, last);
+ XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE);
- pagevec_init(&pv);
+ _enter("{%llx:%llu},{%x @%llx}",
+ vnode->fid.vid, vnode->fid.vnode, len, start);
- do {
- _debug("done %lx-%lx", first, last);
-
- count = last - first + 1;
- if (count > PAGEVEC_SIZE)
- count = PAGEVEC_SIZE;
- pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping,
- first, count, pv.pages);
- ASSERTCMP(pv.nr, ==, count);
-
- for (loop = 0; loop < count; loop++) {
- priv = (unsigned long)detach_page_private(pv.pages[loop]);
- trace_afs_page_dirty(vnode, tracepoint_string("clear"),
- pv.pages[loop]->index, priv);
- end_page_writeback(pv.pages[loop]);
+ rcu_read_lock();
+
+ end = (start + len - 1) / PAGE_SIZE;
+ xas_for_each(&xas, page, end) {
+ if (!PageWriteback(page)) {
+ kdebug("bad %x @%llx page %lx %lx", len, start, page->index, end);
+ ASSERT(PageWriteback(page));
}
- first += count;
- __pagevec_release(&pv);
- } while (first <= last);
+
+ trace_afs_page_dirty(vnode, tracepoint_string("clear"), page);
+ detach_page_private(page);
+ page_endio(page, true, 0);
+ }
+
+ rcu_read_unlock();
afs_prune_wb_keys(vnode);
_leave("");
@@ -402,11 +331,9 @@ static void afs_store_data_success(struct afs_operation *op)
afs_vnode_commit_status(op, &op->file[0]);
if (op->error == 0) {
if (!op->store.laundering)
- afs_pages_written_back(vnode, op->store.first, op->store.last);
+ afs_pages_written_back(vnode, op->store.pos, op->store.size);
afs_stat_v(vnode, n_stores);
- atomic_long_add((op->store.last * PAGE_SIZE + op->store.last_to) -
- (op->store.first * PAGE_SIZE + op->store.first_offset),
- &afs_v2net(vnode)->n_store_bytes);
+ atomic_long_add(op->store.size, &afs_v2net(vnode)->n_store_bytes);
}
}
@@ -419,21 +346,20 @@ static const struct afs_operation_ops afs_store_data_operation = {
/*
* write to a file
*/
-static int afs_store_data(struct address_space *mapping,
- pgoff_t first, pgoff_t last,
- unsigned offset, unsigned to, bool laundering)
+static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t pos,
+ bool laundering)
{
- struct afs_vnode *vnode = AFS_FS_I(mapping->host);
struct afs_operation *op;
struct afs_wb_key *wbk = NULL;
- int ret;
+ loff_t size = iov_iter_count(iter), i_size;
+ int ret = -ENOKEY;
- _enter("%s{%llx:%llu.%u},%lx,%lx,%x,%x",
+ _enter("%s{%llx:%llu.%u},%llx,%llx",
vnode->volume->name,
vnode->fid.vid,
vnode->fid.vnode,
vnode->fid.unique,
- first, last, offset, to);
+ size, pos);
ret = afs_get_writeback_key(vnode, &wbk);
if (ret) {
@@ -447,13 +373,14 @@ static int afs_store_data(struct address_space *mapping,
return -ENOMEM;
}
+ i_size = i_size_read(&vnode->vfs_inode);
+
afs_op_set_vnode(op, 0, vnode);
op->file[0].dv_delta = 1;
- op->store.mapping = mapping;
- op->store.first = first;
- op->store.last = last;
- op->store.first_offset = offset;
- op->store.last_to = to;
+ op->store.write_iter = iter;
+ op->store.pos = pos;
+ op->store.size = size;
+ op->store.i_size = max(pos + size, i_size);
op->store.laundering = laundering;
op->mtime = vnode->vfs_inode.i_mtime;
op->flags |= AFS_OPERATION_UNINTR;
@@ -487,73 +414,58 @@ try_next_key:
}
/*
- * Synchronously write back the locked page and any subsequent non-locked dirty
- * pages.
+ * Extend the region to be written back to include subsequent contiguously
+ * dirty pages if possible, but don't sleep while doing so.
+ *
+ * If this page holds new content, then we can include filler zeros in the
+ * writeback.
*/
-static int afs_write_back_from_locked_page(struct address_space *mapping,
- struct writeback_control *wbc,
- struct page *primary_page,
- pgoff_t final_page)
+static void afs_extend_writeback(struct address_space *mapping,
+ struct afs_vnode *vnode,
+ long *_count,
+ loff_t start,
+ loff_t max_len,
+ bool new_content,
+ unsigned int *_len)
{
- struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- struct page *pages[8], *page;
- unsigned long count, priv;
- unsigned n, offset, to, f, t;
- pgoff_t start, first, last;
- loff_t i_size, end;
- int loop, ret;
-
- _enter(",%lx", primary_page->index);
+ struct pagevec pvec;
+ struct page *page;
+ unsigned long priv;
+ unsigned int psize, filler = 0;
+ unsigned int f, t;
+ loff_t len = *_len;
+ pgoff_t index = (start + len) / PAGE_SIZE;
+ bool stop = true;
+ unsigned int i;
- count = 1;
- if (test_set_page_writeback(primary_page))
- BUG();
+ XA_STATE(xas, &mapping->i_pages, index);
+ pagevec_init(&pvec);
- /* Find all consecutive lockable dirty pages that have contiguous
- * written regions, stopping when we find a page that is not
- * immediately lockable, is not dirty or is missing, or we reach the
- * end of the range.
- */
- start = primary_page->index;
- priv = page_private(primary_page);
- offset = afs_page_dirty_from(priv);
- to = afs_page_dirty_to(priv);
- trace_afs_page_dirty(vnode, tracepoint_string("store"),
- primary_page->index, priv);
-
- WARN_ON(offset == to);
- if (offset == to)
- trace_afs_page_dirty(vnode, tracepoint_string("WARN"),
- primary_page->index, priv);
-
- if (start >= final_page ||
- (to < PAGE_SIZE && !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)))
- goto no_more;
-
- start++;
do {
- _debug("more %lx [%lx]", start, count);
- n = final_page - start + 1;
- if (n > ARRAY_SIZE(pages))
- n = ARRAY_SIZE(pages);
- n = find_get_pages_contig(mapping, start, ARRAY_SIZE(pages), pages);
- _debug("fgpc %u", n);
- if (n == 0)
- goto no_more;
- if (pages[0]->index != start) {
- do {
- put_page(pages[--n]);
- } while (n > 0);
- goto no_more;
- }
+ /* Firstly, we gather up a batch of contiguous dirty pages
+ * under the RCU read lock - but we can't clear the dirty flags
+ * there if any of those pages are mapped.
+ */
+ rcu_read_lock();
- for (loop = 0; loop < n; loop++) {
- page = pages[loop];
- if (to != PAGE_SIZE &&
- !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags))
+ xas_for_each(&xas, page, ULONG_MAX) {
+ stop = true;
+ if (xas_retry(&xas, page))
+ continue;
+ if (xa_is_value(page))
break;
- if (page->index > final_page)
+ if (page->index != index)
break;
+
+ if (!page_cache_get_speculative(page)) {
+ xas_reset(&xas);
+ continue;
+ }
+
+ /* Has the page moved or been split? */
+ if (unlikely(page != xas_reload(&xas)))
+ break;
+
if (!trylock_page(page))
break;
if (!PageDirty(page) || PageWriteback(page)) {
@@ -561,57 +473,134 @@ static int afs_write_back_from_locked_page(struct address_space *mapping,
break;
}
+ psize = thp_size(page);
priv = page_private(page);
- f = afs_page_dirty_from(priv);
- t = afs_page_dirty_to(priv);
- if (f != 0 &&
- !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)) {
+ f = afs_page_dirty_from(page, priv);
+ t = afs_page_dirty_to(page, priv);
+ if (f != 0 && !new_content) {
unlock_page(page);
break;
}
- to = t;
- trace_afs_page_dirty(vnode, tracepoint_string("store+"),
- page->index, priv);
+ len += filler + t;
+ filler = psize - t;
+ if (len >= max_len || *_count <= 0)
+ stop = true;
+ else if (t == psize || new_content)
+ stop = false;
+
+ index += thp_nr_pages(page);
+ if (!pagevec_add(&pvec, page))
+ break;
+ if (stop)
+ break;
+ }
+
+ if (!stop)
+ xas_pause(&xas);
+ rcu_read_unlock();
+
+ /* Now, if we obtained any pages, we can shift them to being
+ * writable and mark them for caching.
+ */
+ if (!pagevec_count(&pvec))
+ break;
+
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ page = pvec.pages[i];
+ trace_afs_page_dirty(vnode, tracepoint_string("store+"), page);
if (!clear_page_dirty_for_io(page))
BUG();
if (test_set_page_writeback(page))
BUG();
+
+ *_count -= thp_nr_pages(page);
unlock_page(page);
- put_page(page);
- }
- count += loop;
- if (loop < n) {
- for (; loop < n; loop++)
- put_page(pages[loop]);
- goto no_more;
}
- start += loop;
- } while (start <= final_page && count < 65536);
+ pagevec_release(&pvec);
+ cond_resched();
+ } while (!stop);
+
+ *_len = len;
+}
+
+/*
+ * Synchronously write back the locked page and any subsequent non-locked dirty
+ * pages.
+ */
+static ssize_t afs_write_back_from_locked_page(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct page *page,
+ loff_t start, loff_t end)
+{
+ struct afs_vnode *vnode = AFS_FS_I(mapping->host);
+ struct iov_iter iter;
+ unsigned long priv;
+ unsigned int offset, to, len, max_len;
+ loff_t i_size = i_size_read(&vnode->vfs_inode);
+ bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
+ long count = wbc->nr_to_write;
+ int ret;
+
+ _enter(",%lx,%llx-%llx", page->index, start, end);
+
+ if (test_set_page_writeback(page))
+ BUG();
+
+ count -= thp_nr_pages(page);
+
+ /* Find all consecutive lockable dirty pages that have contiguous
+ * written regions, stopping when we find a page that is not
+ * immediately lockable, is not dirty or is missing, or we reach the
+ * end of the range.
+ */
+ priv = page_private(page);
+ offset = afs_page_dirty_from(page, priv);
+ to = afs_page_dirty_to(page, priv);
+ trace_afs_page_dirty(vnode, tracepoint_string("store"), page);
+
+ len = to - offset;
+ start += offset;
+ if (start < i_size) {
+ /* Trim the write to the EOF; the extra data is ignored. Also
+ * put an upper limit on the size of a single storedata op.
+ */
+ max_len = 65536 * 4096;
+ max_len = min_t(unsigned long long, max_len, end - start + 1);
+ max_len = min_t(unsigned long long, max_len, i_size - start);
+
+ if (len < max_len &&
+ (to == thp_size(page) || new_content))
+ afs_extend_writeback(mapping, vnode, &count,
+ start, max_len, new_content, &len);
+ len = min_t(loff_t, len, max_len);
+ }
-no_more:
/* We now have a contiguous set of dirty pages, each with writeback
* set; the first page is still locked at this point, but all the rest
* have been unlocked.
*/
- unlock_page(primary_page);
+ unlock_page(page);
- first = primary_page->index;
- last = first + count - 1;
+ if (start < i_size) {
+ _debug("write back %x @%llx [%llx]", len, start, i_size);
- end = (loff_t)last * PAGE_SIZE + to;
- i_size = i_size_read(&vnode->vfs_inode);
+ iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len);
+ ret = afs_store_data(vnode, &iter, start, false);
+ } else {
+ _debug("write discard %x @%llx [%llx]", len, start, i_size);
- _debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to);
- if (end > i_size)
- to = i_size & ~PAGE_MASK;
+ /* The dirty region was entirely beyond the EOF. */
+ afs_pages_written_back(vnode, start, len);
+ ret = 0;
+ }
- ret = afs_store_data(mapping, first, last, offset, to, false);
switch (ret) {
case 0:
- ret = count;
+ wbc->nr_to_write = count;
+ ret = len;
break;
default:
@@ -623,13 +612,13 @@ no_more:
case -EKEYEXPIRED:
case -EKEYREJECTED:
case -EKEYREVOKED:
- afs_redirty_pages(wbc, mapping, first, last);
+ afs_redirty_pages(wbc, mapping, start, len);
mapping_set_error(mapping, ret);
break;
case -EDQUOT:
case -ENOSPC:
- afs_redirty_pages(wbc, mapping, first, last);
+ afs_redirty_pages(wbc, mapping, start, len);
mapping_set_error(mapping, -ENOSPC);
break;
@@ -641,7 +630,7 @@ no_more:
case -ENOMEDIUM:
case -ENXIO:
trace_afs_file_error(vnode, ret, afs_file_error_writeback_fail);
- afs_kill_pages(mapping, first, last);
+ afs_kill_pages(mapping, start, len);
mapping_set_error(mapping, ret);
break;
}
@@ -656,19 +645,19 @@ no_more:
*/
int afs_writepage(struct page *page, struct writeback_control *wbc)
{
- int ret;
+ ssize_t ret;
+ loff_t start;
_enter("{%lx},", page->index);
+ start = page->index * PAGE_SIZE;
ret = afs_write_back_from_locked_page(page->mapping, wbc, page,
- wbc->range_end >> PAGE_SHIFT);
+ start, LLONG_MAX - start);
if (ret < 0) {
- _leave(" = %d", ret);
- return 0;
+ _leave(" = %zd", ret);
+ return ret;
}
- wbc->nr_to_write -= ret;
-
_leave(" = 0");
return 0;
}
@@ -678,35 +667,46 @@ int afs_writepage(struct page *page, struct writeback_control *wbc)
*/
static int afs_writepages_region(struct address_space *mapping,
struct writeback_control *wbc,
- pgoff_t index, pgoff_t end, pgoff_t *_next)
+ loff_t start, loff_t end, loff_t *_next)
{
struct page *page;
- int ret, n;
+ ssize_t ret;
+ int n;
- _enter(",,%lx,%lx,", index, end);
+ _enter("%llx,%llx,", start, end);
do {
- n = find_get_pages_range_tag(mapping, &index, end,
- PAGECACHE_TAG_DIRTY, 1, &page);
+ pgoff_t index = start / PAGE_SIZE;
+
+ n = find_get_pages_range_tag(mapping, &index, end / PAGE_SIZE,
+ PAGECACHE_TAG_DIRTY, 1, &page);
if (!n)
break;
+ start = (loff_t)page->index * PAGE_SIZE; /* May regress with THPs */
+
_debug("wback %lx", page->index);
- /*
- * at this point we hold neither the i_pages lock nor the
+ /* At this point we hold neither the i_pages lock nor the
* page lock: the page may be truncated or invalidated
* (changing page->mapping to NULL), or even swizzled
* back from swapper_space to tmpfs file mapping
*/
- ret = lock_page_killable(page);
- if (ret < 0) {
- put_page(page);
- _leave(" = %d", ret);
- return ret;
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ ret = lock_page_killable(page);
+ if (ret < 0) {
+ put_page(page);
+ return ret;
+ }
+ } else {
+ if (!trylock_page(page)) {
+ put_page(page);
+ return 0;
+ }
}
if (page->mapping != mapping || !PageDirty(page)) {
+ start += thp_size(page);
unlock_page(page);
put_page(page);
continue;
@@ -722,20 +722,20 @@ static int afs_writepages_region(struct address_space *mapping,
if (!clear_page_dirty_for_io(page))
BUG();
- ret = afs_write_back_from_locked_page(mapping, wbc, page, end);
+ ret = afs_write_back_from_locked_page(mapping, wbc, page, start, end);
put_page(page);
if (ret < 0) {
- _leave(" = %d", ret);
+ _leave(" = %zd", ret);
return ret;
}
- wbc->nr_to_write -= ret;
+ start += ret * PAGE_SIZE;
cond_resched();
- } while (index < end && wbc->nr_to_write > 0);
+ } while (wbc->nr_to_write > 0);
- *_next = index;
- _leave(" = 0 [%lx]", *_next);
+ *_next = start;
+ _leave(" = 0 [%llx]", *_next);
return 0;
}
@@ -746,7 +746,7 @@ int afs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- pgoff_t start, end, next;
+ loff_t start, next;
int ret;
_enter("");
@@ -761,22 +761,19 @@ int afs_writepages(struct address_space *mapping,
return 0;
if (wbc->range_cyclic) {
- start = mapping->writeback_index;
- end = -1;
- ret = afs_writepages_region(mapping, wbc, start, end, &next);
+ start = mapping->writeback_index * PAGE_SIZE;
+ ret = afs_writepages_region(mapping, wbc, start, LLONG_MAX, &next);
if (start > 0 && wbc->nr_to_write > 0 && ret == 0)
ret = afs_writepages_region(mapping, wbc, 0, start,
&next);
- mapping->writeback_index = next;
+ mapping->writeback_index = next / PAGE_SIZE;
} else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
- end = (pgoff_t)(LLONG_MAX >> PAGE_SHIFT);
- ret = afs_writepages_region(mapping, wbc, 0, end, &next);
+ ret = afs_writepages_region(mapping, wbc, 0, LLONG_MAX, &next);
if (wbc->nr_to_write > 0)
mapping->writeback_index = next;
} else {
- start = wbc->range_start >> PAGE_SHIFT;
- end = wbc->range_end >> PAGE_SHIFT;
- ret = afs_writepages_region(mapping, wbc, start, end, &next);
+ ret = afs_writepages_region(mapping, wbc,
+ wbc->range_start, wbc->range_end, &next);
}
up_read(&vnode->validate_lock);
@@ -834,13 +831,13 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
*/
vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
{
+ struct page *page = thp_head(vmf->page);
struct file *file = vmf->vma->vm_file;
struct inode *inode = file_inode(file);
struct afs_vnode *vnode = AFS_FS_I(inode);
unsigned long priv;
- _enter("{{%llx:%llu}},{%lx}",
- vnode->fid.vid, vnode->fid.vnode, vmf->page->index);
+ _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index);
sb_start_pagefault(inode->i_sb);
@@ -848,30 +845,35 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
* be modified. We then assume the entire page will need writing back.
*/
#ifdef CONFIG_AFS_FSCACHE
- fscache_wait_on_page_write(vnode->cache, vmf->page);
+ if (PageFsCache(page) &&
+ wait_on_page_fscache_killable(page) < 0)
+ return VM_FAULT_RETRY;
#endif
- if (PageWriteback(vmf->page) &&
- wait_on_page_bit_killable(vmf->page, PG_writeback) < 0)
+ if (wait_on_page_writeback_killable(page))
return VM_FAULT_RETRY;
- if (lock_page_killable(vmf->page) < 0)
+ if (lock_page_killable(page) < 0)
return VM_FAULT_RETRY;
/* We mustn't change page->private until writeback is complete as that
* details the portion of the page we need to write back and we might
* need to redirty the page if there's a problem.
*/
- wait_on_page_writeback(vmf->page);
+ if (wait_on_page_writeback_killable(page) < 0) {
+ unlock_page(page);
+ return VM_FAULT_RETRY;
+ }
- priv = afs_page_dirty(0, PAGE_SIZE);
+ priv = afs_page_dirty(page, 0, thp_size(page));
priv = afs_page_dirty_mmapped(priv);
- trace_afs_page_dirty(vnode, tracepoint_string("mkwrite"),
- vmf->page->index, priv);
- if (PagePrivate(vmf->page))
- set_page_private(vmf->page, priv);
- else
- attach_page_private(vmf->page, (void *)priv);
+ if (PagePrivate(page)) {
+ set_page_private(page, priv);
+ trace_afs_page_dirty(vnode, tracepoint_string("mkwrite+"), page);
+ } else {
+ attach_page_private(page, (void *)priv);
+ trace_afs_page_dirty(vnode, tracepoint_string("mkwrite"), page);
+ }
file_update_time(file);
sb_end_pagefault(inode->i_sb);
@@ -913,6 +915,8 @@ int afs_launder_page(struct page *page)
{
struct address_space *mapping = page->mapping;
struct afs_vnode *vnode = AFS_FS_I(mapping->host);
+ struct iov_iter iter;
+ struct bio_vec bv[1];
unsigned long priv;
unsigned int f, t;
int ret = 0;
@@ -922,26 +926,24 @@ int afs_launder_page(struct page *page)
priv = page_private(page);
if (clear_page_dirty_for_io(page)) {
f = 0;
- t = PAGE_SIZE;
+ t = thp_size(page);
if (PagePrivate(page)) {
- f = afs_page_dirty_from(priv);
- t = afs_page_dirty_to(priv);
+ f = afs_page_dirty_from(page, priv);
+ t = afs_page_dirty_to(page, priv);
}
- trace_afs_page_dirty(vnode, tracepoint_string("launder"),
- page->index, priv);
- ret = afs_store_data(mapping, page->index, page->index, t, f, true);
- }
-
- priv = (unsigned long)detach_page_private(page);
- trace_afs_page_dirty(vnode, tracepoint_string("laundered"),
- page->index, priv);
+ bv[0].bv_page = page;
+ bv[0].bv_offset = f;
+ bv[0].bv_len = t - f;
+ iov_iter_bvec(&iter, WRITE, bv, 1, bv[0].bv_len);
-#ifdef CONFIG_AFS_FSCACHE
- if (PageFsCache(page)) {
- fscache_wait_on_page_write(vnode->cache, page);
- fscache_uncache_page(vnode->cache, page);
+ trace_afs_page_dirty(vnode, tracepoint_string("launder"), page);
+ ret = afs_store_data(vnode, &iter, (loff_t)page->index * PAGE_SIZE,
+ true);
}
-#endif
+
+ trace_afs_page_dirty(vnode, tracepoint_string("laundered"), page);
+ detach_page_private(page);
+ wait_on_page_fscache(page);
return ret;
}
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index bd787e71a657..2b35cba8ad62 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -360,22 +360,23 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
struct afs_vnode_param *vp = &op->file[0];
struct afs_read *req = op->fetch.req;
const __be32 *bp;
- unsigned int size;
int ret;
- _enter("{%u,%zu/%llu}",
- call->unmarshall, iov_iter_count(call->iter), req->actual_len);
+ _enter("{%u,%zu, %zu/%llu}",
+ call->unmarshall, call->iov_len, iov_iter_count(call->iter),
+ req->actual_len);
switch (call->unmarshall) {
case 0:
req->actual_len = 0;
- req->index = 0;
- req->offset = req->pos & (PAGE_SIZE - 1);
afs_extract_to_tmp64(call);
call->unmarshall++;
fallthrough;
- /* extract the returned data length */
+ /* Extract the returned data length into ->actual_len. This
+ * may indicate more or less data than was requested will be
+ * returned.
+ */
case 1:
_debug("extract data length");
ret = afs_extract_data(call, true);
@@ -384,44 +385,25 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
req->actual_len = be64_to_cpu(call->tmp64);
_debug("DATA length: %llu", req->actual_len);
- req->remain = min(req->len, req->actual_len);
- if (req->remain == 0)
+
+ if (req->actual_len == 0)
goto no_more_data;
+ call->iter = req->iter;
+ call->iov_len = min(req->actual_len, req->len);
call->unmarshall++;
-
- begin_page:
- ASSERTCMP(req->index, <, req->nr_pages);
- if (req->remain > PAGE_SIZE - req->offset)
- size = PAGE_SIZE - req->offset;
- else
- size = req->remain;
- call->bvec[0].bv_len = size;
- call->bvec[0].bv_offset = req->offset;
- call->bvec[0].bv_page = req->pages[req->index];
- iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size);
- ASSERTCMP(size, <=, PAGE_SIZE);
fallthrough;
/* extract the returned data */
case 2:
_debug("extract data %zu/%llu",
- iov_iter_count(call->iter), req->remain);
+ iov_iter_count(call->iter), req->actual_len);
ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
- req->remain -= call->bvec[0].bv_len;
- req->offset += call->bvec[0].bv_len;
- ASSERTCMP(req->offset, <=, PAGE_SIZE);
- if (req->offset == PAGE_SIZE) {
- req->offset = 0;
- req->index++;
- if (req->remain > 0)
- goto begin_page;
- }
- ASSERTCMP(req->remain, ==, 0);
+ call->iter = &call->def_iter;
if (req->actual_len <= req->len)
goto no_more_data;
@@ -467,17 +449,6 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
break;
}
- for (; req->index < req->nr_pages; req->index++) {
- if (req->offset < PAGE_SIZE)
- zero_user_segment(req->pages[req->index],
- req->offset, PAGE_SIZE);
- req->offset = 0;
- }
-
- if (req->page_done)
- for (req->index = 0; req->index < req->nr_pages; req->index++)
- req->page_done(req);
-
_leave(" = 0 [done]");
return 0;
}
@@ -516,6 +487,8 @@ void yfs_fs_fetch_data(struct afs_operation *op)
if (!call)
return afs_op_nomem(op);
+ req->call_debug_id = call->debug_id;
+
/* marshall the parameters */
bp = call->request;
bp = xdr_encode_u32(bp, YFSFETCHDATA64);
@@ -1102,25 +1075,15 @@ void yfs_fs_store_data(struct afs_operation *op)
{
struct afs_vnode_param *vp = &op->file[0];
struct afs_call *call;
- loff_t size, pos, i_size;
__be32 *bp;
_enter(",%x,{%llx:%llu},,",
key_serial(op->key), vp->fid.vid, vp->fid.vnode);
- size = (loff_t)op->store.last_to - (loff_t)op->store.first_offset;
- if (op->store.first != op->store.last)
- size += (loff_t)(op->store.last - op->store.first) << PAGE_SHIFT;
- pos = (loff_t)op->store.first << PAGE_SHIFT;
- pos += op->store.first_offset;
-
- i_size = i_size_read(&vp->vnode->vfs_inode);
- if (pos + size > i_size)
- i_size = size + pos;
-
_debug("size %llx, at %llx, i_size %llx",
- (unsigned long long)size, (unsigned long long)pos,
- (unsigned long long)i_size);
+ (unsigned long long)op->store.size,
+ (unsigned long long)op->store.pos,
+ (unsigned long long)op->store.i_size);
call = afs_alloc_flat_call(op->net, &yfs_RXYFSStoreData64,
sizeof(__be32) +
@@ -1133,8 +1096,7 @@ void yfs_fs_store_data(struct afs_operation *op)
if (!call)
return afs_op_nomem(op);
- call->key = op->key;
- call->send_pages = true;
+ call->write_iter = op->store.write_iter;
/* marshall the parameters */
bp = call->request;
@@ -1142,9 +1104,9 @@ void yfs_fs_store_data(struct afs_operation *op)
bp = xdr_encode_u32(bp, 0); /* RPC flags */
bp = xdr_encode_YFSFid(bp, &vp->fid);
bp = xdr_encode_YFSStoreStatus_mtime(bp, &op->mtime);
- bp = xdr_encode_u64(bp, pos);
- bp = xdr_encode_u64(bp, size);
- bp = xdr_encode_u64(bp, i_size);
+ bp = xdr_encode_u64(bp, op->store.pos);
+ bp = xdr_encode_u64(bp, op->store.size);
+ bp = xdr_encode_u64(bp, op->store.i_size);
yfs_check_req(call, bp);
trace_afs_make_fs_call(call, &vp->fid);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b12ba98ae9f5..187b3f2b9202 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2267,8 +2267,7 @@ static int elf_core_dump(struct coredump_params *cprm)
goto end_coredump;
/* Align to page */
- if (!dump_skip(cprm, dataoff - cprm->pos))
- goto end_coredump;
+ dump_skip_to(cprm, dataoff);
for (i = 0; i < vma_count; i++) {
struct core_vma_metadata *meta = vma_meta + i;
@@ -2276,7 +2275,6 @@ static int elf_core_dump(struct coredump_params *cprm)
if (!dump_user_range(cprm, meta->start, meta->dump_size))
goto end_coredump;
}
- dump_truncate(cprm);
if (!elf_core_write_extra_data(cprm))
goto end_coredump;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 3cfd6cd46f26..2c99b102c860 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1631,8 +1631,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
goto end_coredump;
}
- if (!dump_skip(cprm, dataoff - cprm->pos))
- goto end_coredump;
+ dump_skip_to(cprm, dataoff);
if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count))
goto end_coredump;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 363015fcffdb..2ef24273069d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -275,6 +275,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
bio.bi_opf = dio_bio_write_op(iocb);
task_io_account_write(ret);
}
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ bio.bi_opf |= REQ_NOWAIT;
if (iocb->ki_flags & IOCB_HIPRI)
bio_set_polled(&bio, iocb);
@@ -428,6 +430,8 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
bio->bi_opf = dio_bio_write_op(iocb);
task_io_account_write(bio->bi_iter.bi_size);
}
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ bio->bi_opf |= REQ_NOWAIT;
dio->size += bio->bi_iter.bi_size;
pos += bio->bi_iter.bi_size;
@@ -1240,8 +1244,6 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate)
lockdep_assert_held(&bdev->bd_mutex);
- clear_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
-
rescan:
if (bdev->bd_part_count)
return -EBUSY;
@@ -1249,6 +1251,8 @@ rescan:
invalidate_bdev(bdev);
blk_drop_partitions(disk);
+ clear_bit(GD_NEED_PART_SCAN, &disk->state);
+
/*
* Historically we only set the capacity to zero for devices that
* support partitions (independ of actually having partitions created).
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index b634c42115ea..cec88a66bd6c 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,10 +7,12 @@ subdir-ccflags-y += -Wmissing-format-attribute
subdir-ccflags-y += -Wmissing-prototypes
subdir-ccflags-y += -Wold-style-definition
subdir-ccflags-y += -Wmissing-include-dirs
-subdir-ccflags-y += $(call cc-option, -Wunused-but-set-variable)
-subdir-ccflags-y += $(call cc-option, -Wunused-const-variable)
-subdir-ccflags-y += $(call cc-option, -Wpacked-not-aligned)
-subdir-ccflags-y += $(call cc-option, -Wstringop-truncation)
+condflags := \
+ $(call cc-option, -Wunused-but-set-variable) \
+ $(call cc-option, -Wunused-const-variable) \
+ $(call cc-option, -Wpacked-not-aligned) \
+ $(call cc-option, -Wstringop-truncation)
+subdir-ccflags-y += $(condflags)
# The following turn off the warnings enabled by -Wextra
subdir-ccflags-y += -Wno-missing-field-initializers
subdir-ccflags-y += -Wno-sign-compare
@@ -28,7 +30,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
- subpage.o
+ subpage.o tree-mod-log.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f47c1528eb9a..117d423fdb93 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -14,6 +14,7 @@
#include "delayed-ref.h"
#include "locking.h"
#include "misc.h"
+#include "tree-mod-log.h"
/* Just an arbitrary number so we can be sure this happened */
#define BACKREF_FOUND_SHARED 6
@@ -452,7 +453,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
if (path->slots[0] >= btrfs_header_nritems(eb) ||
is_shared_data_backref(preftrees, eb->start) ||
ref->root_id != btrfs_header_owner(eb)) {
- if (time_seq == SEQ_LAST)
+ if (time_seq == BTRFS_SEQ_LAST)
ret = btrfs_next_leaf(root, path);
else
ret = btrfs_next_old_leaf(root, path, time_seq);
@@ -476,7 +477,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
if (slot == 0 &&
(is_shared_data_backref(preftrees, eb->start) ||
ref->root_id != btrfs_header_owner(eb))) {
- if (time_seq == SEQ_LAST)
+ if (time_seq == BTRFS_SEQ_LAST)
ret = btrfs_next_leaf(root, path);
else
ret = btrfs_next_old_leaf(root, path, time_seq);
@@ -514,7 +515,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
eie = NULL;
}
next:
- if (time_seq == SEQ_LAST)
+ if (time_seq == BTRFS_SEQ_LAST)
ret = btrfs_next_item(root, path);
else
ret = btrfs_next_old_item(root, path, time_seq);
@@ -574,7 +575,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
if (path->search_commit_root)
root_level = btrfs_header_level(root->commit_root);
- else if (time_seq == SEQ_LAST)
+ else if (time_seq == BTRFS_SEQ_LAST)
root_level = btrfs_header_level(root->node);
else
root_level = btrfs_old_root_level(root, time_seq);
@@ -605,7 +606,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
search_key.offset >= LLONG_MAX)
search_key.offset = 0;
path->lowest_level = level;
- if (time_seq == SEQ_LAST)
+ if (time_seq == BTRFS_SEQ_LAST)
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
else
ret = btrfs_search_old_slot(root, &search_key, path, time_seq);
@@ -1147,8 +1148,8 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
* indirect refs to their parent bytenr.
* When roots are found, they're added to the roots list
*
- * If time_seq is set to SEQ_LAST, it will not search delayed_refs, and behave
- * much like trans == NULL case, the difference only lies in it will not
+ * If time_seq is set to BTRFS_SEQ_LAST, it will not search delayed_refs, and
+ * behave much like trans == NULL case, the difference only lies in it will not
* commit root.
* The special case is for qgroup to search roots in commit_transaction().
*
@@ -1199,7 +1200,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
path->skip_locking = 1;
}
- if (time_seq == SEQ_LAST)
+ if (time_seq == BTRFS_SEQ_LAST)
path->skip_locking = 1;
/*
@@ -1217,9 +1218,9 @@ again:
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (trans && likely(trans->type != __TRANS_DUMMY) &&
- time_seq != SEQ_LAST) {
+ time_seq != BTRFS_SEQ_LAST) {
#else
- if (trans && time_seq != SEQ_LAST) {
+ if (trans && time_seq != BTRFS_SEQ_LAST) {
#endif
/*
* look if there are updates for this ref queued and lock the
@@ -1527,7 +1528,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
struct btrfs_trans_handle *trans;
struct ulist_iterator uiter;
struct ulist_node *node;
- struct seq_list elem = SEQ_LIST_INIT(elem);
+ struct btrfs_seq_list elem = BTRFS_SEQ_LIST_INIT(elem);
int ret = 0;
struct share_check shared = {
.root_objectid = root->root_key.objectid,
@@ -1953,7 +1954,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
struct ulist *roots = NULL;
struct ulist_node *ref_node = NULL;
struct ulist_node *root_node = NULL;
- struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
+ struct btrfs_seq_list seq_elem = BTRFS_SEQ_LIST_INIT(seq_elem);
struct ulist_iterator ref_uiter;
struct ulist_iterator root_uiter;
@@ -1971,12 +1972,12 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
}
if (trans)
- btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+ btrfs_get_tree_mod_seq(fs_info, &seq_elem);
else
down_read(&fs_info->commit_root_sem);
ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
- tree_mod_seq_elem.seq, &refs,
+ seq_elem.seq, &refs,
&extent_item_pos, ignore_offset);
if (ret)
goto out;
@@ -1984,7 +1985,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
ULIST_ITER_INIT(&ref_uiter);
while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val,
- tree_mod_seq_elem.seq, &roots,
+ seq_elem.seq, &roots,
ignore_offset);
if (ret)
break;
@@ -2007,7 +2008,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
free_leaf_list(refs);
out:
if (trans) {
- btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+ btrfs_put_tree_mod_seq(fs_info, &seq_elem);
btrfs_end_transaction(trans);
} else {
up_read(&fs_info->commit_root_sem);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 744b99ddc28c..aa57bdc8fc89 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1289,7 +1289,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* Long running balances can keep us blocked here for eternity, so
* simply skip deletion if we're unable to get the mutex.
*/
- if (!mutex_trylock(&fs_info->delete_unused_bgs_mutex))
+ if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
return;
spin_lock(&fs_info->unused_bgs_lock);
@@ -1462,12 +1462,12 @@ next:
spin_lock(&fs_info->unused_bgs_lock);
}
spin_unlock(&fs_info->unused_bgs_lock);
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
return;
flip_async:
btrfs_end_transaction(trans);
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_put_block_group(block_group);
btrfs_discard_punt_unused_bgs_list(fs_info);
}
@@ -1485,6 +1485,97 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
spin_unlock(&fs_info->unused_bgs_lock);
}
+void btrfs_reclaim_bgs_work(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info =
+ container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
+ struct btrfs_block_group *bg;
+ struct btrfs_space_info *space_info;
+ int ret;
+
+ if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
+ return;
+
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
+ return;
+
+ mutex_lock(&fs_info->reclaim_bgs_lock);
+ spin_lock(&fs_info->unused_bgs_lock);
+ while (!list_empty(&fs_info->reclaim_bgs)) {
+ bg = list_first_entry(&fs_info->reclaim_bgs,
+ struct btrfs_block_group,
+ bg_list);
+ list_del_init(&bg->bg_list);
+
+ space_info = bg->space_info;
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ /* Don't race with allocators so take the groups_sem */
+ down_write(&space_info->groups_sem);
+
+ spin_lock(&bg->lock);
+ if (bg->reserved || bg->pinned || bg->ro) {
+ /*
+ * We want to bail if we made new allocations or have
+ * outstanding allocations in this block group. We do
+ * the ro check in case balance is currently acting on
+ * this block group.
+ */
+ spin_unlock(&bg->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+ spin_unlock(&bg->lock);
+
+ /* Get out fast, in case we're unmounting the filesystem */
+ if (btrfs_fs_closing(fs_info)) {
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+
+ ret = inc_block_group_ro(bg, 0);
+ up_write(&space_info->groups_sem);
+ if (ret < 0)
+ goto next;
+
+ btrfs_info(fs_info, "reclaiming chunk %llu with %llu%% used",
+ bg->start, div_u64(bg->used * 100, bg->length));
+ trace_btrfs_reclaim_block_group(bg);
+ ret = btrfs_relocate_chunk(fs_info, bg->start);
+ if (ret)
+ btrfs_err(fs_info, "error relocating chunk %llu",
+ bg->start);
+
+next:
+ btrfs_put_block_group(bg);
+ spin_lock(&fs_info->unused_bgs_lock);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ btrfs_exclop_finish(fs_info);
+}
+
+void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
+{
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (!list_empty(&fs_info->reclaim_bgs))
+ queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
+void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (list_empty(&bg->bg_list)) {
+ btrfs_get_block_group(bg);
+ trace_btrfs_add_reclaim_block_group(bg);
+ list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
struct btrfs_path *path)
{
@@ -2267,29 +2358,33 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
struct btrfs_trans_handle *trans;
u64 alloc_flags;
int ret;
+ bool dirty_bg_running;
-again:
- trans = btrfs_join_transaction(fs_info->extent_root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ do {
+ trans = btrfs_join_transaction(fs_info->extent_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- /*
- * we're not allowed to set block groups readonly after the dirty
- * block groups cache has started writing. If it already started,
- * back off and let this transaction commit
- */
- mutex_lock(&fs_info->ro_block_group_mutex);
- if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
- u64 transid = trans->transid;
+ dirty_bg_running = false;
- mutex_unlock(&fs_info->ro_block_group_mutex);
- btrfs_end_transaction(trans);
+ /*
+ * We're not allowed to set block groups readonly after the dirty
+ * block group cache has started writing. If it already started,
+ * back off and let this transaction commit.
+ */
+ mutex_lock(&fs_info->ro_block_group_mutex);
+ if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
+ u64 transid = trans->transid;
- ret = btrfs_wait_for_commit(fs_info, transid);
- if (ret)
- return ret;
- goto again;
- }
+ mutex_unlock(&fs_info->ro_block_group_mutex);
+ btrfs_end_transaction(trans);
+
+ ret = btrfs_wait_for_commit(fs_info, transid);
+ if (ret)
+ return ret;
+ dirty_bg_running = true;
+ }
+ } while (dirty_bg_running);
if (do_chunk_alloc) {
/*
@@ -3269,6 +3364,7 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
*/
void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
{
+ struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_space_info *info;
u64 left;
@@ -3283,6 +3379,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
lockdep_assert_held(&fs_info->chunk_mutex);
info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+again:
spin_lock(&info->lock);
left = info->total_bytes - btrfs_space_info_used(info, true);
spin_unlock(&info->lock);
@@ -3301,6 +3398,58 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
if (left < thresh) {
u64 flags = btrfs_system_alloc_profile(fs_info);
+ u64 reserved = atomic64_read(&cur_trans->chunk_bytes_reserved);
+
+ /*
+ * If there's not available space for the chunk tree (system
+ * space) and there are other tasks that reserved space for
+ * creating a new system block group, wait for them to complete
+ * the creation of their system block group and release excess
+ * reserved space. We do this because:
+ *
+ * *) We can end up allocating more system chunks than necessary
+ * when there are multiple tasks that are concurrently
+ * allocating block groups, which can lead to exhaustion of
+ * the system array in the superblock;
+ *
+ * *) If we allocate extra and unnecessary system block groups,
+ * despite being empty for a long time, and possibly forever,
+ * they end not being added to the list of unused block groups
+ * because that typically happens only when deallocating the
+ * last extent from a block group - which never happens since
+ * we never allocate from them in the first place. The few
+ * exceptions are when mounting a filesystem or running scrub,
+ * which add unused block groups to the list of unused block
+ * groups, to be deleted by the cleaner kthread.
+ * And even when they are added to the list of unused block
+ * groups, it can take a long time until they get deleted,
+ * since the cleaner kthread might be sleeping or busy with
+ * other work (deleting subvolumes, running delayed iputs,
+ * defrag scheduling, etc);
+ *
+ * This is rare in practice, but can happen when too many tasks
+ * are allocating blocks groups in parallel (via fallocate())
+ * and before the one that reserved space for a new system block
+ * group finishes the block group creation and releases the space
+ * reserved in excess (at btrfs_create_pending_block_groups()),
+ * other tasks end up here and see free system space temporarily
+ * not enough for updating the chunk tree.
+ *
+ * We unlock the chunk mutex before waiting for such tasks and
+ * lock it again after the wait, otherwise we would deadlock.
+ * It is safe to do so because allocating a system chunk is the
+ * first thing done while allocating a new block group.
+ */
+ if (reserved > trans->chunk_bytes_reserved) {
+ const u64 min_needed = reserved - thresh;
+
+ mutex_unlock(&fs_info->chunk_mutex);
+ wait_event(cur_trans->chunk_reserve_wait,
+ atomic64_read(&cur_trans->chunk_bytes_reserved) <=
+ min_needed);
+ mutex_lock(&fs_info->chunk_mutex);
+ goto again;
+ }
/*
* Ignore failure to create system chunk. We might end up not
@@ -3315,8 +3464,10 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
ret = btrfs_block_rsv_add(fs_info->chunk_root,
&fs_info->chunk_block_rsv,
thresh, BTRFS_RESERVE_NO_FLUSH);
- if (!ret)
+ if (!ret) {
+ atomic64_add(thresh, &cur_trans->chunk_bytes_reserved);
trans->chunk_bytes_reserved += thresh;
+ }
}
}
@@ -3386,6 +3537,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
}
spin_unlock(&info->unused_bgs_lock);
+ spin_lock(&info->unused_bgs_lock);
+ while (!list_empty(&info->reclaim_bgs)) {
+ block_group = list_first_entry(&info->reclaim_bgs,
+ struct btrfs_block_group,
+ bg_list);
+ list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group,
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 3ecc3372a5ce..7b927425dc71 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -264,6 +264,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
u64 group_start, struct extent_map *em);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
+void btrfs_reclaim_bgs_work(struct work_struct *work);
+void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
+void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
int btrfs_read_block_groups(struct btrfs_fs_info *info);
int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
u64 type, u64 chunk_offset, u64 size);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 28e202e89660..c652e19ad74e 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -220,6 +220,7 @@ struct btrfs_inode {
/* Hook into fs_info->delayed_iputs */
struct list_head delayed_iput;
+ struct rw_semaphore i_mmap_lock;
struct inode vfs_inode;
};
@@ -299,24 +300,30 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
mod);
}
-static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
+/*
+ * Called every time after doing a buffered, direct IO or memory mapped write.
+ *
+ * This is to ensure that if we write to a file that was previously fsynced in
+ * the current transaction, then try to fsync it again in the same transaction,
+ * we will know that there were changes in the file and that it needs to be
+ * logged.
+ */
+static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
+{
+ spin_lock(&inode->lock);
+ inode->last_sub_trans = inode->root->log_transid;
+ spin_unlock(&inode->lock);
+}
+
+static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
{
- int ret = 0;
+ bool ret = false;
spin_lock(&inode->lock);
if (inode->logged_trans == generation &&
inode->last_sub_trans <= inode->last_log_commit &&
- inode->last_sub_trans <= inode->root->last_log_commit) {
- /*
- * After a ranged fsync we might have left some extent maps
- * (that fall outside the fsync's range). So return false
- * here if the list isn't empty, to make sure btrfs_log_inode()
- * will be called and process those extent maps.
- */
- smp_mb();
- if (list_empty(&inode->extent_tree.modified_extents))
- ret = 1;
- }
+ inode->last_sub_trans <= inode->root->last_log_commit)
+ ret = true;
spin_unlock(&inode->lock);
return ret;
}
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 113cb85c1fd4..169508609324 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1555,10 +1555,11 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
BUG_ON(!block_ctx->pagev);
num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
PAGE_SHIFT;
+ /* Pages must be unmapped in reverse order */
while (num_pages > 0) {
num_pages--;
if (block_ctx->datav[num_pages]) {
- kunmap(block_ctx->pagev[num_pages]);
+ kunmap_local(block_ctx->datav[num_pages]);
block_ctx->datav[num_pages] = NULL;
}
if (block_ctx->pagev[num_pages]) {
@@ -1637,7 +1638,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
i = j;
}
for (i = 0; i < num_pages; i++)
- block_ctx->datav[i] = kmap(block_ctx->pagev[i]);
+ block_ctx->datav[i] = kmap_local_page(block_ctx->pagev[i]);
return block_ctx->len;
}
@@ -2677,7 +2678,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
if (NULL != dev_state &&
(bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) {
- unsigned int i = 0;
+ int i = 0;
u64 dev_bytenr;
u64 cur_bytenr;
struct bio_vec bvec;
@@ -2702,7 +2703,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
bio_for_each_segment(bvec, bio, iter) {
BUG_ON(bvec.bv_len != PAGE_SIZE);
- mapped_datav[i] = kmap(bvec.bv_page);
+ mapped_datav[i] = kmap_local_page(bvec.bv_page);
i++;
if (dev_state->state->print_mask &
@@ -2715,8 +2716,9 @@ static void __btrfsic_submit_bio(struct bio *bio)
mapped_datav, segs,
bio, &bio_is_patched,
bio->bi_opf);
- bio_for_each_segment(bvec, bio, iter)
- kunmap(bvec.bv_page);
+ /* Unmap in reverse order */
+ for (--i; i >= 0; i--)
+ kunmap_local(mapped_datav[i]);
kfree(mapped_datav);
} else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
if (dev_state->state->print_mask &
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 3f4c832abfed..17f93fd28f7e 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -80,10 +80,15 @@ static int compression_compress_pages(int type, struct list_head *ws,
case BTRFS_COMPRESS_NONE:
default:
/*
- * This can't happen, the type is validated several times
- * before we get here. As a sane fallback, return what the
- * callers will understand as 'no compression happened'.
+ * This can happen when compression races with remount setting
+ * it to 'no compress', while caller doesn't call
+ * inode_need_compress() to check if we really need to
+ * compress.
+ *
+ * Not a big deal, just need to inform caller that we
+ * haven't allocated any pages yet.
*/
+ *out_pages = 0;
return -E2BIG;
}
}
@@ -1611,7 +1616,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
curr_sample_pos = 0;
while (index < index_end) {
page = find_get_page(inode->i_mapping, index);
- in_data = kmap(page);
+ in_data = kmap_local_page(page);
/* Handle case where the start is not aligned to PAGE_SIZE */
i = start % PAGE_SIZE;
while (i < PAGE_SIZE - SAMPLING_READ_SIZE) {
@@ -1624,7 +1629,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
start += SAMPLING_INTERVAL;
curr_sample_pos += SAMPLING_READ_SIZE;
}
- kunmap(page);
+ kunmap_local(in_data);
put_page(page);
index++;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 34b929bd5c1a..a484fb72a01f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -14,6 +14,7 @@
#include "locking.h"
#include "volumes.h"
#include "qgroup.h"
+#include "tree-mod-log.h"
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level);
@@ -233,597 +234,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
return 0;
}
-enum mod_log_op {
- MOD_LOG_KEY_REPLACE,
- MOD_LOG_KEY_ADD,
- MOD_LOG_KEY_REMOVE,
- MOD_LOG_KEY_REMOVE_WHILE_FREEING,
- MOD_LOG_KEY_REMOVE_WHILE_MOVING,
- MOD_LOG_MOVE_KEYS,
- MOD_LOG_ROOT_REPLACE,
-};
-
-struct tree_mod_root {
- u64 logical;
- u8 level;
-};
-
-struct tree_mod_elem {
- struct rb_node node;
- u64 logical;
- u64 seq;
- enum mod_log_op op;
-
- /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
- int slot;
-
- /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
- u64 generation;
-
- /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
- struct btrfs_disk_key key;
- u64 blockptr;
-
- /* this is used for op == MOD_LOG_MOVE_KEYS */
- struct {
- int dst_slot;
- int nr_items;
- } move;
-
- /* this is used for op == MOD_LOG_ROOT_REPLACE */
- struct tree_mod_root old_root;
-};
-
-/*
- * Pull a new tree mod seq number for our operation.
- */
-static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
-{
- return atomic64_inc_return(&fs_info->tree_mod_seq);
-}
-
-/*
- * This adds a new blocker to the tree mod log's blocker list if the @elem
- * passed does not already have a sequence number set. So when a caller expects
- * to record tree modifications, it should ensure to set elem->seq to zero
- * before calling btrfs_get_tree_mod_seq.
- * Returns a fresh, unused tree log modification sequence number, even if no new
- * blocker was added.
- */
-u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
- struct seq_list *elem)
-{
- write_lock(&fs_info->tree_mod_log_lock);
- if (!elem->seq) {
- elem->seq = btrfs_inc_tree_mod_seq(fs_info);
- list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
- }
- write_unlock(&fs_info->tree_mod_log_lock);
-
- return elem->seq;
-}
-
-void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
- struct seq_list *elem)
-{
- struct rb_root *tm_root;
- struct rb_node *node;
- struct rb_node *next;
- struct tree_mod_elem *tm;
- u64 min_seq = (u64)-1;
- u64 seq_putting = elem->seq;
-
- if (!seq_putting)
- return;
-
- write_lock(&fs_info->tree_mod_log_lock);
- list_del(&elem->list);
- elem->seq = 0;
-
- if (!list_empty(&fs_info->tree_mod_seq_list)) {
- struct seq_list *first;
-
- first = list_first_entry(&fs_info->tree_mod_seq_list,
- struct seq_list, list);
- if (seq_putting > first->seq) {
- /*
- * Blocker with lower sequence number exists, we
- * cannot remove anything from the log.
- */
- write_unlock(&fs_info->tree_mod_log_lock);
- return;
- }
- min_seq = first->seq;
- }
-
- /*
- * anything that's lower than the lowest existing (read: blocked)
- * sequence number can be removed from the tree.
- */
- tm_root = &fs_info->tree_mod_log;
- for (node = rb_first(tm_root); node; node = next) {
- next = rb_next(node);
- tm = rb_entry(node, struct tree_mod_elem, node);
- if (tm->seq >= min_seq)
- continue;
- rb_erase(node, tm_root);
- kfree(tm);
- }
- write_unlock(&fs_info->tree_mod_log_lock);
-}
-
-/*
- * key order of the log:
- * node/leaf start address -> sequence
- *
- * The 'start address' is the logical address of the *new* root node
- * for root replace operations, or the logical address of the affected
- * block for all other operations.
- */
-static noinline int
-__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
-{
- struct rb_root *tm_root;
- struct rb_node **new;
- struct rb_node *parent = NULL;
- struct tree_mod_elem *cur;
-
- lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
-
- tm->seq = btrfs_inc_tree_mod_seq(fs_info);
-
- tm_root = &fs_info->tree_mod_log;
- new = &tm_root->rb_node;
- while (*new) {
- cur = rb_entry(*new, struct tree_mod_elem, node);
- parent = *new;
- if (cur->logical < tm->logical)
- new = &((*new)->rb_left);
- else if (cur->logical > tm->logical)
- new = &((*new)->rb_right);
- else if (cur->seq < tm->seq)
- new = &((*new)->rb_left);
- else if (cur->seq > tm->seq)
- new = &((*new)->rb_right);
- else
- return -EEXIST;
- }
-
- rb_link_node(&tm->node, parent, new);
- rb_insert_color(&tm->node, tm_root);
- return 0;
-}
-
-/*
- * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
- * returns zero with the tree_mod_log_lock acquired. The caller must hold
- * this until all tree mod log insertions are recorded in the rb tree and then
- * write unlock fs_info::tree_mod_log_lock.
- */
-static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb) {
- smp_mb();
- if (list_empty(&(fs_info)->tree_mod_seq_list))
- return 1;
- if (eb && btrfs_header_level(eb) == 0)
- return 1;
-
- write_lock(&fs_info->tree_mod_log_lock);
- if (list_empty(&(fs_info)->tree_mod_seq_list)) {
- write_unlock(&fs_info->tree_mod_log_lock);
- return 1;
- }
-
- return 0;
-}
-
-/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
-static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb)
-{
- smp_mb();
- if (list_empty(&(fs_info)->tree_mod_seq_list))
- return 0;
- if (eb && btrfs_header_level(eb) == 0)
- return 0;
-
- return 1;
-}
-
-static struct tree_mod_elem *
-alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
- enum mod_log_op op, gfp_t flags)
-{
- struct tree_mod_elem *tm;
-
- tm = kzalloc(sizeof(*tm), flags);
- if (!tm)
- return NULL;
-
- tm->logical = eb->start;
- if (op != MOD_LOG_KEY_ADD) {
- btrfs_node_key(eb, &tm->key, slot);
- tm->blockptr = btrfs_node_blockptr(eb, slot);
- }
- tm->op = op;
- tm->slot = slot;
- tm->generation = btrfs_node_ptr_generation(eb, slot);
- RB_CLEAR_NODE(&tm->node);
-
- return tm;
-}
-
-static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
- enum mod_log_op op, gfp_t flags)
-{
- struct tree_mod_elem *tm;
- int ret;
-
- if (!tree_mod_need_log(eb->fs_info, eb))
- return 0;
-
- tm = alloc_tree_mod_elem(eb, slot, op, flags);
- if (!tm)
- return -ENOMEM;
-
- if (tree_mod_dont_log(eb->fs_info, eb)) {
- kfree(tm);
- return 0;
- }
-
- ret = __tree_mod_log_insert(eb->fs_info, tm);
- write_unlock(&eb->fs_info->tree_mod_log_lock);
- if (ret)
- kfree(tm);
-
- return ret;
-}
-
-static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
- int dst_slot, int src_slot, int nr_items)
-{
- struct tree_mod_elem *tm = NULL;
- struct tree_mod_elem **tm_list = NULL;
- int ret = 0;
- int i;
- int locked = 0;
-
- if (!tree_mod_need_log(eb->fs_info, eb))
- return 0;
-
- tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
- if (!tm_list)
- return -ENOMEM;
-
- tm = kzalloc(sizeof(*tm), GFP_NOFS);
- if (!tm) {
- ret = -ENOMEM;
- goto free_tms;
- }
-
- tm->logical = eb->start;
- tm->slot = src_slot;
- tm->move.dst_slot = dst_slot;
- tm->move.nr_items = nr_items;
- tm->op = MOD_LOG_MOVE_KEYS;
-
- for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
- tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
- MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
- if (!tm_list[i]) {
- ret = -ENOMEM;
- goto free_tms;
- }
- }
-
- if (tree_mod_dont_log(eb->fs_info, eb))
- goto free_tms;
- locked = 1;
-
- /*
- * When we override something during the move, we log these removals.
- * This can only happen when we move towards the beginning of the
- * buffer, i.e. dst_slot < src_slot.
- */
- for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
- ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
- if (ret)
- goto free_tms;
- }
-
- ret = __tree_mod_log_insert(eb->fs_info, tm);
- if (ret)
- goto free_tms;
- write_unlock(&eb->fs_info->tree_mod_log_lock);
- kfree(tm_list);
-
- return 0;
-free_tms:
- for (i = 0; i < nr_items; i++) {
- if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
- rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
- kfree(tm_list[i]);
- }
- if (locked)
- write_unlock(&eb->fs_info->tree_mod_log_lock);
- kfree(tm_list);
- kfree(tm);
-
- return ret;
-}
-
-static inline int
-__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
- struct tree_mod_elem **tm_list,
- int nritems)
-{
- int i, j;
- int ret;
-
- for (i = nritems - 1; i >= 0; i--) {
- ret = __tree_mod_log_insert(fs_info, tm_list[i]);
- if (ret) {
- for (j = nritems - 1; j > i; j--)
- rb_erase(&tm_list[j]->node,
- &fs_info->tree_mod_log);
- return ret;
- }
- }
-
- return 0;
-}
-
-static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
- struct extent_buffer *new_root, int log_removal)
-{
- struct btrfs_fs_info *fs_info = old_root->fs_info;
- struct tree_mod_elem *tm = NULL;
- struct tree_mod_elem **tm_list = NULL;
- int nritems = 0;
- int ret = 0;
- int i;
-
- if (!tree_mod_need_log(fs_info, NULL))
- return 0;
-
- if (log_removal && btrfs_header_level(old_root) > 0) {
- nritems = btrfs_header_nritems(old_root);
- tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
- GFP_NOFS);
- if (!tm_list) {
- ret = -ENOMEM;
- goto free_tms;
- }
- for (i = 0; i < nritems; i++) {
- tm_list[i] = alloc_tree_mod_elem(old_root, i,
- MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
- if (!tm_list[i]) {
- ret = -ENOMEM;
- goto free_tms;
- }
- }
- }
-
- tm = kzalloc(sizeof(*tm), GFP_NOFS);
- if (!tm) {
- ret = -ENOMEM;
- goto free_tms;
- }
-
- tm->logical = new_root->start;
- tm->old_root.logical = old_root->start;
- tm->old_root.level = btrfs_header_level(old_root);
- tm->generation = btrfs_header_generation(old_root);
- tm->op = MOD_LOG_ROOT_REPLACE;
-
- if (tree_mod_dont_log(fs_info, NULL))
- goto free_tms;
-
- if (tm_list)
- ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
- if (!ret)
- ret = __tree_mod_log_insert(fs_info, tm);
-
- write_unlock(&fs_info->tree_mod_log_lock);
- if (ret)
- goto free_tms;
- kfree(tm_list);
-
- return ret;
-
-free_tms:
- if (tm_list) {
- for (i = 0; i < nritems; i++)
- kfree(tm_list[i]);
- kfree(tm_list);
- }
- kfree(tm);
-
- return ret;
-}
-
-static struct tree_mod_elem *
-__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
- int smallest)
-{
- struct rb_root *tm_root;
- struct rb_node *node;
- struct tree_mod_elem *cur = NULL;
- struct tree_mod_elem *found = NULL;
-
- read_lock(&fs_info->tree_mod_log_lock);
- tm_root = &fs_info->tree_mod_log;
- node = tm_root->rb_node;
- while (node) {
- cur = rb_entry(node, struct tree_mod_elem, node);
- if (cur->logical < start) {
- node = node->rb_left;
- } else if (cur->logical > start) {
- node = node->rb_right;
- } else if (cur->seq < min_seq) {
- node = node->rb_left;
- } else if (!smallest) {
- /* we want the node with the highest seq */
- if (found)
- BUG_ON(found->seq > cur->seq);
- found = cur;
- node = node->rb_left;
- } else if (cur->seq > min_seq) {
- /* we want the node with the smallest seq */
- if (found)
- BUG_ON(found->seq < cur->seq);
- found = cur;
- node = node->rb_right;
- } else {
- found = cur;
- break;
- }
- }
- read_unlock(&fs_info->tree_mod_log_lock);
-
- return found;
-}
-
-/*
- * this returns the element from the log with the smallest time sequence
- * value that's in the log (the oldest log item). any element with a time
- * sequence lower than min_seq will be ignored.
- */
-static struct tree_mod_elem *
-tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
- u64 min_seq)
-{
- return __tree_mod_log_search(fs_info, start, min_seq, 1);
-}
-
-/*
- * this returns the element from the log with the largest time sequence
- * value that's in the log (the most recent log item). any element with
- * a time sequence lower than min_seq will be ignored.
- */
-static struct tree_mod_elem *
-tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
-{
- return __tree_mod_log_search(fs_info, start, min_seq, 0);
-}
-
-static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst,
- struct extent_buffer *src, unsigned long dst_offset,
- unsigned long src_offset, int nr_items)
-{
- struct btrfs_fs_info *fs_info = dst->fs_info;
- int ret = 0;
- struct tree_mod_elem **tm_list = NULL;
- struct tree_mod_elem **tm_list_add, **tm_list_rem;
- int i;
- int locked = 0;
-
- if (!tree_mod_need_log(fs_info, NULL))
- return 0;
-
- if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
- return 0;
-
- tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
- GFP_NOFS);
- if (!tm_list)
- return -ENOMEM;
-
- tm_list_add = tm_list;
- tm_list_rem = tm_list + nr_items;
- for (i = 0; i < nr_items; i++) {
- tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
- MOD_LOG_KEY_REMOVE, GFP_NOFS);
- if (!tm_list_rem[i]) {
- ret = -ENOMEM;
- goto free_tms;
- }
-
- tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
- MOD_LOG_KEY_ADD, GFP_NOFS);
- if (!tm_list_add[i]) {
- ret = -ENOMEM;
- goto free_tms;
- }
- }
-
- if (tree_mod_dont_log(fs_info, NULL))
- goto free_tms;
- locked = 1;
-
- for (i = 0; i < nr_items; i++) {
- ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
- if (ret)
- goto free_tms;
- ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
- if (ret)
- goto free_tms;
- }
-
- write_unlock(&fs_info->tree_mod_log_lock);
- kfree(tm_list);
-
- return 0;
-
-free_tms:
- for (i = 0; i < nr_items * 2; i++) {
- if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
- rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
- kfree(tm_list[i]);
- }
- if (locked)
- write_unlock(&fs_info->tree_mod_log_lock);
- kfree(tm_list);
-
- return ret;
-}
-
-static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
-{
- struct tree_mod_elem **tm_list = NULL;
- int nritems = 0;
- int i;
- int ret = 0;
-
- if (btrfs_header_level(eb) == 0)
- return 0;
-
- if (!tree_mod_need_log(eb->fs_info, NULL))
- return 0;
-
- nritems = btrfs_header_nritems(eb);
- tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
- if (!tm_list)
- return -ENOMEM;
-
- for (i = 0; i < nritems; i++) {
- tm_list[i] = alloc_tree_mod_elem(eb, i,
- MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
- if (!tm_list[i]) {
- ret = -ENOMEM;
- goto free_tms;
- }
- }
-
- if (tree_mod_dont_log(eb->fs_info, eb))
- goto free_tms;
-
- ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
- write_unlock(&eb->fs_info->tree_mod_log_lock);
- if (ret)
- goto free_tms;
- kfree(tm_list);
-
- return 0;
-
-free_tms:
- for (i = 0; i < nritems; i++)
- kfree(tm_list[i]);
- kfree(tm_list);
-
- return ret;
-}
-
/*
* check if the tree block can be shared by multiple trees
*/
@@ -1090,7 +500,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
parent_start = buf->start;
atomic_inc(&cow->refs);
- ret = tree_mod_log_insert_root(root->node, cow, 1);
+ ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
BUG_ON(ret < 0);
rcu_assign_pointer(root->node, cow);
@@ -1100,15 +510,15 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
add_root_to_dirty_list(root);
} else {
WARN_ON(trans->transid != btrfs_header_generation(parent));
- tree_mod_log_insert_key(parent, parent_slot,
- MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ btrfs_tree_mod_log_insert_key(parent, parent_slot,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
btrfs_set_node_blockptr(parent, parent_slot,
cow->start);
btrfs_set_node_ptr_generation(parent, parent_slot,
trans->transid);
btrfs_mark_buffer_dirty(parent);
if (last_ref) {
- ret = tree_mod_log_free_eb(buf);
+ ret = btrfs_tree_mod_log_free_eb(buf);
if (ret) {
btrfs_tree_unlock(cow);
free_extent_buffer(cow);
@@ -1127,298 +537,6 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
return 0;
}
-/*
- * returns the logical address of the oldest predecessor of the given root.
- * entries older than time_seq are ignored.
- */
-static struct tree_mod_elem *__tree_mod_log_oldest_root(
- struct extent_buffer *eb_root, u64 time_seq)
-{
- struct tree_mod_elem *tm;
- struct tree_mod_elem *found = NULL;
- u64 root_logical = eb_root->start;
- int looped = 0;
-
- if (!time_seq)
- return NULL;
-
- /*
- * the very last operation that's logged for a root is the
- * replacement operation (if it is replaced at all). this has
- * the logical address of the *new* root, making it the very
- * first operation that's logged for this root.
- */
- while (1) {
- tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
- time_seq);
- if (!looped && !tm)
- return NULL;
- /*
- * if there are no tree operation for the oldest root, we simply
- * return it. this should only happen if that (old) root is at
- * level 0.
- */
- if (!tm)
- break;
-
- /*
- * if there's an operation that's not a root replacement, we
- * found the oldest version of our root. normally, we'll find a
- * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
- */
- if (tm->op != MOD_LOG_ROOT_REPLACE)
- break;
-
- found = tm;
- root_logical = tm->old_root.logical;
- looped = 1;
- }
-
- /* if there's no old root to return, return what we found instead */
- if (!found)
- found = tm;
-
- return found;
-}
-
-/*
- * tm is a pointer to the first operation to rewind within eb. then, all
- * previous operations will be rewound (until we reach something older than
- * time_seq).
- */
-static void
-__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
- u64 time_seq, struct tree_mod_elem *first_tm)
-{
- u32 n;
- struct rb_node *next;
- struct tree_mod_elem *tm = first_tm;
- unsigned long o_dst;
- unsigned long o_src;
- unsigned long p_size = sizeof(struct btrfs_key_ptr);
-
- n = btrfs_header_nritems(eb);
- read_lock(&fs_info->tree_mod_log_lock);
- while (tm && tm->seq >= time_seq) {
- /*
- * all the operations are recorded with the operator used for
- * the modification. as we're going backwards, we do the
- * opposite of each operation here.
- */
- switch (tm->op) {
- case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
- BUG_ON(tm->slot < n);
- fallthrough;
- case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
- case MOD_LOG_KEY_REMOVE:
- btrfs_set_node_key(eb, &tm->key, tm->slot);
- btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
- btrfs_set_node_ptr_generation(eb, tm->slot,
- tm->generation);
- n++;
- break;
- case MOD_LOG_KEY_REPLACE:
- BUG_ON(tm->slot >= n);
- btrfs_set_node_key(eb, &tm->key, tm->slot);
- btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
- btrfs_set_node_ptr_generation(eb, tm->slot,
- tm->generation);
- break;
- case MOD_LOG_KEY_ADD:
- /* if a move operation is needed it's in the log */
- n--;
- break;
- case MOD_LOG_MOVE_KEYS:
- o_dst = btrfs_node_key_ptr_offset(tm->slot);
- o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
- memmove_extent_buffer(eb, o_dst, o_src,
- tm->move.nr_items * p_size);
- break;
- case MOD_LOG_ROOT_REPLACE:
- /*
- * this operation is special. for roots, this must be
- * handled explicitly before rewinding.
- * for non-roots, this operation may exist if the node
- * was a root: root A -> child B; then A gets empty and
- * B is promoted to the new root. in the mod log, we'll
- * have a root-replace operation for B, a tree block
- * that is no root. we simply ignore that operation.
- */
- break;
- }
- next = rb_next(&tm->node);
- if (!next)
- break;
- tm = rb_entry(next, struct tree_mod_elem, node);
- if (tm->logical != first_tm->logical)
- break;
- }
- read_unlock(&fs_info->tree_mod_log_lock);
- btrfs_set_header_nritems(eb, n);
-}
-
-/*
- * Called with eb read locked. If the buffer cannot be rewound, the same buffer
- * is returned. If rewind operations happen, a fresh buffer is returned. The
- * returned buffer is always read-locked. If the returned buffer is not the
- * input buffer, the lock on the input buffer is released and the input buffer
- * is freed (its refcount is decremented).
- */
-static struct extent_buffer *
-tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
- struct extent_buffer *eb, u64 time_seq)
-{
- struct extent_buffer *eb_rewin;
- struct tree_mod_elem *tm;
-
- if (!time_seq)
- return eb;
-
- if (btrfs_header_level(eb) == 0)
- return eb;
-
- tm = tree_mod_log_search(fs_info, eb->start, time_seq);
- if (!tm)
- return eb;
-
- if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
- BUG_ON(tm->slot != 0);
- eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
- if (!eb_rewin) {
- btrfs_tree_read_unlock(eb);
- free_extent_buffer(eb);
- return NULL;
- }
- btrfs_set_header_bytenr(eb_rewin, eb->start);
- btrfs_set_header_backref_rev(eb_rewin,
- btrfs_header_backref_rev(eb));
- btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
- btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
- } else {
- eb_rewin = btrfs_clone_extent_buffer(eb);
- if (!eb_rewin) {
- btrfs_tree_read_unlock(eb);
- free_extent_buffer(eb);
- return NULL;
- }
- }
-
- btrfs_tree_read_unlock(eb);
- free_extent_buffer(eb);
-
- btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
- eb_rewin, btrfs_header_level(eb_rewin));
- btrfs_tree_read_lock(eb_rewin);
- __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
- WARN_ON(btrfs_header_nritems(eb_rewin) >
- BTRFS_NODEPTRS_PER_BLOCK(fs_info));
-
- return eb_rewin;
-}
-
-/*
- * get_old_root() rewinds the state of @root's root node to the given @time_seq
- * value. If there are no changes, the current root->root_node is returned. If
- * anything changed in between, there's a fresh buffer allocated on which the
- * rewind operations are done. In any case, the returned buffer is read locked.
- * Returns NULL on error (with no locks held).
- */
-static inline struct extent_buffer *
-get_old_root(struct btrfs_root *root, u64 time_seq)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct tree_mod_elem *tm;
- struct extent_buffer *eb = NULL;
- struct extent_buffer *eb_root;
- u64 eb_root_owner = 0;
- struct extent_buffer *old;
- struct tree_mod_root *old_root = NULL;
- u64 old_generation = 0;
- u64 logical;
- int level;
-
- eb_root = btrfs_read_lock_root_node(root);
- tm = __tree_mod_log_oldest_root(eb_root, time_seq);
- if (!tm)
- return eb_root;
-
- if (tm->op == MOD_LOG_ROOT_REPLACE) {
- old_root = &tm->old_root;
- old_generation = tm->generation;
- logical = old_root->logical;
- level = old_root->level;
- } else {
- logical = eb_root->start;
- level = btrfs_header_level(eb_root);
- }
-
- tm = tree_mod_log_search(fs_info, logical, time_seq);
- if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
- btrfs_tree_read_unlock(eb_root);
- free_extent_buffer(eb_root);
- old = read_tree_block(fs_info, logical, root->root_key.objectid,
- 0, level, NULL);
- if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
- if (!IS_ERR(old))
- free_extent_buffer(old);
- btrfs_warn(fs_info,
- "failed to read tree block %llu from get_old_root",
- logical);
- } else {
- btrfs_tree_read_lock(old);
- eb = btrfs_clone_extent_buffer(old);
- btrfs_tree_read_unlock(old);
- free_extent_buffer(old);
- }
- } else if (old_root) {
- eb_root_owner = btrfs_header_owner(eb_root);
- btrfs_tree_read_unlock(eb_root);
- free_extent_buffer(eb_root);
- eb = alloc_dummy_extent_buffer(fs_info, logical);
- } else {
- eb = btrfs_clone_extent_buffer(eb_root);
- btrfs_tree_read_unlock(eb_root);
- free_extent_buffer(eb_root);
- }
-
- if (!eb)
- return NULL;
- if (old_root) {
- btrfs_set_header_bytenr(eb, eb->start);
- btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
- btrfs_set_header_owner(eb, eb_root_owner);
- btrfs_set_header_level(eb, old_root->level);
- btrfs_set_header_generation(eb, old_generation);
- }
- btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
- btrfs_header_level(eb));
- btrfs_tree_read_lock(eb);
- if (tm)
- __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
- else
- WARN_ON(btrfs_header_level(eb) != 0);
- WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
-
- return eb;
-}
-
-int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
-{
- struct tree_mod_elem *tm;
- int level;
- struct extent_buffer *eb_root = btrfs_root_node(root);
-
- tm = __tree_mod_log_oldest_root(eb_root, time_seq);
- if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
- level = tm->old_root.level;
- } else {
- level = btrfs_header_level(eb_root);
- }
- free_extent_buffer(eb_root);
-
- return level;
-}
-
static inline int should_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf)
@@ -1840,7 +958,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto enospc;
}
- ret = tree_mod_log_insert_root(root->node, child, 1);
+ ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
BUG_ON(ret < 0);
rcu_assign_pointer(root->node, child);
@@ -1920,8 +1038,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
} else {
struct btrfs_disk_key right_key;
btrfs_node_key(right, &right_key, 0);
- ret = tree_mod_log_insert_key(parent, pslot + 1,
- MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &right_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@@ -1966,8 +1084,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* update the parent key to reflect our changes */
struct btrfs_disk_key mid_key;
btrfs_node_key(mid, &mid_key, 0);
- ret = tree_mod_log_insert_key(parent, pslot,
- MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ ret = btrfs_tree_mod_log_insert_key(parent, pslot,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &mid_key, pslot);
btrfs_mark_buffer_dirty(parent);
@@ -2068,8 +1186,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
orig_slot += left_nr;
btrfs_node_key(mid, &disk_key, 0);
- ret = tree_mod_log_insert_key(parent, pslot,
- MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ ret = btrfs_tree_mod_log_insert_key(parent, pslot,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &disk_key, pslot);
btrfs_mark_buffer_dirty(parent);
@@ -2122,8 +1240,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
btrfs_node_key(right, &disk_key, 0);
- ret = tree_mod_log_insert_key(parent, pslot + 1,
- MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &disk_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@@ -2161,12 +1279,13 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
u64 search;
u64 target;
u64 nread = 0;
+ u64 nread_max;
struct extent_buffer *eb;
u32 nr;
u32 blocksize;
u32 nscan = 0;
- if (level != 1)
+ if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
return;
if (!path->nodes[level])
@@ -2174,6 +1293,20 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
node = path->nodes[level];
+ /*
+ * Since the time between visiting leaves is much shorter than the time
+ * between visiting nodes, limit read ahead of nodes to 1, to avoid too
+ * much IO at once (possibly random).
+ */
+ if (path->reada == READA_FORWARD_ALWAYS) {
+ if (level > 1)
+ nread_max = node->fs_info->nodesize;
+ else
+ nread_max = SZ_128K;
+ } else {
+ nread_max = SZ_64K;
+ }
+
search = btrfs_node_blockptr(node, slot);
blocksize = fs_info->nodesize;
eb = find_extent_buffer(fs_info, search);
@@ -2192,7 +1325,8 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
if (nr == 0)
break;
nr--;
- } else if (path->reada == READA_FORWARD) {
+ } else if (path->reada == READA_FORWARD ||
+ path->reada == READA_FORWARD_ALWAYS) {
nr++;
if (nr >= nritems)
break;
@@ -2203,13 +1337,14 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
break;
}
search = btrfs_node_blockptr(node, nr);
- if ((search <= target && target - search <= 65536) ||
+ if (path->reada == READA_FORWARD_ALWAYS ||
+ (search <= target && target - search <= 65536) ||
(search > target && search - target <= 65536)) {
btrfs_readahead_node_child(node, nr);
nread += blocksize;
}
nscan++;
- if ((nread > 65536 || nscan > 32))
+ if (nread > nread_max || nscan > 32)
break;
}
}
@@ -2318,6 +1453,9 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
tmp = find_extent_buffer(fs_info, blocknr);
if (tmp) {
+ if (p->reada == READA_FORWARD_ALWAYS)
+ reada_for_search(fs_info, p, level, slot, key->objectid);
+
/* first we do an atomic uptodate check */
if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
/*
@@ -2861,7 +1999,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
}
again:
- b = get_old_root(root, time_seq);
+ b = btrfs_get_old_root(root, time_seq);
if (!b) {
ret = -EIO;
goto done;
@@ -2916,7 +2054,7 @@ again:
level = btrfs_header_level(b);
btrfs_tree_read_lock(b);
- b = tree_mod_log_rewind(fs_info, p, b, time_seq);
+ b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
if (!b) {
ret = -ENOMEM;
goto done;
@@ -3030,8 +2168,8 @@ static void fixup_low_keys(struct btrfs_path *path,
if (!path->nodes[i])
break;
t = path->nodes[i];
- ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
- GFP_ATOMIC);
+ ret = btrfs_tree_mod_log_insert_key(t, tslot,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC);
BUG_ON(ret < 0);
btrfs_set_node_key(t, key, tslot);
btrfs_mark_buffer_dirty(path->nodes[i]);
@@ -3194,7 +2332,7 @@ static int push_node_left(struct btrfs_trans_handle *trans,
btrfs_abort_transaction(trans, ret);
return ret;
}
- ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
+ ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -3206,8 +2344,8 @@ static int push_node_left(struct btrfs_trans_handle *trans,
if (push_items < src_nritems) {
/*
- * Don't call tree_mod_log_insert_move here, key removal was
- * already fully logged by tree_mod_log_eb_copy above.
+ * Don't call btrfs_tree_mod_log_insert_move() here, key removal
+ * was already fully logged by btrfs_tree_mod_log_eb_copy() above.
*/
memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(push_items),
@@ -3268,15 +2406,15 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
btrfs_abort_transaction(trans, ret);
return ret;
}
- ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
+ ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
BUG_ON(ret < 0);
memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
btrfs_node_key_ptr_offset(0),
(dst_nritems) *
sizeof(struct btrfs_key_ptr));
- ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
- push_items);
+ ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
+ push_items);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -3342,7 +2480,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(c);
old = root->node;
- ret = tree_mod_log_insert_root(root->node, c, 0);
+ ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
BUG_ON(ret < 0);
rcu_assign_pointer(root->node, c);
@@ -3381,8 +2519,8 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
if (slot != nritems) {
if (level) {
- ret = tree_mod_log_insert_move(lower, slot + 1, slot,
- nritems - slot);
+ ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
+ slot, nritems - slot);
BUG_ON(ret < 0);
}
memmove_extent_buffer(lower,
@@ -3391,8 +2529,8 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
(nritems - slot) * sizeof(struct btrfs_key_ptr));
}
if (level) {
- ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
- GFP_NOFS);
+ ret = btrfs_tree_mod_log_insert_key(lower, slot,
+ BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
BUG_ON(ret < 0);
}
btrfs_set_node_key(lower, key, slot);
@@ -3433,9 +2571,9 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
* tree mod log: We don't log_removal old root in
* insert_new_root, because that root buffer will be kept as a
* normal node. We are going to log removal of half of the
- * elements below with tree_mod_log_eb_copy. We're holding a
- * tree lock on the buffer, which is why we cannot race with
- * other tree_mod_log users.
+ * elements below with btrfs_tree_mod_log_eb_copy(). We're
+ * holding a tree lock on the buffer, which is why we cannot
+ * race with other tree_mod_log users.
*/
ret = insert_new_root(trans, root, path, level + 1);
if (ret)
@@ -3462,7 +2600,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
root_add_used(root, fs_info->nodesize);
ASSERT(btrfs_header_level(c) == level);
- ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
+ ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -4844,8 +3982,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
nritems = btrfs_header_nritems(parent);
if (slot != nritems - 1) {
if (level) {
- ret = tree_mod_log_insert_move(parent, slot, slot + 1,
- nritems - slot - 1);
+ ret = btrfs_tree_mod_log_insert_move(parent, slot,
+ slot + 1, nritems - slot - 1);
BUG_ON(ret < 0);
}
memmove_extent_buffer(parent,
@@ -4854,8 +3992,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
sizeof(struct btrfs_key_ptr) *
(nritems - slot - 1));
} else if (level) {
- ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
- GFP_NOFS);
+ ret = btrfs_tree_mod_log_insert_key(parent, slot,
+ BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
BUG_ON(ret < 0);
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9ae776ab3967..f83fd3cbf243 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -342,6 +342,27 @@ struct btrfs_node {
struct btrfs_key_ptr ptrs[];
} __attribute__ ((__packed__));
+/* Read ahead values for struct btrfs_path.reada */
+enum {
+ READA_NONE,
+ READA_BACK,
+ READA_FORWARD,
+ /*
+ * Similar to READA_FORWARD but unlike it:
+ *
+ * 1) It will trigger readahead even for leaves that are not close to
+ * each other on disk;
+ * 2) It also triggers readahead for nodes;
+ * 3) During a search, even when a node or leaf is already in memory, it
+ * will still trigger readahead for other nodes and leaves that follow
+ * it.
+ *
+ * This is meant to be used only when we know we are iterating over the
+ * entire tree or a very large part of it.
+ */
+ READA_FORWARD_ALWAYS,
+};
+
/*
* btrfs_paths remember the path taken from the root down to the leaf.
* level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
@@ -350,7 +371,6 @@ struct btrfs_node {
* The slots array records the index of the item or block pointer
* used while walking the tree.
*/
-enum { READA_NONE, READA_BACK, READA_FORWARD };
struct btrfs_path {
struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
int slots[BTRFS_MAX_LEVEL];
@@ -482,16 +502,6 @@ struct btrfs_discard_ctl {
atomic64_t discard_bytes_saved;
};
-/* delayed seq elem */
-struct seq_list {
- struct list_head list;
- u64 seq;
-};
-
-#define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 }
-
-#define SEQ_LAST ((u64)-1)
-
enum btrfs_orphan_cleanup_state {
ORPHAN_CLEANUP_STARTED = 1,
ORPHAN_CLEANUP_DONE = 2,
@@ -572,6 +582,15 @@ enum {
/* Indicate that we can't trust the free space tree for caching yet */
BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED,
+
+ /* Indicate whether there are any tree modification log users */
+ BTRFS_FS_TREE_MOD_LOG_USERS,
+
+#if BITS_PER_LONG == 32
+ /* Indicate if we have error/warn message printed on 32bit systems */
+ BTRFS_FS_32BIT_ERROR,
+ BTRFS_FS_32BIT_WARN,
+#endif
};
/*
@@ -941,10 +960,16 @@ struct btrfs_fs_info {
struct work_struct async_data_reclaim_work;
struct work_struct preempt_reclaim_work;
+ /* Reclaim partially filled block groups in the background */
+ struct work_struct reclaim_bgs_work;
+ struct list_head reclaim_bgs;
+ int bg_reclaim_threshold;
+
spinlock_t unused_bgs_lock;
struct list_head unused_bgs;
struct mutex unused_bg_unpin_mutex;
- struct mutex delete_unused_bgs_mutex;
+ /* Protect block groups that are going to be deleted */
+ struct mutex reclaim_bgs_lock;
/* Cached block sizes */
u32 nodesize;
@@ -2691,7 +2716,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref);
-int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
/*
@@ -2929,13 +2953,6 @@ static inline void btrfs_clear_sb_rdonly(struct super_block *sb)
clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
}
-/* tree mod log functions from ctree.c */
-u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
- struct seq_list *elem);
-void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
- struct seq_list *elem);
-int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
-
/* root-item.c */
int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
u64 ref_id, u64 dirid, u64 sequence, const char *name,
@@ -3084,7 +3101,7 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path);
blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
- struct page *page, u64 start, u64 end, int mirror);
+ struct page *page, u64 start, u64 end);
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
u64 start, u64 len);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
@@ -3179,6 +3196,7 @@ extern const struct iomap_dio_ops btrfs_dio_ops;
/* Inode locking type flags, by default the exclusive lock is taken */
#define BTRFS_ILOCK_SHARED (1U << 0)
#define BTRFS_ILOCK_TRY (1U << 1)
+#define BTRFS_ILOCK_MMAP (1U << 2)
int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags);
void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags);
@@ -3189,6 +3207,9 @@ void btrfs_update_inode_bytes(struct btrfs_inode *inode,
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int btrfs_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa);
int btrfs_ioctl_get_supported_features(void __user *arg);
void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
int __pure btrfs_is_empty_uuid(u8 *uuid);
@@ -3217,8 +3238,9 @@ extern const struct file_operations btrfs_file_operations;
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
struct btrfs_drop_extents_args *args);
-int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
- const u64 start, const u64 end,
+int btrfs_replace_file_extents(struct btrfs_inode *inode,
+ struct btrfs_path *path, const u64 start,
+ const u64 end,
struct btrfs_replace_extent_info *extent_info,
struct btrfs_trans_handle **trans_out);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
@@ -3405,6 +3427,19 @@ static inline void assertfail(const char *expr, const char* file, int line) { }
#define ASSERT(expr) (void)(expr)
#endif
+#if BITS_PER_LONG == 32
+#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT)
+/*
+ * The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical
+ * addresses of extents.
+ *
+ * For 4K page size it's about 10T, for 64K it's 160T.
+ */
+#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8)
+void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info);
+void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info);
+#endif
+
/*
* Get the correct offset inside the page of extent buffer.
*
@@ -3732,8 +3767,6 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
return signal_pending(current);
}
-#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
-
/* Sanity test specific functions */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_destroy_inode(struct inode *inode);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index bf25401c9768..1a88f6214ebc 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -602,7 +602,6 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
static int btrfs_delayed_inode_reserve_metadata(
struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- struct btrfs_inode *inode,
struct btrfs_delayed_node *node)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -633,32 +632,17 @@ static int btrfs_delayed_inode_reserve_metadata(
return ret;
ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
BTRFS_RESERVE_NO_FLUSH);
- /*
- * Since we're under a transaction reserve_metadata_bytes could
- * try to commit the transaction which will make it return
- * EAGAIN to make us stop the transaction we have, so return
- * ENOSPC instead so that btrfs_dirty_inode knows what to do.
- */
- if (ret == -EAGAIN) {
- ret = -ENOSPC;
- btrfs_qgroup_free_meta_prealloc(root, num_bytes);
- }
- if (!ret) {
- node->bytes_reserved = num_bytes;
- trace_btrfs_space_reservation(fs_info,
- "delayed_inode",
- btrfs_ino(inode),
- num_bytes, 1);
- } else {
+ /* NO_FLUSH could only fail with -ENOSPC */
+ ASSERT(ret == 0 || ret == -ENOSPC);
+ if (ret)
btrfs_qgroup_free_meta_prealloc(root, num_bytes);
- }
- return ret;
+ } else {
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
}
- ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
if (!ret) {
trace_btrfs_space_reservation(fs_info, "delayed_inode",
- btrfs_ino(inode), num_bytes, 1);
+ node->inode_id, num_bytes, 1);
node->bytes_reserved = num_bytes;
}
@@ -1589,8 +1573,8 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
* We can only do one readdir with delayed items at a time because of
* item->readdir_list.
*/
- inode_unlock_shared(inode);
- inode_lock(inode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_lock(inode, 0);
mutex_lock(&delayed_node->mutex);
item = __btrfs_first_delayed_insertion_item(delayed_node);
@@ -1833,8 +1817,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
goto release_node;
}
- ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
- delayed_node);
+ ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
if (ret)
goto release_node;
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 63be7d01a9a3..c92d9d4f5f46 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -11,6 +11,7 @@
#include "transaction.h"
#include "qgroup.h"
#include "space-info.h"
+#include "tree-mod-log.h"
struct kmem_cache *btrfs_delayed_ref_head_cachep;
struct kmem_cache *btrfs_delayed_tree_ref_cachep;
@@ -494,16 +495,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
if (head->is_data)
return;
- read_lock(&fs_info->tree_mod_log_lock);
- if (!list_empty(&fs_info->tree_mod_seq_list)) {
- struct seq_list *elem;
-
- elem = list_first_entry(&fs_info->tree_mod_seq_list,
- struct seq_list, list);
- seq = elem->seq;
- }
- read_unlock(&fs_info->tree_mod_log_lock);
-
+ seq = btrfs_tree_mod_log_lowest_seq(fs_info);
again:
for (node = rb_first_cached(&head->ref_tree); node;
node = rb_next(node)) {
@@ -517,23 +509,16 @@ again:
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
{
- struct seq_list *elem;
int ret = 0;
+ u64 min_seq = btrfs_tree_mod_log_lowest_seq(fs_info);
- read_lock(&fs_info->tree_mod_log_lock);
- if (!list_empty(&fs_info->tree_mod_seq_list)) {
- elem = list_first_entry(&fs_info->tree_mod_seq_list,
- struct seq_list, list);
- if (seq >= elem->seq) {
- btrfs_debug(fs_info,
- "holding back delayed_ref %#x.%x, lowest is %#x.%x",
- (u32)(seq >> 32), (u32)seq,
- (u32)(elem->seq >> 32), (u32)elem->seq);
- ret = 1;
- }
+ if (min_seq != 0 && seq >= min_seq) {
+ btrfs_debug(fs_info,
+ "holding back delayed_ref %llu, lowest is %llu",
+ seq, min_seq);
+ ret = 1;
}
- read_unlock(&fs_info->tree_mod_log_lock);
return ret;
}
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 3a9c1e046ebe..d05f73530af7 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -81,6 +81,9 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
struct btrfs_dev_replace_item *ptr;
u64 src_devid;
+ if (!dev_root)
+ return 0;
+
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 41b718cfea40..c9a3036c23bf 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -42,6 +42,7 @@
#include "discard.h"
#include "space-info.h"
#include "zoned.h"
+#include "subpage.h"
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
BTRFS_HEADER_FLAG_RELOC |\
@@ -440,6 +441,74 @@ static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
return ret;
}
+static int csum_one_extent_buffer(struct extent_buffer *eb)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ u8 result[BTRFS_CSUM_SIZE];
+ int ret;
+
+ ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
+ offsetof(struct btrfs_header, fsid),
+ BTRFS_FSID_SIZE) == 0);
+ csum_tree_block(eb, result);
+
+ if (btrfs_header_level(eb))
+ ret = btrfs_check_node(eb);
+ else
+ ret = btrfs_check_leaf_full(eb);
+
+ if (ret < 0) {
+ btrfs_print_tree(eb, 0);
+ btrfs_err(fs_info,
+ "block=%llu write time tree block corruption detected",
+ eb->start);
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ return ret;
+ }
+ write_extent_buffer(eb, result, 0, fs_info->csum_size);
+
+ return 0;
+}
+
+/* Checksum all dirty extent buffers in one bio_vec */
+static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info,
+ struct bio_vec *bvec)
+{
+ struct page *page = bvec->bv_page;
+ u64 bvec_start = page_offset(page) + bvec->bv_offset;
+ u64 cur;
+ int ret = 0;
+
+ for (cur = bvec_start; cur < bvec_start + bvec->bv_len;
+ cur += fs_info->nodesize) {
+ struct extent_buffer *eb;
+ bool uptodate;
+
+ eb = find_extent_buffer(fs_info, cur);
+ uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
+ fs_info->nodesize);
+
+ /* A dirty eb shouldn't disappear from buffer_radix */
+ if (WARN_ON(!eb))
+ return -EUCLEAN;
+
+ if (WARN_ON(cur != btrfs_header_bytenr(eb))) {
+ free_extent_buffer(eb);
+ return -EUCLEAN;
+ }
+ if (WARN_ON(!uptodate)) {
+ free_extent_buffer(eb);
+ return -EUCLEAN;
+ }
+
+ ret = csum_one_extent_buffer(eb);
+ free_extent_buffer(eb);
+ if (ret < 0)
+ return ret;
+ }
+ return ret;
+}
+
/*
* Checksum a dirty tree block before IO. This has extra checks to make sure
* we only fill in the checksum field in the first page of a multi-page block.
@@ -450,9 +519,10 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec
struct page *page = bvec->bv_page;
u64 start = page_offset(page);
u64 found_start;
- u8 result[BTRFS_CSUM_SIZE];
struct extent_buffer *eb;
- int ret;
+
+ if (fs_info->sectorsize < PAGE_SIZE)
+ return csum_dirty_subpage_buffers(fs_info, bvec);
eb = (struct extent_buffer *)page->private;
if (page != eb->pages[0])
@@ -474,28 +544,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec
if (WARN_ON(!PageUptodate(page)))
return -EUCLEAN;
- ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
- offsetof(struct btrfs_header, fsid),
- BTRFS_FSID_SIZE) == 0);
-
- csum_tree_block(eb, result);
-
- if (btrfs_header_level(eb))
- ret = btrfs_check_node(eb);
- else
- ret = btrfs_check_leaf_full(eb);
-
- if (ret < 0) {
- btrfs_print_tree(eb, 0);
- btrfs_err(fs_info,
- "block=%llu write time tree block corruption detected",
- eb->start);
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
- return ret;
- }
- write_extent_buffer(eb, result, 0, fs_info->csum_size);
-
- return 0;
+ return csum_one_extent_buffer(eb);
}
static int check_tree_block_fsid(struct extent_buffer *eb)
@@ -992,14 +1041,48 @@ static void btree_invalidatepage(struct page *page, unsigned int offset,
static int btree_set_page_dirty(struct page *page)
{
#ifdef DEBUG
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct btrfs_subpage *subpage;
struct extent_buffer *eb;
+ int cur_bit = 0;
+ u64 page_start = page_offset(page);
+
+ if (fs_info->sectorsize == PAGE_SIZE) {
+ BUG_ON(!PagePrivate(page));
+ eb = (struct extent_buffer *)page->private;
+ BUG_ON(!eb);
+ BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+ BUG_ON(!atomic_read(&eb->refs));
+ btrfs_assert_tree_locked(eb);
+ return __set_page_dirty_nobuffers(page);
+ }
+ ASSERT(PagePrivate(page) && page->private);
+ subpage = (struct btrfs_subpage *)page->private;
+
+ ASSERT(subpage->dirty_bitmap);
+ while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) {
+ unsigned long flags;
+ u64 cur;
+ u16 tmp = (1 << cur_bit);
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ if (!(tmp & subpage->dirty_bitmap)) {
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ cur_bit++;
+ continue;
+ }
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ cur = page_start + cur_bit * fs_info->sectorsize;
- BUG_ON(!PagePrivate(page));
- eb = (struct extent_buffer *)page->private;
- BUG_ON(!eb);
- BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
- BUG_ON(!atomic_read(&eb->refs));
- btrfs_assert_tree_locked(eb);
+ eb = find_extent_buffer(fs_info, cur);
+ ASSERT(eb);
+ ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+ ASSERT(atomic_read(&eb->refs));
+ btrfs_assert_tree_locked(eb);
+ free_extent_buffer(eb);
+
+ cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
+ }
#endif
return __set_page_dirty_nobuffers(page);
}
@@ -1807,14 +1890,21 @@ static int cleaner_kthread(void *arg)
btrfs_run_defrag_inodes(fs_info);
/*
- * Acquires fs_info->delete_unused_bgs_mutex to avoid racing
+ * Acquires fs_info->reclaim_bgs_lock to avoid racing
* with relocation (btrfs_relocate_chunk) and relocation
* acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
- * after acquiring fs_info->delete_unused_bgs_mutex. So we
+ * after acquiring fs_info->reclaim_bgs_lock. So we
* can't hold, nor need to, fs_info->cleaner_mutex when deleting
* unused block groups.
*/
btrfs_delete_unused_bgs(fs_info);
+
+ /*
+ * Reclaim block groups in the reclaim_bgs list after we deleted
+ * all unused block_groups. This possibly gives us some more free
+ * space.
+ */
+ btrfs_reclaim_bgs(fs_info);
sleep:
clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
if (kthread_should_park())
@@ -2387,8 +2477,9 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
} else {
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->dev_root = root;
- btrfs_init_devices_late(fs_info);
}
+ /* Initialize fs_info for all devices in any case */
+ btrfs_init_devices_late(fs_info);
/* If IGNOREDATACSUMS is set don't bother reading the csum root. */
if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
@@ -2792,7 +2883,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->treelog_bg_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->unused_bg_unpin_mutex);
- mutex_init(&fs_info->delete_unused_bgs_mutex);
+ mutex_init(&fs_info->reclaim_bgs_lock);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
mutex_init(&fs_info->zoned_meta_io_lock);
@@ -2802,6 +2893,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_LIST_HEAD(&fs_info->unused_bgs);
+ INIT_LIST_HEAD(&fs_info->reclaim_bgs);
#ifdef CONFIG_BTRFS_DEBUG
INIT_LIST_HEAD(&fs_info->allocated_roots);
INIT_LIST_HEAD(&fs_info->allocated_ebs);
@@ -2890,6 +2982,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->swapfile_pins = RB_ROOT;
fs_info->send_in_progress = 0;
+
+ fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
+ INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
}
static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
@@ -3009,6 +3104,21 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
}
}
+ /*
+ * btrfs_find_orphan_roots() is responsible for finding all the dead
+ * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
+ * them into the fs_info->fs_roots_radix tree. This must be done before
+ * calling btrfs_orphan_cleanup() on the tree root. If we don't do it
+ * first, then btrfs_orphan_cleanup() will delete a dead root's orphan
+ * item before the root's tree is deleted - this means that if we unmount
+ * or crash before the deletion completes, on the next mount we will not
+ * delete what remains of the tree because the orphan item does not
+ * exists anymore, which is what tells us we have a pending deletion.
+ */
+ ret = btrfs_find_orphan_roots(fs_info);
+ if (ret)
+ goto out;
+
ret = btrfs_cleanup_fs_roots(fs_info);
if (ret)
goto out;
@@ -3068,7 +3178,6 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
}
}
- ret = btrfs_find_orphan_roots(fs_info);
out:
return ret;
}
@@ -4234,6 +4343,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
cancel_work_sync(&fs_info->async_data_reclaim_work);
cancel_work_sync(&fs_info->preempt_reclaim_work);
+ cancel_work_sync(&fs_info->reclaim_bgs_work);
+
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 36a3c973fda1..7a28314189b4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2490,19 +2490,6 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
}
-int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
-{
- struct btrfs_block_group *block_group;
- int readonly = 0;
-
- block_group = btrfs_lookup_block_group(fs_info, bytenr);
- if (!block_group || block_group->ro)
- readonly = 1;
- if (block_group)
- btrfs_put_block_group(block_group);
- return readonly;
-}
-
static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -3355,11 +3342,9 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
* find a node pointing to this leaf and record operations that
* point to this leaf.
*/
- if (btrfs_header_level(buf) == 0) {
- read_lock(&fs_info->tree_mod_log_lock);
- must_pin = !list_empty(&fs_info->tree_mod_seq_list);
- read_unlock(&fs_info->tree_mod_log_lock);
- }
+ if (btrfs_header_level(buf) == 0 &&
+ test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
+ must_pin = true;
if (must_pin || btrfs_is_zoned(fs_info)) {
btrfs_redirty_list_add(trans->transaction, buf);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 910769d5fcdb..f2d1bb234377 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -13,6 +13,7 @@
#include <linux/pagevec.h>
#include <linux/prefetch.h>
#include <linux/cleancache.h>
+#include "misc.h"
#include "extent_io.h"
#include "extent-io-tree.h"
#include "extent_map.h"
@@ -2983,8 +2984,7 @@ static void end_bio_extent_readpage(struct bio *bio)
if (likely(uptodate)) {
if (is_data_inode(inode))
ret = btrfs_verify_data_csum(io_bio,
- bio_offset, page, start, end,
- mirror);
+ bio_offset, page, start, end);
else
ret = btrfs_validate_metadata_buffer(io_bio,
page, start, end, mirror);
@@ -3967,7 +3967,13 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
btrfs_tree_unlock(eb);
- if (!ret)
+ /*
+ * Either we don't need to submit any tree block, or we're submitting
+ * subpage eb.
+ * Subpage metadata doesn't use page locking at all, so we can skip
+ * the page locking.
+ */
+ if (!ret || fs_info->sectorsize < PAGE_SIZE)
return ret;
num_pages = num_extent_pages(eb);
@@ -4012,12 +4018,11 @@ err_unlock:
return ret;
}
-static void set_btree_ioerr(struct page *page)
+static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
{
- struct extent_buffer *eb = (struct extent_buffer *)page->private;
- struct btrfs_fs_info *fs_info;
+ struct btrfs_fs_info *fs_info = eb->fs_info;
- SetPageError(page);
+ btrfs_page_set_error(fs_info, page, eb->start, eb->len);
if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
return;
@@ -4025,7 +4030,6 @@ static void set_btree_ioerr(struct page *page)
* If we error out, we should add back the dirty_metadata_bytes
* to make it consistent.
*/
- fs_info = eb->fs_info;
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
eb->len, fs_info->dirty_metadata_batch);
@@ -4069,26 +4073,111 @@ static void set_btree_ioerr(struct page *page)
*/
switch (eb->log_index) {
case -1:
- set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags);
+ set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
break;
case 0:
- set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags);
+ set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
break;
case 1:
- set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags);
+ set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
break;
default:
BUG(); /* unexpected, logic error */
}
}
+/*
+ * The endio specific version which won't touch any unsafe spinlock in endio
+ * context.
+ */
+static struct extent_buffer *find_extent_buffer_nolock(
+ struct btrfs_fs_info *fs_info, u64 start)
+{
+ struct extent_buffer *eb;
+
+ rcu_read_lock();
+ eb = radix_tree_lookup(&fs_info->buffer_radix,
+ start >> fs_info->sectorsize_bits);
+ if (eb && atomic_inc_not_zero(&eb->refs)) {
+ rcu_read_unlock();
+ return eb;
+ }
+ rcu_read_unlock();
+ return NULL;
+}
+
+/*
+ * The endio function for subpage extent buffer write.
+ *
+ * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
+ * after all extent buffers in the page has finished their writeback.
+ */
+static void end_bio_subpage_eb_writepage(struct btrfs_fs_info *fs_info,
+ struct bio *bio)
+{
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+
+ ASSERT(!bio_flagged(bio, BIO_CLONED));
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ struct page *page = bvec->bv_page;
+ u64 bvec_start = page_offset(page) + bvec->bv_offset;
+ u64 bvec_end = bvec_start + bvec->bv_len - 1;
+ u64 cur_bytenr = bvec_start;
+
+ ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize));
+
+ /* Iterate through all extent buffers in the range */
+ while (cur_bytenr <= bvec_end) {
+ struct extent_buffer *eb;
+ int done;
+
+ /*
+ * Here we can't use find_extent_buffer(), as it may
+ * try to lock eb->refs_lock, which is not safe in endio
+ * context.
+ */
+ eb = find_extent_buffer_nolock(fs_info, cur_bytenr);
+ ASSERT(eb);
+
+ cur_bytenr = eb->start + eb->len;
+
+ ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags));
+ done = atomic_dec_and_test(&eb->io_pages);
+ ASSERT(done);
+
+ if (bio->bi_status ||
+ test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
+ ClearPageUptodate(page);
+ set_btree_ioerr(page, eb);
+ }
+
+ btrfs_subpage_clear_writeback(fs_info, page, eb->start,
+ eb->len);
+ end_extent_buffer_writeback(eb);
+ /*
+ * free_extent_buffer() will grab spinlock which is not
+ * safe in endio context. Thus here we manually dec
+ * the ref.
+ */
+ atomic_dec(&eb->refs);
+ }
+ }
+ bio_put(bio);
+}
+
static void end_bio_extent_buffer_writepage(struct bio *bio)
{
+ struct btrfs_fs_info *fs_info;
struct bio_vec *bvec;
struct extent_buffer *eb;
int done;
struct bvec_iter_all iter_all;
+ fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
+ if (fs_info->sectorsize < PAGE_SIZE)
+ return end_bio_subpage_eb_writepage(fs_info, bio);
+
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *page = bvec->bv_page;
@@ -4100,7 +4189,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
if (bio->bi_status ||
test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
ClearPageUptodate(page);
- set_btree_ioerr(page);
+ set_btree_ioerr(page, eb);
}
end_page_writeback(page);
@@ -4114,6 +4203,56 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
bio_put(bio);
}
+/*
+ * Unlike the work in write_one_eb(), we rely completely on extent locking.
+ * Page locking is only utilized at minimum to keep the VMM code happy.
+ *
+ * Caller should still call write_one_eb() other than this function directly.
+ * As write_one_eb() has extra preparation before submitting the extent buffer.
+ */
+static int write_one_subpage_eb(struct extent_buffer *eb,
+ struct writeback_control *wbc,
+ struct extent_page_data *epd)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct page *page = eb->pages[0];
+ unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
+ bool no_dirty_ebs = false;
+ int ret;
+
+ /* clear_page_dirty_for_io() in subpage helper needs page locked */
+ lock_page(page);
+ btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len);
+
+ /* Check if this is the last dirty bit to update nr_written */
+ no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page,
+ eb->start, eb->len);
+ if (no_dirty_ebs)
+ clear_page_dirty_for_io(page);
+
+ ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, page,
+ eb->start, eb->len, eb->start - page_offset(page),
+ &epd->bio, end_bio_extent_buffer_writepage, 0, 0, 0,
+ false);
+ if (ret) {
+ btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
+ set_btree_ioerr(page, eb);
+ unlock_page(page);
+
+ if (atomic_dec_and_test(&eb->io_pages))
+ end_extent_buffer_writeback(eb);
+ return -EIO;
+ }
+ unlock_page(page);
+ /*
+ * Submission finished without problem, if no range of the page is
+ * dirty anymore, we have submitted a page. Update nr_written in wbc.
+ */
+ if (no_dirty_ebs)
+ update_nr_written(wbc, 1);
+ return ret;
+}
+
static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
struct writeback_control *wbc,
struct extent_page_data *epd)
@@ -4145,6 +4284,9 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
memzero_extent_buffer(eb, start, end - start);
}
+ if (eb->fs_info->sectorsize < PAGE_SIZE)
+ return write_one_subpage_eb(eb, wbc, epd);
+
for (i = 0; i < num_pages; i++) {
struct page *p = eb->pages[i];
@@ -4156,7 +4298,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
end_bio_extent_buffer_writepage,
0, 0, 0, false);
if (ret) {
- set_btree_ioerr(p);
+ set_btree_ioerr(p, eb);
if (PageWriteback(p))
end_page_writeback(p);
if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
@@ -4181,6 +4323,98 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
}
/*
+ * Submit one subpage btree page.
+ *
+ * The main difference to submit_eb_page() is:
+ * - Page locking
+ * For subpage, we don't rely on page locking at all.
+ *
+ * - Flush write bio
+ * We only flush bio if we may be unable to fit current extent buffers into
+ * current bio.
+ *
+ * Return >=0 for the number of submitted extent buffers.
+ * Return <0 for fatal error.
+ */
+static int submit_eb_subpage(struct page *page,
+ struct writeback_control *wbc,
+ struct extent_page_data *epd)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ int submitted = 0;
+ u64 page_start = page_offset(page);
+ int bit_start = 0;
+ const int nbits = BTRFS_SUBPAGE_BITMAP_SIZE;
+ int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
+ int ret;
+
+ /* Lock and write each dirty extent buffers in the range */
+ while (bit_start < nbits) {
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct extent_buffer *eb;
+ unsigned long flags;
+ u64 start;
+
+ /*
+ * Take private lock to ensure the subpage won't be detached
+ * in the meantime.
+ */
+ spin_lock(&page->mapping->private_lock);
+ if (!PagePrivate(page)) {
+ spin_unlock(&page->mapping->private_lock);
+ break;
+ }
+ spin_lock_irqsave(&subpage->lock, flags);
+ if (!((1 << bit_start) & subpage->dirty_bitmap)) {
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock(&page->mapping->private_lock);
+ bit_start++;
+ continue;
+ }
+
+ start = page_start + bit_start * fs_info->sectorsize;
+ bit_start += sectors_per_node;
+
+ /*
+ * Here we just want to grab the eb without touching extra
+ * spin locks, so call find_extent_buffer_nolock().
+ */
+ eb = find_extent_buffer_nolock(fs_info, start);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock(&page->mapping->private_lock);
+
+ /*
+ * The eb has already reached 0 refs thus find_extent_buffer()
+ * doesn't return it. We don't need to write back such eb
+ * anyway.
+ */
+ if (!eb)
+ continue;
+
+ ret = lock_extent_buffer_for_io(eb, epd);
+ if (ret == 0) {
+ free_extent_buffer(eb);
+ continue;
+ }
+ if (ret < 0) {
+ free_extent_buffer(eb);
+ goto cleanup;
+ }
+ ret = write_one_eb(eb, wbc, epd);
+ free_extent_buffer(eb);
+ if (ret < 0)
+ goto cleanup;
+ submitted++;
+ }
+ return submitted;
+
+cleanup:
+ /* We hit error, end bio for the submitted extent buffers */
+ end_write_bio(epd, ret);
+ return ret;
+}
+
+/*
* Submit all page(s) of one extent buffer.
*
* @page: the page of one extent buffer
@@ -4212,6 +4446,9 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
if (!PagePrivate(page))
return 0;
+ if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
+ return submit_eb_subpage(page, wbc, epd);
+
spin_lock(&mapping->private_lock);
if (!PagePrivate(page)) {
spin_unlock(&mapping->private_lock);
@@ -4652,10 +4889,8 @@ void extent_readahead(struct readahead_control *rac)
int nr;
while ((nr = readahead_page_batch(rac, pagepool))) {
- u64 contig_start = page_offset(pagepool[0]);
- u64 contig_end = page_offset(pagepool[nr - 1]) + PAGE_SIZE - 1;
-
- ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
+ u64 contig_start = readahead_pos(rac);
+ u64 contig_end = contig_start + readahead_batch_length(rac) - 1;
contiguous_readpages(pagepool, nr, contig_start, contig_end,
&em_cached, &bio, &bio_flags, &prev_em_start);
@@ -5469,36 +5704,28 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
{
struct extent_buffer *eb;
- rcu_read_lock();
- eb = radix_tree_lookup(&fs_info->buffer_radix,
- start >> fs_info->sectorsize_bits);
- if (eb && atomic_inc_not_zero(&eb->refs)) {
- rcu_read_unlock();
- /*
- * Lock our eb's refs_lock to avoid races with
- * free_extent_buffer. When we get our eb it might be flagged
- * with EXTENT_BUFFER_STALE and another task running
- * free_extent_buffer might have seen that flag set,
- * eb->refs == 2, that the buffer isn't under IO (dirty and
- * writeback flags not set) and it's still in the tree (flag
- * EXTENT_BUFFER_TREE_REF set), therefore being in the process
- * of decrementing the extent buffer's reference count twice.
- * So here we could race and increment the eb's reference count,
- * clear its stale flag, mark it as dirty and drop our reference
- * before the other task finishes executing free_extent_buffer,
- * which would later result in an attempt to free an extent
- * buffer that is dirty.
- */
- if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
- spin_lock(&eb->refs_lock);
- spin_unlock(&eb->refs_lock);
- }
- mark_extent_buffer_accessed(eb, NULL);
- return eb;
+ eb = find_extent_buffer_nolock(fs_info, start);
+ if (!eb)
+ return NULL;
+ /*
+ * Lock our eb's refs_lock to avoid races with free_extent_buffer().
+ * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and
+ * another task running free_extent_buffer() might have seen that flag
+ * set, eb->refs == 2, that the buffer isn't under IO (dirty and
+ * writeback flags not set) and it's still in the tree (flag
+ * EXTENT_BUFFER_TREE_REF set), therefore being in the process of
+ * decrementing the extent buffer's reference count twice. So here we
+ * could race and increment the eb's reference count, clear its stale
+ * flag, mark it as dirty and drop our reference before the other task
+ * finishes executing free_extent_buffer, which would later result in
+ * an attempt to free an extent buffer that is dirty.
+ */
+ if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
+ spin_lock(&eb->refs_lock);
+ spin_unlock(&eb->refs_lock);
}
- rcu_read_unlock();
-
- return NULL;
+ mark_extent_buffer_accessed(eb, NULL);
+ return eb;
}
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -5594,6 +5821,17 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
return ERR_PTR(-EINVAL);
}
+#if BITS_PER_LONG == 32
+ if (start >= MAX_LFS_FILESIZE) {
+ btrfs_err_rl(fs_info,
+ "extent buffer %llu is beyond 32bit page cache limit", start);
+ btrfs_err_32bit_limit(fs_info);
+ return ERR_PTR(-EOVERFLOW);
+ }
+ if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD)
+ btrfs_warn_32bit_limit(fs_info);
+#endif
+
if (fs_info->sectorsize < PAGE_SIZE &&
offset_in_page(start) + len > PAGE_SIZE) {
btrfs_err(fs_info,
@@ -5665,7 +5903,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
btrfs_page_inc_eb_refs(fs_info, p);
spin_unlock(&mapping->private_lock);
- WARN_ON(PageDirty(p));
+ WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
eb->pages[i] = p;
if (!PageUptodate(p))
uptodate = 0;
@@ -5814,28 +6052,51 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
release_extent_buffer(eb);
}
+static void btree_clear_page_dirty(struct page *page)
+{
+ ASSERT(PageDirty(page));
+ ASSERT(PageLocked(page));
+ clear_page_dirty_for_io(page);
+ xa_lock_irq(&page->mapping->i_pages);
+ if (!PageDirty(page))
+ __xa_clear_mark(&page->mapping->i_pages,
+ page_index(page), PAGECACHE_TAG_DIRTY);
+ xa_unlock_irq(&page->mapping->i_pages);
+}
+
+static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct page *page = eb->pages[0];
+ bool last;
+
+ /* btree_clear_page_dirty() needs page locked */
+ lock_page(page);
+ last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start,
+ eb->len);
+ if (last)
+ btree_clear_page_dirty(page);
+ unlock_page(page);
+ WARN_ON(atomic_read(&eb->refs) == 0);
+}
+
void clear_extent_buffer_dirty(const struct extent_buffer *eb)
{
int i;
int num_pages;
struct page *page;
+ if (eb->fs_info->sectorsize < PAGE_SIZE)
+ return clear_subpage_extent_buffer_dirty(eb);
+
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
if (!PageDirty(page))
continue;
-
lock_page(page);
- WARN_ON(!PagePrivate(page));
-
- clear_page_dirty_for_io(page);
- xa_lock_irq(&page->mapping->i_pages);
- if (!PageDirty(page))
- __xa_clear_mark(&page->mapping->i_pages,
- page_index(page), PAGECACHE_TAG_DIRTY);
- xa_unlock_irq(&page->mapping->i_pages);
+ btree_clear_page_dirty(page);
ClearPageError(page);
unlock_page(page);
}
@@ -5856,10 +6117,28 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb)
WARN_ON(atomic_read(&eb->refs) == 0);
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
- if (!was_dirty)
- for (i = 0; i < num_pages; i++)
- set_page_dirty(eb->pages[i]);
+ if (!was_dirty) {
+ bool subpage = eb->fs_info->sectorsize < PAGE_SIZE;
+ /*
+ * For subpage case, we can have other extent buffers in the
+ * same page, and in clear_subpage_extent_buffer_dirty() we
+ * have to clear page dirty without subpage lock held.
+ * This can cause race where our page gets dirty cleared after
+ * we just set it.
+ *
+ * Thankfully, clear_subpage_extent_buffer_dirty() has locked
+ * its page for other reasons, we can use page lock to prevent
+ * the above race.
+ */
+ if (subpage)
+ lock_page(eb->pages[0]);
+ for (i = 0; i < num_pages; i++)
+ btrfs_page_set_dirty(eb->fs_info, eb->pages[i],
+ eb->start, eb->len);
+ if (subpage)
+ unlock_page(eb->pages[0]);
+ }
#ifdef CONFIG_BTRFS_DEBUG
for (i = 0; i < num_pages; i++)
ASSERT(PageDirty(eb->pages[i]));
@@ -6217,12 +6496,34 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
return ret;
}
+/*
+ * Check that the extent buffer is uptodate.
+ *
+ * For regular sector size == PAGE_SIZE case, check if @page is uptodate.
+ * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
+ */
+static void assert_eb_page_uptodate(const struct extent_buffer *eb,
+ struct page *page)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+
+ if (fs_info->sectorsize < PAGE_SIZE) {
+ bool uptodate;
+
+ uptodate = btrfs_subpage_test_uptodate(fs_info, page,
+ eb->start, eb->len);
+ WARN_ON(!uptodate);
+ } else {
+ WARN_ON(!PageUptodate(page));
+ }
+}
+
void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
const void *srcv)
{
char *kaddr;
- WARN_ON(!PageUptodate(eb->pages[0]));
+ assert_eb_page_uptodate(eb, eb->pages[0]);
kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv,
BTRFS_FSID_SIZE);
@@ -6232,7 +6533,7 @@ void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
{
char *kaddr;
- WARN_ON(!PageUptodate(eb->pages[0]));
+ assert_eb_page_uptodate(eb, eb->pages[0]);
kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv,
BTRFS_FSID_SIZE);
@@ -6257,7 +6558,7 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
while (len > 0) {
page = eb->pages[i];
- WARN_ON(!PageUptodate(page));
+ assert_eb_page_uptodate(eb, page);
cur = min(len, PAGE_SIZE - offset);
kaddr = page_address(page);
@@ -6286,7 +6587,7 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
while (len > 0) {
page = eb->pages[i];
- WARN_ON(!PageUptodate(page));
+ assert_eb_page_uptodate(eb, page);
cur = min(len, PAGE_SIZE - offset);
kaddr = page_address(page);
@@ -6344,7 +6645,7 @@ void copy_extent_buffer(const struct extent_buffer *dst,
while (len > 0) {
page = dst->pages[i];
- WARN_ON(!PageUptodate(page));
+ assert_eb_page_uptodate(dst, page);
cur = min(len, (unsigned long)(PAGE_SIZE - offset));
@@ -6406,7 +6707,7 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
eb_bitmap_offset(eb, start, nr, &i, &offset);
page = eb->pages[i];
- WARN_ON(!PageUptodate(page));
+ assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
}
@@ -6431,7 +6732,7 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star
eb_bitmap_offset(eb, start, pos, &i, &offset);
page = eb->pages[i];
- WARN_ON(!PageUptodate(page));
+ assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
while (len >= bits_to_set) {
@@ -6442,7 +6743,7 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star
if (++offset >= PAGE_SIZE && len > 0) {
offset = 0;
page = eb->pages[++i];
- WARN_ON(!PageUptodate(page));
+ assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
}
}
@@ -6474,7 +6775,7 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
eb_bitmap_offset(eb, start, pos, &i, &offset);
page = eb->pages[i];
- WARN_ON(!PageUptodate(page));
+ assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
while (len >= bits_to_clear) {
@@ -6485,7 +6786,7 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
if (++offset >= PAGE_SIZE && len > 0) {
offset = 0;
page = eb->pages[++i];
- WARN_ON(!PageUptodate(page));
+ assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
}
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 824640cb0ace..227215a5722c 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -66,6 +66,7 @@ enum {
struct btrfs_root;
struct btrfs_inode;
struct btrfs_io_bio;
+struct btrfs_fs_info;
struct io_failure_record;
struct extent_io_tree;
@@ -270,9 +271,6 @@ struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
struct bio *btrfs_bio_clone(struct bio *bio);
struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size);
-struct btrfs_fs_info;
-struct btrfs_inode;
-
int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, struct page *page,
unsigned int pg_offset, int mirror_num);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 47cd3a6dc635..294602f139ef 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -9,6 +9,7 @@
#include <linux/highmem.h>
#include <linux/sched/mm.h>
#include <crypto/hash.h>
+#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0e155f013839..864c08d08a35 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2014,14 +2014,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
else
num_written = btrfs_buffered_write(iocb, from);
- /*
- * We also have to set last_sub_trans to the current log transid,
- * otherwise subsequent syncs to a file that's been synced in this
- * transaction will appear to have already occurred.
- */
- spin_lock(&inode->lock);
- inode->last_sub_trans = inode->root->log_transid;
- spin_unlock(&inode->lock);
+ btrfs_set_inode_last_sub_trans(inode);
+
if (num_written > 0)
num_written = generic_write_sync(iocb, num_written);
@@ -2122,7 +2116,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (ret)
goto out;
- inode_lock(inode);
+ btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
atomic_inc(&root->log_batch);
@@ -2135,11 +2129,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
&BTRFS_I(inode)->runtime_flags);
/*
- * Before we acquired the inode's lock, someone may have dirtied more
- * pages in the target range. We need to make sure that writeback for
- * any such pages does not start while we are logging the inode, because
- * if it does, any of the following might happen when we are not doing a
- * full inode sync:
+ * Before we acquired the inode's lock and the mmap lock, someone may
+ * have dirtied more pages in the target range. We need to make sure
+ * that writeback for any such pages does not start while we are logging
+ * the inode, because if it does, any of the following might happen when
+ * we are not doing a full inode sync:
*
* 1) We log an extent after its writeback finishes but before its
* checksums are added to the csum tree, leading to -EIO errors
@@ -2154,7 +2148,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = start_ordered_ops(inode, start, end);
if (ret) {
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
goto out;
}
@@ -2255,7 +2249,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
if (ret != BTRFS_NO_LOG_SYNC) {
if (!ret) {
@@ -2285,7 +2279,7 @@ out:
out_release_extents:
btrfs_release_log_ctx_extents(&ctx);
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
goto out;
}
@@ -2605,16 +2599,17 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
* extents without inserting a new one, so we must abort the transaction to avoid
* a corruption.
*/
-int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
- const u64 start, const u64 end,
- struct btrfs_replace_extent_info *extent_info,
- struct btrfs_trans_handle **trans_out)
+int btrfs_replace_file_extents(struct btrfs_inode *inode,
+ struct btrfs_path *path, const u64 start,
+ const u64 end,
+ struct btrfs_replace_extent_info *extent_info,
+ struct btrfs_trans_handle **trans_out)
{
struct btrfs_drop_extents_args drop_args = { 0 };
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
- u64 ino_size = round_up(inode->i_size, fs_info->sectorsize);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
struct btrfs_trans_handle *trans = NULL;
struct btrfs_block_rsv *rsv;
unsigned int rsv_count;
@@ -2662,10 +2657,10 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
drop_args.drop_cache = true;
while (cur_offset < end) {
drop_args.start = cur_offset;
- ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
/* If we are punching a hole decrement the inode's byte count */
if (!extent_info)
- btrfs_update_inode_bytes(BTRFS_I(inode), 0,
+ btrfs_update_inode_bytes(inode, 0,
drop_args.bytes_found);
if (ret != -ENOSPC) {
/*
@@ -2685,8 +2680,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
if (!extent_info && cur_offset < drop_args.drop_end &&
cur_offset < ino_size) {
- ret = fill_holes(trans, BTRFS_I(inode), path,
- cur_offset, drop_args.drop_end);
+ ret = fill_holes(trans, inode, path, cur_offset,
+ drop_args.drop_end);
if (ret) {
/*
* If we failed then we didn't insert our hole
@@ -2704,7 +2699,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
* know to not set disk_i_size in this area until a new
* file extent is inserted here.
*/
- ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
+ ret = btrfs_inode_clear_file_extent_range(inode,
cur_offset,
drop_args.drop_end - cur_offset);
if (ret) {
@@ -2723,8 +2718,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
u64 replace_len = drop_args.drop_end -
extent_info->file_offset;
- ret = btrfs_insert_replace_extent(trans, BTRFS_I(inode),
- path, extent_info, replace_len,
+ ret = btrfs_insert_replace_extent(trans, inode, path,
+ extent_info, replace_len,
drop_args.bytes_found);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -2735,9 +2730,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
extent_info->file_offset += replace_len;
}
- cur_offset = drop_args.drop_end;
-
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, root, inode);
if (ret)
break;
@@ -2756,9 +2749,10 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
- if (!extent_info) {
- ret = find_first_non_hole(BTRFS_I(inode), &cur_offset,
- &len);
+ cur_offset = drop_args.drop_end;
+ len = end - cur_offset;
+ if (!extent_info && len) {
+ ret = find_first_non_hole(inode, &cur_offset, &len);
if (unlikely(ret < 0))
break;
if (ret && !len) {
@@ -2771,14 +2765,11 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
/*
* If we were cloning, force the next fsync to be a full one since we
* we replaced (or just dropped in the case of cloning holes when
- * NO_HOLES is enabled) extents and extent maps.
- * This is for the sake of simplicity, and cloning into files larger
- * than 16Mb would force the full fsync any way (when
- * try_release_extent_mapping() is invoked during page cache truncation.
+ * NO_HOLES is enabled) file extent items and did not setup new extent
+ * maps for the replacement extents (or holes).
*/
if (extent_info && !extent_info->is_new_extent)
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
if (ret)
goto out_trans;
@@ -2804,8 +2795,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
*/
if (!extent_info && cur_offset < ino_size &&
cur_offset < drop_args.drop_end) {
- ret = fill_holes(trans, BTRFS_I(inode), path,
- cur_offset, drop_args.drop_end);
+ ret = fill_holes(trans, inode, path, cur_offset,
+ drop_args.drop_end);
if (ret) {
/* Same comment as above. */
btrfs_abort_transaction(trans, ret);
@@ -2813,8 +2804,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
}
} else if (!extent_info && cur_offset < drop_args.drop_end) {
/* See the comment in the loop above for the reasoning here. */
- ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
- cur_offset, drop_args.drop_end - cur_offset);
+ ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
+ drop_args.drop_end - cur_offset);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_trans;
@@ -2822,7 +2813,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
}
if (extent_info) {
- ret = btrfs_insert_replace_extent(trans, BTRFS_I(inode), path,
+ ret = btrfs_insert_replace_extent(trans, inode, path,
extent_info, extent_info->data_len,
drop_args.bytes_found);
if (ret) {
@@ -2868,7 +2859,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (ret)
return ret;
- inode_lock(inode);
+ btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
ino_size = round_up(inode->i_size, fs_info->sectorsize);
ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
if (ret < 0)
@@ -2908,7 +2899,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
truncated_block = true;
ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
if (ret) {
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
return ret;
}
}
@@ -2967,8 +2958,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out;
}
- ret = btrfs_replace_file_extents(inode, path, lockstart, lockend, NULL,
- &trans);
+ ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
+ lockend, NULL, &trans);
btrfs_free_path(path);
if (ret)
goto out;
@@ -3009,7 +3000,7 @@ out_only_mutex:
ret = ret2;
}
}
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
return ret;
}
@@ -3335,7 +3326,7 @@ static long btrfs_fallocate(struct file *file, int mode,
return ret;
}
- btrfs_inode_lock(inode, 0);
+ btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
ret = inode_newsize_ok(inode, offset + len);
@@ -3377,7 +3368,7 @@ static long btrfs_fallocate(struct file *file, int mode,
if (mode & FALLOC_FL_ZERO_RANGE) {
ret = btrfs_zero_range(inode, offset, len, mode);
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
return ret;
}
@@ -3487,7 +3478,7 @@ out_unlock:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
&cached_state);
out:
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
/* Let go of our reservation. */
if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
@@ -3496,13 +3487,13 @@ out:
return ret;
}
-static loff_t find_desired_extent(struct inode *inode, loff_t offset,
+static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
int whence)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
- loff_t i_size = inode->i_size;
+ loff_t i_size = inode->vfs_inode.i_size;
u64 lockstart;
u64 lockend;
u64 start;
@@ -3525,11 +3516,10 @@ static loff_t find_desired_extent(struct inode *inode, loff_t offset,
lockend--;
len = lockend - lockstart + 1;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
+ lock_extent_bits(&inode->io_tree, lockstart, lockend, &cached_state);
while (start < i_size) {
- em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len);
+ em = btrfs_get_extent_fiemap(inode, start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
em = NULL;
@@ -3551,7 +3541,7 @@ static loff_t find_desired_extent(struct inode *inode, loff_t offset,
cond_resched();
}
free_extent_map(em);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ unlock_extent_cached(&inode->io_tree, lockstart, lockend,
&cached_state);
if (ret) {
offset = ret;
@@ -3575,7 +3565,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
case SEEK_DATA:
case SEEK_HOLE:
btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
- offset = find_desired_extent(inode, offset, whence);
+ offset = find_desired_extent(BTRFS_I(inode), offset, whence);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
break;
}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 9988decd5717..e54466fc101f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -11,6 +11,7 @@
#include <linux/ratelimit.h>
#include <linux/error-injection.h>
#include <linux/sched/mm.h>
+#include "misc.h"
#include "ctree.h"
#include "free-space-cache.h"
#include "transaction.h"
@@ -2539,6 +2540,7 @@ out:
static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
u64 bytenr, u64 size, bool used)
{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
u64 offset = bytenr - block_group->start;
u64 to_free, to_unusable;
@@ -2569,8 +2571,13 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
}
/* All the region is now unusable. Mark it as unused and reclaim */
- if (block_group->zone_unusable == block_group->length)
+ if (block_group->zone_unusable == block_group->length) {
btrfs_mark_bg_unused(block_group);
+ } else if (block_group->zone_unusable >=
+ div_factor_fine(block_group->length,
+ fs_info->bg_reclaim_threshold)) {
+ btrfs_mark_bg_to_reclaim(block_group);
+ }
return 0;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7cdf65be3707..b21d491b3adc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -102,6 +102,7 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode,
* BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
* BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
* return -EAGAIN
+ * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
*/
int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
{
@@ -122,6 +123,8 @@ int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
}
inode_lock(inode);
}
+ if (ilock_flags & BTRFS_ILOCK_MMAP)
+ down_write(&BTRFS_I(inode)->i_mmap_lock);
return 0;
}
@@ -133,6 +136,8 @@ int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
*/
void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags)
{
+ if (ilock_flags & BTRFS_ILOCK_MMAP)
+ up_write(&BTRFS_I(inode)->i_mmap_lock);
if (ilock_flags & BTRFS_ILOCK_SHARED)
inode_unlock_shared(inode);
else
@@ -1516,7 +1521,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
struct page *locked_page,
const u64 start, const u64 end,
- int *page_started, int force,
+ int *page_started,
unsigned long *nr_written)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1530,6 +1535,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
u64 ino = btrfs_ino(inode);
bool nocow = false;
u64 disk_bytenr = 0;
+ const bool force = inode->flags & BTRFS_INODE_NODATACOW;
path = btrfs_alloc_path();
if (!path) {
@@ -1863,23 +1869,16 @@ error:
return ret;
}
-static inline int need_force_cow(struct btrfs_inode *inode, u64 start, u64 end)
+static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
{
-
- if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
- !(inode->flags & BTRFS_INODE_PREALLOC))
- return 0;
-
- /*
- * @defrag_bytes is a hint value, no spinlock held here,
- * if is not zero, it means the file is defragging.
- * Force cow if given extent needs to be defragged.
- */
- if (inode->defrag_bytes &&
- test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, 0, NULL))
- return 1;
-
- return 0;
+ if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
+ if (inode->defrag_bytes &&
+ test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
+ 0, NULL))
+ return false;
+ return true;
+ }
+ return false;
}
/*
@@ -1891,17 +1890,12 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
struct writeback_control *wbc)
{
int ret;
- int force_cow = need_force_cow(inode, start, end);
const bool zoned = btrfs_is_zoned(inode->root->fs_info);
- if (inode->flags & BTRFS_INODE_NODATACOW && !force_cow) {
- ASSERT(!zoned);
- ret = run_delalloc_nocow(inode, locked_page, start, end,
- page_started, 1, nr_written);
- } else if (inode->flags & BTRFS_INODE_PREALLOC && !force_cow) {
+ if (should_nocow(inode, start, end)) {
ASSERT(!zoned);
ret = run_delalloc_nocow(inode, locked_page, start, end,
- page_started, 0, nr_written);
+ page_started, nr_written);
} else if (!inode_can_compress(inode) ||
!inode_need_compress(inode, start, end)) {
if (zoned)
@@ -3099,11 +3093,13 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
* @bio_offset: offset to the beginning of the bio (in bytes)
* @page: page where is the data to be verified
* @pgoff: offset inside the page
+ * @start: logical offset in the file
*
* The length of such check is always one sector size.
*/
static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
- u32 bio_offset, struct page *page, u32 pgoff)
+ u32 bio_offset, struct page *page, u32 pgoff,
+ u64 start)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
@@ -3130,8 +3126,8 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
kunmap_atomic(kaddr);
return 0;
zeroit:
- btrfs_print_data_csum_error(BTRFS_I(inode), page_offset(page) + pgoff,
- csum, csum_expected, io_bio->mirror_num);
+ btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
+ io_bio->mirror_num);
if (io_bio->device)
btrfs_dev_stat_inc_and_print(io_bio->device,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
@@ -3149,10 +3145,9 @@ zeroit:
* @bio_offset: offset to the beginning of the bio (in bytes)
* @start: file offset of the range start
* @end: file offset of the range end (inclusive)
- * @mirror: mirror number
*/
int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
- struct page *page, u64 start, u64 end, int mirror)
+ struct page *page, u64 start, u64 end)
{
struct inode *inode = page->mapping->host;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -3184,7 +3179,8 @@ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
pg_off += sectorsize, bio_offset += sectorsize) {
int ret;
- ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off);
+ ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
+ page_offset(page) + pg_off);
if (ret < 0)
return -EIO;
}
@@ -3390,15 +3386,19 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
int is_dead_root = 0;
/*
- * this is an orphan in the tree root. Currently these
+ * This is an orphan in the tree root. Currently these
* could come from 2 sources:
- * a) a snapshot deletion in progress
+ * a) a root (snapshot/subvolume) deletion in progress
* b) a free space cache inode
- * We need to distinguish those two, as the snapshot
- * orphan must not get deleted.
- * find_dead_roots already ran before us, so if this
- * is a snapshot deletion, we should find the root
- * in the fs_roots radix tree.
+ * We need to distinguish those two, as the orphan item
+ * for a root must not get deleted before the deletion
+ * of the snapshot/subvolume's tree completes.
+ *
+ * btrfs_find_orphan_roots() ran before us, which has
+ * found all deleted roots and loaded them into
+ * fs_info->fs_roots_radix. So here we can find if an
+ * orphan item corresponds to a deleted root by looking
+ * up the root from that radix tree.
*/
spin_lock(&fs_info->fs_roots_radix_lock);
@@ -4329,7 +4329,11 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
goto out_end_trans;
}
- btrfs_record_root_in_trans(trans, dest);
+ ret = btrfs_record_root_in_trans(trans, dest);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_end_trans;
+ }
memset(&dest->root_item.drop_progress, 0,
sizeof(dest->root_item.drop_progress));
@@ -7023,7 +7027,7 @@ next:
if (ret)
goto out;
} else {
- map = kmap(page);
+ map = kmap_local_page(page);
read_extent_buffer(leaf, map + pg_offset, ptr,
copy_size);
if (pg_offset + copy_size < PAGE_SIZE) {
@@ -7031,7 +7035,7 @@ next:
PAGE_SIZE - pg_offset -
copy_size);
}
- kunmap(page);
+ kunmap_local(map);
}
flush_dcache_page(page);
}
@@ -7259,6 +7263,19 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
return em;
}
+static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_block_group *block_group;
+ bool readonly = false;
+
+ block_group = btrfs_lookup_block_group(fs_info, bytenr);
+ if (!block_group || block_group->ro)
+ readonly = true;
+ if (block_group)
+ btrfs_put_block_group(block_group);
+ return readonly;
+}
+
/*
* Check if we can do nocow write into the range [@offset, @offset + @len)
*
@@ -7910,7 +7927,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
ASSERT(pgoff < PAGE_SIZE);
if (uptodate &&
(!csum || !check_data_csum(inode, io_bio,
- bio_offset, bvec.bv_page, pgoff))) {
+ bio_offset, bvec.bv_page,
+ pgoff, start))) {
clean_io_failure(fs_info, failure_tree, io_tree,
start, bvec.bv_page,
btrfs_ino(BTRFS_I(inode)),
@@ -8169,10 +8187,6 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
bio->bi_end_io = btrfs_end_dio_bio;
btrfs_io_bio(bio)->logical = file_offset;
- WARN_ON_ONCE(write && btrfs_is_zoned(fs_info) &&
- fs_info->max_zone_append_size &&
- bio_op(bio) != REQ_OP_ZONE_APPEND);
-
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
status = extract_ordered_extent(BTRFS_I(inode), bio,
file_offset);
@@ -8403,17 +8417,11 @@ again:
* for the finish_ordered_io
*/
if (TestClearPagePrivate2(page)) {
- struct btrfs_ordered_inode_tree *tree;
- u64 new_len;
-
- tree = &inode->ordered_tree;
-
- spin_lock_irq(&tree->lock);
+ spin_lock_irq(&inode->ordered_tree.lock);
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
- new_len = start - ordered->file_offset;
- if (new_len < ordered->truncated_len)
- ordered->truncated_len = new_len;
- spin_unlock_irq(&tree->lock);
+ ordered->truncated_len = min(ordered->truncated_len,
+ start - ordered->file_offset);
+ spin_unlock_irq(&inode->ordered_tree.lock);
if (btrfs_dec_test_ordered_pending(inode, &ordered,
start,
@@ -8539,6 +8547,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
again:
+ down_read(&BTRFS_I(inode)->i_mmap_lock);
lock_page(page);
size = i_size_read(inode);
@@ -8567,6 +8576,7 @@ again:
unlock_extent_cached(io_tree, page_start, page_end,
&cached_state);
unlock_page(page);
+ up_read(&BTRFS_I(inode)->i_mmap_lock);
btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
@@ -8619,11 +8629,10 @@ again:
set_page_dirty(page);
SetPageUptodate(page);
- BTRFS_I(inode)->last_trans = fs_info->generation;
- BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
- BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
+ btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
+ up_read(&BTRFS_I(inode)->i_mmap_lock);
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
sb_end_pagefault(inode->i_sb);
@@ -8632,6 +8641,7 @@ again:
out_unlock:
unlock_page(page);
+ up_read(&BTRFS_I(inode)->i_mmap_lock);
out:
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
@@ -8883,6 +8893,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->delayed_iput);
RB_CLEAR_NODE(&ei->rb_node);
+ init_rwsem(&ei->i_mmap_lock);
return inode;
}
@@ -9101,8 +9112,11 @@ static int btrfs_rename_exchange(struct inode *old_dir,
goto out_notrans;
}
- if (dest != root)
- btrfs_record_root_in_trans(trans, dest);
+ if (dest != root) {
+ ret = btrfs_record_root_in_trans(trans, dest);
+ if (ret)
+ goto out_fail;
+ }
/*
* We need to find a free sequence number both in the source and
@@ -9406,8 +9420,11 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out_notrans;
}
- if (dest != root)
- btrfs_record_root_in_trans(trans, dest);
+ if (dest != root) {
+ ret = btrfs_record_root_in_trans(trans, dest);
+ if (ret)
+ goto out_fail;
+ }
ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
if (ret)
@@ -9919,7 +9936,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
goto free_qgroup;
}
- ret = btrfs_replace_file_extents(&inode->vfs_inode, path, file_offset,
+ ret = btrfs_replace_file_extents(inode, path, file_offset,
file_offset + len - 1, &extent_info,
&trans);
btrfs_free_path(path);
@@ -10603,6 +10620,8 @@ static const struct inode_operations btrfs_dir_inode_operations = {
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
.tmpfile = btrfs_tmpfile,
+ .fileattr_get = btrfs_fileattr_get,
+ .fileattr_set = btrfs_fileattr_set,
};
static const struct file_operations btrfs_dir_file_operations = {
@@ -10656,6 +10675,8 @@ static const struct inode_operations btrfs_file_inode_operations = {
.get_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
+ .fileattr_get = btrfs_fileattr_get,
+ .fileattr_set = btrfs_fileattr_set,
};
static const struct inode_operations btrfs_special_inode_operations = {
.getattr = btrfs_getattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e8d53fea4c61..ee1dbabb5d3c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -26,6 +26,7 @@
#include <linux/btrfs.h>
#include <linux/uaccess.h>
#include <linux/iversion.h>
+#include <linux/fileattr.h>
#include "ctree.h"
#include "disk-io.h"
#include "export.h"
@@ -153,16 +154,6 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
new_fl);
}
-static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
-{
- struct btrfs_inode *binode = BTRFS_I(file_inode(file));
- unsigned int flags = btrfs_inode_flags_to_fsflags(binode->flags);
-
- if (copy_to_user(arg, &flags, sizeof(flags)))
- return -EFAULT;
- return 0;
-}
-
/*
* Check if @flags are a supported and valid set of FS_*_FL flags and that
* the old and new flags are not conflicting
@@ -201,9 +192,22 @@ static int check_fsflags_compatible(struct btrfs_fs_info *fs_info,
return 0;
}
-static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
+/*
+ * Set flags/xflags from the internal inode flags. The remaining items of
+ * fsxattr are zeroed.
+ */
+int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+ struct btrfs_inode *binode = BTRFS_I(d_inode(dentry));
+
+ fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode->flags));
+ return 0;
+}
+
+int btrfs_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
{
- struct inode *inode = file_inode(file);
+ struct inode *inode = d_inode(dentry);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_inode *binode = BTRFS_I(inode);
struct btrfs_root *root = binode->root;
@@ -213,34 +217,21 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
const char *comp = NULL;
u32 binode_flags;
- if (!inode_owner_or_capable(&init_user_ns, inode))
- return -EPERM;
-
if (btrfs_root_readonly(root))
return -EROFS;
- if (copy_from_user(&fsflags, arg, sizeof(fsflags)))
- return -EFAULT;
-
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
+ if (fileattr_has_fsx(fa))
+ return -EOPNOTSUPP;
- inode_lock(inode);
- fsflags = btrfs_mask_fsflags_for_type(inode, fsflags);
+ fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags);
old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
-
- ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags);
- if (ret)
- goto out_unlock;
-
ret = check_fsflags(old_fsflags, fsflags);
if (ret)
- goto out_unlock;
+ return ret;
ret = check_fsflags_compatible(fs_info, fsflags);
if (ret)
- goto out_unlock;
+ return ret;
binode_flags = binode->flags;
if (fsflags & FS_SYNC_FL)
@@ -263,6 +254,14 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
binode_flags |= BTRFS_INODE_NOATIME;
else
binode_flags &= ~BTRFS_INODE_NOATIME;
+
+ /* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */
+ if (!fa->flags_valid) {
+ /* 1 item for the inode */
+ trans = btrfs_start_transaction(root, 1);
+ goto update_flags;
+ }
+
if (fsflags & FS_DIRSYNC_FL)
binode_flags |= BTRFS_INODE_DIRSYNC;
else
@@ -303,10 +302,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
binode_flags |= BTRFS_INODE_NOCOMPRESS;
} else if (fsflags & FS_COMPR_FL) {
- if (IS_SWAPFILE(inode)) {
- ret = -ETXTBSY;
- goto out_unlock;
- }
+ if (IS_SWAPFILE(inode))
+ return -ETXTBSY;
binode_flags |= BTRFS_INODE_COMPRESS;
binode_flags &= ~BTRFS_INODE_NOCOMPRESS;
@@ -323,10 +320,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
* 2 for properties
*/
trans = btrfs_start_transaction(root, 3);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- goto out_unlock;
- }
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
if (comp) {
ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp,
@@ -344,6 +339,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
}
}
+update_flags:
binode->flags = binode_flags;
btrfs_sync_inode_flags_to_i_flags(inode);
inode_inc_iversion(inode);
@@ -352,44 +348,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
out_end_trans:
btrfs_end_transaction(trans);
- out_unlock:
- inode_unlock(inode);
- mnt_drop_write_file(file);
return ret;
}
-/*
- * Translate btrfs internal inode flags to xflags as expected by the
- * FS_IOC_FSGETXATT ioctl. Filter only the supported ones, unknown flags are
- * silently dropped.
- */
-static unsigned int btrfs_inode_flags_to_xflags(unsigned int flags)
-{
- unsigned int xflags = 0;
-
- if (flags & BTRFS_INODE_APPEND)
- xflags |= FS_XFLAG_APPEND;
- if (flags & BTRFS_INODE_IMMUTABLE)
- xflags |= FS_XFLAG_IMMUTABLE;
- if (flags & BTRFS_INODE_NOATIME)
- xflags |= FS_XFLAG_NOATIME;
- if (flags & BTRFS_INODE_NODUMP)
- xflags |= FS_XFLAG_NODUMP;
- if (flags & BTRFS_INODE_SYNC)
- xflags |= FS_XFLAG_SYNC;
-
- return xflags;
-}
-
-/* Check if @flags are a supported and valid set of FS_XFLAGS_* flags */
-static int check_xflags(unsigned int flags)
-{
- if (flags & ~(FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE | FS_XFLAG_NOATIME |
- FS_XFLAG_NODUMP | FS_XFLAG_SYNC))
- return -EOPNOTSUPP;
- return 0;
-}
-
bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation type)
{
@@ -402,111 +363,6 @@ void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
}
-/*
- * Set the xflags from the internal inode flags. The remaining items of fsxattr
- * are zeroed.
- */
-static int btrfs_ioctl_fsgetxattr(struct file *file, void __user *arg)
-{
- struct btrfs_inode *binode = BTRFS_I(file_inode(file));
- struct fsxattr fa;
-
- simple_fill_fsxattr(&fa, btrfs_inode_flags_to_xflags(binode->flags));
- if (copy_to_user(arg, &fa, sizeof(fa)))
- return -EFAULT;
-
- return 0;
-}
-
-static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg)
-{
- struct inode *inode = file_inode(file);
- struct btrfs_inode *binode = BTRFS_I(inode);
- struct btrfs_root *root = binode->root;
- struct btrfs_trans_handle *trans;
- struct fsxattr fa, old_fa;
- unsigned old_flags;
- unsigned old_i_flags;
- int ret = 0;
-
- if (!inode_owner_or_capable(&init_user_ns, inode))
- return -EPERM;
-
- if (btrfs_root_readonly(root))
- return -EROFS;
-
- if (copy_from_user(&fa, arg, sizeof(fa)))
- return -EFAULT;
-
- ret = check_xflags(fa.fsx_xflags);
- if (ret)
- return ret;
-
- if (fa.fsx_extsize != 0 || fa.fsx_projid != 0 || fa.fsx_cowextsize != 0)
- return -EOPNOTSUPP;
-
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
- inode_lock(inode);
-
- old_flags = binode->flags;
- old_i_flags = inode->i_flags;
-
- simple_fill_fsxattr(&old_fa,
- btrfs_inode_flags_to_xflags(binode->flags));
- ret = vfs_ioc_fssetxattr_check(inode, &old_fa, &fa);
- if (ret)
- goto out_unlock;
-
- if (fa.fsx_xflags & FS_XFLAG_SYNC)
- binode->flags |= BTRFS_INODE_SYNC;
- else
- binode->flags &= ~BTRFS_INODE_SYNC;
- if (fa.fsx_xflags & FS_XFLAG_IMMUTABLE)
- binode->flags |= BTRFS_INODE_IMMUTABLE;
- else
- binode->flags &= ~BTRFS_INODE_IMMUTABLE;
- if (fa.fsx_xflags & FS_XFLAG_APPEND)
- binode->flags |= BTRFS_INODE_APPEND;
- else
- binode->flags &= ~BTRFS_INODE_APPEND;
- if (fa.fsx_xflags & FS_XFLAG_NODUMP)
- binode->flags |= BTRFS_INODE_NODUMP;
- else
- binode->flags &= ~BTRFS_INODE_NODUMP;
- if (fa.fsx_xflags & FS_XFLAG_NOATIME)
- binode->flags |= BTRFS_INODE_NOATIME;
- else
- binode->flags &= ~BTRFS_INODE_NOATIME;
-
- /* 1 item for the inode */
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- goto out_unlock;
- }
-
- btrfs_sync_inode_flags_to_i_flags(inode);
- inode_inc_iversion(inode);
- inode->i_ctime = current_time(inode);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
-
- btrfs_end_transaction(trans);
-
-out_unlock:
- if (ret) {
- binode->flags = old_flags;
- inode->i_flags = old_i_flags;
- }
-
- inode_unlock(inode);
- mnt_drop_write_file(file);
-
- return ret;
-}
-
static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
{
struct inode *inode = file_inode(file);
@@ -697,8 +553,6 @@ static noinline int create_subvol(struct inode *dir,
btrfs_set_root_otransid(root_item, trans->transid);
btrfs_tree_unlock(leaf);
- free_extent_buffer(leaf);
- leaf = NULL;
btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID);
@@ -707,8 +561,22 @@ static noinline int create_subvol(struct inode *dir,
key.type = BTRFS_ROOT_ITEM_KEY;
ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
root_item);
- if (ret)
+ if (ret) {
+ /*
+ * Since we don't abort the transaction in this case, free the
+ * tree block so that we don't leak space and leave the
+ * filesystem in an inconsistent state (an extent item in the
+ * extent tree without backreferences). Also no need to have
+ * the tree block locked since it is not in any tree at this
+ * point, so no other task can find it and use it.
+ */
+ btrfs_free_tree_block(trans, root, leaf, 0, 1);
+ free_extent_buffer(leaf);
goto fail;
+ }
+
+ free_extent_buffer(leaf);
+ leaf = NULL;
key.offset = (u64)-1;
new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
@@ -721,7 +589,12 @@ static noinline int create_subvol(struct inode *dir,
/* Freeing will be done in btrfs_put_root() of new_root */
anon_dev = 0;
- btrfs_record_root_in_trans(trans, new_root);
+ ret = btrfs_record_root_in_trans(trans, new_root);
+ if (ret) {
+ btrfs_put_root(new_root);
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
ret = btrfs_create_subvol_root(trans, new_root, root);
btrfs_put_root(new_root);
@@ -1014,7 +887,7 @@ out_up_read:
out_dput:
dput(dentry);
out_unlock:
- inode_unlock(dir);
+ btrfs_inode_unlock(dir, 0);
return error;
}
@@ -1612,7 +1485,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
ra_index += cluster;
}
- inode_lock(inode);
+ btrfs_inode_lock(inode, 0);
if (IS_SWAPFILE(inode)) {
ret = -ETXTBSY;
} else {
@@ -1621,13 +1494,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
ret = cluster_pages_for_defrag(inode, pages, i, cluster);
}
if (ret < 0) {
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, 0);
goto out_ra;
}
defrag_count += ret;
balance_dirty_pages_ratelimited(inode->i_mapping);
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, 0);
if (newer_than) {
if (newer_off == (u64)-1)
@@ -1675,9 +1548,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
out_ra:
if (do_compress) {
- inode_lock(inode);
+ btrfs_inode_lock(inode, 0);
BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, 0);
}
if (!file)
kfree(ra);
@@ -3112,9 +2985,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out_dput;
}
- inode_lock(inode);
+ btrfs_inode_lock(inode, 0);
err = btrfs_delete_subvolume(dir, dentry);
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, 0);
if (!err) {
fsnotify_rmdir(dir, dentry);
d_delete(dentry);
@@ -3123,7 +2996,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
out_dput:
dput(dentry);
out_unlock_dir:
- inode_unlock(dir);
+ btrfs_inode_unlock(dir, 0);
free_subvol_name:
kfree(subvol_name_ptr);
free_parent:
@@ -4915,10 +4788,6 @@ long btrfs_ioctl(struct file *file, unsigned int
void __user *argp = (void __user *)arg;
switch (cmd) {
- case FS_IOC_GETFLAGS:
- return btrfs_ioctl_getflags(file, argp);
- case FS_IOC_SETFLAGS:
- return btrfs_ioctl_setflags(file, argp);
case FS_IOC_GETVERSION:
return btrfs_ioctl_getversion(file, argp);
case FS_IOC_GETFSLABEL:
@@ -5044,10 +4913,6 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_get_features(fs_info, argp);
case BTRFS_IOC_SET_FEATURES:
return btrfs_ioctl_set_features(file, argp);
- case FS_IOC_FSGETXATTR:
- return btrfs_ioctl_fsgetxattr(file, argp);
- case FS_IOC_FSSETXATTR:
- return btrfs_ioctl_fssetxattr(file, argp);
case BTRFS_IOC_GET_SUBVOL_INFO:
return btrfs_ioctl_get_subvol_info(file, argp);
case BTRFS_IOC_GET_SUBVOL_ROOTREF:
@@ -5067,12 +4932,6 @@ long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
* handling is necessary.
*/
switch (cmd) {
- case FS_IOC32_GETFLAGS:
- cmd = FS_IOC_GETFLAGS;
- break;
- case FS_IOC32_SETFLAGS:
- cmd = FS_IOC_SETFLAGS;
- break;
case FS_IOC32_GETVERSION:
cmd = FS_IOC_GETVERSION;
break;
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 9084a950dc09..cd042c7567a4 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -118,7 +118,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0;
char *data_in;
- char *cpage_out;
+ char *cpage_out, *sizes_ptr;
int nr_pages = 0;
struct page *in_page = NULL;
struct page *out_page = NULL;
@@ -258,10 +258,9 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
}
/* store the size of all chunks of compressed data */
- cpage_out = kmap(pages[0]);
- write_compress_length(cpage_out, tot_out);
-
- kunmap(pages[0]);
+ sizes_ptr = kmap_local_page(pages[0]);
+ write_compress_length(sizes_ptr, tot_out);
+ kunmap_local(sizes_ptr);
ret = 0;
*total_out = tot_out;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 985a21558437..07b0b4218791 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -107,17 +107,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
return NULL;
}
-/*
- * helper to check if a given offset is inside a given entry
- */
-static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
-{
- if (file_offset < entry->file_offset ||
- entry->file_offset + entry->num_bytes <= file_offset)
- return 0;
- return 1;
-}
-
static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
u64 len)
{
@@ -142,7 +131,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
if (tree->last) {
entry = rb_entry(tree->last, struct btrfs_ordered_extent,
rb_node);
- if (offset_in_entry(entry, file_offset))
+ if (in_range(file_offset, entry->file_offset, entry->num_bytes))
return tree->last;
}
ret = __tree_search(root, file_offset, &prev);
@@ -349,7 +338,7 @@ bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
goto out;
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- if (!offset_in_entry(entry, *file_offset))
+ if (!in_range(*file_offset, entry->file_offset, entry->num_bytes))
goto out;
dec_start = max(*file_offset, entry->file_offset);
@@ -428,7 +417,7 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
have_entry:
- if (!offset_in_entry(entry, file_offset))
+ if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
goto out;
if (io_size > entry->bytes_left)
@@ -779,7 +768,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino
goto out;
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- if (!offset_in_entry(entry, file_offset))
+ if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
entry = NULL;
if (entry)
refcount_inc(&entry->refs);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 99e0853e4d3b..e60c07f36427 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -39,8 +39,8 @@ struct btrfs_ordered_sum {
*/
enum {
/*
- * Different types for direct io, one and only one of the 4 type can
- * be set when creating ordered extent.
+ * Different types for ordered extents, one and only one of the 4 types
+ * need to be set when creating ordered extent.
*
* REGULAR: For regular non-compressed COW write
* NOCOW: For NOCOW write into existing non-hole extent
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 14ff388fd3bd..2319c923c9e6 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -23,6 +23,7 @@
#include "qgroup.h"
#include "block-group.h"
#include "sysfs.h"
+#include "tree-mod-log.h"
/* TODO XXX FIXME
* - subvol delete -> delete when ref goes to 0? delete limits also?
@@ -226,7 +227,6 @@ static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
{
struct btrfs_qgroup_list *list;
- btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
list_del(&qgroup->dirty);
while (!list_empty(&qgroup->groups)) {
list = list_first_entry(&qgroup->groups,
@@ -243,7 +243,6 @@ static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
list_del(&list->next_member);
kfree(list);
}
- kfree(qgroup);
}
/* must be called with qgroup_lock held */
@@ -569,6 +568,8 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
qgroup = rb_entry(n, struct btrfs_qgroup, node);
rb_erase(n, &fs_info->qgroup_tree);
__del_qgroup_rb(fs_info, qgroup);
+ btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
+ kfree(qgroup);
}
/*
* We call btrfs_free_qgroup_config() when unmounting
@@ -1578,6 +1579,14 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
spin_lock(&fs_info->qgroup_lock);
del_qgroup_rb(fs_info, qgroupid);
spin_unlock(&fs_info->qgroup_lock);
+
+ /*
+ * Remove the qgroup from sysfs now without holding the qgroup_lock
+ * spinlock, since the sysfs_remove_group() function needs to take
+ * the mutex kernfs_mutex through kernfs_remove_by_name_ns().
+ */
+ btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
+ kfree(qgroup);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
@@ -2631,12 +2640,12 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
record->data_rsv,
BTRFS_QGROUP_RSV_DATA);
/*
- * Use SEQ_LAST as time_seq to do special search, which
- * doesn't lock tree or delayed_refs and search current
- * root. It's safe inside commit_transaction().
+ * Use BTRFS_SEQ_LAST as time_seq to do special search,
+ * which doesn't lock tree or delayed_refs and search
+ * current root. It's safe inside commit_transaction().
*/
ret = btrfs_find_all_roots(trans, fs_info,
- record->bytenr, SEQ_LAST, &new_roots, false);
+ record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
if (ret < 0)
goto cleanup;
if (qgroup_to_skip) {
@@ -3535,37 +3544,19 @@ static int try_flush_qgroup(struct btrfs_root *root)
{
struct btrfs_trans_handle *trans;
int ret;
- bool can_commit = true;
- /*
- * If current process holds a transaction, we shouldn't flush, as we
- * assume all space reservation happens before a transaction handle is
- * held.
- *
- * But there are cases like btrfs_delayed_item_reserve_metadata() where
- * we try to reserve space with one transction handle already held.
- * In that case we can't commit transaction, but at least try to end it
- * and hope the started data writes can free some space.
- */
- if (current->journal_info &&
- current->journal_info != BTRFS_SEND_TRANS_STUB)
- can_commit = false;
+ /* Can't hold an open transaction or we run the risk of deadlocking */
+ ASSERT(current->journal_info == NULL ||
+ current->journal_info == BTRFS_SEND_TRANS_STUB);
+ if (WARN_ON(current->journal_info &&
+ current->journal_info != BTRFS_SEND_TRANS_STUB))
+ return 0;
/*
* We don't want to run flush again and again, so if there is a running
* one, we won't try to start a new flush, but exit directly.
*/
if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
- /*
- * We are already holding a transaction, thus we can block other
- * threads from flushing. So exit right now. This increases
- * the chance of EDQUOT for heavy load and near limit cases.
- * But we can argue that if we're already near limit, EDQUOT is
- * unavoidable anyway.
- */
- if (!can_commit)
- return 0;
-
wait_event(root->qgroup_flush_wait,
!test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
return 0;
@@ -3582,10 +3573,7 @@ static int try_flush_qgroup(struct btrfs_root *root)
goto out;
}
- if (can_commit)
- ret = btrfs_commit_transaction(trans);
- else
- ret = btrfs_end_transaction(trans);
+ ret = btrfs_commit_transaction(trans);
out:
clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
wake_up(&root->qgroup_flush_wait);
@@ -3638,8 +3626,7 @@ cleanup:
qgroup_unreserve_range(inode, reserved, start, len);
out:
if (new_reserved) {
- extent_changeset_release(reserved);
- kfree(reserved);
+ extent_changeset_free(reserved);
*reserved_ret = NULL;
}
return ret;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 8c31357f08ed..244d499ebc72 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -13,6 +13,7 @@
#include <linux/list_sort.h>
#include <linux/raid/xor.h>
#include <linux/mm.h>
+#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "volumes.h"
@@ -1231,13 +1232,13 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
/* first collect one page from each data stripe */
for (stripe = 0; stripe < nr_data; stripe++) {
p = page_in_rbio(rbio, stripe, pagenr, 0);
- pointers[stripe] = kmap(p);
+ pointers[stripe] = kmap_local_page(p);
}
/* then add the parity stripe */
p = rbio_pstripe_page(rbio, pagenr);
SetPageUptodate(p);
- pointers[stripe++] = kmap(p);
+ pointers[stripe++] = kmap_local_page(p);
if (has_qstripe) {
@@ -1247,7 +1248,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
*/
p = rbio_qstripe_page(rbio, pagenr);
SetPageUptodate(p);
- pointers[stripe++] = kmap(p);
+ pointers[stripe++] = kmap_local_page(p);
raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
pointers);
@@ -1256,10 +1257,8 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
copy_page(pointers[nr_data], pointers[0]);
run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
}
-
-
- for (stripe = 0; stripe < rbio->real_stripes; stripe++)
- kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+ for (stripe = stripe - 1; stripe >= 0; stripe--)
+ kunmap_local(pointers[stripe]);
}
/*
@@ -1634,7 +1633,8 @@ struct btrfs_plug_cb {
/*
* rbios on the plug list are sorted for easier merging.
*/
-static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int plug_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
{
struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
plug_list);
@@ -1776,6 +1776,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
{
int pagenr, stripe;
void **pointers;
+ void **unmap_array;
int faila = -1, failb = -1;
struct page *page;
blk_status_t err;
@@ -1787,6 +1788,16 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
goto cleanup_io;
}
+ /*
+ * Store copy of pointers that does not get reordered during
+ * reconstruction so that kunmap_local works.
+ */
+ unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
+ if (!unmap_array) {
+ err = BLK_STS_RESOURCE;
+ goto cleanup_pointers;
+ }
+
faila = rbio->faila;
failb = rbio->failb;
@@ -1808,8 +1819,11 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
!test_bit(pagenr, rbio->dbitmap))
continue;
- /* setup our array of pointers with pages
- * from each stripe
+ /*
+ * Setup our array of pointers with pages from each stripe
+ *
+ * NOTE: store a duplicate array of pointers to preserve the
+ * pointer order
*/
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
/*
@@ -1823,7 +1837,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
} else {
page = rbio_stripe_page(rbio, stripe, pagenr);
}
- pointers[stripe] = kmap(page);
+ pointers[stripe] = kmap_local_page(page);
+ unmap_array[stripe] = pointers[stripe];
}
/* all raid6 handling here */
@@ -1916,24 +1931,14 @@ pstripe:
}
}
}
- for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- /*
- * if we're rebuilding a read, we have to use
- * pages from the bio list
- */
- if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
- rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
- (stripe == faila || stripe == failb)) {
- page = page_in_rbio(rbio, stripe, pagenr, 0);
- } else {
- page = rbio_stripe_page(rbio, stripe, pagenr);
- }
- kunmap(page);
- }
+ for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
+ kunmap_local(unmap_array[stripe]);
}
err = BLK_STS_OK;
cleanup:
+ kfree(unmap_array);
+cleanup_pointers:
kfree(pointers);
cleanup_io:
@@ -2358,13 +2363,13 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
goto cleanup;
}
SetPageUptodate(q_page);
- pointers[rbio->real_stripes - 1] = kmap(q_page);
+ pointers[rbio->real_stripes - 1] = kmap_local_page(q_page);
}
atomic_set(&rbio->error, 0);
/* Map the parity stripe just once */
- pointers[nr_data] = kmap(p_page);
+ pointers[nr_data] = kmap_local_page(p_page);
for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
struct page *p;
@@ -2372,7 +2377,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
/* first collect one page from each data stripe */
for (stripe = 0; stripe < nr_data; stripe++) {
p = page_in_rbio(rbio, stripe, pagenr, 0);
- pointers[stripe] = kmap(p);
+ pointers[stripe] = kmap_local_page(p);
}
if (has_qstripe) {
@@ -2387,22 +2392,22 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
/* Check scrubbing parity and repair it */
p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
- parity = kmap(p);
+ parity = kmap_local_page(p);
if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
copy_page(parity, pointers[rbio->scrubp]);
else
/* Parity is right, needn't writeback */
bitmap_clear(rbio->dbitmap, pagenr, 1);
- kunmap(p);
+ kunmap_local(parity);
- for (stripe = 0; stripe < nr_data; stripe++)
- kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+ for (stripe = nr_data - 1; stripe >= 0; stripe--)
+ kunmap_local(pointers[stripe]);
}
- kunmap(p_page);
+ kunmap_local(pointers[nr_data]);
__free_page(p_page);
if (q_page) {
- kunmap(q_page);
+ kunmap_local(pointers[rbio->real_stripes - 1]);
__free_page(q_page);
}
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 762881b777b3..f4ec06b53aa0 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -478,9 +478,9 @@ process_slot:
clone_info.file_offset = new_key.offset;
clone_info.extent_buf = buf;
clone_info.is_new_extent = false;
- ret = btrfs_replace_file_extents(inode, path, drop_start,
- new_key.offset + datal - 1, &clone_info,
- &trans);
+ ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
+ drop_start, new_key.offset + datal - 1,
+ &clone_info, &trans);
if (ret)
goto out;
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
@@ -567,8 +567,8 @@ process_slot:
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
- ret = btrfs_replace_file_extents(inode, path, last_dest_end,
- destoff + len - 1, NULL, &trans);
+ ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
+ last_dest_end, destoff + len - 1, NULL, &trans);
if (ret)
goto out;
@@ -604,6 +604,20 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
}
+static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2)
+{
+ if (inode1 < inode2)
+ swap(inode1, inode2);
+ down_write(&BTRFS_I(inode1)->i_mmap_lock);
+ down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING);
+}
+
+static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2)
+{
+ up_write(&BTRFS_I(inode1)->i_mmap_lock);
+ up_write(&BTRFS_I(inode2)->i_mmap_lock);
+}
+
static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
struct inode *dst, u64 dst_loff)
{
@@ -820,6 +834,16 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
len, remap_flags);
}
+static bool file_sync_write(const struct file *file)
+{
+ if (file->f_flags & (__O_SYNC | O_DSYNC))
+ return true;
+ if (IS_SYNC(file_inode(file)))
+ return true;
+
+ return false;
+}
+
loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
struct file *dst_file, loff_t destoff, loff_t len,
unsigned int remap_flags)
@@ -832,10 +856,12 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
- if (same_inode)
- inode_lock(src_inode);
- else
+ if (same_inode) {
+ btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP);
+ } else {
lock_two_nondirectories(src_inode, dst_inode);
+ btrfs_double_mmap_lock(src_inode, dst_inode);
+ }
ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
&len, remap_flags);
@@ -848,10 +874,27 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
out_unlock:
- if (same_inode)
- inode_unlock(src_inode);
- else
+ if (same_inode) {
+ btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP);
+ } else {
+ btrfs_double_mmap_unlock(src_inode, dst_inode);
unlock_two_nondirectories(src_inode, dst_inode);
+ }
+
+ /*
+ * If either the source or the destination file was opened with O_SYNC,
+ * O_DSYNC or has the S_SYNC attribute, fsync both the destination and
+ * source files/ranges, so that after a successful return (0) followed
+ * by a power failure results in the reflinked data to be readable from
+ * both files/ranges.
+ */
+ if (ret == 0 && len > 0 &&
+ (file_sync_write(src_file) || file_sync_write(dst_file))) {
+ ret = btrfs_sync_file(src_file, off, off + len - 1, 0);
+ if (ret == 0)
+ ret = btrfs_sync_file(dst_file, destoff,
+ destoff + len - 1, 0);
+ }
return ret < 0 ? ret : len;
}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 232d5da7b7be..b70be2ac2e9e 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -638,9 +638,10 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)
node->bytenr, &node->rb_node);
spin_unlock(&rc->reloc_root_tree.lock);
if (rb_node) {
- btrfs_panic(fs_info, -EEXIST,
+ btrfs_err(fs_info,
"Duplicate root found for start=%llu while inserting into relocation tree",
node->bytenr);
+ return -EEXIST;
}
list_add_tail(&root->root_list, &rc->reloc_roots);
@@ -733,10 +734,12 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
struct extent_buffer *eb;
struct btrfs_root_item *root_item;
struct btrfs_key root_key;
- int ret;
+ int ret = 0;
+ bool must_abort = false;
root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
- BUG_ON(!root_item);
+ if (!root_item)
+ return ERR_PTR(-ENOMEM);
root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
root_key.type = BTRFS_ROOT_ITEM_KEY;
@@ -748,7 +751,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
/* called by btrfs_init_reloc_root */
ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
BTRFS_TREE_RELOC_OBJECTID);
- BUG_ON(ret);
+ if (ret)
+ goto fail;
+
/*
* Set the last_snapshot field to the generation of the commit
* root - like this ctree.c:btrfs_block_can_be_shared() behaves
@@ -769,9 +774,16 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
*/
ret = btrfs_copy_root(trans, root, root->node, &eb,
BTRFS_TREE_RELOC_OBJECTID);
- BUG_ON(ret);
+ if (ret)
+ goto fail;
}
+ /*
+ * We have changed references at this point, we must abort the
+ * transaction if anything fails.
+ */
+ must_abort = true;
+
memcpy(root_item, &root->root_item, sizeof(*root_item));
btrfs_set_root_bytenr(root_item, eb->start);
btrfs_set_root_level(root_item, btrfs_header_level(eb));
@@ -789,14 +801,25 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
ret = btrfs_insert_root(trans, fs_info->tree_root,
&root_key, root_item);
- BUG_ON(ret);
+ if (ret)
+ goto fail;
+
kfree(root_item);
reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
- BUG_ON(IS_ERR(reloc_root));
+ if (IS_ERR(reloc_root)) {
+ ret = PTR_ERR(reloc_root);
+ goto abort;
+ }
set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
reloc_root->last_trans = trans->transid;
return reloc_root;
+fail:
+ kfree(root_item);
+abort:
+ if (must_abort)
+ btrfs_abort_transaction(trans, ret);
+ return ERR_PTR(ret);
}
/*
@@ -856,9 +879,16 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
if (clear_rsv)
trans->block_rsv = rsv;
+ if (IS_ERR(reloc_root))
+ return PTR_ERR(reloc_root);
ret = __add_reloc_root(reloc_root);
- BUG_ON(ret < 0);
+ ASSERT(ret != -EEXIST);
+ if (ret) {
+ /* Pairs with create_reloc_root */
+ btrfs_put_root(reloc_root);
+ return ret;
+ }
root->reloc_root = btrfs_grab_root(reloc_root);
return 0;
}
@@ -875,7 +905,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
int ret;
if (!have_reloc_root(root))
- goto out;
+ return 0;
reloc_root = root->reloc_root;
root_item = &reloc_root->root_item;
@@ -908,10 +938,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
ret = btrfs_update_root(trans, fs_info->tree_root,
&reloc_root->root_key, root_item);
- BUG_ON(ret);
btrfs_put_root(reloc_root);
-out:
- return 0;
+ return ret;
}
/*
@@ -1185,8 +1213,8 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
int ret;
int slot;
- BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
- BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
+ ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
+ ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
last_snapshot = btrfs_root_last_snapshot(&src->root_item);
again:
@@ -1205,7 +1233,11 @@ again:
if (cow) {
ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb,
BTRFS_NESTING_COW);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ return ret;
+ }
}
if (next_key) {
@@ -1217,7 +1249,7 @@ again:
parent = eb;
while (1) {
level = btrfs_header_level(parent);
- BUG_ON(level < lowest_level);
+ ASSERT(level >= lowest_level);
ret = btrfs_bin_search(parent, &key, &slot);
if (ret < 0)
@@ -1265,7 +1297,11 @@ again:
ret = btrfs_cow_block(trans, dest, eb, parent,
slot, &eb,
BTRFS_NESTING_COW);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ break;
+ }
}
btrfs_tree_unlock(parent);
@@ -1289,7 +1325,11 @@ again:
path->lowest_level = level;
ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
path->lowest_level = 0;
- BUG_ON(ret);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ break;
+ }
/*
* Info qgroup to trace both subtrees.
@@ -1329,27 +1369,39 @@ again:
ref.skip_qgroup = true;
btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
ret = btrfs_inc_extent_ref(trans, &ref);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
blocksize, 0);
ref.skip_qgroup = true;
btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
ret = btrfs_inc_extent_ref(trans, &ref);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
blocksize, path->nodes[level]->start);
btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
ref.skip_qgroup = true;
ret = btrfs_free_extent(trans, &ref);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
blocksize, 0);
btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
ref.skip_qgroup = true;
ret = btrfs_free_extent(trans, &ref);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
btrfs_unlock_up_safe(path, 0);
@@ -1537,12 +1589,13 @@ static int find_next_key(struct btrfs_path *path, int level,
/*
* Insert current subvolume into reloc_control::dirty_subvol_roots
*/
-static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
- struct reloc_control *rc,
- struct btrfs_root *root)
+static int insert_dirty_subvol(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_root *root)
{
struct btrfs_root *reloc_root = root->reloc_root;
struct btrfs_root_item *reloc_root_item;
+ int ret;
/* @root must be a subvolume tree root with a valid reloc tree */
ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
@@ -1553,12 +1606,16 @@ static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
sizeof(reloc_root_item->drop_progress));
btrfs_set_root_drop_level(reloc_root_item, 0);
btrfs_set_root_refs(reloc_root_item, 0);
- btrfs_update_reloc_root(trans, root);
+ ret = btrfs_update_reloc_root(trans, root);
+ if (ret)
+ return ret;
if (list_empty(&root->reloc_dirty_list)) {
btrfs_grab_root(root);
list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
}
+
+ return 0;
}
static int clean_dirty_subvols(struct reloc_control *rc)
@@ -1760,8 +1817,11 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
out:
btrfs_free_path(path);
- if (ret == 0)
- insert_dirty_subvol(trans, rc, root);
+ if (ret == 0) {
+ ret = insert_dirty_subvol(trans, rc, root);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
if (trans)
btrfs_end_transaction_throttle(trans);
@@ -1825,8 +1885,18 @@ again:
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
false);
- BUG_ON(IS_ERR(root));
- BUG_ON(root->reloc_root != reloc_root);
+ if (IS_ERR(root)) {
+ /*
+ * Even if we have an error we need this reloc root
+ * back on our list so we can clean up properly.
+ */
+ list_add(&reloc_root->root_list, &reloc_roots);
+ btrfs_abort_transaction(trans, (int)PTR_ERR(root));
+ if (!err)
+ err = PTR_ERR(root);
+ break;
+ }
+ ASSERT(root->reloc_root == reloc_root);
/*
* set reference count to 1, so btrfs_recover_relocation
@@ -1834,16 +1904,27 @@ again:
*/
if (!err)
btrfs_set_root_refs(&reloc_root->root_item, 1);
- btrfs_update_reloc_root(trans, root);
+ ret = btrfs_update_reloc_root(trans, root);
+ /*
+ * Even if we have an error we need this reloc root back on our
+ * list so we can clean up properly.
+ */
list_add(&reloc_root->root_list, &reloc_roots);
btrfs_put_root(root);
+
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ if (!err)
+ err = ret;
+ break;
+ }
}
list_splice(&reloc_roots, &rc->reloc_roots);
if (!err)
- btrfs_commit_transaction(trans);
+ err = btrfs_commit_transaction(trans);
else
btrfs_end_transaction(trans);
return err;
@@ -1888,8 +1969,29 @@ again:
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
false);
if (btrfs_root_refs(&reloc_root->root_item) > 0) {
- BUG_ON(IS_ERR(root));
- BUG_ON(root->reloc_root != reloc_root);
+ if (IS_ERR(root)) {
+ /*
+ * For recovery we read the fs roots on mount,
+ * and if we didn't find the root then we marked
+ * the reloc root as a garbage root. For normal
+ * relocation obviously the root should exist in
+ * memory. However there's no reason we can't
+ * handle the error properly here just in case.
+ */
+ ASSERT(0);
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ if (root->reloc_root != reloc_root) {
+ /*
+ * This is actually impossible without something
+ * going really wrong (like weird race condition
+ * or cosmic rays).
+ */
+ ASSERT(0);
+ ret = -EINVAL;
+ goto out;
+ }
ret = merge_reloc_root(rc, root);
btrfs_put_root(root);
if (ret) {
@@ -1971,8 +2073,27 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
return 0;
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false);
- BUG_ON(IS_ERR(root));
- BUG_ON(root->reloc_root != reloc_root);
+
+ /*
+ * This should succeed, since we can't have a reloc root without having
+ * already looked up the actual root and created the reloc root for this
+ * root.
+ *
+ * However if there's some sort of corruption where we have a ref to a
+ * reloc root without a corresponding root this could return ENOENT.
+ */
+ if (IS_ERR(root)) {
+ ASSERT(0);
+ return PTR_ERR(root);
+ }
+ if (root->reloc_root != reloc_root) {
+ ASSERT(0);
+ btrfs_err(fs_info,
+ "root %llu has two reloc roots associated with it",
+ reloc_root->root_key.offset);
+ btrfs_put_root(root);
+ return -EUCLEAN;
+ }
ret = btrfs_record_root_in_trans(trans, root);
btrfs_put_root(root);
@@ -1988,26 +2109,77 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_backref_node *next;
struct btrfs_root *root;
int index = 0;
+ int ret;
next = node;
while (1) {
cond_resched();
next = walk_up_backref(next, edges, &index);
root = next->root;
- BUG_ON(!root);
- BUG_ON(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state));
+
+ /*
+ * If there is no root, then our references for this block are
+ * incomplete, as we should be able to walk all the way up to a
+ * block that is owned by a root.
+ *
+ * This path is only for SHAREABLE roots, so if we come upon a
+ * non-SHAREABLE root then we have backrefs that resolve
+ * improperly.
+ *
+ * Both of these cases indicate file system corruption, or a bug
+ * in the backref walking code.
+ */
+ if (!root) {
+ ASSERT(0);
+ btrfs_err(trans->fs_info,
+ "bytenr %llu doesn't have a backref path ending in a root",
+ node->bytenr);
+ return ERR_PTR(-EUCLEAN);
+ }
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
+ ASSERT(0);
+ btrfs_err(trans->fs_info,
+ "bytenr %llu has multiple refs with one ending in a non-shareable root",
+ node->bytenr);
+ return ERR_PTR(-EUCLEAN);
+ }
if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
- record_reloc_root_in_trans(trans, root);
+ ret = record_reloc_root_in_trans(trans, root);
+ if (ret)
+ return ERR_PTR(ret);
break;
}
- btrfs_record_root_in_trans(trans, root);
+ ret = btrfs_record_root_in_trans(trans, root);
+ if (ret)
+ return ERR_PTR(ret);
root = root->reloc_root;
+ /*
+ * We could have raced with another thread which failed, so
+ * root->reloc_root may not be set, return ENOENT in this case.
+ */
+ if (!root)
+ return ERR_PTR(-ENOENT);
+
if (next->new_bytenr != root->node->start) {
- BUG_ON(next->new_bytenr);
- BUG_ON(!list_empty(&next->list));
+ /*
+ * We just created the reloc root, so we shouldn't have
+ * ->new_bytenr set and this shouldn't be in the changed
+ * list. If it is then we have multiple roots pointing
+ * at the same bytenr which indicates corruption, or
+ * we've made a mistake in the backref walking code.
+ */
+ ASSERT(next->new_bytenr == 0);
+ ASSERT(list_empty(&next->list));
+ if (next->new_bytenr || !list_empty(&next->list)) {
+ btrfs_err(trans->fs_info,
+ "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
+ node->bytenr, next->bytenr);
+ return ERR_PTR(-EUCLEAN);
+ }
+
next->new_bytenr = root->node->start;
btrfs_put_root(next->root);
next->root = btrfs_grab_root(root);
@@ -2024,8 +2196,14 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
if (!next || next->level <= node->level)
break;
}
- if (!root)
- return NULL;
+ if (!root) {
+ /*
+ * This can happen if there's fs corruption or if there's a bug
+ * in the backref lookup code.
+ */
+ ASSERT(0);
+ return ERR_PTR(-ENOENT);
+ }
next = node;
/* setup backref node path for btrfs_reloc_cow_block */
@@ -2061,7 +2239,13 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
cond_resched();
next = walk_up_backref(next, edges, &index);
root = next->root;
- BUG_ON(!root);
+
+ /*
+ * This can occur if we have incomplete extent refs leading all
+ * the way up a particular path, in this case return -EUCLEAN.
+ */
+ if (!root)
+ return ERR_PTR(-EUCLEAN);
/* No other choice for non-shareable tree */
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
@@ -2181,7 +2365,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
int slot;
int ret = 0;
- BUG_ON(lowest && node->eb);
+ /*
+ * If we are lowest then this is the first time we're processing this
+ * block, and thus shouldn't have an eb associated with it yet.
+ */
+ ASSERT(!lowest || !node->eb);
path->lowest_level = node->level + 1;
rc->backref_cache.path[node->level] = node;
@@ -2192,7 +2380,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
upper = edge->node[UPPER];
root = select_reloc_root(trans, rc, upper, edges);
- BUG_ON(!root);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto next;
+ }
if (upper->eb && !upper->locked) {
if (!lowest) {
@@ -2266,7 +2457,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
free_extent_buffer(eb);
if (ret < 0)
goto next;
- BUG_ON(node->eb != eb);
+ /*
+ * We've just COWed this block, it should have updated
+ * the correct backref node entry.
+ */
+ ASSERT(node->eb == eb);
} else {
btrfs_set_node_blockptr(upper->eb, slot,
node->eb->start);
@@ -2281,10 +2476,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
btrfs_init_tree_ref(&ref, node->level,
btrfs_header_owner(upper->eb));
ret = btrfs_inc_extent_ref(trans, &ref);
- BUG_ON(ret);
-
- ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
- BUG_ON(ret);
+ if (!ret)
+ ret = btrfs_drop_subtree(trans, root, eb,
+ upper->eb);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
}
next:
if (!upper->pending)
@@ -2302,7 +2498,12 @@ next:
}
path->lowest_level = 0;
- BUG_ON(ret == -ENOSPC);
+
+ /*
+ * We should have allocated all of our space in the block rsv and thus
+ * shouldn't ENOSPC.
+ */
+ ASSERT(ret != -ENOSPC);
return ret;
}
@@ -2434,16 +2635,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
BUG_ON(node->processed);
root = select_one_root(node);
- if (root == ERR_PTR(-ENOENT)) {
- update_processed_blocks(rc, node);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+
+ /* See explanation in select_one_root for the -EUCLEAN case. */
+ ASSERT(ret == -ENOENT);
+ if (ret == -ENOENT) {
+ ret = 0;
+ update_processed_blocks(rc, node);
+ }
goto out;
}
if (root) {
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
- BUG_ON(node->new_bytenr);
- BUG_ON(!list_empty(&node->list));
- btrfs_record_root_in_trans(trans, root);
+ /*
+ * This block was the root block of a root, and this is
+ * the first time we're processing the block and thus it
+ * should not have had the ->new_bytenr modified and
+ * should have not been included on the changed list.
+ *
+ * However in the case of corruption we could have
+ * multiple refs pointing to the same block improperly,
+ * and thus we would trip over these checks. ASSERT()
+ * for the developer case, because it could indicate a
+ * bug in the backref code, however error out for a
+ * normal user in the case of corruption.
+ */
+ ASSERT(node->new_bytenr == 0);
+ ASSERT(list_empty(&node->list));
+ if (node->new_bytenr || !list_empty(&node->list)) {
+ btrfs_err(root->fs_info,
+ "bytenr %llu has improper references to it",
+ node->bytenr);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ ret = btrfs_record_root_in_trans(trans, root);
+ if (ret)
+ goto out;
+ /*
+ * Another thread could have failed, need to check if we
+ * have reloc_root actually set.
+ */
+ if (!root->reloc_root) {
+ ret = -ENOENT;
+ goto out;
+ }
root = root->reloc_root;
node->new_bytenr = root->node->start;
btrfs_put_root(node->root);
@@ -2578,7 +2816,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
return btrfs_end_transaction(trans);
}
- inode_lock(&inode->vfs_inode);
+ btrfs_inode_lock(&inode->vfs_inode, 0);
for (nr = 0; nr < cluster->nr; nr++) {
start = cluster->boundary[nr] - offset;
if (nr + 1 < cluster->nr)
@@ -2596,7 +2834,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
if (ret)
break;
}
- inode_unlock(&inode->vfs_inode);
+ btrfs_inode_unlock(&inode->vfs_inode, 0);
if (cur_offset < prealloc_end)
btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
@@ -3220,20 +3458,6 @@ static void unset_reloc_control(struct reloc_control *rc)
mutex_unlock(&fs_info->reloc_mutex);
}
-static int check_extent_flags(u64 flags)
-{
- if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
- (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
- return 1;
- if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
- !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
- return 1;
- if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
- (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
- return 1;
- return 0;
-}
-
static noinline_for_stack
int prepare_to_relocate(struct reloc_control *rc)
{
@@ -3272,8 +3496,7 @@ int prepare_to_relocate(struct reloc_control *rc)
*/
return PTR_ERR(trans);
}
- btrfs_commit_transaction(trans);
- return 0;
+ return btrfs_commit_transaction(trans);
}
static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
@@ -3285,7 +3508,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
struct btrfs_path *path;
struct btrfs_extent_item *ei;
u64 flags;
- u32 item_size;
int ret;
int err = 0;
int progress = 0;
@@ -3334,19 +3556,7 @@ restart:
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_extent_item);
- item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
- if (item_size >= sizeof(*ei)) {
- flags = btrfs_extent_flags(path->nodes[0], ei);
- ret = check_extent_flags(flags);
- BUG_ON(ret);
- } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
- err = -EINVAL;
- btrfs_print_v0_err(trans->fs_info);
- btrfs_abort_transaction(trans, err);
- break;
- } else {
- BUG();
- }
+ flags = btrfs_extent_flags(path->nodes[0], ei);
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
ret = add_tree_block(rc, &key, path, &blocks);
@@ -3445,7 +3655,9 @@ restart:
err = PTR_ERR(trans);
goto out_free;
}
- btrfs_commit_transaction(trans);
+ ret = btrfs_commit_transaction(trans);
+ if (ret && !err)
+ err = ret;
out_free:
ret = clean_dirty_subvols(rc);
if (ret < 0 && !err)
@@ -3488,6 +3700,35 @@ out:
return ret;
}
+static void delete_orphan_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 objectid)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = objectid;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = btrfs_del_item(trans, root, path);
+out:
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ btrfs_free_path(path);
+}
+
/*
* helper to create inode for data relocation.
* the inode is in data relocation tree and its link count is 0
@@ -3514,10 +3755,16 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
goto out;
err = __insert_orphan_inode(trans, root, objectid);
- BUG_ON(err);
+ if (err)
+ goto out;
inode = btrfs_iget(fs_info->sb, objectid, root);
- BUG_ON(IS_ERR(inode));
+ if (IS_ERR(inode)) {
+ delete_orphan_inode(trans, root, objectid);
+ err = PTR_ERR(inode);
+ inode = NULL;
+ goto out;
+ }
BTRFS_I(inode)->index_cnt = group->start;
err = btrfs_orphan_add(trans, BTRFS_I(inode));
@@ -3859,7 +4106,13 @@ int btrfs_recover_relocation(struct btrfs_root *root)
}
err = __add_reloc_root(reloc_root);
- BUG_ON(err < 0); /* -ENOMEM or logic error */
+ ASSERT(err != -EEXIST);
+ if (err) {
+ list_add_tail(&reloc_root->root_list, &reloc_roots);
+ btrfs_put_root(fs_root);
+ btrfs_end_transaction(trans);
+ goto out_unset;
+ }
fs_root->reloc_root = btrfs_grab_root(reloc_root);
btrfs_put_root(fs_root);
}
@@ -4074,7 +4327,12 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
return PTR_ERR(reloc_root);
ret = __add_reloc_root(reloc_root);
- BUG_ON(ret < 0);
+ ASSERT(ret != -EEXIST);
+ if (ret) {
+ /* Pairs with create_reloc_root */
+ btrfs_put_root(reloc_root);
+ return ret;
+ }
new_root->reloc_root = btrfs_grab_root(reloc_root);
if (rc->create_reloc_tree)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 3d9088eab2fc..485cda3eb8d7 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -206,9 +206,6 @@ struct full_stripe_lock {
struct mutex mutex;
};
-static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
-static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
-static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct scrub_block *sblocks_for_recheck);
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
@@ -226,14 +223,11 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
static int scrub_checksum_data(struct scrub_block *sblock);
static int scrub_checksum_tree_block(struct scrub_block *sblock);
static int scrub_checksum_super(struct scrub_block *sblock);
-static void scrub_block_get(struct scrub_block *sblock);
static void scrub_block_put(struct scrub_block *sblock);
static void scrub_page_get(struct scrub_page *spage);
static void scrub_page_put(struct scrub_page *spage);
static void scrub_parity_get(struct scrub_parity *sparity);
static void scrub_parity_put(struct scrub_parity *sparity);
-static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
- struct scrub_page *spage);
static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
u64 physical, struct btrfs_device *dev, u64 flags,
u64 gen, int mirror_num, u8 *csum,
@@ -251,8 +245,6 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
static void scrub_wr_submit(struct scrub_ctx *sctx);
static void scrub_wr_bio_end_io(struct bio *bio);
static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
-static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
-static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
static void scrub_put_ctx(struct scrub_ctx *sctx);
static inline int scrub_is_page_on_raid56(struct scrub_page *spage)
@@ -3682,8 +3674,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
spin_lock(&cache->lock);
if (!cache->to_copy) {
spin_unlock(&cache->lock);
- ro_set = 0;
- goto done;
+ btrfs_put_block_group(cache);
+ goto skip;
}
spin_unlock(&cache->lock);
}
@@ -3841,7 +3833,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
cache, found_key.offset))
ro_set = 0;
-done:
down_write(&dev_replace->rwsem);
dev_replace->cursor_left = dev_replace->cursor_right;
dev_replace->item_needs_writeback = 1;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 8f323859156b..55741adf9071 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -6650,6 +6650,7 @@ static int full_send_tree(struct send_ctx *sctx)
path = alloc_path_for_send();
if (!path)
return -ENOMEM;
+ path->reada = READA_FORWARD_ALWAYS;
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
key.type = BTRFS_INODE_ITEM_KEY;
@@ -6688,15 +6689,35 @@ out:
return ret;
}
-static int tree_move_down(struct btrfs_path *path, int *level)
+static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen)
{
struct extent_buffer *eb;
+ struct extent_buffer *parent = path->nodes[*level];
+ int slot = path->slots[*level];
+ const int nritems = btrfs_header_nritems(parent);
+ u64 reada_max;
+ u64 reada_done = 0;
BUG_ON(*level == 0);
- eb = btrfs_read_node_slot(path->nodes[*level], path->slots[*level]);
+ eb = btrfs_read_node_slot(parent, slot);
if (IS_ERR(eb))
return PTR_ERR(eb);
+ /*
+ * Trigger readahead for the next leaves we will process, so that it is
+ * very likely that when we need them they are already in memory and we
+ * will not block on disk IO. For nodes we only do readahead for one,
+ * since the time window between processing nodes is typically larger.
+ */
+ reada_max = (*level == 1 ? SZ_128K : eb->fs_info->nodesize);
+
+ for (slot++; slot < nritems && reada_done < reada_max; slot++) {
+ if (btrfs_node_ptr_generation(parent, slot) > reada_min_gen) {
+ btrfs_readahead_node_child(parent, slot);
+ reada_done += eb->fs_info->nodesize;
+ }
+ }
+
path->nodes[*level - 1] = eb;
path->slots[*level - 1] = 0;
(*level)--;
@@ -6736,14 +6757,15 @@ static int tree_move_next_or_upnext(struct btrfs_path *path,
static int tree_advance(struct btrfs_path *path,
int *level, int root_level,
int allow_down,
- struct btrfs_key *key)
+ struct btrfs_key *key,
+ u64 reada_min_gen)
{
int ret;
if (*level == 0 || !allow_down) {
ret = tree_move_next_or_upnext(path, level, root_level);
} else {
- ret = tree_move_down(path, level);
+ ret = tree_move_down(path, level, reada_min_gen);
}
if (ret >= 0) {
if (*level == 0)
@@ -6817,6 +6839,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
u64 right_blockptr;
u64 left_gen;
u64 right_gen;
+ u64 reada_min_gen;
left_path = btrfs_alloc_path();
if (!left_path) {
@@ -6896,6 +6919,14 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
ret = -ENOMEM;
goto out;
}
+ /*
+ * Our right root is the parent root, while the left root is the "send"
+ * root. We know that all new nodes/leaves in the left root must have
+ * a generation greater than the right root's generation, so we trigger
+ * readahead for those nodes and leaves of the left root, as we know we
+ * will need to read them at some point.
+ */
+ reada_min_gen = btrfs_header_generation(right_root->commit_root);
up_read(&fs_info->commit_root_sem);
if (left_level == 0)
@@ -6920,7 +6951,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
ret = tree_advance(left_path, &left_level,
left_root_level,
advance_left != ADVANCE_ONLY_NEXT,
- &left_key);
+ &left_key, reada_min_gen);
if (ret == -1)
left_end_reached = ADVANCE;
else if (ret < 0)
@@ -6931,7 +6962,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
ret = tree_advance(right_path, &right_level,
right_root_level,
advance_right != ADVANCE_ONLY_NEXT,
- &right_key);
+ &right_key, reada_min_gen);
if (ret == -1)
right_end_reached = ADVANCE;
else if (ret < 0)
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 2da6177f4b0b..2dc674b7c3b1 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -861,8 +861,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
* of heavy DIO or ordered reservations, preemptive flushing will just
* waste time and cause us to slow down.
*/
- ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes);
- delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
+ ordered = percpu_counter_read_positive(&fs_info->ordered_bytes);
+ delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
if (ordered >= delalloc)
used += fs_info->delayed_refs_rsv.reserved +
fs_info->delayed_block_rsv.reserved;
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index c69049e7daa9..2d19089ab625 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -4,6 +4,64 @@
#include "ctree.h"
#include "subpage.h"
+/*
+ * Subpage (sectorsize < PAGE_SIZE) support overview:
+ *
+ * Limitations:
+ *
+ * - Only support 64K page size for now
+ * This is to make metadata handling easier, as 64K page would ensure
+ * all nodesize would fit inside one page, thus we don't need to handle
+ * cases where a tree block crosses several pages.
+ *
+ * - Only metadata read-write for now
+ * The data read-write part is in development.
+ *
+ * - Metadata can't cross 64K page boundary
+ * btrfs-progs and kernel have done that for a while, thus only ancient
+ * filesystems could have such problem. For such case, do a graceful
+ * rejection.
+ *
+ * Special behavior:
+ *
+ * - Metadata
+ * Metadata read is fully supported.
+ * Meaning when reading one tree block will only trigger the read for the
+ * needed range, other unrelated range in the same page will not be touched.
+ *
+ * Metadata write support is partial.
+ * The writeback is still for the full page, but we will only submit
+ * the dirty extent buffers in the page.
+ *
+ * This means, if we have a metadata page like this:
+ *
+ * Page offset
+ * 0 16K 32K 48K 64K
+ * |/////////| |///////////|
+ * \- Tree block A \- Tree block B
+ *
+ * Even if we just want to writeback tree block A, we will also writeback
+ * tree block B if it's also dirty.
+ *
+ * This may cause extra metadata writeback which results more COW.
+ *
+ * Implementation:
+ *
+ * - Common
+ * Both metadata and data will use a new structure, btrfs_subpage, to
+ * record the status of each sector inside a page. This provides the extra
+ * granularity needed.
+ *
+ * - Metadata
+ * Since we have multiple tree blocks inside one page, we can't rely on page
+ * locking anymore, or we will have greatly reduced concurrency or even
+ * deadlocks (hold one tree lock while trying to lock another tree lock in
+ * the same page).
+ *
+ * Thus for metadata locking, subpage support relies on io_tree locking only.
+ * This means a slightly higher tree locking latency.
+ */
+
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
struct page *page, enum btrfs_subpage_type type)
{
@@ -220,6 +278,82 @@ void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info,
spin_unlock_irqrestore(&subpage->lock, flags);
}
+void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ subpage->dirty_bitmap |= tmp;
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ set_page_dirty(page);
+}
+
+/*
+ * Extra clear_and_test function for subpage dirty bitmap.
+ *
+ * Return true if we're the last bits in the dirty_bitmap and clear the
+ * dirty_bitmap.
+ * Return false otherwise.
+ *
+ * NOTE: Callers should manually clear page dirty for true case, as we have
+ * extra handling for tree blocks.
+ */
+bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned long flags;
+ bool last = false;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ subpage->dirty_bitmap &= ~tmp;
+ if (subpage->dirty_bitmap == 0)
+ last = true;
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ return last;
+}
+
+void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ bool last;
+
+ last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len);
+ if (last)
+ clear_page_dirty_for_io(page);
+}
+
+void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ subpage->writeback_bitmap |= tmp;
+ set_page_writeback(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ subpage->writeback_bitmap &= ~tmp;
+ if (subpage->writeback_bitmap == 0)
+ end_page_writeback(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
/*
* Unlike set/clear which is dependent on each page status, for test all bits
* are tested in the same way.
@@ -240,6 +374,8 @@ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
}
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error);
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty);
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback);
/*
* Note that, in selftests (extent-io-tests), we can have empty fs_info passed
@@ -276,3 +412,7 @@ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate,
PageUptodate);
IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError);
+IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io,
+ PageDirty);
+IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
+ PageWriteback);
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index b86a4881475d..bfd626e955be 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -20,6 +20,8 @@ struct btrfs_subpage {
spinlock_t lock;
u16 uptodate_bitmap;
u16 error_bitmap;
+ u16 dirty_bitmap;
+ u16 writeback_bitmap;
union {
/*
* Structures only used by metadata
@@ -87,5 +89,10 @@ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
DECLARE_BTRFS_SUBPAGE_OPS(uptodate);
DECLARE_BTRFS_SUBPAGE_OPS(error);
+DECLARE_BTRFS_SUBPAGE_OPS(dirty);
+DECLARE_BTRFS_SUBPAGE_OPS(writeback);
+
+bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f7a4ad86adee..4a396c1147f1 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -252,6 +252,32 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, .
}
#endif
+#if BITS_PER_LONG == 32
+void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info)
+{
+ if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) {
+ btrfs_warn(fs_info, "reaching 32bit limit for logical addresses");
+ btrfs_warn(fs_info,
+"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT",
+ BTRFS_32BIT_MAX_FILE_SIZE >> 40);
+ btrfs_warn(fs_info,
+ "please consider upgrading to 64bit kernel/hardware");
+ }
+}
+
+void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
+{
+ if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) {
+ btrfs_err(fs_info, "reached 32bit limit for logical addresses");
+ btrfs_err(fs_info,
+"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed",
+ BTRFS_32BIT_MAX_FILE_SIZE >> 40);
+ btrfs_err(fs_info,
+ "please consider upgrading to 64bit kernel/hardware");
+ }
+}
+#endif
+
/*
* We only mark the transaction aborted and then set the file system read-only.
* This will prevent new transactions from starting or trying to join this
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 6eb1c50fa98c..436ac7b4b334 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -360,11 +360,26 @@ static ssize_t supported_rescue_options_show(struct kobject *kobj,
BTRFS_ATTR(static_feature, supported_rescue_options,
supported_rescue_options_show);
+static ssize_t supported_sectorsizes_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ ssize_t ret = 0;
+
+ /* Only sectorsize == PAGE_SIZE is now supported */
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%lu\n", PAGE_SIZE);
+
+ return ret;
+}
+BTRFS_ATTR(static_feature, supported_sectorsizes,
+ supported_sectorsizes_show);
+
static struct attribute *btrfs_supported_static_feature_attrs[] = {
BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
BTRFS_ATTR_PTR(static_feature, supported_checksums),
BTRFS_ATTR_PTR(static_feature, send_stream_version),
BTRFS_ATTR_PTR(static_feature, supported_rescue_options),
+ BTRFS_ATTR_PTR(static_feature, supported_sectorsizes),
NULL
};
@@ -965,6 +980,40 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
}
BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store);
+static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ ssize_t ret;
+
+ ret = scnprintf(buf, PAGE_SIZE, "%d\n", fs_info->bg_reclaim_threshold);
+
+ return ret;
+}
+
+static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ int thresh;
+ int ret;
+
+ ret = kstrtoint(buf, 10, &thresh);
+ if (ret)
+ return ret;
+
+ if (thresh <= 50 || thresh > 100)
+ return -EINVAL;
+
+ fs_info->bg_reclaim_threshold = thresh;
+
+ return len;
+}
+BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show,
+ btrfs_bg_reclaim_threshold_store);
+
static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, label),
BTRFS_ATTR_PTR(, nodesize),
@@ -976,6 +1025,7 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, exclusive_operation),
BTRFS_ATTR_PTR(, generation),
BTRFS_ATTR_PTR(, read_policy),
+ BTRFS_ATTR_PTR(, bg_reclaim_threshold),
NULL,
};
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index acff6bb49a97..f75de9f6c0ad 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -260,6 +260,7 @@ static inline int extwriter_counter_read(struct btrfs_transaction *trans)
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_transaction *cur_trans = trans->transaction;
if (!trans->chunk_bytes_reserved)
return;
@@ -268,6 +269,8 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
trans->chunk_bytes_reserved, NULL);
+ atomic64_sub(trans->chunk_bytes_reserved, &cur_trans->chunk_bytes_reserved);
+ cond_wake_up(&cur_trans->chunk_reserve_wait);
trans->chunk_bytes_reserved = 0;
}
@@ -383,6 +386,8 @@ loop:
spin_lock_init(&cur_trans->dropped_roots_lock);
INIT_LIST_HEAD(&cur_trans->releasing_ebs);
spin_lock_init(&cur_trans->releasing_ebs_lock);
+ atomic64_set(&cur_trans->chunk_bytes_reserved, 0);
+ init_waitqueue_head(&cur_trans->chunk_reserve_wait);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
@@ -408,6 +413,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
int force)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret = 0;
if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
root->last_trans < trans->transid) || force) {
@@ -456,11 +462,11 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
* lock. smp_wmb() makes sure that all the writes above are
* done before we pop in the zero below
*/
- btrfs_init_reloc_root(trans, root);
+ ret = btrfs_init_reloc_root(trans, root);
smp_mb__before_atomic();
clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
}
- return 0;
+ return ret;
}
@@ -487,6 +493,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return 0;
@@ -501,10 +508,10 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
return 0;
mutex_lock(&fs_info->reloc_mutex);
- record_root_in_trans(trans, root, 0);
+ ret = record_root_in_trans(trans, root, 0);
mutex_unlock(&fs_info->reloc_mutex);
- return 0;
+ return ret;
}
static inline int is_transaction_blocked(struct btrfs_transaction *trans)
@@ -741,7 +748,16 @@ got_it:
* Thus it need to be called after current->journal_info initialized,
* or we can deadlock.
*/
- btrfs_record_root_in_trans(h, root);
+ ret = btrfs_record_root_in_trans(h, root);
+ if (ret) {
+ /*
+ * The transaction handle is fully initialized and linked with
+ * other structures so it needs to be ended in case of errors,
+ * not just freed.
+ */
+ btrfs_end_transaction(h);
+ return ERR_PTR(ret);
+ }
return h;
@@ -1347,7 +1363,9 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
spin_unlock(&fs_info->fs_roots_radix_lock);
btrfs_free_log(trans, root);
- btrfs_update_reloc_root(trans, root);
+ ret2 = btrfs_update_reloc_root(trans, root);
+ if (ret2)
+ return ret2;
/* see comments in should_cow_block() */
clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
@@ -1440,7 +1458,9 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
* recorded root will never be updated again, causing an outdated root
* item.
*/
- record_root_in_trans(trans, src, 1);
+ ret = record_root_in_trans(trans, src, 1);
+ if (ret)
+ return ret;
/*
* btrfs_qgroup_inherit relies on a consistent view of the usage for the
@@ -1509,7 +1529,7 @@ out:
* insert_dir_item()
*/
if (!ret)
- record_root_in_trans(trans, parent, 1);
+ ret = record_root_in_trans(trans, parent, 1);
return ret;
}
@@ -1586,8 +1606,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
dentry = pending->dentry;
parent_inode = pending->dir;
parent_root = BTRFS_I(parent_inode)->root;
- record_root_in_trans(trans, parent_root, 0);
-
+ ret = record_root_in_trans(trans, parent_root, 0);
+ if (ret)
+ goto fail;
cur_time = current_time(parent_inode);
/*
@@ -1623,7 +1644,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
- record_root_in_trans(trans, root, 0);
+ ret = record_root_in_trans(trans, root, 0);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
btrfs_check_and_init_root_item(new_root_item);
@@ -1961,7 +1986,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
*/
BUG_ON(list_empty(&cur_trans->list));
- list_del_init(&cur_trans->list);
if (cur_trans == fs_info->running_transaction) {
cur_trans->state = TRANS_STATE_COMMIT_DOING;
spin_unlock(&fs_info->trans_lock);
@@ -1970,6 +1994,17 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
spin_lock(&fs_info->trans_lock);
}
+
+ /*
+ * Now that we know no one else is still using the transaction we can
+ * remove the transaction from the list of transactions. This avoids
+ * the transaction kthread from cleaning up the transaction while some
+ * other task is still using it, which could result in a use-after-free
+ * on things like log trees, as it forces the transaction kthread to
+ * wait for this transaction to be cleaned up by us.
+ */
+ list_del_init(&cur_trans->list);
+
spin_unlock(&fs_info->trans_lock);
btrfs_cleanup_one_transaction(trans->transaction, fs_info);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 6335716e513f..364cfbb4c5c5 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -96,6 +96,13 @@ struct btrfs_transaction {
spinlock_t releasing_ebs_lock;
struct list_head releasing_ebs;
+
+ /*
+ * The number of bytes currently reserved, by all transaction handles
+ * attached to this transaction, for metadata extents of the chunk tree.
+ */
+ atomic64_t chunk_bytes_reserved;
+ wait_queue_head_t chunk_reserve_wait;
};
#define __TRANS_FREEZABLE (1U << 0)
@@ -175,7 +182,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
spin_lock(&inode->lock);
inode->last_trans = trans->transaction->transid;
inode->last_sub_trans = inode->root->log_transid;
- inode->last_log_commit = inode->root->last_log_commit;
+ inode->last_log_commit = inode->last_sub_trans - 1;
spin_unlock(&inode->lock);
}
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index f4ade821307d..a8b2e0d2c025 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1290,6 +1290,11 @@ static int check_extent_item(struct extent_buffer *leaf,
key->offset, fs_info->sectorsize);
return -EUCLEAN;
}
+ if (unlikely(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
+ extent_err(leaf, slot,
+ "invalid extent flag, data has full backref set");
+ return -EUCLEAN;
+ }
}
ptr = (unsigned long)(struct btrfs_extent_item *)(ei + 1);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 92a368627791..f67721d82e5d 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3165,20 +3165,22 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
mutex_unlock(&root->log_mutex);
- btrfs_init_log_ctx(&root_log_ctx, NULL);
-
- mutex_lock(&log_root_tree->log_mutex);
-
if (btrfs_is_zoned(fs_info)) {
+ mutex_lock(&fs_info->tree_root->log_mutex);
if (!log_root_tree->node) {
ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
if (ret) {
- mutex_unlock(&log_root_tree->log_mutex);
+ mutex_unlock(&fs_info->tree_log_mutex);
goto out;
}
}
+ mutex_unlock(&fs_info->tree_root->log_mutex);
}
+ btrfs_init_log_ctx(&root_log_ctx, NULL);
+
+ mutex_lock(&log_root_tree->log_mutex);
+
index2 = log_root_tree->log_transid % 2;
list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
root_log_ctx.log_transid = log_root_tree->log_transid;
@@ -4136,7 +4138,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
return ret;
}
-static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int extent_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
{
struct extent_map *em1, *em2;
@@ -6278,8 +6281,13 @@ again:
}
wc.replay_dest->log_root = log;
- btrfs_record_root_in_trans(trans, wc.replay_dest);
- ret = walk_log_tree(trans, log, &wc);
+ ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
+ if (ret)
+ /* The loop needs to continue due to the root refs */
+ btrfs_handle_fs_error(fs_info, ret,
+ "failed to record the log root in transaction");
+ else
+ ret = walk_log_tree(trans, log, &wc);
if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
ret = fixup_inode_link_counts(trans, wc.replay_dest,
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
new file mode 100644
index 000000000000..8a3a14686d3e
--- /dev/null
+++ b/fs/btrfs/tree-mod-log.c
@@ -0,0 +1,929 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "tree-mod-log.h"
+#include "disk-io.h"
+
+struct tree_mod_root {
+ u64 logical;
+ u8 level;
+};
+
+struct tree_mod_elem {
+ struct rb_node node;
+ u64 logical;
+ u64 seq;
+ enum btrfs_mod_log_op op;
+
+ /*
+ * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
+ * operations.
+ */
+ int slot;
+
+ /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
+ u64 generation;
+
+ /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
+ struct btrfs_disk_key key;
+ u64 blockptr;
+
+ /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
+ struct {
+ int dst_slot;
+ int nr_items;
+ } move;
+
+ /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
+ struct tree_mod_root old_root;
+};
+
+/*
+ * Pull a new tree mod seq number for our operation.
+ */
+static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
+{
+ return atomic64_inc_return(&fs_info->tree_mod_seq);
+}
+
+/*
+ * This adds a new blocker to the tree mod log's blocker list if the @elem
+ * passed does not already have a sequence number set. So when a caller expects
+ * to record tree modifications, it should ensure to set elem->seq to zero
+ * before calling btrfs_get_tree_mod_seq.
+ * Returns a fresh, unused tree log modification sequence number, even if no new
+ * blocker was added.
+ */
+u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct btrfs_seq_list *elem)
+{
+ write_lock(&fs_info->tree_mod_log_lock);
+ if (!elem->seq) {
+ elem->seq = btrfs_inc_tree_mod_seq(fs_info);
+ list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+ set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
+ }
+ write_unlock(&fs_info->tree_mod_log_lock);
+
+ return elem->seq;
+}
+
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct btrfs_seq_list *elem)
+{
+ struct rb_root *tm_root;
+ struct rb_node *node;
+ struct rb_node *next;
+ struct tree_mod_elem *tm;
+ u64 min_seq = BTRFS_SEQ_LAST;
+ u64 seq_putting = elem->seq;
+
+ if (!seq_putting)
+ return;
+
+ write_lock(&fs_info->tree_mod_log_lock);
+ list_del(&elem->list);
+ elem->seq = 0;
+
+ if (list_empty(&fs_info->tree_mod_seq_list)) {
+ clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
+ } else {
+ struct btrfs_seq_list *first;
+
+ first = list_first_entry(&fs_info->tree_mod_seq_list,
+ struct btrfs_seq_list, list);
+ if (seq_putting > first->seq) {
+ /*
+ * Blocker with lower sequence number exists, we cannot
+ * remove anything from the log.
+ */
+ write_unlock(&fs_info->tree_mod_log_lock);
+ return;
+ }
+ min_seq = first->seq;
+ }
+
+ /*
+ * Anything that's lower than the lowest existing (read: blocked)
+ * sequence number can be removed from the tree.
+ */
+ tm_root = &fs_info->tree_mod_log;
+ for (node = rb_first(tm_root); node; node = next) {
+ next = rb_next(node);
+ tm = rb_entry(node, struct tree_mod_elem, node);
+ if (tm->seq >= min_seq)
+ continue;
+ rb_erase(node, tm_root);
+ kfree(tm);
+ }
+ write_unlock(&fs_info->tree_mod_log_lock);
+}
+
+/*
+ * Key order of the log:
+ * node/leaf start address -> sequence
+ *
+ * The 'start address' is the logical address of the *new* root node for root
+ * replace operations, or the logical address of the affected block for all
+ * other operations.
+ */
+static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
+ struct tree_mod_elem *tm)
+{
+ struct rb_root *tm_root;
+ struct rb_node **new;
+ struct rb_node *parent = NULL;
+ struct tree_mod_elem *cur;
+
+ lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
+
+ tm->seq = btrfs_inc_tree_mod_seq(fs_info);
+
+ tm_root = &fs_info->tree_mod_log;
+ new = &tm_root->rb_node;
+ while (*new) {
+ cur = rb_entry(*new, struct tree_mod_elem, node);
+ parent = *new;
+ if (cur->logical < tm->logical)
+ new = &((*new)->rb_left);
+ else if (cur->logical > tm->logical)
+ new = &((*new)->rb_right);
+ else if (cur->seq < tm->seq)
+ new = &((*new)->rb_left);
+ else if (cur->seq > tm->seq)
+ new = &((*new)->rb_right);
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(&tm->node, parent, new);
+ rb_insert_color(&tm->node, tm_root);
+ return 0;
+}
+
+/*
+ * Determines if logging can be omitted. Returns true if it can. Otherwise, it
+ * returns false with the tree_mod_log_lock acquired. The caller must hold
+ * this until all tree mod log insertions are recorded in the rb tree and then
+ * write unlock fs_info::tree_mod_log_lock.
+ */
+static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb)
+{
+ if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
+ return true;
+ if (eb && btrfs_header_level(eb) == 0)
+ return true;
+
+ write_lock(&fs_info->tree_mod_log_lock);
+ if (list_empty(&(fs_info)->tree_mod_seq_list)) {
+ write_unlock(&fs_info->tree_mod_log_lock);
+ return true;
+ }
+
+ return false;
+}
+
+/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
+static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb)
+{
+ if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
+ return false;
+ if (eb && btrfs_header_level(eb) == 0)
+ return false;
+
+ return true;
+}
+
+static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
+ int slot,
+ enum btrfs_mod_log_op op,
+ gfp_t flags)
+{
+ struct tree_mod_elem *tm;
+
+ tm = kzalloc(sizeof(*tm), flags);
+ if (!tm)
+ return NULL;
+
+ tm->logical = eb->start;
+ if (op != BTRFS_MOD_LOG_KEY_ADD) {
+ btrfs_node_key(eb, &tm->key, slot);
+ tm->blockptr = btrfs_node_blockptr(eb, slot);
+ }
+ tm->op = op;
+ tm->slot = slot;
+ tm->generation = btrfs_node_ptr_generation(eb, slot);
+ RB_CLEAR_NODE(&tm->node);
+
+ return tm;
+}
+
+int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
+ enum btrfs_mod_log_op op, gfp_t flags)
+{
+ struct tree_mod_elem *tm;
+ int ret;
+
+ if (!tree_mod_need_log(eb->fs_info, eb))
+ return 0;
+
+ tm = alloc_tree_mod_elem(eb, slot, op, flags);
+ if (!tm)
+ return -ENOMEM;
+
+ if (tree_mod_dont_log(eb->fs_info, eb)) {
+ kfree(tm);
+ return 0;
+ }
+
+ ret = tree_mod_log_insert(eb->fs_info, tm);
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
+ if (ret)
+ kfree(tm);
+
+ return ret;
+}
+
+int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
+ int dst_slot, int src_slot,
+ int nr_items)
+{
+ struct tree_mod_elem *tm = NULL;
+ struct tree_mod_elem **tm_list = NULL;
+ int ret = 0;
+ int i;
+ bool locked = false;
+
+ if (!tree_mod_need_log(eb->fs_info, eb))
+ return 0;
+
+ tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
+ if (!tm_list)
+ return -ENOMEM;
+
+ tm = kzalloc(sizeof(*tm), GFP_NOFS);
+ if (!tm) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+
+ tm->logical = eb->start;
+ tm->slot = src_slot;
+ tm->move.dst_slot = dst_slot;
+ tm->move.nr_items = nr_items;
+ tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
+
+ for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+ tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
+ if (!tm_list[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ }
+
+ if (tree_mod_dont_log(eb->fs_info, eb))
+ goto free_tms;
+ locked = true;
+
+ /*
+ * When we override something during the move, we log these removals.
+ * This can only happen when we move towards the beginning of the
+ * buffer, i.e. dst_slot < src_slot.
+ */
+ for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+ ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
+ if (ret)
+ goto free_tms;
+ }
+
+ ret = tree_mod_log_insert(eb->fs_info, tm);
+ if (ret)
+ goto free_tms;
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
+ kfree(tm_list);
+
+ return 0;
+
+free_tms:
+ for (i = 0; i < nr_items; i++) {
+ if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
+ rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
+ kfree(tm_list[i]);
+ }
+ if (locked)
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
+ kfree(tm_list);
+ kfree(tm);
+
+ return ret;
+}
+
+static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+ struct tree_mod_elem **tm_list,
+ int nritems)
+{
+ int i, j;
+ int ret;
+
+ for (i = nritems - 1; i >= 0; i--) {
+ ret = tree_mod_log_insert(fs_info, tm_list[i]);
+ if (ret) {
+ for (j = nritems - 1; j > i; j--)
+ rb_erase(&tm_list[j]->node,
+ &fs_info->tree_mod_log);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
+ struct extent_buffer *new_root,
+ bool log_removal)
+{
+ struct btrfs_fs_info *fs_info = old_root->fs_info;
+ struct tree_mod_elem *tm = NULL;
+ struct tree_mod_elem **tm_list = NULL;
+ int nritems = 0;
+ int ret = 0;
+ int i;
+
+ if (!tree_mod_need_log(fs_info, NULL))
+ return 0;
+
+ if (log_removal && btrfs_header_level(old_root) > 0) {
+ nritems = btrfs_header_nritems(old_root);
+ tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
+ GFP_NOFS);
+ if (!tm_list) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ for (i = 0; i < nritems; i++) {
+ tm_list[i] = alloc_tree_mod_elem(old_root, i,
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
+ if (!tm_list[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ }
+ }
+
+ tm = kzalloc(sizeof(*tm), GFP_NOFS);
+ if (!tm) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+
+ tm->logical = new_root->start;
+ tm->old_root.logical = old_root->start;
+ tm->old_root.level = btrfs_header_level(old_root);
+ tm->generation = btrfs_header_generation(old_root);
+ tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
+
+ if (tree_mod_dont_log(fs_info, NULL))
+ goto free_tms;
+
+ if (tm_list)
+ ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
+ if (!ret)
+ ret = tree_mod_log_insert(fs_info, tm);
+
+ write_unlock(&fs_info->tree_mod_log_lock);
+ if (ret)
+ goto free_tms;
+ kfree(tm_list);
+
+ return ret;
+
+free_tms:
+ if (tm_list) {
+ for (i = 0; i < nritems; i++)
+ kfree(tm_list[i]);
+ kfree(tm_list);
+ }
+ kfree(tm);
+
+ return ret;
+}
+
+static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
+ u64 start, u64 min_seq,
+ bool smallest)
+{
+ struct rb_root *tm_root;
+ struct rb_node *node;
+ struct tree_mod_elem *cur = NULL;
+ struct tree_mod_elem *found = NULL;
+
+ read_lock(&fs_info->tree_mod_log_lock);
+ tm_root = &fs_info->tree_mod_log;
+ node = tm_root->rb_node;
+ while (node) {
+ cur = rb_entry(node, struct tree_mod_elem, node);
+ if (cur->logical < start) {
+ node = node->rb_left;
+ } else if (cur->logical > start) {
+ node = node->rb_right;
+ } else if (cur->seq < min_seq) {
+ node = node->rb_left;
+ } else if (!smallest) {
+ /* We want the node with the highest seq */
+ if (found)
+ BUG_ON(found->seq > cur->seq);
+ found = cur;
+ node = node->rb_left;
+ } else if (cur->seq > min_seq) {
+ /* We want the node with the smallest seq */
+ if (found)
+ BUG_ON(found->seq < cur->seq);
+ found = cur;
+ node = node->rb_right;
+ } else {
+ found = cur;
+ break;
+ }
+ }
+ read_unlock(&fs_info->tree_mod_log_lock);
+
+ return found;
+}
+
+/*
+ * This returns the element from the log with the smallest time sequence
+ * value that's in the log (the oldest log item). Any element with a time
+ * sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
+ u64 start, u64 min_seq)
+{
+ return __tree_mod_log_search(fs_info, start, min_seq, true);
+}
+
+/*
+ * This returns the element from the log with the largest time sequence
+ * value that's in the log (the most recent log item). Any element with
+ * a time sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
+ u64 start, u64 min_seq)
+{
+ return __tree_mod_log_search(fs_info, start, min_seq, false);
+}
+
+int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
+ struct extent_buffer *src,
+ unsigned long dst_offset,
+ unsigned long src_offset,
+ int nr_items)
+{
+ struct btrfs_fs_info *fs_info = dst->fs_info;
+ int ret = 0;
+ struct tree_mod_elem **tm_list = NULL;
+ struct tree_mod_elem **tm_list_add, **tm_list_rem;
+ int i;
+ bool locked = false;
+
+ if (!tree_mod_need_log(fs_info, NULL))
+ return 0;
+
+ if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
+ return 0;
+
+ tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
+ GFP_NOFS);
+ if (!tm_list)
+ return -ENOMEM;
+
+ tm_list_add = tm_list;
+ tm_list_rem = tm_list + nr_items;
+ for (i = 0; i < nr_items; i++) {
+ tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
+ BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
+ if (!tm_list_rem[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+
+ tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
+ BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
+ if (!tm_list_add[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ }
+
+ if (tree_mod_dont_log(fs_info, NULL))
+ goto free_tms;
+ locked = true;
+
+ for (i = 0; i < nr_items; i++) {
+ ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
+ if (ret)
+ goto free_tms;
+ ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
+ if (ret)
+ goto free_tms;
+ }
+
+ write_unlock(&fs_info->tree_mod_log_lock);
+ kfree(tm_list);
+
+ return 0;
+
+free_tms:
+ for (i = 0; i < nr_items * 2; i++) {
+ if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
+ rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
+ kfree(tm_list[i]);
+ }
+ if (locked)
+ write_unlock(&fs_info->tree_mod_log_lock);
+ kfree(tm_list);
+
+ return ret;
+}
+
+int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
+{
+ struct tree_mod_elem **tm_list = NULL;
+ int nritems = 0;
+ int i;
+ int ret = 0;
+
+ if (!tree_mod_need_log(eb->fs_info, eb))
+ return 0;
+
+ nritems = btrfs_header_nritems(eb);
+ tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
+ if (!tm_list)
+ return -ENOMEM;
+
+ for (i = 0; i < nritems; i++) {
+ tm_list[i] = alloc_tree_mod_elem(eb, i,
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
+ if (!tm_list[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ }
+
+ if (tree_mod_dont_log(eb->fs_info, eb))
+ goto free_tms;
+
+ ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
+ if (ret)
+ goto free_tms;
+ kfree(tm_list);
+
+ return 0;
+
+free_tms:
+ for (i = 0; i < nritems; i++)
+ kfree(tm_list[i]);
+ kfree(tm_list);
+
+ return ret;
+}
+
+/*
+ * Returns the logical address of the oldest predecessor of the given root.
+ * Entries older than time_seq are ignored.
+ */
+static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
+ u64 time_seq)
+{
+ struct tree_mod_elem *tm;
+ struct tree_mod_elem *found = NULL;
+ u64 root_logical = eb_root->start;
+ bool looped = false;
+
+ if (!time_seq)
+ return NULL;
+
+ /*
+ * The very last operation that's logged for a root is the replacement
+ * operation (if it is replaced at all). This has the logical address
+ * of the *new* root, making it the very first operation that's logged
+ * for this root.
+ */
+ while (1) {
+ tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
+ time_seq);
+ if (!looped && !tm)
+ return NULL;
+ /*
+ * If there are no tree operation for the oldest root, we simply
+ * return it. This should only happen if that (old) root is at
+ * level 0.
+ */
+ if (!tm)
+ break;
+
+ /*
+ * If there's an operation that's not a root replacement, we
+ * found the oldest version of our root. Normally, we'll find a
+ * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
+ */
+ if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
+ break;
+
+ found = tm;
+ root_logical = tm->old_root.logical;
+ looped = true;
+ }
+
+ /* If there's no old root to return, return what we found instead */
+ if (!found)
+ found = tm;
+
+ return found;
+}
+
+
+/*
+ * tm is a pointer to the first operation to rewind within eb. Then, all
+ * previous operations will be rewound (until we reach something older than
+ * time_seq).
+ */
+static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb,
+ u64 time_seq,
+ struct tree_mod_elem *first_tm)
+{
+ u32 n;
+ struct rb_node *next;
+ struct tree_mod_elem *tm = first_tm;
+ unsigned long o_dst;
+ unsigned long o_src;
+ unsigned long p_size = sizeof(struct btrfs_key_ptr);
+
+ n = btrfs_header_nritems(eb);
+ read_lock(&fs_info->tree_mod_log_lock);
+ while (tm && tm->seq >= time_seq) {
+ /*
+ * All the operations are recorded with the operator used for
+ * the modification. As we're going backwards, we do the
+ * opposite of each operation here.
+ */
+ switch (tm->op) {
+ case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
+ BUG_ON(tm->slot < n);
+ fallthrough;
+ case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
+ case BTRFS_MOD_LOG_KEY_REMOVE:
+ btrfs_set_node_key(eb, &tm->key, tm->slot);
+ btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+ btrfs_set_node_ptr_generation(eb, tm->slot,
+ tm->generation);
+ n++;
+ break;
+ case BTRFS_MOD_LOG_KEY_REPLACE:
+ BUG_ON(tm->slot >= n);
+ btrfs_set_node_key(eb, &tm->key, tm->slot);
+ btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+ btrfs_set_node_ptr_generation(eb, tm->slot,
+ tm->generation);
+ break;
+ case BTRFS_MOD_LOG_KEY_ADD:
+ /* if a move operation is needed it's in the log */
+ n--;
+ break;
+ case BTRFS_MOD_LOG_MOVE_KEYS:
+ o_dst = btrfs_node_key_ptr_offset(tm->slot);
+ o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
+ memmove_extent_buffer(eb, o_dst, o_src,
+ tm->move.nr_items * p_size);
+ break;
+ case BTRFS_MOD_LOG_ROOT_REPLACE:
+ /*
+ * This operation is special. For roots, this must be
+ * handled explicitly before rewinding.
+ * For non-roots, this operation may exist if the node
+ * was a root: root A -> child B; then A gets empty and
+ * B is promoted to the new root. In the mod log, we'll
+ * have a root-replace operation for B, a tree block
+ * that is no root. We simply ignore that operation.
+ */
+ break;
+ }
+ next = rb_next(&tm->node);
+ if (!next)
+ break;
+ tm = rb_entry(next, struct tree_mod_elem, node);
+ if (tm->logical != first_tm->logical)
+ break;
+ }
+ read_unlock(&fs_info->tree_mod_log_lock);
+ btrfs_set_header_nritems(eb, n);
+}
+
+/*
+ * Called with eb read locked. If the buffer cannot be rewound, the same buffer
+ * is returned. If rewind operations happen, a fresh buffer is returned. The
+ * returned buffer is always read-locked. If the returned buffer is not the
+ * input buffer, the lock on the input buffer is released and the input buffer
+ * is freed (its refcount is decremented).
+ */
+struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ struct extent_buffer *eb,
+ u64 time_seq)
+{
+ struct extent_buffer *eb_rewin;
+ struct tree_mod_elem *tm;
+
+ if (!time_seq)
+ return eb;
+
+ if (btrfs_header_level(eb) == 0)
+ return eb;
+
+ tm = tree_mod_log_search(fs_info, eb->start, time_seq);
+ if (!tm)
+ return eb;
+
+ if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+ BUG_ON(tm->slot != 0);
+ eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
+ if (!eb_rewin) {
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ return NULL;
+ }
+ btrfs_set_header_bytenr(eb_rewin, eb->start);
+ btrfs_set_header_backref_rev(eb_rewin,
+ btrfs_header_backref_rev(eb));
+ btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
+ btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
+ } else {
+ eb_rewin = btrfs_clone_extent_buffer(eb);
+ if (!eb_rewin) {
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ return NULL;
+ }
+ }
+
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+
+ btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
+ eb_rewin, btrfs_header_level(eb_rewin));
+ btrfs_tree_read_lock(eb_rewin);
+ tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
+ WARN_ON(btrfs_header_nritems(eb_rewin) >
+ BTRFS_NODEPTRS_PER_BLOCK(fs_info));
+
+ return eb_rewin;
+}
+
+/*
+ * Rewind the state of @root's root node to the given @time_seq value.
+ * If there are no changes, the current root->root_node is returned. If anything
+ * changed in between, there's a fresh buffer allocated on which the rewind
+ * operations are done. In any case, the returned buffer is read locked.
+ * Returns NULL on error (with no locks held).
+ */
+struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct tree_mod_elem *tm;
+ struct extent_buffer *eb = NULL;
+ struct extent_buffer *eb_root;
+ u64 eb_root_owner = 0;
+ struct extent_buffer *old;
+ struct tree_mod_root *old_root = NULL;
+ u64 old_generation = 0;
+ u64 logical;
+ int level;
+
+ eb_root = btrfs_read_lock_root_node(root);
+ tm = tree_mod_log_oldest_root(eb_root, time_seq);
+ if (!tm)
+ return eb_root;
+
+ if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
+ old_root = &tm->old_root;
+ old_generation = tm->generation;
+ logical = old_root->logical;
+ level = old_root->level;
+ } else {
+ logical = eb_root->start;
+ level = btrfs_header_level(eb_root);
+ }
+
+ tm = tree_mod_log_search(fs_info, logical, time_seq);
+ if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+ btrfs_tree_read_unlock(eb_root);
+ free_extent_buffer(eb_root);
+ old = read_tree_block(fs_info, logical, root->root_key.objectid,
+ 0, level, NULL);
+ if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
+ if (!IS_ERR(old))
+ free_extent_buffer(old);
+ btrfs_warn(fs_info,
+ "failed to read tree block %llu from get_old_root",
+ logical);
+ } else {
+ struct tree_mod_elem *tm2;
+
+ btrfs_tree_read_lock(old);
+ eb = btrfs_clone_extent_buffer(old);
+ /*
+ * After the lookup for the most recent tree mod operation
+ * above and before we locked and cloned the extent buffer
+ * 'old', a new tree mod log operation may have been added.
+ * So lookup for a more recent one to make sure the number
+ * of mod log operations we replay is consistent with the
+ * number of items we have in the cloned extent buffer,
+ * otherwise we can hit a BUG_ON when rewinding the extent
+ * buffer.
+ */
+ tm2 = tree_mod_log_search(fs_info, logical, time_seq);
+ btrfs_tree_read_unlock(old);
+ free_extent_buffer(old);
+ ASSERT(tm2);
+ ASSERT(tm2 == tm || tm2->seq > tm->seq);
+ if (!tm2 || tm2->seq < tm->seq) {
+ free_extent_buffer(eb);
+ return NULL;
+ }
+ tm = tm2;
+ }
+ } else if (old_root) {
+ eb_root_owner = btrfs_header_owner(eb_root);
+ btrfs_tree_read_unlock(eb_root);
+ free_extent_buffer(eb_root);
+ eb = alloc_dummy_extent_buffer(fs_info, logical);
+ } else {
+ eb = btrfs_clone_extent_buffer(eb_root);
+ btrfs_tree_read_unlock(eb_root);
+ free_extent_buffer(eb_root);
+ }
+
+ if (!eb)
+ return NULL;
+ if (old_root) {
+ btrfs_set_header_bytenr(eb, eb->start);
+ btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
+ btrfs_set_header_owner(eb, eb_root_owner);
+ btrfs_set_header_level(eb, old_root->level);
+ btrfs_set_header_generation(eb, old_generation);
+ }
+ btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
+ btrfs_header_level(eb));
+ btrfs_tree_read_lock(eb);
+ if (tm)
+ tree_mod_log_rewind(fs_info, eb, time_seq, tm);
+ else
+ WARN_ON(btrfs_header_level(eb) != 0);
+ WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
+
+ return eb;
+}
+
+int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
+{
+ struct tree_mod_elem *tm;
+ int level;
+ struct extent_buffer *eb_root = btrfs_root_node(root);
+
+ tm = tree_mod_log_oldest_root(eb_root, time_seq);
+ if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
+ level = tm->old_root.level;
+ else
+ level = btrfs_header_level(eb_root);
+
+ free_extent_buffer(eb_root);
+
+ return level;
+}
+
+/*
+ * Return the lowest sequence number in the tree modification log.
+ *
+ * Return the sequence number of the oldest tree modification log user, which
+ * corresponds to the lowest sequence number of all existing users. If there are
+ * no users it returns 0.
+ */
+u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
+{
+ u64 ret = 0;
+
+ read_lock(&fs_info->tree_mod_log_lock);
+ if (!list_empty(&fs_info->tree_mod_seq_list)) {
+ struct btrfs_seq_list *elem;
+
+ elem = list_first_entry(&fs_info->tree_mod_seq_list,
+ struct btrfs_seq_list, list);
+ ret = elem->seq;
+ }
+ read_unlock(&fs_info->tree_mod_log_lock);
+
+ return ret;
+}
diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h
new file mode 100644
index 000000000000..12605d19621b
--- /dev/null
+++ b/fs/btrfs/tree-mod-log.h
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef BTRFS_TREE_MOD_LOG_H
+#define BTRFS_TREE_MOD_LOG_H
+
+#include "ctree.h"
+
+/* Represents a tree mod log user. */
+struct btrfs_seq_list {
+ struct list_head list;
+ u64 seq;
+};
+
+#define BTRFS_SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 }
+#define BTRFS_SEQ_LAST ((u64)-1)
+
+enum btrfs_mod_log_op {
+ BTRFS_MOD_LOG_KEY_REPLACE,
+ BTRFS_MOD_LOG_KEY_ADD,
+ BTRFS_MOD_LOG_KEY_REMOVE,
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING,
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING,
+ BTRFS_MOD_LOG_MOVE_KEYS,
+ BTRFS_MOD_LOG_ROOT_REPLACE,
+};
+
+u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct btrfs_seq_list *elem);
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct btrfs_seq_list *elem);
+int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
+ struct extent_buffer *new_root,
+ bool log_removal);
+int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
+ enum btrfs_mod_log_op op, gfp_t flags);
+int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb);
+struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ struct extent_buffer *eb,
+ u64 time_seq);
+struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq);
+int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
+int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
+ struct extent_buffer *src,
+ unsigned long dst_offset,
+ unsigned long src_offset,
+ int nr_items);
+int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
+ int dst_slot, int src_slot,
+ int nr_items);
+u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info);
+
+#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bc3b33efddc5..9a1ead0c4a31 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1224,7 +1224,8 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
return 0;
}
-static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int devid_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
{
struct btrfs_device *dev1, *dev2;
@@ -1458,8 +1459,8 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
/* Given hole range was invalid (outside of device) */
if (ret == -ERANGE) {
*hole_start += *hole_size;
- *hole_size = 0;
- return 1;
+ *hole_size = false;
+ return true;
}
*hole_start += zone_size;
@@ -3098,11 +3099,12 @@ out:
return ret;
}
-static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
+int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_trans_handle *trans;
struct btrfs_block_group *block_group;
+ u64 length;
int ret;
/*
@@ -3117,7 +3119,7 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
* we release the path used to search the chunk/dev tree and before
* the current task acquires this mutex and calls us.
*/
- lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
+ lockdep_assert_held(&fs_info->reclaim_bgs_lock);
/* step one, relocate all the extents inside this chunk */
btrfs_scrub_pause(fs_info);
@@ -3130,8 +3132,23 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
if (!block_group)
return -ENOENT;
btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
+ length = block_group->length;
btrfs_put_block_group(block_group);
+ /*
+ * On a zoned file system, discard the whole block group, this will
+ * trigger a REQ_OP_ZONE_RESET operation on the device zone. If
+ * resetting the zone fails, don't treat it as a fatal problem from the
+ * filesystem's point of view.
+ */
+ if (btrfs_is_zoned(fs_info)) {
+ ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
+ if (ret)
+ btrfs_info(fs_info,
+ "failed to reset zone %llu after relocation",
+ chunk_offset);
+ }
+
trans = btrfs_start_trans_remove_block_group(root->fs_info,
chunk_offset);
if (IS_ERR(trans)) {
@@ -3172,10 +3189,10 @@ again:
key.type = BTRFS_CHUNK_ITEM_KEY;
while (1) {
- mutex_lock(&fs_info->delete_unused_bgs_mutex);
+ mutex_lock(&fs_info->reclaim_bgs_lock);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
goto error;
}
BUG_ON(ret == 0); /* Corruption */
@@ -3183,7 +3200,7 @@ again:
ret = btrfs_previous_item(chunk_root, path, key.objectid,
key.type);
if (ret)
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret < 0)
goto error;
if (ret > 0)
@@ -3204,7 +3221,7 @@ again:
else
BUG_ON(ret);
}
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
if (found_key.offset == 0)
break;
@@ -3744,10 +3761,10 @@ again:
goto error;
}
- mutex_lock(&fs_info->delete_unused_bgs_mutex);
+ mutex_lock(&fs_info->reclaim_bgs_lock);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
goto error;
}
@@ -3761,7 +3778,7 @@ again:
ret = btrfs_previous_item(chunk_root, path, 0,
BTRFS_CHUNK_ITEM_KEY);
if (ret) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
ret = 0;
break;
}
@@ -3771,7 +3788,7 @@ again:
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid != key.objectid) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
break;
}
@@ -3788,12 +3805,12 @@ again:
btrfs_release_path(path);
if (!ret) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
goto loop;
}
if (counting) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
spin_lock(&fs_info->balance_lock);
bctl->stat.expected++;
spin_unlock(&fs_info->balance_lock);
@@ -3818,7 +3835,7 @@ again:
count_meta < bctl->meta.limit_min)
|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
count_sys < bctl->sys.limit_min)) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
goto loop;
}
@@ -3832,7 +3849,7 @@ again:
ret = btrfs_may_alloc_data_chunk(fs_info,
found_key.offset);
if (ret < 0) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
goto error;
} else if (ret == 1) {
chunk_reserved = 1;
@@ -3840,7 +3857,7 @@ again:
}
ret = btrfs_relocate_chunk(fs_info, found_key.offset);
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret == -ENOSPC) {
enospc_errors++;
} else if (ret == -ETXTBSY) {
@@ -4725,16 +4742,16 @@ again:
key.type = BTRFS_DEV_EXTENT_KEY;
do {
- mutex_lock(&fs_info->delete_unused_bgs_mutex);
+ mutex_lock(&fs_info->reclaim_bgs_lock);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
goto done;
}
ret = btrfs_previous_item(root, path, 0, key.type);
if (ret) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret < 0)
goto done;
ret = 0;
@@ -4747,7 +4764,7 @@ again:
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
if (key.objectid != device->devid) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_release_path(path);
break;
}
@@ -4756,7 +4773,7 @@ again:
length = btrfs_dev_extent_length(l, dev_extent);
if (key.offset + length <= new_size) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_release_path(path);
break;
}
@@ -4772,12 +4789,12 @@ again:
*/
ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
if (ret < 0) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
goto done;
}
ret = btrfs_relocate_chunk(fs_info, chunk_offset);
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret == -ENOSPC) {
failed++;
} else if (ret) {
@@ -4989,6 +5006,8 @@ static void init_alloc_chunk_ctl_policy_zoned(
ctl->max_chunk_size = 2 * ctl->max_stripe_size;
ctl->devs_max = min_t(int, ctl->devs_max,
BTRFS_MAX_DEVS_SYS_CHUNK);
+ } else {
+ BUG();
}
/* We don't want a chunk larger than 10% of writable space */
@@ -6787,6 +6806,46 @@ static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
return div_u64(chunk_len, data_stripes);
}
+#if BITS_PER_LONG == 32
+/*
+ * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
+ * can't be accessed on 32bit systems.
+ *
+ * This function do mount time check to reject the fs if it already has
+ * metadata chunk beyond that limit.
+ */
+static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length, u64 type)
+{
+ if (!(type & BTRFS_BLOCK_GROUP_METADATA))
+ return 0;
+
+ if (logical + length < MAX_LFS_FILESIZE)
+ return 0;
+
+ btrfs_err_32bit_limit(fs_info);
+ return -EOVERFLOW;
+}
+
+/*
+ * This is to give early warning for any metadata chunk reaching
+ * BTRFS_32BIT_EARLY_WARN_THRESHOLD.
+ * Although we can still access the metadata, it's not going to be possible
+ * once the limit is reached.
+ */
+static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length, u64 type)
+{
+ if (!(type & BTRFS_BLOCK_GROUP_METADATA))
+ return;
+
+ if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
+ return;
+
+ btrfs_warn_32bit_limit(fs_info);
+}
+#endif
+
static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
struct btrfs_chunk *chunk)
{
@@ -6797,6 +6856,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
u64 logical;
u64 length;
u64 devid;
+ u64 type;
u8 uuid[BTRFS_UUID_SIZE];
int num_stripes;
int ret;
@@ -6804,8 +6864,16 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
logical = key->offset;
length = btrfs_chunk_length(leaf, chunk);
+ type = btrfs_chunk_type(leaf, chunk);
num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+#if BITS_PER_LONG == 32
+ ret = check_32bit_meta_chunk(fs_info, logical, length, type);
+ if (ret < 0)
+ return ret;
+ warn_32bit_meta_chunk(fs_info, logical, length, type);
+#endif
+
/*
* Only need to verify chunk item if we're reading from sys chunk array,
* as chunk item in tree block is already verified by tree-checker.
@@ -6849,10 +6917,10 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
map->io_width = btrfs_chunk_io_width(leaf, chunk);
map->io_align = btrfs_chunk_io_align(leaf, chunk);
map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
- map->type = btrfs_chunk_type(leaf, chunk);
+ map->type = type;
map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
map->verified_stripes = 0;
- em->orig_block_len = calc_stripe_length(map->type, em->len,
+ em->orig_block_len = calc_stripe_length(type, em->len,
map->num_stripes);
for (i = 0; i < num_stripes; i++) {
map->stripes[i].physical =
@@ -7448,6 +7516,9 @@ static int btrfs_device_init_dev_stats(struct btrfs_device *device,
int item_size;
int i, ret, slot;
+ if (!device->fs_info->dev_root)
+ return 0;
+
key.objectid = BTRFS_DEV_STATS_OBJECTID;
key.type = BTRFS_PERSISTENT_ITEM_KEY;
key.offset = device->devid;
@@ -7998,7 +8069,7 @@ static int relocating_repair_kthread(void *data)
return -EBUSY;
}
- mutex_lock(&fs_info->delete_unused_bgs_mutex);
+ mutex_lock(&fs_info->reclaim_bgs_lock);
/* Ensure block group still exists */
cache = btrfs_lookup_block_group(fs_info, target);
@@ -8020,7 +8091,7 @@ static int relocating_repair_kthread(void *data)
out:
if (cache)
btrfs_put_block_group(cache);
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_exclop_finish(fs_info);
return ret;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d4c3e0dd32b8..9c0d84e5ec06 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -484,6 +484,7 @@ void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf);
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
+int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
int btrfs_uuid_scan_kthread(void *data);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 1f972b75a9ab..70b23a0d03b1 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -21,9 +21,30 @@
/* Pseudo write pointer value for conventional zone */
#define WP_CONVENTIONAL ((u64)-2)
+/*
+ * Location of the first zone of superblock logging zone pairs.
+ *
+ * - primary superblock: 0B (zone 0)
+ * - first copy: 512G (zone starting at that offset)
+ * - second copy: 4T (zone starting at that offset)
+ */
+#define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL)
+#define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G)
+#define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G)
+
+#define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
+#define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
+
/* Number of superblock log zones */
#define BTRFS_NR_SB_LOG_ZONES 2
+/*
+ * Maximum supported zone size. Currently, SMR disks have a zone size of
+ * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not
+ * expect the zone size to become larger than 8GiB in the near future.
+ */
+#define BTRFS_MAX_ZONE_SIZE SZ_8G
+
static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
{
struct blk_zone *zones = data;
@@ -111,23 +132,22 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
}
/*
- * The following zones are reserved as the circular buffer on ZONED btrfs.
- * - The primary superblock: zones 0 and 1
- * - The first copy: zones 16 and 17
- * - The second copy: zones 1024 or zone at 256GB which is minimum, and
- * the following one
+ * Get the first zone number of the superblock mirror
*/
static inline u32 sb_zone_number(int shift, int mirror)
{
- ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
+ u64 zone;
+ ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
switch (mirror) {
- case 0: return 0;
- case 1: return 16;
- case 2: return min_t(u64, btrfs_sb_offset(mirror) >> shift, 1024);
+ case 0: zone = 0; break;
+ case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break;
+ case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break;
}
- return 0;
+ ASSERT(zone <= U32_MAX);
+
+ return (u32)zone;
}
/*
@@ -300,10 +320,21 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
zone_sectors = bdev_zone_sectors(bdev);
}
- nr_sectors = bdev_nr_sectors(bdev);
/* Check if it's power of 2 (see is_power_of_2) */
ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0);
zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
+
+ /* We reject devices with a zone size larger than 8GB */
+ if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
+ btrfs_err_in_rcu(fs_info,
+ "zoned: %s: zone size %llu larger than supported maximum %llu",
+ rcu_str_deref(device->name),
+ zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ nr_sectors = bdev_nr_sectors(bdev);
zone_info->zone_size_shift = ilog2(zone_info->zone_size);
zone_info->max_zone_append_size =
(u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT;
@@ -311,6 +342,13 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
if (!IS_ALIGNED(nr_sectors, zone_sectors))
zone_info->nr_zones++;
+ if (bdev_is_zoned(bdev) && zone_info->max_zone_append_size == 0) {
+ btrfs_err(fs_info, "zoned: device %pg does not support zone append",
+ bdev);
+ ret = -EINVAL;
+ goto out;
+ }
+
zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
if (!zone_info->seq_zones) {
ret = -ENOMEM;
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 61e969652fe1..5e41a74a9cb2 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -9,6 +9,12 @@
#include "disk-io.h"
#include "block-group.h"
+/*
+ * Block groups with more than this value (percents) of unusable space will be
+ * scheduled for background reclaim.
+ */
+#define BTRFS_DEFAULT_RECLAIM_THRESH 75
+
struct btrfs_zoned_device_info {
/*
* Number of zones, zone size and types of zones if bdev is a
diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile
index 891dedda5905..2227dc2d5498 100644
--- a/fs/cachefiles/Makefile
+++ b/fs/cachefiles/Makefile
@@ -7,6 +7,7 @@ cachefiles-y := \
bind.o \
daemon.o \
interface.o \
+ io.o \
key.o \
main.o \
namei.o \
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index dfb14dbddf51..38bb7764b454 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -118,6 +118,12 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
cache->mnt = path.mnt;
root = path.dentry;
+ ret = -EINVAL;
+ if (mnt_user_ns(path.mnt) != &init_user_ns) {
+ pr_warn("File cache on idmapped mounts not supported");
+ goto error_unsupported;
+ }
+
/* check parameters */
ret = -EOPNOTSUPP;
if (d_is_negative(root) ||
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 5efa6a3702c0..da3948fdb615 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -319,8 +319,8 @@ static void cachefiles_drop_object(struct fscache_object *_object)
/*
* dispose of a reference to an object
*/
-static void cachefiles_put_object(struct fscache_object *_object,
- enum fscache_obj_ref_trace why)
+void cachefiles_put_object(struct fscache_object *_object,
+ enum fscache_obj_ref_trace why)
{
struct cachefiles_object *object;
struct fscache_cache *cache;
@@ -568,4 +568,5 @@ const struct fscache_cache_ops cachefiles_cache_ops = {
.uncache_page = cachefiles_uncache_page,
.dissociate_pages = cachefiles_dissociate_pages,
.check_consistency = cachefiles_check_consistency,
+ .begin_read_operation = cachefiles_begin_read_operation,
};
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index cf9bd6401c2d..4ed83aa5253b 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -150,6 +150,9 @@ extern int cachefiles_has_space(struct cachefiles_cache *cache,
*/
extern const struct fscache_cache_ops cachefiles_cache_ops;
+void cachefiles_put_object(struct fscache_object *_object,
+ enum fscache_obj_ref_trace why);
+
/*
* key.c
*/
@@ -218,6 +221,12 @@ extern int cachefiles_write_page(struct fscache_storage *, struct page *);
extern void cachefiles_uncache_page(struct fscache_object *, struct page *);
/*
+ * rdwr2.c
+ */
+extern int cachefiles_begin_read_operation(struct netfs_read_request *,
+ struct fscache_retrieval *);
+
+/*
* security.c
*/
extern int cachefiles_get_security_ID(struct cachefiles_cache *cache);
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
new file mode 100644
index 000000000000..b13fb45fc3f3
--- /dev/null
+++ b/fs/cachefiles/io.c
@@ -0,0 +1,420 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* kiocb-using read/write
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/sched/mm.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+struct cachefiles_kiocb {
+ struct kiocb iocb;
+ refcount_t ki_refcnt;
+ loff_t start;
+ union {
+ size_t skipped;
+ size_t len;
+ };
+ netfs_io_terminated_t term_func;
+ void *term_func_priv;
+ bool was_async;
+};
+
+static inline void cachefiles_put_kiocb(struct cachefiles_kiocb *ki)
+{
+ if (refcount_dec_and_test(&ki->ki_refcnt)) {
+ fput(ki->iocb.ki_filp);
+ kfree(ki);
+ }
+}
+
+/*
+ * Handle completion of a read from the cache.
+ */
+static void cachefiles_read_complete(struct kiocb *iocb, long ret, long ret2)
+{
+ struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);
+
+ _enter("%ld,%ld", ret, ret2);
+
+ if (ki->term_func) {
+ if (ret >= 0)
+ ret += ki->skipped;
+ ki->term_func(ki->term_func_priv, ret, ki->was_async);
+ }
+
+ cachefiles_put_kiocb(ki);
+}
+
+/*
+ * Initiate a read from the cache.
+ */
+static int cachefiles_read(struct netfs_cache_resources *cres,
+ loff_t start_pos,
+ struct iov_iter *iter,
+ bool seek_data,
+ netfs_io_terminated_t term_func,
+ void *term_func_priv)
+{
+ struct cachefiles_kiocb *ki;
+ struct file *file = cres->cache_priv2;
+ unsigned int old_nofs;
+ ssize_t ret = -ENOBUFS;
+ size_t len = iov_iter_count(iter), skipped = 0;
+
+ _enter("%pD,%li,%llx,%zx/%llx",
+ file, file_inode(file)->i_ino, start_pos, len,
+ i_size_read(file->f_inode));
+
+ /* If the caller asked us to seek for data before doing the read, then
+ * we should do that now. If we find a gap, we fill it with zeros.
+ */
+ if (seek_data) {
+ loff_t off = start_pos, off2;
+
+ off2 = vfs_llseek(file, off, SEEK_DATA);
+ if (off2 < 0 && off2 >= (loff_t)-MAX_ERRNO && off2 != -ENXIO) {
+ skipped = 0;
+ ret = off2;
+ goto presubmission_error;
+ }
+
+ if (off2 == -ENXIO || off2 >= start_pos + len) {
+ /* The region is beyond the EOF or there's no more data
+ * in the region, so clear the rest of the buffer and
+ * return success.
+ */
+ iov_iter_zero(len, iter);
+ skipped = len;
+ ret = 0;
+ goto presubmission_error;
+ }
+
+ skipped = off2 - off;
+ iov_iter_zero(skipped, iter);
+ }
+
+ ret = -ENOBUFS;
+ ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL);
+ if (!ki)
+ goto presubmission_error;
+
+ refcount_set(&ki->ki_refcnt, 2);
+ ki->iocb.ki_filp = file;
+ ki->iocb.ki_pos = start_pos + skipped;
+ ki->iocb.ki_flags = IOCB_DIRECT;
+ ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file));
+ ki->iocb.ki_ioprio = get_current_ioprio();
+ ki->skipped = skipped;
+ ki->term_func = term_func;
+ ki->term_func_priv = term_func_priv;
+ ki->was_async = true;
+
+ if (ki->term_func)
+ ki->iocb.ki_complete = cachefiles_read_complete;
+
+ get_file(ki->iocb.ki_filp);
+
+ old_nofs = memalloc_nofs_save();
+ ret = vfs_iocb_iter_read(file, &ki->iocb, iter);
+ memalloc_nofs_restore(old_nofs);
+ switch (ret) {
+ case -EIOCBQUEUED:
+ goto in_progress;
+
+ case -ERESTARTSYS:
+ case -ERESTARTNOINTR:
+ case -ERESTARTNOHAND:
+ case -ERESTART_RESTARTBLOCK:
+ /* There's no easy way to restart the syscall since other AIO's
+ * may be already running. Just fail this IO with EINTR.
+ */
+ ret = -EINTR;
+ fallthrough;
+ default:
+ ki->was_async = false;
+ cachefiles_read_complete(&ki->iocb, ret, 0);
+ if (ret > 0)
+ ret = 0;
+ break;
+ }
+
+in_progress:
+ cachefiles_put_kiocb(ki);
+ _leave(" = %zd", ret);
+ return ret;
+
+presubmission_error:
+ if (term_func)
+ term_func(term_func_priv, ret < 0 ? ret : skipped, false);
+ return ret;
+}
+
+/*
+ * Handle completion of a write to the cache.
+ */
+static void cachefiles_write_complete(struct kiocb *iocb, long ret, long ret2)
+{
+ struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);
+ struct inode *inode = file_inode(ki->iocb.ki_filp);
+
+ _enter("%ld,%ld", ret, ret2);
+
+ /* Tell lockdep we inherited freeze protection from submission thread */
+ __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
+ __sb_end_write(inode->i_sb, SB_FREEZE_WRITE);
+
+ if (ki->term_func)
+ ki->term_func(ki->term_func_priv, ret, ki->was_async);
+
+ cachefiles_put_kiocb(ki);
+}
+
+/*
+ * Initiate a write to the cache.
+ */
+static int cachefiles_write(struct netfs_cache_resources *cres,
+ loff_t start_pos,
+ struct iov_iter *iter,
+ netfs_io_terminated_t term_func,
+ void *term_func_priv)
+{
+ struct cachefiles_kiocb *ki;
+ struct inode *inode;
+ struct file *file = cres->cache_priv2;
+ unsigned int old_nofs;
+ ssize_t ret = -ENOBUFS;
+ size_t len = iov_iter_count(iter);
+
+ _enter("%pD,%li,%llx,%zx/%llx",
+ file, file_inode(file)->i_ino, start_pos, len,
+ i_size_read(file->f_inode));
+
+ ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL);
+ if (!ki)
+ goto presubmission_error;
+
+ refcount_set(&ki->ki_refcnt, 2);
+ ki->iocb.ki_filp = file;
+ ki->iocb.ki_pos = start_pos;
+ ki->iocb.ki_flags = IOCB_DIRECT | IOCB_WRITE;
+ ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file));
+ ki->iocb.ki_ioprio = get_current_ioprio();
+ ki->start = start_pos;
+ ki->len = len;
+ ki->term_func = term_func;
+ ki->term_func_priv = term_func_priv;
+ ki->was_async = true;
+
+ if (ki->term_func)
+ ki->iocb.ki_complete = cachefiles_write_complete;
+
+ /* Open-code file_start_write here to grab freeze protection, which
+ * will be released by another thread in aio_complete_rw(). Fool
+ * lockdep by telling it the lock got released so that it doesn't
+ * complain about the held lock when we return to userspace.
+ */
+ inode = file_inode(file);
+ __sb_start_write(inode->i_sb, SB_FREEZE_WRITE);
+ __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
+
+ get_file(ki->iocb.ki_filp);
+
+ old_nofs = memalloc_nofs_save();
+ ret = vfs_iocb_iter_write(file, &ki->iocb, iter);
+ memalloc_nofs_restore(old_nofs);
+ switch (ret) {
+ case -EIOCBQUEUED:
+ goto in_progress;
+
+ case -ERESTARTSYS:
+ case -ERESTARTNOINTR:
+ case -ERESTARTNOHAND:
+ case -ERESTART_RESTARTBLOCK:
+ /* There's no easy way to restart the syscall since other AIO's
+ * may be already running. Just fail this IO with EINTR.
+ */
+ ret = -EINTR;
+ fallthrough;
+ default:
+ ki->was_async = false;
+ cachefiles_write_complete(&ki->iocb, ret, 0);
+ if (ret > 0)
+ ret = 0;
+ break;
+ }
+
+in_progress:
+ cachefiles_put_kiocb(ki);
+ _leave(" = %zd", ret);
+ return ret;
+
+presubmission_error:
+ if (term_func)
+ term_func(term_func_priv, -ENOMEM, false);
+ return -ENOMEM;
+}
+
+/*
+ * Prepare a read operation, shortening it to a cached/uncached
+ * boundary as appropriate.
+ */
+static enum netfs_read_source cachefiles_prepare_read(struct netfs_read_subrequest *subreq,
+ loff_t i_size)
+{
+ struct fscache_retrieval *op = subreq->rreq->cache_resources.cache_priv;
+ struct cachefiles_object *object;
+ struct cachefiles_cache *cache;
+ const struct cred *saved_cred;
+ struct file *file = subreq->rreq->cache_resources.cache_priv2;
+ loff_t off, to;
+
+ _enter("%zx @%llx/%llx", subreq->len, subreq->start, i_size);
+
+ object = container_of(op->op.object,
+ struct cachefiles_object, fscache);
+ cache = container_of(object->fscache.cache,
+ struct cachefiles_cache, cache);
+
+ if (!file)
+ goto cache_fail_nosec;
+
+ if (subreq->start >= i_size)
+ return NETFS_FILL_WITH_ZEROES;
+
+ cachefiles_begin_secure(cache, &saved_cred);
+
+ off = vfs_llseek(file, subreq->start, SEEK_DATA);
+ if (off < 0 && off >= (loff_t)-MAX_ERRNO) {
+ if (off == (loff_t)-ENXIO)
+ goto download_and_store;
+ goto cache_fail;
+ }
+
+ if (off >= subreq->start + subreq->len)
+ goto download_and_store;
+
+ if (off > subreq->start) {
+ off = round_up(off, cache->bsize);
+ subreq->len = off - subreq->start;
+ goto download_and_store;
+ }
+
+ to = vfs_llseek(file, subreq->start, SEEK_HOLE);
+ if (to < 0 && to >= (loff_t)-MAX_ERRNO)
+ goto cache_fail;
+
+ if (to < subreq->start + subreq->len) {
+ if (subreq->start + subreq->len >= i_size)
+ to = round_up(to, cache->bsize);
+ else
+ to = round_down(to, cache->bsize);
+ subreq->len = to - subreq->start;
+ }
+
+ cachefiles_end_secure(cache, saved_cred);
+ return NETFS_READ_FROM_CACHE;
+
+download_and_store:
+ if (cachefiles_has_space(cache, 0, (subreq->len + PAGE_SIZE - 1) / PAGE_SIZE) == 0)
+ __set_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags);
+cache_fail:
+ cachefiles_end_secure(cache, saved_cred);
+cache_fail_nosec:
+ return NETFS_DOWNLOAD_FROM_SERVER;
+}
+
+/*
+ * Prepare for a write to occur.
+ */
+static int cachefiles_prepare_write(struct netfs_cache_resources *cres,
+ loff_t *_start, size_t *_len, loff_t i_size)
+{
+ loff_t start = *_start;
+ size_t len = *_len, down;
+
+ /* Round to DIO size */
+ down = start - round_down(start, PAGE_SIZE);
+ *_start = start - down;
+ *_len = round_up(down + len, PAGE_SIZE);
+ return 0;
+}
+
+/*
+ * Clean up an operation.
+ */
+static void cachefiles_end_operation(struct netfs_cache_resources *cres)
+{
+ struct fscache_retrieval *op = cres->cache_priv;
+ struct file *file = cres->cache_priv2;
+
+ _enter("");
+
+ if (file)
+ fput(file);
+ if (op) {
+ fscache_op_complete(&op->op, false);
+ fscache_put_retrieval(op);
+ }
+
+ _leave("");
+}
+
+static const struct netfs_cache_ops cachefiles_netfs_cache_ops = {
+ .end_operation = cachefiles_end_operation,
+ .read = cachefiles_read,
+ .write = cachefiles_write,
+ .prepare_read = cachefiles_prepare_read,
+ .prepare_write = cachefiles_prepare_write,
+};
+
+/*
+ * Open the cache file when beginning a cache operation.
+ */
+int cachefiles_begin_read_operation(struct netfs_read_request *rreq,
+ struct fscache_retrieval *op)
+{
+ struct cachefiles_object *object;
+ struct cachefiles_cache *cache;
+ struct path path;
+ struct file *file;
+
+ _enter("");
+
+ object = container_of(op->op.object,
+ struct cachefiles_object, fscache);
+ cache = container_of(object->fscache.cache,
+ struct cachefiles_cache, cache);
+
+ path.mnt = cache->mnt;
+ path.dentry = object->backer;
+ file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT,
+ d_inode(object->backer), cache->cache_cred);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+ if (!S_ISREG(file_inode(file)->i_mode))
+ goto error_file;
+ if (unlikely(!file->f_op->read_iter) ||
+ unlikely(!file->f_op->write_iter)) {
+ pr_notice("Cache does not support read_iter and write_iter\n");
+ goto error_file;
+ }
+
+ fscache_get_retrieval(op);
+ rreq->cache_resources.cache_priv = op;
+ rreq->cache_resources.cache_priv2 = file;
+ rreq->cache_resources.ops = &cachefiles_netfs_cache_ops;
+ rreq->cookie_debug_id = object->fscache.debug_id;
+ _leave("");
+ return 0;
+
+error_file:
+ fput(file);
+ return -EIO;
+}
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index e027c718ca01..8ffc40e84a59 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -24,17 +24,16 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,
container_of(wait, struct cachefiles_one_read, monitor);
struct cachefiles_object *object;
struct fscache_retrieval *op = monitor->op;
- struct wait_bit_key *key = _key;
+ struct wait_page_key *key = _key;
struct page *page = wait->private;
ASSERT(key);
_enter("{%lu},%u,%d,{%p,%u}",
monitor->netfs_page->index, mode, sync,
- key->flags, key->bit_nr);
+ key->page, key->bit_nr);
- if (key->flags != &page->flags ||
- key->bit_nr != PG_locked)
+ if (key->page != page || key->bit_nr != PG_locked)
return 0;
_debug("--- monitor %p %lx ---", page, page->flags);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 570731c4d019..3c03fa37cac4 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -3358,7 +3358,13 @@ static void handle_cap_grant(struct inode *inode,
if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
(extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) {
- inode->i_mode = le32_to_cpu(grant->mode);
+ umode_t mode = le32_to_cpu(grant->mode);
+
+ if (inode_wrong_type(inode, mode))
+ pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
+ ceph_vinop(inode), inode->i_mode, mode);
+ else
+ inode->i_mode = mode;
inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
ci->i_btime = extra_info->btime;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 83d9358854fb..f7a790ed62c4 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -677,6 +677,8 @@ int ceph_handle_snapdir(struct ceph_mds_request *req,
strcmp(dentry->d_name.name,
fsc->mount_options->snapdir_name) == 0) {
struct inode *inode = ceph_get_snapdir(parent);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
dout("ENOENT on snapdir %p '%pd', linking to snapdir %p\n",
dentry, dentry, inode);
BUG_ON(!d_unhashed(dentry));
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index e088843a7734..f22156ee7306 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -248,9 +248,10 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb,
ihold(inode);
} else {
/* mds does not support lookup snapped inode */
- err = -EOPNOTSUPP;
- inode = NULL;
+ inode = ERR_PTR(-EOPNOTSUPP);
}
+ } else {
+ inode = ERR_PTR(-ESTALE);
}
ceph_mdsc_put_request(req);
@@ -261,8 +262,8 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb,
dout("snapfh_to_dentry %llx.%llx parent %llx hash %x err=%d",
vino.ino, vino.snap, sfh->parent_ino, sfh->hash, err);
}
- if (!inode)
- return ERR_PTR(-ESTALE);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
/* see comments in ceph_get_parent() */
return unlinked ? d_obtain_root(inode) : d_obtain_alias(inode);
}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 156f849f5385..689e3ffd29d7 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -78,9 +78,21 @@ struct inode *ceph_get_snapdir(struct inode *parent)
struct inode *inode = ceph_get_inode(parent->i_sb, vino);
struct ceph_inode_info *ci = ceph_inode(inode);
- BUG_ON(!S_ISDIR(parent->i_mode));
if (IS_ERR(inode))
return inode;
+
+ if (!S_ISDIR(parent->i_mode)) {
+ pr_warn_once("bad snapdir parent type (mode=0%o)\n",
+ parent->i_mode);
+ return ERR_PTR(-ENOTDIR);
+ }
+
+ if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) {
+ pr_warn_once("bad snapdir inode type (mode=0%o)\n",
+ inode->i_mode);
+ return ERR_PTR(-ENOTDIR);
+ }
+
inode->i_mode = parent->i_mode;
inode->i_uid = parent->i_uid;
inode->i_gid = parent->i_gid;
@@ -757,11 +769,32 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
bool queue_trunc = false;
bool new_version = false;
bool fill_inline = false;
+ umode_t mode = le32_to_cpu(info->mode);
+ dev_t rdev = le32_to_cpu(info->rdev);
dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__,
inode, ceph_vinop(inode), le64_to_cpu(info->version),
ci->i_version);
+ /* Once I_NEW is cleared, we can't change type or dev numbers */
+ if (inode->i_state & I_NEW) {
+ inode->i_mode = mode;
+ } else {
+ if (inode_wrong_type(inode, mode)) {
+ pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
+ ceph_vinop(inode), inode->i_mode, mode);
+ return -ESTALE;
+ }
+
+ if ((S_ISCHR(mode) || S_ISBLK(mode)) && inode->i_rdev != rdev) {
+ pr_warn_once("dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n",
+ ceph_vinop(inode), MAJOR(inode->i_rdev),
+ MINOR(inode->i_rdev), MAJOR(rdev),
+ MINOR(rdev));
+ return -ESTALE;
+ }
+ }
+
info_caps = le32_to_cpu(info->cap.caps);
/* prealloc new cap struct */
@@ -815,8 +848,6 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
issued |= __ceph_caps_dirty(ci);
new_issued = ~issued & info_caps;
- /* update inode */
- inode->i_rdev = le32_to_cpu(info->rdev);
/* directories have fl_stripe_unit set to zero */
if (le32_to_cpu(info->layout.fl_stripe_unit))
inode->i_blkbits =
@@ -828,7 +859,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
(issued & CEPH_CAP_AUTH_EXCL) == 0) {
- inode->i_mode = le32_to_cpu(info->mode);
+ inode->i_mode = mode;
inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
@@ -926,7 +957,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
case S_IFCHR:
case S_IFSOCK:
inode->i_blkbits = PAGE_SHIFT;
- init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &ceph_file_iops;
break;
case S_IFREG:
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index fe03cbdae959..bf52e9326ebe 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -18,6 +18,7 @@ config CIFS
select CRYPTO_AES
select CRYPTO_LIB_DES
select KEYS
+ select DNS_RESOLVER
help
This is the client VFS module for the SMB3 family of NAS protocols,
(including support for the most recent, most secure dialect SMB3.1.1)
@@ -112,7 +113,6 @@ config CIFS_WEAK_PW_HASH
config CIFS_UPCALL
bool "Kerberos/SPNEGO advanced session setup"
depends on CIFS
- select DNS_RESOLVER
help
Enables an upcall mechanism for CIFS which accesses userspace helper
utilities to provide SPNEGO packaged (RFC 4178) Kerberos tickets
@@ -179,7 +179,6 @@ config CIFS_DEBUG_DUMP_KEYS
config CIFS_DFS_UPCALL
bool "DFS feature support"
depends on CIFS
- select DNS_RESOLVER
help
Distributed File System (DFS) support is used to access shares
transparently in an enterprise name space, even if the share
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 5213b20843b5..3ee3b7de4ded 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -10,13 +10,14 @@ cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \
cifs_unicode.o nterr.o cifsencrypt.o \
readdir.o ioctl.o sess.o export.o smb1ops.o unc.o winucase.o \
smb2ops.o smb2maperror.o smb2transport.o \
- smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o
+ smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o \
+ dns_resolve.o
cifs-$(CONFIG_CIFS_XATTR) += xattr.o
cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
-cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o dfs_cache.o
+cifs-$(CONFIG_CIFS_DFS_UPCALL) += cifs_dfs_ref.o dfs_cache.o
cifs-$(CONFIG_CIFS_SWN_UPCALL) += netlink.o cifs_swn.o
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 88a7958170ee..68e8e5b27841 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -17,15 +17,14 @@
#include "cifsproto.h"
#include "cifs_debug.h"
#include "cifsfs.h"
+#include "fs_context.h"
#ifdef CONFIG_CIFS_DFS_UPCALL
#include "dfs_cache.h"
#endif
#ifdef CONFIG_CIFS_SMB_DIRECT
#include "smbdirect.h"
#endif
-#ifdef CONFIG_CIFS_SWN_UPCALL
#include "cifs_swn.h"
-#endif
void
cifs_dump_mem(char *label, void *data, int length)
@@ -118,10 +117,8 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon)
seq_printf(m, " POSIX Extensions");
if (tcon->ses->server->ops->dump_share_caps)
tcon->ses->server->ops->dump_share_caps(m, tcon);
-#ifdef CONFIG_CIFS_SWN_UPCALL
if (tcon->use_witness)
seq_puts(m, " Witness");
-#endif
if (tcon->need_reconnect)
seq_puts(m, "\tDISCONNECTED ");
@@ -490,10 +487,8 @@ skip_rdma:
spin_unlock(&cifs_tcp_ses_lock);
seq_putc(m, '\n');
-
-#ifdef CONFIG_CIFS_SWN_UPCALL
cifs_swn_dump(m);
-#endif
+
/* BB add code to dump additional info such as TCP session info now */
return 0;
}
@@ -702,6 +697,7 @@ static const struct proc_ops cifs_lookup_cache_proc_ops;
static const struct proc_ops traceSMB_proc_ops;
static const struct proc_ops cifs_security_flags_proc_ops;
static const struct proc_ops cifs_linux_ext_proc_ops;
+static const struct proc_ops cifs_mount_params_proc_ops;
void
cifs_proc_init(void)
@@ -726,6 +722,8 @@ cifs_proc_init(void)
proc_create("LookupCacheEnabled", 0644, proc_fs_cifs,
&cifs_lookup_cache_proc_ops);
+ proc_create("mount_params", 0444, proc_fs_cifs, &cifs_mount_params_proc_ops);
+
#ifdef CONFIG_CIFS_DFS_UPCALL
proc_create("dfscache", 0644, proc_fs_cifs, &dfscache_proc_ops);
#endif
@@ -764,6 +762,7 @@ cifs_proc_clean(void)
remove_proc_entry("SecurityFlags", proc_fs_cifs);
remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs);
remove_proc_entry("LookupCacheEnabled", proc_fs_cifs);
+ remove_proc_entry("mount_params", proc_fs_cifs);
#ifdef CONFIG_CIFS_DFS_UPCALL
remove_proc_entry("dfscache", proc_fs_cifs);
@@ -1023,6 +1022,51 @@ static const struct proc_ops cifs_security_flags_proc_ops = {
.proc_release = single_release,
.proc_write = cifs_security_flags_proc_write,
};
+
+/* To make it easier to debug, can help to show mount params */
+static int cifs_mount_params_proc_show(struct seq_file *m, void *v)
+{
+ const struct fs_parameter_spec *p;
+ const char *type;
+
+ for (p = smb3_fs_parameters; p->name; p++) {
+ /* cannot use switch with pointers... */
+ if (!p->type) {
+ if (p->flags == fs_param_neg_with_no)
+ type = "noflag";
+ else
+ type = "flag";
+ } else if (p->type == fs_param_is_bool)
+ type = "bool";
+ else if (p->type == fs_param_is_u32)
+ type = "u32";
+ else if (p->type == fs_param_is_u64)
+ type = "u64";
+ else if (p->type == fs_param_is_string)
+ type = "string";
+ else
+ type = "unknown";
+
+ seq_printf(m, "%s:%s\n", p->name, type);
+ }
+
+ return 0;
+}
+
+static int cifs_mount_params_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, cifs_mount_params_proc_show, NULL);
+}
+
+static const struct proc_ops cifs_mount_params_proc_ops = {
+ .proc_open = cifs_mount_params_proc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ /* No need for write for now */
+ /* .proc_write = cifs_mount_params_proc_write, */
+};
+
#else
inline void cifs_proc_init(void)
{
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 6b1ce4efb591..c87c37cf2914 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -270,7 +270,7 @@ static struct vfsmount *cifs_dfs_do_mount(struct dentry *mntpt,
char *mountdata;
char *devname;
- devname = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL);
+ devname = kstrdup(fullpath, GFP_KERNEL);
if (!devname)
return ERR_PTR(-ENOMEM);
@@ -302,6 +302,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
struct cifs_sb_info *cifs_sb;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
+ void *page;
char *full_path, *root_path;
unsigned int xid;
int rc;
@@ -324,10 +325,13 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
goto cdda_exit;
}
+ page = alloc_dentry_path();
/* always use tree name prefix */
- full_path = build_path_from_dentry_optional_prefix(mntpt, true);
- if (full_path == NULL)
- goto cdda_exit;
+ full_path = build_path_from_dentry_optional_prefix(mntpt, page, true);
+ if (IS_ERR(full_path)) {
+ mnt = ERR_CAST(full_path);
+ goto free_full_path;
+ }
convert_delimiter(full_path, '\\');
@@ -385,7 +389,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
free_root_path:
kfree(root_path);
free_full_path:
- kfree(full_path);
+ free_dentry_path(page);
cdda_exit:
cifs_dbg(FYI, "leaving %s\n" , __func__);
return mnt;
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index aa77edc12212..2a5325a7ae49 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -81,5 +81,9 @@ struct cifs_sb_info {
* (cifs_autodisable_serverino) in order to match new mounts.
*/
bool mnt_cifs_serverino_autodisabled;
+ /*
+ * Available once the mount has completed.
+ */
+ struct dentry *root;
};
#endif /* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifs_swn.h b/fs/cifs/cifs_swn.h
index 236ecd4959d5..8a9d2a5c9077 100644
--- a/fs/cifs/cifs_swn.h
+++ b/fs/cifs/cifs_swn.h
@@ -7,11 +7,13 @@
#ifndef _CIFS_SWN_H
#define _CIFS_SWN_H
+#include "cifsglob.h"
struct cifs_tcon;
struct sk_buff;
struct genl_info;
+#ifdef CONFIG_CIFS_SWN_UPCALL
extern int cifs_swn_register(struct cifs_tcon *tcon);
extern int cifs_swn_unregister(struct cifs_tcon *tcon);
@@ -22,4 +24,29 @@ extern void cifs_swn_dump(struct seq_file *m);
extern void cifs_swn_check(void);
+static inline bool cifs_swn_set_server_dstaddr(struct TCP_Server_Info *server)
+{
+ if (server->use_swn_dstaddr) {
+ server->dstaddr = server->swn_dstaddr;
+ return true;
+ }
+ return false;
+}
+
+static inline void cifs_swn_reset_server_dstaddr(struct TCP_Server_Info *server)
+{
+ server->use_swn_dstaddr = false;
+}
+
+#else
+
+static inline int cifs_swn_register(struct cifs_tcon *tcon) { return 0; }
+static inline int cifs_swn_unregister(struct cifs_tcon *tcon) { return 0; }
+static inline int cifs_swn_notify(struct sk_buff *s, struct genl_info *i) { return 0; }
+static inline void cifs_swn_dump(struct seq_file *m) {}
+static inline void cifs_swn_check(void) {}
+static inline bool cifs_swn_set_server_dstaddr(struct TCP_Server_Info *server) { return false; }
+static inline void cifs_swn_reset_server_dstaddr(struct TCP_Server_Info *server) {}
+
+#endif /* CONFIG_CIFS_SWN_UPCALL */
#endif /* _CIFS_SWN_H */
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 2be22a5c690f..784407f9280f 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -1094,11 +1094,9 @@ static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl,
struct cifs_ace *pnntace = NULL;
char *nacl_base = NULL;
u32 num_aces = 0;
- __u64 nmode;
bool new_aces_set = false;
/* Assuming that pndacl and pnmode are never NULL */
- nmode = *pnmode;
nacl_base = (char *)pndacl;
nsize = sizeof(struct cifs_acl);
@@ -1130,8 +1128,7 @@ static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl,
}
/* If it's any one of the ACE we're replacing, skip! */
- if (!mode_from_sid &&
- ((compare_sids(&pntace->sid, &sid_unix_NFS_mode) == 0) ||
+ if (((compare_sids(&pntace->sid, &sid_unix_NFS_mode) == 0) ||
(compare_sids(&pntace->sid, pownersid) == 0) ||
(compare_sids(&pntace->sid, pgrpsid) == 0) ||
(compare_sids(&pntace->sid, &sid_everyone) == 0) ||
@@ -1652,7 +1649,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode,
* Add three ACEs for owner, group, everyone getting rid of other ACEs
* as chmod disables ACEs and set the security descriptor. Allocate
* memory for the smb header, set security descriptor request security
- * descriptor parameters, and secuirty descriptor itself
+ * descriptor parameters, and security descriptor itself
*/
nsecdesclen = max_t(u32, nsecdesclen, DEFAULT_SEC_DESC_LEN);
pnntsd = kmalloc(nsecdesclen, GFP_KERNEL);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 099ad9f3660b..5f2c139143a7 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -217,8 +217,11 @@ cifs_read_super(struct super_block *sb)
rc = super_setup_bdi(sb);
if (rc)
goto out_no_root;
- /* tune readahead according to rsize */
- sb->s_bdi->ra_pages = cifs_sb->ctx->rsize / PAGE_SIZE;
+ /* tune readahead according to rsize if readahead size not set on mount */
+ if (cifs_sb->ctx->rasize)
+ sb->s_bdi->ra_pages = cifs_sb->ctx->rasize / PAGE_SIZE;
+ else
+ sb->s_bdi->ra_pages = cifs_sb->ctx->rsize / PAGE_SIZE;
sb->s_blocksize = CIFS_MAX_MSGSIZE;
sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */
@@ -257,6 +260,29 @@ out_no_root:
static void cifs_kill_sb(struct super_block *sb)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ struct cifs_tcon *tcon;
+ struct cached_fid *cfid;
+
+ /*
+ * We ned to release all dentries for the cached directories
+ * before we kill the sb.
+ */
+ if (cifs_sb->root) {
+ dput(cifs_sb->root);
+ cifs_sb->root = NULL;
+ }
+ tcon = cifs_sb_master_tcon(cifs_sb);
+ if (tcon) {
+ cfid = &tcon->crfid;
+ mutex_lock(&cfid->fid_mutex);
+ if (cfid->dentry) {
+
+ dput(cfid->dentry);
+ cfid->dentry = NULL;
+ }
+ mutex_unlock(&cfid->fid_mutex);
+ }
+
kill_anon_super(sb);
cifs_umount(cifs_sb);
}
@@ -476,7 +502,8 @@ static int cifs_show_devname(struct seq_file *m, struct dentry *root)
seq_puts(m, "none");
else {
convert_delimiter(devname, '/');
- seq_puts(m, devname);
+ /* escape all spaces in share names */
+ seq_escape(m, devname, " \t");
kfree(devname);
}
return 0;
@@ -625,6 +652,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",rsize=%u", cifs_sb->ctx->rsize);
seq_printf(s, ",wsize=%u", cifs_sb->ctx->wsize);
seq_printf(s, ",bsize=%u", cifs_sb->ctx->bsize);
+ if (cifs_sb->ctx->rasize)
+ seq_printf(s, ",rasize=%u", cifs_sb->ctx->rasize);
if (tcon->ses->server->min_offload)
seq_printf(s, ",esize=%u", tcon->ses->server->min_offload);
seq_printf(s, ",echo_interval=%lu",
@@ -655,10 +684,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",multichannel,max_channels=%zu",
tcon->ses->chan_max);
-#ifdef CONFIG_CIFS_SWN_UPCALL
if (tcon->use_witness)
seq_puts(s, ",witness");
-#endif
return 0;
}
@@ -833,6 +860,12 @@ cifs_smb3_do_mount(struct file_system_type *fs_type,
goto out;
}
+ /* cifs_setup_volume_info->smb3_parse_devname() redups UNC & prepath */
+ kfree(cifs_sb->ctx->UNC);
+ cifs_sb->ctx->UNC = NULL;
+ kfree(cifs_sb->ctx->prepath);
+ cifs_sb->ctx->prepath = NULL;
+
rc = cifs_setup_volume_info(cifs_sb->ctx, NULL, old_ctx->UNC);
if (rc) {
root = ERR_PTR(rc);
@@ -887,6 +920,9 @@ cifs_smb3_do_mount(struct file_system_type *fs_type,
if (IS_ERR(root))
goto out_super;
+ if (cifs_sb)
+ cifs_sb->root = dget(root);
+
cifs_dbg(FYI, "dentry root is: %p\n", root);
return root;
@@ -1527,10 +1563,6 @@ init_cifs(void)
int rc = 0;
cifs_proc_init();
INIT_LIST_HEAD(&cifs_tcp_ses_list);
-#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
- INIT_LIST_HEAD(&GlobalDnotifyReqList);
- INIT_LIST_HEAD(&GlobalDnotifyRsp_Q);
-#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
/*
* Initialize Global counters
*/
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 0d7ef150dbb2..6beddb108ba0 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -165,5 +165,5 @@ extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type,
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.31"
+#define CIFS_VERSION "2.32"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 31fc8695abd6..b23a0ee8c6f8 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -495,7 +495,7 @@ struct smb_version_operations {
struct inode *inode,
struct dentry *dentry,
struct cifs_tcon *tcon,
- char *full_path,
+ const char *full_path,
umode_t mode,
dev_t device_number);
/* version specific fiemap implementation */
@@ -919,8 +919,8 @@ struct cifs_ses {
bool binding:1; /* are we binding the session? */
__u16 session_flags;
__u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
- __u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE];
- __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE];
+ __u8 smb3encryptionkey[SMB3_ENC_DEC_KEY_SIZE];
+ __u8 smb3decryptionkey[SMB3_ENC_DEC_KEY_SIZE];
__u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE];
__u8 binding_preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE];
@@ -988,10 +988,12 @@ struct cached_fid {
bool is_valid:1; /* Do we have a useable root fid */
bool file_all_info_is_valid:1;
bool has_lease:1;
+ unsigned long time; /* jiffies of when lease was taken */
struct kref refcount;
struct cifs_fid *fid;
struct mutex fid_mutex;
struct cifs_tcon *tcon;
+ struct dentry *dentry;
struct work_struct lease_break;
struct smb2_file_all_info file_all_info;
};
@@ -1070,6 +1072,7 @@ struct cifs_tcon {
bool use_resilient:1; /* use resilient instead of durable handles */
bool use_persistent:1; /* use persistent instead of durable handles */
bool no_lease:1; /* Do not request leases on files or directories */
+ bool use_witness:1; /* use witness protocol */
__le32 capabilities;
__u32 share_flags;
__u32 maximal_access;
@@ -1094,9 +1097,6 @@ struct cifs_tcon {
int remap:2;
struct list_head ulist; /* cache update list */
#endif
-#ifdef CONFIG_CIFS_SWN_UPCALL
- bool use_witness:1; /* use witness protocol */
-#endif
};
/*
@@ -1283,8 +1283,6 @@ struct cifs_aio_ctx {
bool direct_io;
};
-struct cifs_readdata;
-
/* asynchronous read support */
struct cifs_readdata {
struct kref refcount;
@@ -1318,8 +1316,6 @@ struct cifs_readdata {
struct page **pages;
};
-struct cifs_writedata;
-
/* asynchronous write support */
struct cifs_writedata {
struct kref refcount;
@@ -1798,9 +1794,8 @@ require use of the stronger protocol */
*
* Semaphores
* ----------
- * sesSem operations on smb session
- * tconSem operations on tree connection
- * fh_sem file handle reconnection operations
+ * cifsInodeInfo->lock_sem protects:
+ * the list of locks held by the inode
*
****************************************************************************/
@@ -1831,13 +1826,6 @@ GLOBAL_EXTERN struct list_head cifs_tcp_ses_list;
*/
GLOBAL_EXTERN spinlock_t cifs_tcp_ses_lock;
-#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
-/* Outstanding dir notify requests */
-GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
-/* DirNotify response queue */
-GLOBAL_EXTERN struct list_head GlobalDnotifyRsp_Q;
-#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
-
/*
* Global transaction id (XID) information
*/
@@ -1881,19 +1869,9 @@ extern unsigned int cifs_min_small; /* min size of small buf pool */
extern unsigned int cifs_max_pending; /* MAX requests at once to server*/
extern bool disable_legacy_dialects; /* forbid vers=1.0 and vers=2.0 mounts */
-GLOBAL_EXTERN struct rb_root uidtree;
-GLOBAL_EXTERN struct rb_root gidtree;
-GLOBAL_EXTERN spinlock_t siduidlock;
-GLOBAL_EXTERN spinlock_t sidgidlock;
-GLOBAL_EXTERN struct rb_root siduidtree;
-GLOBAL_EXTERN struct rb_root sidgidtree;
-GLOBAL_EXTERN spinlock_t uidsidlock;
-GLOBAL_EXTERN spinlock_t gidsidlock;
-
void cifs_oplock_break(struct work_struct *work);
void cifs_queue_oplock_break(struct cifsFileInfo *cfile);
-extern const struct slow_work_ops cifs_oplock_break_ops;
extern struct workqueue_struct *cifsiod_wq;
extern struct workqueue_struct *decrypt_wq;
extern struct workqueue_struct *fileinfo_put_wq;
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 64fe5a47b5e8..b53a87db282f 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -147,6 +147,11 @@
*/
#define SMB3_SIGN_KEY_SIZE (16)
+/*
+ * Size of the smb3 encryption/decryption keys
+ */
+#define SMB3_ENC_DEC_KEY_SIZE (32)
+
#define CIFS_CLIENT_CHALLENGE_SIZE (8)
#define CIFS_SERVER_CHALLENGE_SIZE (8)
#define CIFS_HMAC_MD5_HASH_SIZE (16)
@@ -1898,7 +1903,7 @@ typedef struct smb_com_transaction2_fnext_req {
__le16 InformationLevel;
__u32 ResumeKey;
__le16 SearchFlags;
- char ResumeFileName[1];
+ char ResumeFileName[];
} __attribute__((packed)) TRANSACTION2_FNEXT_REQ;
typedef struct smb_com_transaction2_fnext_rsp {
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 75ce6f742b8d..a79d50001fbf 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -69,9 +69,20 @@ extern int init_cifs_idmap(void);
extern void exit_cifs_idmap(void);
extern int init_cifs_spnego(void);
extern void exit_cifs_spnego(void);
-extern char *build_path_from_dentry(struct dentry *);
+extern const char *build_path_from_dentry(struct dentry *, void *);
extern char *build_path_from_dentry_optional_prefix(struct dentry *direntry,
- bool prefix);
+ void *page, bool prefix);
+static inline void *alloc_dentry_path(void)
+{
+ return __getname();
+}
+
+static inline void free_dentry_path(void *page)
+{
+ if (page)
+ __putname(page);
+}
+
extern char *cifs_build_path_to_root(struct smb3_fs_context *ctx,
struct cifs_sb_info *cifs_sb,
struct cifs_tcon *tcon,
@@ -184,7 +195,7 @@ extern struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid,
struct file *file,
struct tcon_link *tlink,
__u32 oplock);
-extern int cifs_posix_open(char *full_path, struct inode **inode,
+extern int cifs_posix_open(const char *full_path, struct inode **inode,
struct super_block *sb, int mode,
unsigned int f_flags, __u32 *oplock, __u16 *netfid,
unsigned int xid);
@@ -194,7 +205,7 @@ extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
struct cifs_sb_info *cifs_sb);
extern void cifs_dir_info_to_fattr(struct cifs_fattr *, FILE_DIRECTORY_INFO *,
struct cifs_sb_info *);
-extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
+extern int cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
extern struct inode *cifs_iget(struct super_block *sb,
struct cifs_fattr *fattr);
@@ -207,7 +218,7 @@ extern int cifs_get_inode_info_unix(struct inode **pinode,
const unsigned char *search_path,
struct super_block *sb, unsigned int xid);
extern int cifs_set_file_info(struct inode *inode, struct iattr *attrs,
- unsigned int xid, char *full_path, __u32 dosattr);
+ unsigned int xid, const char *full_path, __u32 dosattr);
extern int cifs_rename_pending_delete(const char *full_path,
struct dentry *dentry,
const unsigned int xid);
@@ -358,11 +369,6 @@ extern int CIFSSMBSetFileDisposition(const unsigned int xid,
struct cifs_tcon *tcon,
bool delete_file, __u16 fid,
__u32 pid_of_opener);
-#if 0
-extern int CIFSSMBSetAttrLegacy(unsigned int xid, struct cifs_tcon *tcon,
- char *fileName, __u16 dos_attributes,
- const struct nls_table *nls_codepage);
-#endif /* possibly unneeded function */
extern int CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon,
const char *file_name, __u64 size,
struct cifs_sb_info *cifs_sb, bool set_allocation);
@@ -504,12 +510,6 @@ extern int generate_smb311signingkey(struct cifs_ses *);
extern int calc_lanman_hash(const char *password, const char *cryptkey,
bool encrypt, char *lnm_session_key);
#endif /* CIFS_WEAK_PW_HASH */
-#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
-extern int CIFSSMBNotify(const unsigned int xid, struct cifs_tcon *tcon,
- const int notify_subdirs, const __u16 netfid,
- __u32 filter, struct file *file, int multishot,
- const struct nls_table *nls_codepage);
-#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
extern int CIFSSMBCopy(unsigned int xid,
struct cifs_tcon *source_tcon,
const char *fromName,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index c279527aae92..41f74163cc1c 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -114,7 +114,7 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
mutex_lock(&tcon->crfid.fid_mutex);
tcon->crfid.is_valid = false;
/* cached handle is not valid, so SMB2_CLOSE won't be sent below */
- close_shroot_lease_locked(&tcon->crfid);
+ close_cached_dir_lease_locked(&tcon->crfid);
memset(tcon->crfid.fid, 0, sizeof(struct cifs_fid));
mutex_unlock(&tcon->crfid.fid_mutex);
@@ -5917,56 +5917,6 @@ SetTimesRetry:
return rc;
}
-/* Can not be used to set time stamps yet (due to old DOS time format) */
-/* Can be used to set attributes */
-#if 0 /* Possibly not needed - since it turns out that strangely NT4 has a bug
- handling it anyway and NT4 was what we thought it would be needed for
- Do not delete it until we prove whether needed for Win9x though */
-int
-CIFSSMBSetAttrLegacy(unsigned int xid, struct cifs_tcon *tcon, char *fileName,
- __u16 dos_attrs, const struct nls_table *nls_codepage)
-{
- SETATTR_REQ *pSMB = NULL;
- SETATTR_RSP *pSMBr = NULL;
- int rc = 0;
- int bytes_returned;
- int name_len;
-
- cifs_dbg(FYI, "In SetAttrLegacy\n");
-
-SetAttrLgcyRetry:
- rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB,
- (void **) &pSMBr);
- if (rc)
- return rc;
-
- if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
- name_len =
- ConvertToUTF16((__le16 *) pSMB->fileName, fileName,
- PATH_MAX, nls_codepage);
- name_len++; /* trailing null */
- name_len *= 2;
- } else {
- name_len = copy_path_name(pSMB->fileName, fileName);
- }
- pSMB->attr = cpu_to_le16(dos_attrs);
- pSMB->BufferFormat = 0x04;
- inc_rfc1001_len(pSMB, name_len + 1);
- pSMB->ByteCount = cpu_to_le16(name_len + 1);
- rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
- (struct smb_hdr *) pSMBr, &bytes_returned, 0);
- if (rc)
- cifs_dbg(FYI, "Error in LegacySetAttr = %d\n", rc);
-
- cifs_buf_release(pSMB);
-
- if (rc == -EAGAIN)
- goto SetAttrLgcyRetry;
-
- return rc;
-}
-#endif /* temporarily unneeded SetAttr legacy function */
-
static void
cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset,
const struct cifs_unix_set_info_args *args)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index eec8a2052da2..121d8b4535b0 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -62,9 +62,7 @@
#include "dfs_cache.h"
#endif
#include "fs_context.h"
-#ifdef CONFIG_CIFS_SWN_UPCALL
#include "cifs_swn.h"
-#endif
extern mempool_t *cifs_req_poolp;
extern bool disable_legacy_dialects;
@@ -87,7 +85,6 @@ static void cifs_prune_tlinks(struct work_struct *work);
*
* This should be called with server->srv_mutex held.
*/
-#ifdef CONFIG_CIFS_DFS_UPCALL
static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
{
int rc;
@@ -124,6 +121,7 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
return !rc ? -1 : 0;
}
+#ifdef CONFIG_CIFS_DFS_UPCALL
/* These functions must be called with server->srv_mutex held */
static void reconn_set_next_dfs_target(struct TCP_Server_Info *server,
struct cifs_sb_info *cifs_sb,
@@ -314,25 +312,34 @@ cifs_reconnect(struct TCP_Server_Info *server)
mutex_lock(&server->srv_mutex);
-#ifdef CONFIG_CIFS_SWN_UPCALL
- if (server->use_swn_dstaddr) {
- server->dstaddr = server->swn_dstaddr;
- } else {
-#endif
+ if (!cifs_swn_set_server_dstaddr(server)) {
#ifdef CONFIG_CIFS_DFS_UPCALL
+ if (cifs_sb && cifs_sb->origin_fullpath)
/*
* Set up next DFS target server (if any) for reconnect. If DFS
* feature is disabled, then we will retry last server we
* connected to before.
*/
reconn_set_next_dfs_target(server, cifs_sb, &tgt_list, &tgt_it);
+ else {
#endif
+ /*
+ * Resolve the hostname again to make sure that IP address is up-to-date.
+ */
+ rc = reconn_set_ipaddr_from_hostname(server);
+ if (rc) {
+ cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n",
+ __func__, rc);
+ }
-#ifdef CONFIG_CIFS_SWN_UPCALL
+#ifdef CONFIG_CIFS_DFS_UPCALL
}
#endif
+
+ }
+
if (cifs_rdma_enabled(server))
rc = smbd_reconnect(server);
else
@@ -348,9 +355,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
if (server->tcpStatus != CifsExiting)
server->tcpStatus = CifsNeedNegotiate;
spin_unlock(&GlobalMid_Lock);
-#ifdef CONFIG_CIFS_SWN_UPCALL
- server->use_swn_dstaddr = false;
-#endif
+ cifs_swn_reset_server_dstaddr(server);
mutex_unlock(&server->srv_mutex);
}
} while (server->tcpStatus == CifsNeedReconnect);
@@ -415,10 +420,8 @@ cifs_echo_request(struct work_struct *work)
cifs_dbg(FYI, "Unable to send echo request to server: %s\n",
server->hostname);
-#ifdef CONFIG_CIFS_SWN_UPCALL
/* Check witness registrations */
cifs_swn_check();
-#endif
requeue_echo:
queue_delayed_work(cifsiod_wq, &server->echo, server->echo_interval);
@@ -1775,9 +1778,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses)
* for the request.
*/
if (is_domain && ses->domainName) {
- ctx->domainname = kstrndup(ses->domainName,
- strlen(ses->domainName),
- GFP_KERNEL);
+ ctx->domainname = kstrdup(ses->domainName, GFP_KERNEL);
if (!ctx->domainname) {
cifs_dbg(FYI, "Unable to allocate %zd bytes for domain\n",
len);
@@ -1994,7 +1995,6 @@ cifs_put_tcon(struct cifs_tcon *tcon)
return;
}
-#ifdef CONFIG_CIFS_SWN_UPCALL
if (tcon->use_witness) {
int rc;
@@ -2004,7 +2004,6 @@ cifs_put_tcon(struct cifs_tcon *tcon)
__func__, rc);
}
}
-#endif
list_del_init(&tcon->tcon_list);
spin_unlock(&cifs_tcp_ses_lock);
@@ -2166,9 +2165,9 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
}
tcon->use_resilient = true;
}
-#ifdef CONFIG_CIFS_SWN_UPCALL
+
tcon->use_witness = false;
- if (ctx->witness) {
+ if (IS_ENABLED(CONFIG_CIFS_SWN_UPCALL) && ctx->witness) {
if (ses->server->vals->protocol_id >= SMB30_PROT_ID) {
if (tcon->capabilities & SMB2_SHARE_CAP_CLUSTER) {
/*
@@ -2194,7 +2193,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
goto out_fail;
}
}
-#endif
/* If the user really knows what they are doing they can override */
if (tcon->share_flags & SMB2_SHAREFLAG_NO_CACHING) {
@@ -3411,8 +3409,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
goto error;
}
/* Save mount options */
- mntdata = kstrndup(cifs_sb->ctx->mount_options,
- strlen(cifs_sb->ctx->mount_options), GFP_KERNEL);
+ mntdata = kstrdup(cifs_sb->ctx->mount_options, GFP_KERNEL);
if (!mntdata) {
rc = -ENOMEM;
goto error;
@@ -3485,7 +3482,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
* links, the prefix path is included in both and may be changed during reconnect. See
* cifs_tree_connect().
*/
- cifs_sb->origin_fullpath = kstrndup(full_path, strlen(full_path), GFP_KERNEL);
+ cifs_sb->origin_fullpath = kstrdup(full_path, GFP_KERNEL);
if (!cifs_sb->origin_fullpath) {
rc = -ENOMEM;
goto error;
@@ -3862,9 +3859,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
ctx->sectype = master_tcon->ses->sectype;
ctx->sign = master_tcon->ses->sign;
ctx->seal = master_tcon->seal;
-#ifdef CONFIG_CIFS_SWN_UPCALL
ctx->witness = master_tcon->use_witness;
-#endif
rc = cifs_set_vol_auth(ctx, master_tcon->ses);
if (rc) {
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index 098b4bc8da59..b1fa30fefe1f 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -81,23 +81,24 @@ static void refresh_cache_worker(struct work_struct *work);
static DECLARE_DELAYED_WORK(refresh_task, refresh_cache_worker);
-static int get_normalized_path(const char *path, char **npath)
+static int get_normalized_path(const char *path, const char **npath)
{
if (!path || strlen(path) < 3 || (*path != '\\' && *path != '/'))
return -EINVAL;
if (*path == '\\') {
- *npath = (char *)path;
+ *npath = path;
} else {
- *npath = kstrndup(path, strlen(path), GFP_KERNEL);
- if (!*npath)
+ char *s = kstrdup(path, GFP_KERNEL);
+ if (!s)
return -ENOMEM;
- convert_delimiter(*npath, '\\');
+ convert_delimiter(s, '\\');
+ *npath = s;
}
return 0;
}
-static inline void free_normalized_path(const char *path, char *npath)
+static inline void free_normalized_path(const char *path, const char *npath)
{
if (path != npath)
kfree(npath);
@@ -358,7 +359,7 @@ static struct cache_dfs_tgt *alloc_target(const char *name, int path_consumed)
t = kmalloc(sizeof(*t), GFP_ATOMIC);
if (!t)
return ERR_PTR(-ENOMEM);
- t->name = kstrndup(name, strlen(name), GFP_ATOMIC);
+ t->name = kstrdup(name, GFP_ATOMIC);
if (!t->name) {
kfree(t);
return ERR_PTR(-ENOMEM);
@@ -419,7 +420,7 @@ static struct cache_entry *alloc_cache_entry(const char *path,
if (!ce)
return ERR_PTR(-ENOMEM);
- ce->path = kstrndup(path, strlen(path), GFP_KERNEL);
+ ce->path = kstrdup(path, GFP_KERNEL);
if (!ce->path) {
kmem_cache_free(cache_slab, ce);
return ERR_PTR(-ENOMEM);
@@ -531,7 +532,7 @@ static struct cache_entry *lookup_cache_entry(const char *path, unsigned int *ha
char *s, *e;
char sep;
- npath = kstrndup(path, strlen(path), GFP_KERNEL);
+ npath = kstrdup(path, GFP_KERNEL);
if (!npath)
return ERR_PTR(-ENOMEM);
@@ -641,7 +642,7 @@ static int __update_cache_entry(const char *path,
if (ce->tgthint) {
s = ce->tgthint->name;
- th = kstrndup(s, strlen(s), GFP_ATOMIC);
+ th = kstrdup(s, GFP_ATOMIC);
if (!th)
return -ENOMEM;
}
@@ -786,11 +787,11 @@ static int setup_referral(const char *path, struct cache_entry *ce,
memset(ref, 0, sizeof(*ref));
- ref->path_name = kstrndup(path, strlen(path), GFP_ATOMIC);
+ ref->path_name = kstrdup(path, GFP_ATOMIC);
if (!ref->path_name)
return -ENOMEM;
- ref->node_name = kstrndup(target, strlen(target), GFP_ATOMIC);
+ ref->node_name = kstrdup(target, GFP_ATOMIC);
if (!ref->node_name) {
rc = -ENOMEM;
goto err_free_path;
@@ -828,7 +829,7 @@ static int get_targets(struct cache_entry *ce, struct dfs_cache_tgt_list *tl)
goto err_free_it;
}
- it->it_name = kstrndup(t->name, strlen(t->name), GFP_ATOMIC);
+ it->it_name = kstrdup(t->name, GFP_ATOMIC);
if (!it->it_name) {
kfree(it);
rc = -ENOMEM;
@@ -882,7 +883,7 @@ int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
struct dfs_cache_tgt_list *tgt_list)
{
int rc;
- char *npath;
+ const char *npath;
struct cache_entry *ce;
rc = get_normalized_path(path, &npath);
@@ -936,7 +937,7 @@ int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref,
struct dfs_cache_tgt_list *tgt_list)
{
int rc;
- char *npath;
+ const char *npath;
struct cache_entry *ce;
rc = get_normalized_path(path, &npath);
@@ -991,7 +992,7 @@ int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses,
const struct dfs_cache_tgt_iterator *it)
{
int rc;
- char *npath;
+ const char *npath;
struct cache_entry *ce;
struct cache_dfs_tgt *t;
@@ -1053,7 +1054,7 @@ int dfs_cache_noreq_update_tgthint(const char *path,
const struct dfs_cache_tgt_iterator *it)
{
int rc;
- char *npath;
+ const char *npath;
struct cache_entry *ce;
struct cache_dfs_tgt *t;
@@ -1111,7 +1112,7 @@ int dfs_cache_get_tgt_referral(const char *path,
struct dfs_info3_param *ref)
{
int rc;
- char *npath;
+ const char *npath;
struct cache_entry *ce;
if (!it || !ref)
@@ -1166,7 +1167,7 @@ int dfs_cache_add_vol(char *mntdata, struct smb3_fs_context *ctx, const char *fu
if (!vi)
return -ENOMEM;
- vi->fullpath = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL);
+ vi->fullpath = kstrdup(fullpath, GFP_KERNEL);
if (!vi->fullpath) {
rc = -ENOMEM;
goto err_free_vi;
@@ -1484,7 +1485,7 @@ static int refresh_tcon(struct vol_info *vi, struct cifs_tcon *tcon)
{
int rc = 0;
unsigned int xid;
- char *path, *npath;
+ const char *path, *npath;
struct cache_entry *ce;
struct cifs_ses *root_ses = NULL, *ses;
struct dfs_info3_param *refs = NULL;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index a3fb81e0ba17..c85aff838305 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -78,31 +78,31 @@ cifs_build_path_to_root(struct smb3_fs_context *ctx, struct cifs_sb_info *cifs_s
}
/* Note: caller must free return buffer */
-char *
-build_path_from_dentry(struct dentry *direntry)
+const char *
+build_path_from_dentry(struct dentry *direntry, void *page)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
bool prefix = tcon->Flags & SMB_SHARE_IS_IN_DFS;
- return build_path_from_dentry_optional_prefix(direntry,
+ return build_path_from_dentry_optional_prefix(direntry, page,
prefix);
}
char *
-build_path_from_dentry_optional_prefix(struct dentry *direntry, bool prefix)
+build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
+ bool prefix)
{
- struct dentry *temp;
- int namelen;
int dfsplen;
int pplen = 0;
- char *full_path;
- char dirsep;
struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
- unsigned seq;
+ char dirsep = CIFS_DIR_SEP(cifs_sb);
+ char *s;
+
+ if (unlikely(!page))
+ return ERR_PTR(-ENOMEM);
- dirsep = CIFS_DIR_SEP(cifs_sb);
if (prefix)
dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
else
@@ -111,86 +111,39 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, bool prefix)
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)
pplen = cifs_sb->prepath ? strlen(cifs_sb->prepath) + 1 : 0;
-cifs_bp_rename_retry:
- namelen = dfsplen + pplen;
- seq = read_seqbegin(&rename_lock);
- rcu_read_lock();
- for (temp = direntry; !IS_ROOT(temp);) {
- namelen += (1 + temp->d_name.len);
- temp = temp->d_parent;
- if (temp == NULL) {
- cifs_dbg(VFS, "corrupt dentry\n");
- rcu_read_unlock();
- return NULL;
- }
- }
- rcu_read_unlock();
-
- full_path = kmalloc(namelen+1, GFP_ATOMIC);
- if (full_path == NULL)
- return full_path;
- full_path[namelen] = 0; /* trailing null */
- rcu_read_lock();
- for (temp = direntry; !IS_ROOT(temp);) {
- spin_lock(&temp->d_lock);
- namelen -= 1 + temp->d_name.len;
- if (namelen < 0) {
- spin_unlock(&temp->d_lock);
- break;
- } else {
- full_path[namelen] = dirsep;
- strncpy(full_path + namelen + 1, temp->d_name.name,
- temp->d_name.len);
- cifs_dbg(FYI, "name: %s\n", full_path + namelen);
- }
- spin_unlock(&temp->d_lock);
- temp = temp->d_parent;
- if (temp == NULL) {
- cifs_dbg(VFS, "corrupt dentry\n");
- rcu_read_unlock();
- kfree(full_path);
- return NULL;
- }
- }
- rcu_read_unlock();
- if (namelen != dfsplen + pplen || read_seqretry(&rename_lock, seq)) {
- cifs_dbg(FYI, "did not end path lookup where expected. namelen=%ddfsplen=%d\n",
- namelen, dfsplen);
- /* presumably this is only possible if racing with a rename
- of one of the parent directories (we can not lock the dentries
- above us to prevent this, but retrying should be harmless) */
- kfree(full_path);
- goto cifs_bp_rename_retry;
- }
- /* DIR_SEP already set for byte 0 / vs \ but not for
- subsequent slashes in prepath which currently must
- be entered the right way - not sure if there is an alternative
- since the '\' is a valid posix character so we can not switch
- those safely to '/' if any are found in the middle of the prepath */
- /* BB test paths to Windows with '/' in the midst of prepath */
-
+ s = dentry_path_raw(direntry, page, PAGE_SIZE);
+ if (IS_ERR(s))
+ return s;
+ if (!s[1]) // for root we want "", not "/"
+ s++;
+ if (s < (char *)page + pplen + dfsplen)
+ return ERR_PTR(-ENAMETOOLONG);
if (pplen) {
- int i;
-
cifs_dbg(FYI, "using cifs_sb prepath <%s>\n", cifs_sb->prepath);
- memcpy(full_path+dfsplen+1, cifs_sb->prepath, pplen-1);
- full_path[dfsplen] = dirsep;
- for (i = 0; i < pplen-1; i++)
- if (full_path[dfsplen+1+i] == '/')
- full_path[dfsplen+1+i] = CIFS_DIR_SEP(cifs_sb);
+ s -= pplen;
+ memcpy(s + 1, cifs_sb->prepath, pplen - 1);
+ *s = '/';
}
+ if (dirsep != '/') {
+ /* BB test paths to Windows with '/' in the midst of prepath */
+ char *p;
+ for (p = s; *p; p++)
+ if (*p == '/')
+ *p = dirsep;
+ }
if (dfsplen) {
- strncpy(full_path, tcon->treeName, dfsplen);
+ s -= dfsplen;
+ memcpy(s, tcon->treeName, dfsplen);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
int i;
for (i = 0; i < dfsplen; i++) {
- if (full_path[i] == '\\')
- full_path[i] = '/';
+ if (s[i] == '\\')
+ s[i] = '/';
}
}
}
- return full_path;
+ return s;
}
/*
@@ -233,7 +186,8 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
int desired_access;
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifs_tcon *tcon = tlink_tcon(tlink);
- char *full_path = NULL;
+ const char *full_path;
+ void *page = alloc_dentry_path();
FILE_ALL_INFO *buf = NULL;
struct inode *newinode = NULL;
int disposition;
@@ -244,9 +198,11 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
if (tcon->ses->server->oplocks)
*oplock = REQ_OPLOCK;
- full_path = build_path_from_dentry(direntry);
- if (!full_path)
- return -ENOMEM;
+ full_path = build_path_from_dentry(direntry, page);
+ if (IS_ERR(full_path)) {
+ free_dentry_path(page);
+ return PTR_ERR(full_path);
+ }
if (tcon->unix_ext && cap_unix(tcon->ses) && !tcon->broken_posix_open &&
(CIFS_UNIX_POSIX_PATH_OPS_CAP &
@@ -418,15 +374,16 @@ cifs_create_get_file_info:
if (newinode) {
if (server->ops->set_lease_key)
server->ops->set_lease_key(newinode, fid);
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
- newinode->i_mode = mode;
- if ((*oplock & CIFS_CREATE_ACTION) &&
- (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) {
- newinode->i_uid = current_fsuid();
- if (inode->i_mode & S_ISGID)
- newinode->i_gid = inode->i_gid;
- else
- newinode->i_gid = current_fsgid();
+ if ((*oplock & CIFS_CREATE_ACTION) && S_ISREG(newinode->i_mode)) {
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
+ newinode->i_mode = mode;
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
+ newinode->i_uid = current_fsuid();
+ if (inode->i_mode & S_ISGID)
+ newinode->i_gid = inode->i_gid;
+ else
+ newinode->i_gid = current_fsgid();
+ }
}
}
}
@@ -448,7 +405,7 @@ cifs_create_set_dentry:
out:
kfree(buf);
- kfree(full_path);
+ free_dentry_path(page);
return rc;
out_err:
@@ -619,7 +576,8 @@ int cifs_mknod(struct user_namespace *mnt_userns, struct inode *inode,
struct cifs_sb_info *cifs_sb;
struct tcon_link *tlink;
struct cifs_tcon *tcon;
- char *full_path = NULL;
+ const char *full_path;
+ void *page;
if (!old_valid_dev(device_number))
return -EINVAL;
@@ -629,13 +587,13 @@ int cifs_mknod(struct user_namespace *mnt_userns, struct inode *inode,
if (IS_ERR(tlink))
return PTR_ERR(tlink);
+ page = alloc_dentry_path();
tcon = tlink_tcon(tlink);
-
xid = get_xid();
- full_path = build_path_from_dentry(direntry);
- if (full_path == NULL) {
- rc = -ENOMEM;
+ full_path = build_path_from_dentry(direntry, page);
+ if (IS_ERR(full_path)) {
+ rc = PTR_ERR(full_path);
goto mknod_out;
}
@@ -644,7 +602,7 @@ int cifs_mknod(struct user_namespace *mnt_userns, struct inode *inode,
device_number);
mknod_out:
- kfree(full_path);
+ free_dentry_path(page);
free_xid(xid);
cifs_put_tlink(tlink);
return rc;
@@ -660,7 +618,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
struct tcon_link *tlink;
struct cifs_tcon *pTcon;
struct inode *newInode = NULL;
- char *full_path = NULL;
+ const char *full_path;
+ void *page;
xid = get_xid();
@@ -687,11 +646,13 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
/* can not grab the rename sem here since it would
deadlock in the cases (beginning of sys_rename itself)
in which we already have the sb rename sem */
- full_path = build_path_from_dentry(direntry);
- if (full_path == NULL) {
+ page = alloc_dentry_path();
+ full_path = build_path_from_dentry(direntry, page);
+ if (IS_ERR(full_path)) {
cifs_put_tlink(tlink);
free_xid(xid);
- return ERR_PTR(-ENOMEM);
+ free_dentry_path(page);
+ return ERR_CAST(full_path);
}
if (d_really_is_positive(direntry)) {
@@ -727,7 +688,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
}
newInode = ERR_PTR(rc);
}
- kfree(full_path);
+ free_dentry_path(page);
cifs_put_tlink(tlink);
free_xid(xid);
return d_splice_alias(newInode, direntry);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 26de4329d161..639c59596d4f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -112,7 +112,7 @@ static inline int cifs_get_disposition(unsigned int flags)
return FILE_OPEN;
}
-int cifs_posix_open(char *full_path, struct inode **pinode,
+int cifs_posix_open(const char *full_path, struct inode **pinode,
struct super_block *sb, int mode, unsigned int f_flags,
__u32 *poplock, __u16 *pnetfid, unsigned int xid)
{
@@ -165,7 +165,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
goto posix_open_ret;
}
} else {
- cifs_fattr_to_inode(*pinode, &fattr);
+ cifs_revalidate_mapping(*pinode);
+ rc = cifs_fattr_to_inode(*pinode, &fattr);
}
posix_open_ret:
@@ -174,7 +175,7 @@ posix_open_ret:
}
static int
-cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
+cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
struct cifs_tcon *tcon, unsigned int f_flags, __u32 *oplock,
struct cifs_fid *fid, unsigned int xid)
{
@@ -529,7 +530,8 @@ int cifs_open(struct inode *inode, struct file *file)
struct cifs_tcon *tcon;
struct tcon_link *tlink;
struct cifsFileInfo *cfile = NULL;
- char *full_path = NULL;
+ void *page;
+ const char *full_path;
bool posix_open_ok = false;
struct cifs_fid fid;
struct cifs_pending_open open;
@@ -545,9 +547,10 @@ int cifs_open(struct inode *inode, struct file *file)
tcon = tlink_tcon(tlink);
server = tcon->ses->server;
- full_path = build_path_from_dentry(file_dentry(file));
- if (full_path == NULL) {
- rc = -ENOMEM;
+ page = alloc_dentry_path();
+ full_path = build_path_from_dentry(file_dentry(file), page);
+ if (IS_ERR(full_path)) {
+ rc = PTR_ERR(full_path);
goto out;
}
@@ -639,7 +642,7 @@ int cifs_open(struct inode *inode, struct file *file)
}
out:
- kfree(full_path);
+ free_dentry_path(page);
free_xid(xid);
cifs_put_tlink(tlink);
return rc;
@@ -688,7 +691,8 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
struct TCP_Server_Info *server;
struct cifsInodeInfo *cinode;
struct inode *inode;
- char *full_path = NULL;
+ void *page;
+ const char *full_path;
int desired_access;
int disposition = FILE_OPEN;
int create_options = CREATE_NOT_DIR;
@@ -698,9 +702,8 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
mutex_lock(&cfile->fh_mutex);
if (!cfile->invalidHandle) {
mutex_unlock(&cfile->fh_mutex);
- rc = 0;
free_xid(xid);
- return rc;
+ return 0;
}
inode = d_inode(cfile->dentry);
@@ -714,12 +717,13 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
* called and if the server was down that means we end up here, and we
* can never tell if the caller already has the rename_sem.
*/
- full_path = build_path_from_dentry(cfile->dentry);
- if (full_path == NULL) {
- rc = -ENOMEM;
+ page = alloc_dentry_path();
+ full_path = build_path_from_dentry(cfile->dentry, page);
+ if (IS_ERR(full_path)) {
mutex_unlock(&cfile->fh_mutex);
+ free_dentry_path(page);
free_xid(xid);
- return rc;
+ return PTR_ERR(full_path);
}
cifs_dbg(FYI, "inode = 0x%p file flags 0x%x for %s\n",
@@ -837,7 +841,7 @@ reopen_success:
cifs_relock_file(cfile);
reopen_error_exit:
- kfree(full_path);
+ free_dentry_path(page);
free_xid(xid);
return rc;
}
@@ -2068,34 +2072,31 @@ cifs_get_writable_path(struct cifs_tcon *tcon, const char *name,
int flags,
struct cifsFileInfo **ret_file)
{
- struct list_head *tmp;
struct cifsFileInfo *cfile;
- struct cifsInodeInfo *cinode;
- char *full_path;
+ void *page = alloc_dentry_path();
*ret_file = NULL;
spin_lock(&tcon->open_file_lock);
- list_for_each(tmp, &tcon->openFileList) {
- cfile = list_entry(tmp, struct cifsFileInfo,
- tlist);
- full_path = build_path_from_dentry(cfile->dentry);
- if (full_path == NULL) {
+ list_for_each_entry(cfile, &tcon->openFileList, tlist) {
+ struct cifsInodeInfo *cinode;
+ const char *full_path = build_path_from_dentry(cfile->dentry, page);
+ if (IS_ERR(full_path)) {
spin_unlock(&tcon->open_file_lock);
- return -ENOMEM;
+ free_dentry_path(page);
+ return PTR_ERR(full_path);
}
- if (strcmp(full_path, name)) {
- kfree(full_path);
+ if (strcmp(full_path, name))
continue;
- }
- kfree(full_path);
cinode = CIFS_I(d_inode(cfile->dentry));
spin_unlock(&tcon->open_file_lock);
+ free_dentry_path(page);
return cifs_get_writable_file(cinode, flags, ret_file);
}
spin_unlock(&tcon->open_file_lock);
+ free_dentry_path(page);
return -ENOENT;
}
@@ -2103,35 +2104,32 @@ int
cifs_get_readable_path(struct cifs_tcon *tcon, const char *name,
struct cifsFileInfo **ret_file)
{
- struct list_head *tmp;
struct cifsFileInfo *cfile;
- struct cifsInodeInfo *cinode;
- char *full_path;
+ void *page = alloc_dentry_path();
*ret_file = NULL;
spin_lock(&tcon->open_file_lock);
- list_for_each(tmp, &tcon->openFileList) {
- cfile = list_entry(tmp, struct cifsFileInfo,
- tlist);
- full_path = build_path_from_dentry(cfile->dentry);
- if (full_path == NULL) {
+ list_for_each_entry(cfile, &tcon->openFileList, tlist) {
+ struct cifsInodeInfo *cinode;
+ const char *full_path = build_path_from_dentry(cfile->dentry, page);
+ if (IS_ERR(full_path)) {
spin_unlock(&tcon->open_file_lock);
- return -ENOMEM;
+ free_dentry_path(page);
+ return PTR_ERR(full_path);
}
- if (strcmp(full_path, name)) {
- kfree(full_path);
+ if (strcmp(full_path, name))
continue;
- }
- kfree(full_path);
cinode = CIFS_I(d_inode(cfile->dentry));
spin_unlock(&tcon->open_file_lock);
+ free_dentry_path(page);
*ret_file = find_readable_file(cinode, 0);
return *ret_file ? 0 : -ENOENT;
}
spin_unlock(&tcon->open_file_lock);
+ free_dentry_path(page);
return -ENOENT;
}
diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c
index 78889024a7ed..3e0d016849e3 100644
--- a/fs/cifs/fs_context.c
+++ b/fs/cifs/fs_context.c
@@ -137,6 +137,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_u32("min_enc_offload", Opt_min_enc_offload),
fsparam_u32("esize", Opt_min_enc_offload),
fsparam_u32("bsize", Opt_blocksize),
+ fsparam_u32("rasize", Opt_rasize),
fsparam_u32("rsize", Opt_rsize),
fsparam_u32("wsize", Opt_wsize),
fsparam_u32("actimeo", Opt_actimeo),
@@ -188,8 +189,8 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
{}
};
-int
-cifs_parse_security_flavors(char *value, struct smb3_fs_context *ctx)
+static int
+cifs_parse_security_flavors(struct fs_context *fc, char *value, struct smb3_fs_context *ctx)
{
substring_t args[MAX_OPT_ARGS];
@@ -203,7 +204,7 @@ cifs_parse_security_flavors(char *value, struct smb3_fs_context *ctx)
switch (match_token(value, cifs_secflavor_tokens, args)) {
case Opt_sec_krb5p:
- cifs_dbg(VFS, "sec=krb5p is not supported!\n");
+ cifs_errorf(fc, "sec=krb5p is not supported!\n");
return 1;
case Opt_sec_krb5i:
ctx->sign = true;
@@ -238,7 +239,7 @@ cifs_parse_security_flavors(char *value, struct smb3_fs_context *ctx)
ctx->nullauth = 1;
break;
default:
- cifs_dbg(VFS, "bad security option: %s\n", value);
+ cifs_errorf(fc, "bad security option: %s\n", value);
return 1;
}
@@ -254,8 +255,8 @@ static const match_table_t cifs_cacheflavor_tokens = {
{ Opt_cache_err, NULL }
};
-int
-cifs_parse_cache_flavor(char *value, struct smb3_fs_context *ctx)
+static int
+cifs_parse_cache_flavor(struct fs_context *fc, char *value, struct smb3_fs_context *ctx)
{
substring_t args[MAX_OPT_ARGS];
@@ -291,7 +292,7 @@ cifs_parse_cache_flavor(char *value, struct smb3_fs_context *ctx)
ctx->cache_rw = true;
break;
default:
- cifs_dbg(VFS, "bad cache= option: %s\n", value);
+ cifs_errorf(fc, "bad cache= option: %s\n", value);
return 1;
}
return 0;
@@ -339,7 +340,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
}
static int
-cifs_parse_smb_version(char *value, struct smb3_fs_context *ctx, bool is_smb3)
+cifs_parse_smb_version(struct fs_context *fc, char *value, struct smb3_fs_context *ctx, bool is_smb3)
{
substring_t args[MAX_OPT_ARGS];
@@ -347,24 +348,24 @@ cifs_parse_smb_version(char *value, struct smb3_fs_context *ctx, bool is_smb3)
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
case Smb_1:
if (disable_legacy_dialects) {
- cifs_dbg(VFS, "mount with legacy dialect disabled\n");
+ cifs_errorf(fc, "mount with legacy dialect disabled\n");
return 1;
}
if (is_smb3) {
- cifs_dbg(VFS, "vers=1.0 (cifs) not permitted when mounting with smb3\n");
+ cifs_errorf(fc, "vers=1.0 (cifs) not permitted when mounting with smb3\n");
return 1;
}
- cifs_dbg(VFS, "Use of the less secure dialect vers=1.0 is not recommended unless required for access to very old servers\n");
+ cifs_errorf(fc, "Use of the less secure dialect vers=1.0 is not recommended unless required for access to very old servers\n");
ctx->ops = &smb1_operations;
ctx->vals = &smb1_values;
break;
case Smb_20:
if (disable_legacy_dialects) {
- cifs_dbg(VFS, "mount with legacy dialect disabled\n");
+ cifs_errorf(fc, "mount with legacy dialect disabled\n");
return 1;
}
if (is_smb3) {
- cifs_dbg(VFS, "vers=2.0 not permitted when mounting with smb3\n");
+ cifs_errorf(fc, "vers=2.0 not permitted when mounting with smb3\n");
return 1;
}
ctx->ops = &smb20_operations;
@@ -372,10 +373,10 @@ cifs_parse_smb_version(char *value, struct smb3_fs_context *ctx, bool is_smb3)
break;
#else
case Smb_1:
- cifs_dbg(VFS, "vers=1.0 (cifs) mount not permitted when legacy dialects disabled\n");
+ cifs_errorf(fc, "vers=1.0 (cifs) mount not permitted when legacy dialects disabled\n");
return 1;
case Smb_20:
- cifs_dbg(VFS, "vers=2.0 mount not permitted when legacy dialects disabled\n");
+ cifs_errorf(fc, "vers=2.0 mount not permitted when legacy dialects disabled\n");
return 1;
#endif /* CIFS_ALLOW_INSECURE_LEGACY */
case Smb_21:
@@ -403,7 +404,7 @@ cifs_parse_smb_version(char *value, struct smb3_fs_context *ctx, bool is_smb3)
ctx->vals = &smbdefault_values;
break;
default:
- cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value);
+ cifs_errorf(fc, "Unknown vers= option specified: %s\n", value);
return 1;
}
return 0;
@@ -430,7 +431,7 @@ int smb3_parse_opt(const char *options, const char *key, char **val)
if (nval == p)
continue;
*nval++ = 0;
- *val = kstrndup(nval, strlen(nval), GFP_KERNEL);
+ *val = kstrdup(nval, GFP_KERNEL);
rc = !*val ? -ENOMEM : 0;
goto out;
}
@@ -588,14 +589,14 @@ static int smb3_fs_context_validate(struct fs_context *fc)
struct smb3_fs_context *ctx = smb3_fc2context(fc);
if (ctx->rdma && ctx->vals->protocol_id < SMB30_PROT_ID) {
- cifs_dbg(VFS, "SMB Direct requires Version >=3.0\n");
+ cifs_errorf(fc, "SMB Direct requires Version >=3.0\n");
return -EOPNOTSUPP;
}
#ifndef CONFIG_KEYS
/* Muliuser mounts require CONFIG_KEYS support */
if (ctx->multiuser) {
- cifs_dbg(VFS, "Multiuser mounts require kernels with CONFIG_KEYS enabled\n");
+ cifs_errorf(fc, "Multiuser mounts require kernels with CONFIG_KEYS enabled\n");
return -1;
}
#endif
@@ -605,13 +606,13 @@ static int smb3_fs_context_validate(struct fs_context *fc)
if (!ctx->UNC) {
- cifs_dbg(VFS, "CIFS mount error: No usable UNC path provided in device string!\n");
+ cifs_errorf(fc, "CIFS mount error: No usable UNC path provided in device string!\n");
return -1;
}
/* make sure UNC has a share name */
if (strlen(ctx->UNC) < 3 || !strchr(ctx->UNC + 3, '\\')) {
- cifs_dbg(VFS, "Malformed UNC. Unable to find share name.\n");
+ cifs_errorf(fc, "Malformed UNC. Unable to find share name.\n");
return -ENOENT;
}
@@ -684,49 +685,50 @@ static void smb3_fs_context_free(struct fs_context *fc)
* Compare the old and new proposed context during reconfigure
* and check if the changes are compatible.
*/
-static int smb3_verify_reconfigure_ctx(struct smb3_fs_context *new_ctx,
+static int smb3_verify_reconfigure_ctx(struct fs_context *fc,
+ struct smb3_fs_context *new_ctx,
struct smb3_fs_context *old_ctx)
{
if (new_ctx->posix_paths != old_ctx->posix_paths) {
- cifs_dbg(VFS, "can not change posixpaths during remount\n");
+ cifs_errorf(fc, "can not change posixpaths during remount\n");
return -EINVAL;
}
if (new_ctx->sectype != old_ctx->sectype) {
- cifs_dbg(VFS, "can not change sec during remount\n");
+ cifs_errorf(fc, "can not change sec during remount\n");
return -EINVAL;
}
if (new_ctx->multiuser != old_ctx->multiuser) {
- cifs_dbg(VFS, "can not change multiuser during remount\n");
+ cifs_errorf(fc, "can not change multiuser during remount\n");
return -EINVAL;
}
if (new_ctx->UNC &&
(!old_ctx->UNC || strcmp(new_ctx->UNC, old_ctx->UNC))) {
- cifs_dbg(VFS, "can not change UNC during remount\n");
+ cifs_errorf(fc, "can not change UNC during remount\n");
return -EINVAL;
}
if (new_ctx->username &&
(!old_ctx->username || strcmp(new_ctx->username, old_ctx->username))) {
- cifs_dbg(VFS, "can not change username during remount\n");
+ cifs_errorf(fc, "can not change username during remount\n");
return -EINVAL;
}
if (new_ctx->password &&
(!old_ctx->password || strcmp(new_ctx->password, old_ctx->password))) {
- cifs_dbg(VFS, "can not change password during remount\n");
+ cifs_errorf(fc, "can not change password during remount\n");
return -EINVAL;
}
if (new_ctx->domainname &&
(!old_ctx->domainname || strcmp(new_ctx->domainname, old_ctx->domainname))) {
- cifs_dbg(VFS, "can not change domainname during remount\n");
+ cifs_errorf(fc, "can not change domainname during remount\n");
return -EINVAL;
}
if (new_ctx->nodename &&
(!old_ctx->nodename || strcmp(new_ctx->nodename, old_ctx->nodename))) {
- cifs_dbg(VFS, "can not change nodename during remount\n");
+ cifs_errorf(fc, "can not change nodename during remount\n");
return -EINVAL;
}
if (new_ctx->iocharset &&
(!old_ctx->iocharset || strcmp(new_ctx->iocharset, old_ctx->iocharset))) {
- cifs_dbg(VFS, "can not change iocharset during remount\n");
+ cifs_errorf(fc, "can not change iocharset during remount\n");
return -EINVAL;
}
@@ -747,7 +749,7 @@ static int smb3_reconfigure(struct fs_context *fc)
struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb);
int rc;
- rc = smb3_verify_reconfigure_ctx(ctx, cifs_sb->ctx);
+ rc = smb3_verify_reconfigure_ctx(fc, ctx, cifs_sb->ctx);
if (rc)
return rc;
@@ -933,13 +935,33 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
*/
if ((result.uint_32 < CIFS_MAX_MSGSIZE) ||
(result.uint_32 > (4 * SMB3_DEFAULT_IOSIZE))) {
- cifs_dbg(VFS, "%s: Invalid blocksize\n",
+ cifs_errorf(fc, "%s: Invalid blocksize\n",
__func__);
goto cifs_parse_mount_err;
}
ctx->bsize = result.uint_32;
ctx->got_bsize = true;
break;
+ case Opt_rasize:
+ /*
+ * readahead size realistically should never need to be
+ * less than 1M (CIFS_DEFAULT_IOSIZE) or greater than 32M
+ * (perhaps an exception should be considered in the
+ * for the case of a large number of channels
+ * when multichannel is negotiated) since that would lead
+ * to plenty of parallel I/O in flight to the server.
+ * Note that smaller read ahead sizes would
+ * hurt performance of common tools like cp and scp
+ * which often trigger sequential i/o with read ahead
+ */
+ if ((result.uint_32 > (8 * SMB3_DEFAULT_IOSIZE)) ||
+ (result.uint_32 < CIFS_DEFAULT_IOSIZE)) {
+ cifs_errorf(fc, "%s: Invalid rasize %d vs. %d\n",
+ __func__, result.uint_32, SMB3_DEFAULT_IOSIZE);
+ goto cifs_parse_mount_err;
+ }
+ ctx->rasize = result.uint_32;
+ break;
case Opt_rsize:
ctx->rsize = result.uint_32;
ctx->got_rsize = true;
@@ -951,25 +973,25 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
case Opt_acregmax:
ctx->acregmax = HZ * result.uint_32;
if (ctx->acregmax > CIFS_MAX_ACTIMEO) {
- cifs_dbg(VFS, "acregmax too large\n");
+ cifs_errorf(fc, "acregmax too large\n");
goto cifs_parse_mount_err;
}
break;
case Opt_acdirmax:
ctx->acdirmax = HZ * result.uint_32;
if (ctx->acdirmax > CIFS_MAX_ACTIMEO) {
- cifs_dbg(VFS, "acdirmax too large\n");
+ cifs_errorf(fc, "acdirmax too large\n");
goto cifs_parse_mount_err;
}
break;
case Opt_actimeo:
if (HZ * result.uint_32 > CIFS_MAX_ACTIMEO) {
- cifs_dbg(VFS, "timeout too large\n");
+ cifs_errorf(fc, "timeout too large\n");
goto cifs_parse_mount_err;
}
if ((ctx->acdirmax != CIFS_DEF_ACTIMEO) ||
(ctx->acregmax != CIFS_DEF_ACTIMEO)) {
- cifs_dbg(VFS, "actimeo ignored since acregmax or acdirmax specified\n");
+ cifs_errorf(fc, "actimeo ignored since acregmax or acdirmax specified\n");
break;
}
ctx->acdirmax = ctx->acregmax = HZ * result.uint_32;
@@ -982,7 +1004,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
break;
case Opt_max_credits:
if (result.uint_32 < 20 || result.uint_32 > 60000) {
- cifs_dbg(VFS, "%s: Invalid max_credits value\n",
+ cifs_errorf(fc, "%s: Invalid max_credits value\n",
__func__);
goto cifs_parse_mount_err;
}
@@ -990,7 +1012,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
break;
case Opt_max_channels:
if (result.uint_32 < 1 || result.uint_32 > CIFS_MAX_CHANNELS) {
- cifs_dbg(VFS, "%s: Invalid max_channels value, needs to be 1-%d\n",
+ cifs_errorf(fc, "%s: Invalid max_channels value, needs to be 1-%d\n",
__func__, CIFS_MAX_CHANNELS);
goto cifs_parse_mount_err;
}
@@ -999,7 +1021,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
case Opt_handletimeout:
ctx->handle_timeout = result.uint_32;
if (ctx->handle_timeout > SMB3_MAX_HANDLE_TIMEOUT) {
- cifs_dbg(VFS, "Invalid handle cache timeout, longer than 16 minutes\n");
+ cifs_errorf(fc, "Invalid handle cache timeout, longer than 16 minutes\n");
goto cifs_parse_mount_err;
}
break;
@@ -1010,23 +1032,23 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
case 0:
break;
case -ENOMEM:
- cifs_dbg(VFS, "Unable to allocate memory for devname\n");
+ cifs_errorf(fc, "Unable to allocate memory for devname\n");
goto cifs_parse_mount_err;
case -EINVAL:
- cifs_dbg(VFS, "Malformed UNC in devname\n");
+ cifs_errorf(fc, "Malformed UNC in devname\n");
goto cifs_parse_mount_err;
default:
- cifs_dbg(VFS, "Unknown error parsing devname\n");
+ cifs_errorf(fc, "Unknown error parsing devname\n");
goto cifs_parse_mount_err;
}
ctx->source = kstrdup(param->string, GFP_KERNEL);
if (ctx->source == NULL) {
- cifs_dbg(VFS, "OOM when copying UNC string\n");
+ cifs_errorf(fc, "OOM when copying UNC string\n");
goto cifs_parse_mount_err;
}
fc->source = kstrdup(param->string, GFP_KERNEL);
if (fc->source == NULL) {
- cifs_dbg(VFS, "OOM when copying UNC string\n");
+ cifs_errorf(fc, "OOM when copying UNC string\n");
goto cifs_parse_mount_err;
}
break;
@@ -1046,7 +1068,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
}
ctx->username = kstrdup(param->string, GFP_KERNEL);
if (ctx->username == NULL) {
- cifs_dbg(VFS, "OOM when copying username string\n");
+ cifs_errorf(fc, "OOM when copying username string\n");
goto cifs_parse_mount_err;
}
break;
@@ -1058,7 +1080,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
ctx->password = kstrdup(param->string, GFP_KERNEL);
if (ctx->password == NULL) {
- cifs_dbg(VFS, "OOM when copying password string\n");
+ cifs_errorf(fc, "OOM when copying password string\n");
goto cifs_parse_mount_err;
}
break;
@@ -1085,7 +1107,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
kfree(ctx->domainname);
ctx->domainname = kstrdup(param->string, GFP_KERNEL);
if (ctx->domainname == NULL) {
- cifs_dbg(VFS, "OOM when copying domainname string\n");
+ cifs_errorf(fc, "OOM when copying domainname string\n");
goto cifs_parse_mount_err;
}
cifs_dbg(FYI, "Domain name set\n");
@@ -1109,7 +1131,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
kfree(ctx->iocharset);
ctx->iocharset = kstrdup(param->string, GFP_KERNEL);
if (ctx->iocharset == NULL) {
- cifs_dbg(VFS, "OOM when copying iocharset string\n");
+ cifs_errorf(fc, "OOM when copying iocharset string\n");
goto cifs_parse_mount_err;
}
}
@@ -1175,21 +1197,21 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
goto cifs_parse_mount_err;
case Opt_vers:
/* protocol version (dialect) */
- if (cifs_parse_smb_version(param->string, ctx, is_smb3) != 0)
+ if (cifs_parse_smb_version(fc, param->string, ctx, is_smb3) != 0)
goto cifs_parse_mount_err;
ctx->got_version = true;
break;
case Opt_sec:
- if (cifs_parse_security_flavors(param->string, ctx) != 0)
+ if (cifs_parse_security_flavors(fc, param->string, ctx) != 0)
goto cifs_parse_mount_err;
break;
case Opt_cache:
- if (cifs_parse_cache_flavor(param->string, ctx) != 0)
+ if (cifs_parse_cache_flavor(fc, param->string, ctx) != 0)
goto cifs_parse_mount_err;
break;
case Opt_witness:
#ifndef CONFIG_CIFS_SWN_UPCALL
- cifs_dbg(VFS, "Witness support needs CONFIG_CIFS_SWN_UPCALL config option\n");
+ cifs_errorf(fc, "Witness support needs CONFIG_CIFS_SWN_UPCALL config option\n");
goto cifs_parse_mount_err;
#endif
ctx->witness = true;
@@ -1290,7 +1312,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
break;
case Opt_fsc:
#ifndef CONFIG_CIFS_FSCACHE
- cifs_dbg(VFS, "FS-Cache support needs CONFIG_CIFS_FSCACHE kernel config option set\n");
+ cifs_errorf(fc, "FS-Cache support needs CONFIG_CIFS_FSCACHE kernel config option set\n");
goto cifs_parse_mount_err;
#endif
ctx->fsc = true;
@@ -1311,15 +1333,13 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
if (result.negated) {
ctx->nopersistent = true;
if (ctx->persistent) {
- cifs_dbg(VFS,
- "persistenthandles mount options conflict\n");
+ cifs_errorf(fc, "persistenthandles mount options conflict\n");
goto cifs_parse_mount_err;
}
} else {
ctx->persistent = true;
if ((ctx->nopersistent) || (ctx->resilient)) {
- cifs_dbg(VFS,
- "persistenthandles mount options conflict\n");
+ cifs_errorf(fc, "persistenthandles mount options conflict\n");
goto cifs_parse_mount_err;
}
}
@@ -1330,8 +1350,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
} else {
ctx->resilient = true;
if (ctx->persistent) {
- cifs_dbg(VFS,
- "persistenthandles mount options conflict\n");
+ cifs_errorf(fc, "persistenthandles mount options conflict\n");
goto cifs_parse_mount_err;
}
}
@@ -1379,7 +1398,9 @@ int smb3_init_fs_context(struct fs_context *fc)
ctx->cred_uid = current_uid();
ctx->linux_uid = current_uid();
ctx->linux_gid = current_gid();
- ctx->bsize = 1024 * 1024; /* can improve cp performance significantly */
+ /* By default 4MB read ahead size, 1MB block size */
+ ctx->bsize = CIFS_DEFAULT_IOSIZE; /* can improve cp performance significantly */
+ ctx->rasize = 0; /* 0 = use default (ie negotiated rsize) for read ahead pages */
/*
* default to SFM style remapping of seven reserved characters
diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h
index 87dd1f7168f2..2a71c8e411ac 100644
--- a/fs/cifs/fs_context.h
+++ b/fs/cifs/fs_context.h
@@ -13,7 +13,12 @@
#include <linux/parser.h>
#include <linux/fs_parser.h>
-#define cifs_invalf(fc, fmt, ...) invalf(fc, fmt, ## __VA_ARGS__)
+/* Log errors in fs_context (new mount api) but also in dmesg (old style) */
+#define cifs_errorf(fc, fmt, ...) \
+ do { \
+ errorf(fc, fmt, ## __VA_ARGS__); \
+ cifs_dbg(VFS, fmt, ## __VA_ARGS__); \
+ } while (0)
enum smb_version {
Smb_1 = 1,
@@ -115,6 +120,7 @@ enum cifs_param {
Opt_dirmode,
Opt_min_enc_offload,
Opt_blocksize,
+ Opt_rasize,
Opt_rsize,
Opt_wsize,
Opt_actimeo,
@@ -230,6 +236,7 @@ struct smb3_fs_context {
/* reuse existing guid for multichannel */
u8 client_guid[SMB2_CLIENT_GUID_SIZE];
unsigned int bsize;
+ unsigned int rasize;
unsigned int rsize;
unsigned int wsize;
unsigned int min_offload;
@@ -257,10 +264,6 @@ struct smb3_fs_context {
extern const struct fs_parameter_spec smb3_fs_parameters[];
-extern int cifs_parse_cache_flavor(char *value,
- struct smb3_fs_context *ctx);
-extern int cifs_parse_security_flavors(char *value,
- struct smb3_fs_context *ctx);
extern int smb3_init_fs_context(struct fs_context *fc);
extern void smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx);
extern void smb3_cleanup_fs_context(struct smb3_fs_context *ctx);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f2df4422e54a..002d864b8f7b 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -157,12 +157,18 @@ cifs_nlink_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
}
/* populate an inode with info from a cifs_fattr struct */
-void
+int
cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
{
struct cifsInodeInfo *cifs_i = CIFS_I(inode);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ if (!(inode->i_state & I_NEW) &&
+ unlikely(inode_wrong_type(inode, fattr->cf_mode))) {
+ CIFS_I(inode)->time = 0; /* force reval */
+ return -ESTALE;
+ }
+
cifs_revalidate_cache(inode, fattr);
spin_lock(&inode->i_lock);
@@ -219,6 +225,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
inode->i_flags |= S_AUTOMOUNT;
if (inode->i_state & I_NEW)
cifs_set_ops(inode);
+ return 0;
}
void
@@ -363,7 +370,7 @@ cifs_get_file_info_unix(struct file *filp)
rc = 0;
}
- cifs_fattr_to_inode(inode, &fattr);
+ rc = cifs_fattr_to_inode(inode, &fattr);
free_xid(xid);
return rc;
}
@@ -426,14 +433,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
}
/* if filetype is different, return error */
- if (unlikely(((*pinode)->i_mode & S_IFMT) !=
- (fattr.cf_mode & S_IFMT))) {
- CIFS_I(*pinode)->time = 0; /* force reval */
- rc = -ESTALE;
- goto cgiiu_exit;
- }
-
- cifs_fattr_to_inode(*pinode, &fattr);
+ rc = cifs_fattr_to_inode(*pinode, &fattr);
}
cgiiu_exit:
@@ -783,7 +783,8 @@ cifs_get_file_info(struct file *filp)
*/
fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
- cifs_fattr_to_inode(inode, &fattr);
+ /* if filetype is different, return error */
+ rc = cifs_fattr_to_inode(inode, &fattr);
cgfi_exit:
free_xid(xid);
return rc;
@@ -1100,16 +1101,8 @@ handle_mnt_opt:
rc = -ESTALE;
goto out;
}
-
/* if filetype is different, return error */
- if (unlikely(((*inode)->i_mode & S_IFMT) !=
- (fattr.cf_mode & S_IFMT))) {
- CIFS_I(*inode)->time = 0; /* force reval */
- rc = -ESTALE;
- goto out;
- }
-
- cifs_fattr_to_inode(*inode, &fattr);
+ rc = cifs_fattr_to_inode(*inode, &fattr);
}
out:
cifs_buf_release(smb1_backup_rsp_buf);
@@ -1215,14 +1208,7 @@ smb311_posix_get_inode_info(struct inode **inode,
}
/* if filetype is different, return error */
- if (unlikely(((*inode)->i_mode & S_IFMT) !=
- (fattr.cf_mode & S_IFMT))) {
- CIFS_I(*inode)->time = 0; /* force reval */
- rc = -ESTALE;
- goto out;
- }
-
- cifs_fattr_to_inode(*inode, &fattr);
+ rc = cifs_fattr_to_inode(*inode, &fattr);
}
out:
cifs_put_tlink(tlink);
@@ -1249,7 +1235,7 @@ cifs_find_inode(struct inode *inode, void *opaque)
return 0;
/* don't match inode of different type */
- if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT))
+ if (inode_wrong_type(inode, fattr->cf_mode))
return 0;
/* if it's not a directory or has no dentries, then flag it */
@@ -1317,6 +1303,7 @@ retry_iget5_locked:
}
}
+ /* can't fail - see cifs_find_inode() */
cifs_fattr_to_inode(inode, fattr);
if (sb->s_flags & SB_NOATIME)
inode->i_flags |= S_NOATIME | S_NOCMTIME;
@@ -1408,7 +1395,7 @@ out:
int
cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid,
- char *full_path, __u32 dosattr)
+ const char *full_path, __u32 dosattr)
{
bool set_time = false;
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -1609,7 +1596,8 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
{
int rc = 0;
unsigned int xid;
- char *full_path = NULL;
+ const char *full_path;
+ void *page;
struct inode *inode = d_inode(dentry);
struct cifsInodeInfo *cifs_inode;
struct super_block *sb = dir->i_sb;
@@ -1629,6 +1617,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
server = tcon->ses->server;
xid = get_xid();
+ page = alloc_dentry_path();
if (tcon->nodelete) {
rc = -EACCES;
@@ -1637,9 +1626,9 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
/* Unlink can be called from rename so we can not take the
* sb->s_vfs_rename_mutex here */
- full_path = build_path_from_dentry(dentry);
- if (full_path == NULL) {
- rc = -ENOMEM;
+ full_path = build_path_from_dentry(dentry, page);
+ if (IS_ERR(full_path)) {
+ rc = PTR_ERR(full_path);
goto unlink_out;
}
@@ -1713,7 +1702,7 @@ out_reval:
cifs_inode = CIFS_I(dir);
CIFS_I(dir)->time = 0; /* force revalidate of dir as well */
unlink_out:
- kfree(full_path);
+ free_dentry_path(page);
kfree(attrs);
free_xid(xid);
cifs_put_tlink(tlink);
@@ -1740,6 +1729,16 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
if (rc)
return rc;
+ if (!S_ISDIR(inode->i_mode)) {
+ /*
+ * mkdir succeeded, but another client has managed to remove the
+ * sucker and replace it with non-directory. Return success,
+ * but don't leave the child in dcache.
+ */
+ iput(inode);
+ d_drop(dentry);
+ return 0;
+ }
/*
* setting nlink not necessary except in cases where we failed to get it
* from the server or was set bogus. Also, since this is a brand new
@@ -1791,7 +1790,7 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
}
}
d_instantiate(dentry, inode);
- return rc;
+ return 0;
}
static int
@@ -1866,7 +1865,8 @@ int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode,
struct tcon_link *tlink;
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
- char *full_path;
+ const char *full_path;
+ void *page;
cifs_dbg(FYI, "In cifs_mkdir, mode = %04ho inode = 0x%p\n",
mode, inode);
@@ -1879,9 +1879,10 @@ int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode,
xid = get_xid();
- full_path = build_path_from_dentry(direntry);
- if (full_path == NULL) {
- rc = -ENOMEM;
+ page = alloc_dentry_path();
+ full_path = build_path_from_dentry(direntry, page);
+ if (IS_ERR(full_path)) {
+ rc = PTR_ERR(full_path);
goto mkdir_out;
}
@@ -1924,7 +1925,7 @@ mkdir_out:
* attributes are invalid now.
*/
CIFS_I(inode)->time = 0;
- kfree(full_path);
+ free_dentry_path(page);
free_xid(xid);
cifs_put_tlink(tlink);
return rc;
@@ -1938,16 +1939,17 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
struct tcon_link *tlink;
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
- char *full_path = NULL;
+ const char *full_path;
+ void *page = alloc_dentry_path();
struct cifsInodeInfo *cifsInode;
cifs_dbg(FYI, "cifs_rmdir, inode = 0x%p\n", inode);
xid = get_xid();
- full_path = build_path_from_dentry(direntry);
- if (full_path == NULL) {
- rc = -ENOMEM;
+ full_path = build_path_from_dentry(direntry, page);
+ if (IS_ERR(full_path)) {
+ rc = PTR_ERR(full_path);
goto rmdir_exit;
}
@@ -1997,7 +1999,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
current_time(inode);
rmdir_exit:
- kfree(full_path);
+ free_dentry_path(page);
free_xid(xid);
return rc;
}
@@ -2072,8 +2074,8 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir,
struct dentry *source_dentry, struct inode *target_dir,
struct dentry *target_dentry, unsigned int flags)
{
- char *from_name = NULL;
- char *to_name = NULL;
+ const char *from_name, *to_name;
+ void *page1, *page2;
struct cifs_sb_info *cifs_sb;
struct tcon_link *tlink;
struct cifs_tcon *tcon;
@@ -2091,21 +2093,19 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir,
return PTR_ERR(tlink);
tcon = tlink_tcon(tlink);
+ page1 = alloc_dentry_path();
+ page2 = alloc_dentry_path();
xid = get_xid();
- /*
- * we already have the rename sem so we do not need to
- * grab it again here to protect the path integrity
- */
- from_name = build_path_from_dentry(source_dentry);
- if (from_name == NULL) {
- rc = -ENOMEM;
+ from_name = build_path_from_dentry(source_dentry, page1);
+ if (IS_ERR(from_name)) {
+ rc = PTR_ERR(from_name);
goto cifs_rename_exit;
}
- to_name = build_path_from_dentry(target_dentry);
- if (to_name == NULL) {
- rc = -ENOMEM;
+ to_name = build_path_from_dentry(target_dentry, page2);
+ if (IS_ERR(to_name)) {
+ rc = PTR_ERR(to_name);
goto cifs_rename_exit;
}
@@ -2177,18 +2177,21 @@ unlink_target:
cifs_rename_exit:
kfree(info_buf_source);
- kfree(from_name);
- kfree(to_name);
+ free_dentry_path(page2);
+ free_dentry_path(page1);
free_xid(xid);
cifs_put_tlink(tlink);
return rc;
}
static bool
-cifs_inode_needs_reval(struct inode *inode)
+cifs_dentry_needs_reval(struct dentry *dentry)
{
+ struct inode *inode = d_inode(dentry);
struct cifsInodeInfo *cifs_i = CIFS_I(inode);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+ struct cached_fid *cfid = NULL;
if (cifs_i->time == 0)
return true;
@@ -2199,6 +2202,16 @@ cifs_inode_needs_reval(struct inode *inode)
if (!lookupCacheEnabled)
return true;
+ if (!open_cached_dir_by_dentry(tcon, dentry->d_parent, &cfid)) {
+ mutex_lock(&cfid->fid_mutex);
+ if (cfid->time && cifs_i->time > cfid->time) {
+ mutex_unlock(&cfid->fid_mutex);
+ close_cached_dir(cfid);
+ return false;
+ }
+ mutex_unlock(&cfid->fid_mutex);
+ close_cached_dir(cfid);
+ }
/*
* depending on inode type, check if attribute caching disabled for
* files or directories
@@ -2297,10 +2310,10 @@ cifs_zap_mapping(struct inode *inode)
int cifs_revalidate_file_attr(struct file *filp)
{
int rc = 0;
- struct inode *inode = file_inode(filp);
+ struct dentry *dentry = file_dentry(filp);
struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
- if (!cifs_inode_needs_reval(inode))
+ if (!cifs_dentry_needs_reval(dentry))
return rc;
if (tlink_tcon(cfile->tlink)->unix_ext)
@@ -2317,22 +2330,22 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry)
int rc = 0;
struct inode *inode = d_inode(dentry);
struct super_block *sb = dentry->d_sb;
- char *full_path = NULL;
+ const char *full_path;
+ void *page;
int count = 0;
if (inode == NULL)
return -ENOENT;
- if (!cifs_inode_needs_reval(inode))
+ if (!cifs_dentry_needs_reval(dentry))
return rc;
xid = get_xid();
- /* can not safely grab the rename sem here if rename calls revalidate
- since that would deadlock */
- full_path = build_path_from_dentry(dentry);
- if (full_path == NULL) {
- rc = -ENOMEM;
+ page = alloc_dentry_path();
+ full_path = build_path_from_dentry(dentry, page);
+ if (IS_ERR(full_path)) {
+ rc = PTR_ERR(full_path);
goto out;
}
@@ -2351,7 +2364,7 @@ again:
if (rc == -EAGAIN && count++ < 10)
goto again;
out:
- kfree(full_path);
+ free_dentry_path(page);
free_xid(xid);
return rc;
@@ -2522,7 +2535,7 @@ void cifs_setsize(struct inode *inode, loff_t offset)
static int
cifs_set_file_size(struct inode *inode, struct iattr *attrs,
- unsigned int xid, char *full_path)
+ unsigned int xid, const char *full_path)
{
int rc;
struct cifsFileInfo *open_file;
@@ -2613,7 +2626,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
{
int rc;
unsigned int xid;
- char *full_path = NULL;
+ const char *full_path;
+ void *page = alloc_dentry_path();
struct inode *inode = d_inode(direntry);
struct cifsInodeInfo *cifsInode = CIFS_I(inode);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -2634,9 +2648,9 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
if (rc < 0)
goto out;
- full_path = build_path_from_dentry(direntry);
- if (full_path == NULL) {
- rc = -ENOMEM;
+ full_path = build_path_from_dentry(direntry, page);
+ if (IS_ERR(full_path)) {
+ rc = PTR_ERR(full_path);
goto out;
}
@@ -2748,7 +2762,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
cifsInode->time = 0;
out:
kfree(args);
- kfree(full_path);
+ free_dentry_path(page);
free_xid(xid);
return rc;
}
@@ -2764,7 +2778,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
struct cifsInodeInfo *cifsInode = CIFS_I(inode);
struct cifsFileInfo *wfile;
struct cifs_tcon *tcon;
- char *full_path = NULL;
+ const char *full_path;
+ void *page = alloc_dentry_path();
int rc = -EACCES;
__u32 dosattr = 0;
__u64 mode = NO_CHANGE_64;
@@ -2778,16 +2793,13 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
attrs->ia_valid |= ATTR_FORCE;
rc = setattr_prepare(&init_user_ns, direntry, attrs);
- if (rc < 0) {
- free_xid(xid);
- return rc;
- }
+ if (rc < 0)
+ goto cifs_setattr_exit;
- full_path = build_path_from_dentry(direntry);
- if (full_path == NULL) {
- rc = -ENOMEM;
- free_xid(xid);
- return rc;
+ full_path = build_path_from_dentry(direntry, page);
+ if (IS_ERR(full_path)) {
+ rc = PTR_ERR(full_path);
+ goto cifs_setattr_exit;
}
/*
@@ -2937,8 +2949,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
mark_inode_dirty(inode);
cifs_setattr_exit:
- kfree(full_path);
free_xid(xid);
+ free_dentry_path(page);
return rc;
}
@@ -2961,12 +2973,3 @@ cifs_setattr(struct user_namespace *mnt_userns, struct dentry *direntry,
/* BB: add cifs_setattr_legacy for really old servers */
return rc;
}
-
-#if 0
-void cifs_delete_inode(struct inode *inode)
-{
- cifs_dbg(FYI, "In cifs_delete_inode, inode = 0x%p\n", inode);
- /* may have to add back in if and when safe distributed caching of
- directories added e.g. via FindNotify */
-}
-#endif
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index dcde44ff6cf9..08d99fec593e 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -42,13 +42,16 @@ static long cifs_ioctl_query_info(unsigned int xid, struct file *filep,
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
struct dentry *dentry = filep->f_path.dentry;
- unsigned char *path;
+ const unsigned char *path;
+ void *page = alloc_dentry_path();
__le16 *utf16_path = NULL, root_path;
int rc = 0;
- path = build_path_from_dentry(dentry);
- if (path == NULL)
- return -ENOMEM;
+ path = build_path_from_dentry(dentry, page);
+ if (IS_ERR(path)) {
+ free_dentry_path(page);
+ return PTR_ERR(path);
+ }
cifs_dbg(FYI, "%s %s\n", __func__, path);
@@ -73,7 +76,7 @@ static long cifs_ioctl_query_info(unsigned int xid, struct file *filep,
ici_exit:
if (utf16_path != &root_path)
kfree(utf16_path);
- kfree(path);
+ free_dentry_path(page);
return rc;
}
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 7c5878a645d9..616e1bc0cc0a 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -510,8 +510,8 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
{
int rc = -EACCES;
unsigned int xid;
- char *from_name = NULL;
- char *to_name = NULL;
+ const char *from_name, *to_name;
+ void *page1, *page2;
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct tcon_link *tlink;
struct cifs_tcon *tcon;
@@ -524,11 +524,17 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
tcon = tlink_tcon(tlink);
xid = get_xid();
+ page1 = alloc_dentry_path();
+ page2 = alloc_dentry_path();
- from_name = build_path_from_dentry(old_file);
- to_name = build_path_from_dentry(direntry);
- if ((from_name == NULL) || (to_name == NULL)) {
- rc = -ENOMEM;
+ from_name = build_path_from_dentry(old_file, page1);
+ if (IS_ERR(from_name)) {
+ rc = PTR_ERR(from_name);
+ goto cifs_hl_exit;
+ }
+ to_name = build_path_from_dentry(direntry, page2);
+ if (IS_ERR(to_name)) {
+ rc = PTR_ERR(to_name);
goto cifs_hl_exit;
}
@@ -587,8 +593,8 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
}
cifs_hl_exit:
- kfree(from_name);
- kfree(to_name);
+ free_dentry_path(page1);
+ free_dentry_path(page2);
free_xid(xid);
cifs_put_tlink(tlink);
return rc;
@@ -600,7 +606,8 @@ cifs_get_link(struct dentry *direntry, struct inode *inode,
{
int rc = -ENOMEM;
unsigned int xid;
- char *full_path = NULL;
+ const char *full_path;
+ void *page;
char *target_path = NULL;
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct tcon_link *tlink = NULL;
@@ -620,11 +627,13 @@ cifs_get_link(struct dentry *direntry, struct inode *inode,
tcon = tlink_tcon(tlink);
server = tcon->ses->server;
- full_path = build_path_from_dentry(direntry);
- if (!full_path) {
+ page = alloc_dentry_path();
+ full_path = build_path_from_dentry(direntry, page);
+ if (IS_ERR(full_path)) {
free_xid(xid);
cifs_put_tlink(tlink);
- return ERR_PTR(-ENOMEM);
+ free_dentry_path(page);
+ return ERR_CAST(full_path);
}
cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, inode);
@@ -649,7 +658,7 @@ cifs_get_link(struct dentry *direntry, struct inode *inode,
&target_path, reparse_point);
}
- kfree(full_path);
+ free_dentry_path(page);
free_xid(xid);
cifs_put_tlink(tlink);
if (rc != 0) {
@@ -669,7 +678,8 @@ cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode,
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct tcon_link *tlink;
struct cifs_tcon *pTcon;
- char *full_path = NULL;
+ const char *full_path;
+ void *page = alloc_dentry_path();
struct inode *newinode = NULL;
xid = get_xid();
@@ -681,9 +691,9 @@ cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode,
}
pTcon = tlink_tcon(tlink);
- full_path = build_path_from_dentry(direntry);
- if (full_path == NULL) {
- rc = -ENOMEM;
+ full_path = build_path_from_dentry(direntry, page);
+ if (IS_ERR(full_path)) {
+ rc = PTR_ERR(full_path);
goto symlink_exit;
}
@@ -719,7 +729,7 @@ cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode,
}
}
symlink_exit:
- kfree(full_path);
+ free_dentry_path(page);
cifs_put_tlink(tlink);
free_xid(xid);
return rc;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 82e176720ca6..c15a90e422be 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -1180,7 +1180,7 @@ int update_super_prepath(struct cifs_tcon *tcon, char *prefix)
kfree(cifs_sb->prepath);
if (prefix && *prefix) {
- cifs_sb->prepath = kstrndup(prefix, strlen(prefix), GFP_ATOMIC);
+ cifs_sb->prepath = kstrdup(prefix, GFP_ATOMIC);
if (!cifs_sb->prepath) {
rc = -ENOMEM;
goto out;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 80bf4c6f4c7b..63bfc533c9fb 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -119,9 +119,7 @@ retry:
/* update inode in place
* if both i_ino and i_mode didn't change */
if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid &&
- (inode->i_mode & S_IFMT) ==
- (fattr->cf_mode & S_IFMT)) {
- cifs_fattr_to_inode(inode, fattr);
+ cifs_fattr_to_inode(inode, fattr) == 0) {
dput(dentry);
return;
}
@@ -384,7 +382,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
static int
initiate_cifs_search(const unsigned int xid, struct file *file,
- char *full_path)
+ const char *full_path)
{
__u16 search_flags;
int rc = 0;
@@ -704,7 +702,7 @@ static int cifs_save_resume_key(const char *current_entry,
*/
static int
find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
- struct file *file, char *full_path,
+ struct file *file, const char *full_path,
char **current_entry, int *num_to_ret)
{
__u16 search_flags;
@@ -942,13 +940,14 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
char *tmp_buf = NULL;
char *end_of_smb;
unsigned int max_len;
- char *full_path = NULL;
+ const char *full_path;
+ void *page = alloc_dentry_path();
xid = get_xid();
- full_path = build_path_from_dentry(file_dentry(file));
- if (full_path == NULL) {
- rc = -ENOMEM;
+ full_path = build_path_from_dentry(file_dentry(file), page);
+ if (IS_ERR(full_path)) {
+ rc = PTR_ERR(full_path);
goto rddir2_exit;
}
@@ -1043,7 +1042,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
kfree(tmp_buf);
rddir2_exit:
- kfree(full_path);
+ free_dentry_path(page);
free_xid(xid);
return rc;
}
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index e31b939e628c..3b83839fc2c2 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -926,9 +926,7 @@ cifs_unix_dfs_readlink(const unsigned int xid, struct cifs_tcon *tcon,
0);
if (!rc) {
- *symlinkinfo = kstrndup(referral.node_name,
- strlen(referral.node_name),
- GFP_KERNEL);
+ *symlinkinfo = kstrdup(referral.node_name, GFP_KERNEL);
free_dfs_info_param(&referral);
if (!*symlinkinfo)
rc = -ENOMEM;
@@ -1027,7 +1025,7 @@ cifs_can_echo(struct TCP_Server_Info *server)
static int
cifs_make_node(unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
- char *full_path, umode_t mode, dev_t dev)
+ const char *full_path, umode_t mode, dev_t dev)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct inode *newinode = NULL;
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index 99a1951a01ec..d9a990c99121 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -58,6 +58,7 @@
#define SMB2_HMACSHA256_SIZE (32)
#define SMB2_CMACAES_SIZE (16)
#define SMB3_SIGNKEY_SIZE (16)
+#define SMB3_GCM128_CRYPTKEY_SIZE (16)
#define SMB3_GCM256_CRYPTKEY_SIZE (32)
/* Maximum buffer size value we can send with 1 credit */
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index a718dc77e604..9a61209a283e 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -512,7 +512,6 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
int rc;
struct smb2_file_all_info *smb2_data;
__u32 create_options = 0;
- bool no_cached_open = tcon->nohandlecache;
struct cifsFileInfo *cfile;
struct cached_fid *cfid = NULL;
@@ -525,11 +524,8 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
return -ENOMEM;
/* If it is a root and its handle is cached then use it */
- if (!strlen(full_path) && !no_cached_open) {
- rc = open_shroot(xid, tcon, cifs_sb, &cfid);
- if (rc)
- goto out;
-
+ rc = open_cached_dir(xid, tcon, full_path, cifs_sb, &cfid);
+ if (!rc) {
if (tcon->crfid.file_all_info_is_valid) {
move_smb2_info_to_cifs(data,
&tcon->crfid.file_all_info);
@@ -540,7 +536,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
if (!rc)
move_smb2_info_to_cifs(data, smb2_data);
}
- close_shroot(cfid);
+ close_cached_dir(cfid);
goto out;
}
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index b50164e2c88d..06d555d4da9a 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -667,6 +667,7 @@ smb2_is_valid_lease_break(char *buffer)
!memcmp(rsp->LeaseKey,
tcon->crfid.fid->lease_key,
SMB2_LEASE_KEY_SIZE)) {
+ tcon->crfid.time = 0;
INIT_WORK(&tcon->crfid.lease_break,
smb2_cached_lease_break);
queue_work(cifsiod_wq,
@@ -754,8 +755,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
}
}
spin_unlock(&cifs_tcp_ses_lock);
- cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n");
- return false;
+ cifs_dbg(FYI, "No file id matched, oplock break ignored\n");
+ return true;
}
void
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 9bae7e8deb09..dd0eb665b680 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -690,17 +690,21 @@ smb2_close_cached_fid(struct kref *ref)
cfid->is_valid = false;
cfid->file_all_info_is_valid = false;
cfid->has_lease = false;
+ if (cfid->dentry) {
+ dput(cfid->dentry);
+ cfid->dentry = NULL;
+ }
}
}
-void close_shroot(struct cached_fid *cfid)
+void close_cached_dir(struct cached_fid *cfid)
{
mutex_lock(&cfid->fid_mutex);
kref_put(&cfid->refcount, smb2_close_cached_fid);
mutex_unlock(&cfid->fid_mutex);
}
-void close_shroot_lease_locked(struct cached_fid *cfid)
+void close_cached_dir_lease_locked(struct cached_fid *cfid)
{
if (cfid->has_lease) {
cfid->has_lease = false;
@@ -708,10 +712,10 @@ void close_shroot_lease_locked(struct cached_fid *cfid)
}
}
-void close_shroot_lease(struct cached_fid *cfid)
+void close_cached_dir_lease(struct cached_fid *cfid)
{
mutex_lock(&cfid->fid_mutex);
- close_shroot_lease_locked(cfid);
+ close_cached_dir_lease_locked(cfid);
mutex_unlock(&cfid->fid_mutex);
}
@@ -721,13 +725,15 @@ smb2_cached_lease_break(struct work_struct *work)
struct cached_fid *cfid = container_of(work,
struct cached_fid, lease_break);
- close_shroot_lease(cfid);
+ close_cached_dir_lease(cfid);
}
/*
- * Open the directory at the root of a share
+ * Open the and cache a directory handle.
+ * Only supported for the root handle.
*/
-int open_shroot(unsigned int xid, struct cifs_tcon *tcon,
+int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
+ const char *path,
struct cifs_sb_info *cifs_sb,
struct cached_fid **cfid)
{
@@ -745,6 +751,18 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon,
__le16 utf16_path = 0; /* Null - since an open of top of share */
u8 oplock = SMB2_OPLOCK_LEVEL_II;
struct cifs_fid *pfid;
+ struct dentry *dentry;
+
+ if (tcon->nohandlecache)
+ return -ENOTSUPP;
+
+ if (cifs_sb->root == NULL)
+ return -ENOENT;
+
+ if (strlen(path))
+ return -ENOENT;
+
+ dentry = cifs_sb->root;
mutex_lock(&tcon->crfid.fid_mutex);
if (tcon->crfid.is_valid) {
@@ -830,11 +848,9 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon,
};
/*
- * caller expects this func to set pfid to a valid
- * cached root, so we copy the existing one and get a
- * reference.
+ * caller expects this func to set the fid in crfid to valid
+ * cached root, so increment the refcount.
*/
- memcpy(pfid, tcon->crfid.fid, sizeof(*pfid));
kref_get(&tcon->crfid.refcount);
mutex_unlock(&tcon->crfid.fid_mutex);
@@ -867,13 +883,18 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon,
oparms.fid->mid = le64_to_cpu(o_rsp->sync_hdr.MessageId);
#endif /* CIFS_DEBUG2 */
- memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid));
tcon->crfid.tcon = tcon;
tcon->crfid.is_valid = true;
+ tcon->crfid.dentry = dentry;
+ dget(dentry);
kref_init(&tcon->crfid.refcount);
/* BB TBD check to see if oplock level check can be removed below */
if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) {
+ /*
+ * See commit 2f94a3125b87. Increment the refcount when we
+ * get a lease for root, release it if lease break occurs
+ */
kref_get(&tcon->crfid.refcount);
tcon->crfid.has_lease = true;
smb2_parse_contexts(server, o_rsp,
@@ -892,6 +913,8 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon,
&rsp_iov[1], sizeof(struct smb2_file_all_info),
(char *)&tcon->crfid.file_all_info))
tcon->crfid.file_all_info_is_valid = true;
+ tcon->crfid.time = jiffies;
+
oshr_exit:
mutex_unlock(&tcon->crfid.fid_mutex);
@@ -905,6 +928,22 @@ oshr_free:
return rc;
}
+int open_cached_dir_by_dentry(struct cifs_tcon *tcon,
+ struct dentry *dentry,
+ struct cached_fid **cfid)
+{
+ mutex_lock(&tcon->crfid.fid_mutex);
+ if (tcon->crfid.dentry == dentry) {
+ cifs_dbg(FYI, "found a cached root file handle by dentry\n");
+ *cfid = &tcon->crfid;
+ kref_get(&tcon->crfid.refcount);
+ mutex_unlock(&tcon->crfid.fid_mutex);
+ return 0;
+ }
+ mutex_unlock(&tcon->crfid.fid_mutex);
+ return -ENOENT;
+}
+
static void
smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb)
@@ -914,7 +953,6 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon,
u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
struct cifs_open_parms oparms;
struct cifs_fid fid;
- bool no_cached_open = tcon->nohandlecache;
struct cached_fid *cfid = NULL;
oparms.tcon = tcon;
@@ -924,14 +962,12 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon,
oparms.fid = &fid;
oparms.reconnect = false;
- if (no_cached_open) {
+ rc = open_cached_dir(xid, tcon, "", cifs_sb, &cfid);
+ if (rc == 0)
+ memcpy(&fid, cfid->fid, sizeof(struct cifs_fid));
+ else
rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL,
NULL, NULL);
- } else {
- rc = open_shroot(xid, tcon, cifs_sb, &cfid);
- if (rc == 0)
- memcpy(&fid, cfid->fid, sizeof(struct cifs_fid));
- }
if (rc)
return;
@@ -945,10 +981,10 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon,
FS_VOLUME_INFORMATION);
SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
FS_SECTOR_SIZE_INFORMATION); /* SMB3 specific */
- if (no_cached_open)
+ if (cfid == NULL)
SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
else
- close_shroot(cfid);
+ close_cached_dir(cfid);
}
static void
@@ -1531,7 +1567,10 @@ SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon,
NULL, 0 /* no input */, CIFSMaxBufSize,
(char **)&res_key, &ret_data_len);
- if (rc) {
+ if (rc == -EOPNOTSUPP) {
+ pr_warn_once("Server share %s does not support copy range\n", tcon->treeName);
+ goto req_res_key_exit;
+ } else if (rc) {
cifs_tcon_dbg(VFS, "refcpy ioctl error %d getting resume key\n", rc);
goto req_res_key_exit;
}
@@ -1763,18 +1802,14 @@ smb2_ioctl_query_info(const unsigned int xid,
}
iqinf_exit:
- kfree(vars);
- kfree(buffer);
- SMB2_open_free(&rqst[0]);
- if (qi.flags & PASSTHRU_FSCTL)
- SMB2_ioctl_free(&rqst[1]);
- else
- SMB2_query_info_free(&rqst[1]);
-
- SMB2_close_free(&rqst[2]);
+ cifs_small_buf_release(rqst[0].rq_iov[0].iov_base);
+ cifs_small_buf_release(rqst[1].rq_iov[0].iov_base);
+ cifs_small_buf_release(rqst[2].rq_iov[0].iov_base);
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
+ kfree(vars);
+ kfree(buffer);
return rc;
e_fault:
@@ -2038,6 +2073,7 @@ smb2_duplicate_extents(const unsigned int xid,
{
int rc;
unsigned int ret_data_len;
+ struct inode *inode;
struct duplicate_extents_to_file dup_ext_buf;
struct cifs_tcon *tcon = tlink_tcon(trgtfile->tlink);
@@ -2054,10 +2090,21 @@ smb2_duplicate_extents(const unsigned int xid,
cifs_dbg(FYI, "Duplicate extents: src off %lld dst off %lld len %lld\n",
src_off, dest_off, len);
- rc = smb2_set_file_size(xid, tcon, trgtfile, dest_off + len, false);
- if (rc)
- goto duplicate_extents_out;
+ inode = d_inode(trgtfile->dentry);
+ if (inode->i_size < dest_off + len) {
+ rc = smb2_set_file_size(xid, tcon, trgtfile, dest_off + len, false);
+ if (rc)
+ goto duplicate_extents_out;
+ /*
+ * Although also could set plausible allocation size (i_blocks)
+ * here in addition to setting the file size, in reflink
+ * it is likely that the target file is sparse. Its allocation
+ * size will be queried on next revalidate, but it is important
+ * to make sure that file's cached size is updated immediately
+ */
+ cifs_setsize(inode, dest_off + len);
+ }
rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid,
trgtfile->fid.volatile_fid,
FSCTL_DUPLICATE_EXTENTS_TO_FILE,
@@ -2205,22 +2252,23 @@ smb3_notify(const unsigned int xid, struct file *pfile,
struct smb3_notify notify;
struct dentry *dentry = pfile->f_path.dentry;
struct inode *inode = file_inode(pfile);
- struct cifs_sb_info *cifs_sb;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifs_open_parms oparms;
struct cifs_fid fid;
struct cifs_tcon *tcon;
- unsigned char *path = NULL;
+ const unsigned char *path;
+ void *page = alloc_dentry_path();
__le16 *utf16_path = NULL;
u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
int rc = 0;
- path = build_path_from_dentry(dentry);
- if (path == NULL)
- return -ENOMEM;
-
- cifs_sb = CIFS_SB(inode->i_sb);
+ path = build_path_from_dentry(dentry, page);
+ if (IS_ERR(path)) {
+ rc = PTR_ERR(path);
+ goto notify_exit;
+ }
- utf16_path = cifs_convert_path_to_utf16(path + 1, cifs_sb);
+ utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
if (utf16_path == NULL) {
rc = -ENOMEM;
goto notify_exit;
@@ -2252,7 +2300,7 @@ smb3_notify(const unsigned int xid, struct file *pfile,
cifs_dbg(FYI, "change notify for path %s rc %d\n", path, rc);
notify_exit:
- kfree(path);
+ free_dentry_path(page);
kfree(utf16_path);
return rc;
}
@@ -3640,6 +3688,77 @@ out:
return rc;
}
+static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon,
+ loff_t off, loff_t len)
+{
+ int rc;
+ unsigned int xid;
+ struct cifsFileInfo *cfile = file->private_data;
+ __le64 eof;
+
+ xid = get_xid();
+
+ if (off >= i_size_read(file->f_inode) ||
+ off + len >= i_size_read(file->f_inode)) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ rc = smb2_copychunk_range(xid, cfile, cfile, off + len,
+ i_size_read(file->f_inode) - off - len, off);
+ if (rc < 0)
+ goto out;
+
+ eof = cpu_to_le64(i_size_read(file->f_inode) - len);
+ rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid, cfile->pid, &eof);
+ if (rc < 0)
+ goto out;
+
+ rc = 0;
+ out:
+ free_xid(xid);
+ return rc;
+}
+
+static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon,
+ loff_t off, loff_t len)
+{
+ int rc;
+ unsigned int xid;
+ struct cifsFileInfo *cfile = file->private_data;
+ __le64 eof;
+ __u64 count;
+
+ xid = get_xid();
+
+ if (off >= i_size_read(file->f_inode)) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ count = i_size_read(file->f_inode) - off;
+ eof = cpu_to_le64(i_size_read(file->f_inode) + len);
+
+ rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid, cfile->pid, &eof);
+ if (rc < 0)
+ goto out;
+
+ rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len);
+ if (rc < 0)
+ goto out;
+
+ rc = smb3_zero_range(file, tcon, off, len, 1);
+ if (rc < 0)
+ goto out;
+
+ rc = 0;
+ out:
+ free_xid(xid);
+ return rc;
+}
+
static loff_t smb3_llseek(struct file *file, struct cifs_tcon *tcon, loff_t offset, int whence)
{
struct cifsFileInfo *wrcfile, *cfile = file->private_data;
@@ -3811,6 +3930,10 @@ static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode,
return smb3_zero_range(file, tcon, off, len, false);
} else if (mode == FALLOC_FL_KEEP_SIZE)
return smb3_simple_falloc(file, tcon, off, len, true);
+ else if (mode == FALLOC_FL_COLLAPSE_RANGE)
+ return smb3_collapse_range(file, tcon, off, len);
+ else if (mode == FALLOC_FL_INSERT_RANGE)
+ return smb3_insert_range(file, tcon, off, len);
else if (mode == 0)
return smb3_simple_falloc(file, tcon, off, len, false);
@@ -4158,7 +4281,7 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key)
if (ses->Suid == ses_id) {
ses_enc_key = enc ? ses->smb3encryptionkey :
ses->smb3decryptionkey;
- memcpy(key, ses_enc_key, SMB3_SIGN_KEY_SIZE);
+ memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE);
spin_unlock(&cifs_tcp_ses_lock);
return 0;
}
@@ -4166,7 +4289,7 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key)
}
spin_unlock(&cifs_tcp_ses_lock);
- return 1;
+ return -EAGAIN;
}
/*
* Encrypt or decrypt @rqst message. @rqst[0] has the following format:
@@ -4185,7 +4308,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
int rc = 0;
struct scatterlist *sg;
u8 sign[SMB2_SIGNATURE_SIZE] = {};
- u8 key[SMB3_SIGN_KEY_SIZE];
+ u8 key[SMB3_ENC_DEC_KEY_SIZE];
struct aead_request *req;
char *iv;
unsigned int iv_len;
@@ -4209,10 +4332,11 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
tfm = enc ? server->secmech.ccmaesencrypt :
server->secmech.ccmaesdecrypt;
- if (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)
+ if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) ||
+ (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM))
rc = crypto_aead_setkey(tfm, key, SMB3_GCM256_CRYPTKEY_SIZE);
else
- rc = crypto_aead_setkey(tfm, key, SMB3_SIGN_KEY_SIZE);
+ rc = crypto_aead_setkey(tfm, key, SMB3_GCM128_CRYPTKEY_SIZE);
if (rc) {
cifs_server_dbg(VFS, "%s: Failed to set aead key %d\n", __func__, rc);
@@ -4955,7 +5079,7 @@ smb2_next_header(char *buf)
static int
smb2_make_node(unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
- char *full_path, umode_t mode, dev_t dev)
+ const char *full_path, umode_t mode, dev_t dev)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
int rc = -EPERM;
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 2199a9bfae8f..e36c2a867783 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1857,7 +1857,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
if ((tcon->need_reconnect) || (tcon->ses->need_reconnect))
return 0;
- close_shroot_lease(&tcon->crfid);
+ close_cached_dir_lease(&tcon->crfid);
rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, ses->server,
(void **) &req,
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index a5a9e33c0d73..6442dc1c292b 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -144,7 +144,7 @@ struct smb2_transform_hdr {
} __packed;
/* See MS-SMB2 2.2.42 */
-struct smb2_compression_transform_hdr {
+struct smb2_compression_transform_hdr_unchained {
__le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */
__le32 OriginalCompressedSegmentSize;
__le16 CompressionAlgorithm;
@@ -160,10 +160,17 @@ struct compression_payload_header {
__le16 CompressionAlgorithm;
__le16 Flags;
__le32 Length; /* length of compressed playload including field below if present */
- /* __le32 OriginalPayloadSize; */ /* optional */
+ /* __le32 OriginalPayloadSize; */ /* optional, present when LZNT1, LZ77, LZ77+Huffman */
} __packed;
/* See MS-SMB2 2.2.42.2 */
+struct smb2_compression_transform_hdr_chained {
+ __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */
+ __le32 OriginalCompressedSegmentSize;
+ /* struct compression_payload_header[] */
+} __packed;
+
+/* See MS-SMB2 2.2.42.2.2 */
struct compression_pattern_payload_v1 {
__le16 Pattern;
__le16 Reserved1;
@@ -181,7 +188,11 @@ struct smb2_rdma_transform {
__le32 Reserved2;
} __packed;
-struct smb2_rdma_encryption_transform {
+/* TransformType */
+#define SMB2_RDMA_TRANSFORM_TYPE_ENCRYPTION 0x0001
+#define SMB2_RDMA_TRANSFORM_TYPE_SIGNING 0x0002
+
+struct smb2_rdma_crypto_transform {
__le16 TransformType;
__le16 SignatureLength;
__le16 NonceLength;
@@ -409,13 +420,29 @@ struct smb2_netname_neg_context {
} __packed;
/*
- * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6
+ * For smb2_transport_capabilities context see MS-SMB2 2.2.3.1.5
* and 2.2.4.1.5
*/
+/* Flags */
+#define SMB2_ACCEPT_TRANSFORM_LEVEL_SECURITY 0x00000001
+
+struct smb2_transport_capabilities_context {
+ __le16 ContextType; /* 6 */
+ __le16 DataLength;
+ __u32 Reserved;
+ __le32 Flags;
+} __packed;
+
+/*
+ * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6
+ * and 2.2.4.1.6
+ */
+
/* RDMA Transform IDs */
#define SMB2_RDMA_TRANSFORM_NONE 0x0000
#define SMB2_RDMA_TRANSFORM_ENCRYPTION 0x0001
+#define SMB2_RDMA_TRANSFORM_SIGNING 0x0002
struct smb2_rdma_transform_capabilities_context {
__le16 ContextType; /* 7 */
@@ -427,6 +454,11 @@ struct smb2_rdma_transform_capabilities_context {
__le16 RDMATransformIds[];
} __packed;
+/*
+ * For signing capabilities context see MS-SMB2 2.2.3.1.7
+ * and 2.2.4.1.7
+ */
+
/* Signing algorithms */
#define SIGNING_ALG_HMAC_SHA256 0
#define SIGNING_ALG_AES_CMAC 1
@@ -634,7 +666,8 @@ struct smb2_tree_connect_rsp {
#define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000
#define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000
#define SMB2_SHAREFLAG_IDENTITY_REMOTING 0x00040000 /* 3.1.1 */
-#define SHI1005_FLAGS_ALL 0x0004FF33
+#define SMB2_SHAREFLAG_COMPRESS_DATA 0x00100000 /* 3.1.1 */
+#define SHI1005_FLAGS_ALL 0x0014FF33
/* Possible share capabilities */
#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */
@@ -1390,7 +1423,11 @@ struct smb2_lock_req {
struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 48 */
__le16 LockCount;
- __le32 Reserved;
+ /*
+ * The least significant four bits are the index, the other 28 bits are
+ * the lock sequence number (0 to 64). See MS-SMB2 2.2.26
+ */
+ __le32 LockSequenceNumber;
__u64 PersistentFileId; /* opaque endianness */
__u64 VolatileFileId; /* opaque endianness */
/* Followed by at least one */
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index a2eb34a8d9c9..a5f87b02cfaf 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -69,12 +69,16 @@ extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server,
extern int smb3_handle_read_data(struct TCP_Server_Info *server,
struct mid_q_entry *mid);
-extern int open_shroot(unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb,
- struct cached_fid **cfid);
-extern void close_shroot(struct cached_fid *cfid);
-extern void close_shroot_lease(struct cached_fid *cfid);
-extern void close_shroot_lease_locked(struct cached_fid *cfid);
+extern int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
+ const char *path,
+ struct cifs_sb_info *cifs_sb,
+ struct cached_fid **cfid);
+extern int open_cached_dir_by_dentry(struct cifs_tcon *tcon,
+ struct dentry *dentry,
+ struct cached_fid **cfid);
+extern void close_cached_dir(struct cached_fid *cfid);
+extern void close_cached_dir_lease(struct cached_fid *cfid);
+extern void close_cached_dir_lease_locked(struct cached_fid *cfid);
extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst,
struct smb2_file_all_info *src);
extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index ebccd71cc60a..e6fa76ab70be 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -298,7 +298,8 @@ static int generate_key(struct cifs_ses *ses, struct kvec label,
{
unsigned char zero = 0x0;
__u8 i[4] = {0, 0, 0, 1};
- __u8 L[4] = {0, 0, 0, 128};
+ __u8 L128[4] = {0, 0, 0, 128};
+ __u8 L256[4] = {0, 0, 1, 0};
int rc = 0;
unsigned char prfhash[SMB2_HMACSHA256_SIZE];
unsigned char *hashptr = prfhash;
@@ -354,8 +355,14 @@ static int generate_key(struct cifs_ses *ses, struct kvec label,
goto smb3signkey_ret;
}
- rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
- L, 4);
+ if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) ||
+ (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) {
+ rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+ L256, 4);
+ } else {
+ rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+ L128, 4);
+ }
if (rc) {
cifs_server_dbg(VFS, "%s: Could not update with L\n", __func__);
goto smb3signkey_ret;
@@ -390,6 +397,9 @@ generate_smb3signingkey(struct cifs_ses *ses,
const struct derivation_triplet *ptriplet)
{
int rc;
+#ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS
+ struct TCP_Server_Info *server = ses->server;
+#endif
/*
* All channels use the same encryption/decryption keys but
@@ -422,11 +432,11 @@ generate_smb3signingkey(struct cifs_ses *ses,
rc = generate_key(ses, ptriplet->encryption.label,
ptriplet->encryption.context,
ses->smb3encryptionkey,
- SMB3_SIGN_KEY_SIZE);
+ SMB3_ENC_DEC_KEY_SIZE);
rc = generate_key(ses, ptriplet->decryption.label,
ptriplet->decryption.context,
ses->smb3decryptionkey,
- SMB3_SIGN_KEY_SIZE);
+ SMB3_ENC_DEC_KEY_SIZE);
if (rc)
return rc;
}
@@ -442,14 +452,23 @@ generate_smb3signingkey(struct cifs_ses *ses,
*/
cifs_dbg(VFS, "Session Id %*ph\n", (int)sizeof(ses->Suid),
&ses->Suid);
+ cifs_dbg(VFS, "Cipher type %d\n", server->cipher_type);
cifs_dbg(VFS, "Session Key %*ph\n",
SMB2_NTLMV2_SESSKEY_SIZE, ses->auth_key.response);
cifs_dbg(VFS, "Signing Key %*ph\n",
SMB3_SIGN_KEY_SIZE, ses->smb3signingkey);
- cifs_dbg(VFS, "ServerIn Key %*ph\n",
- SMB3_SIGN_KEY_SIZE, ses->smb3encryptionkey);
- cifs_dbg(VFS, "ServerOut Key %*ph\n",
- SMB3_SIGN_KEY_SIZE, ses->smb3decryptionkey);
+ if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) ||
+ (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) {
+ cifs_dbg(VFS, "ServerIn Key %*ph\n",
+ SMB3_GCM256_CRYPTKEY_SIZE, ses->smb3encryptionkey);
+ cifs_dbg(VFS, "ServerOut Key %*ph\n",
+ SMB3_GCM256_CRYPTKEY_SIZE, ses->smb3decryptionkey);
+ } else {
+ cifs_dbg(VFS, "ServerIn Key %*ph\n",
+ SMB3_GCM128_CRYPTKEY_SIZE, ses->smb3encryptionkey);
+ cifs_dbg(VFS, "ServerOut Key %*ph\n",
+ SMB3_GCM128_CRYPTKEY_SIZE, ses->smb3decryptionkey);
+ }
#endif
return rc;
}
diff --git a/fs/cifs/unc.c b/fs/cifs/unc.c
index 394aa00cea40..f6fc5e343ea4 100644
--- a/fs/cifs/unc.c
+++ b/fs/cifs/unc.c
@@ -50,7 +50,6 @@ char *extract_sharename(const char *unc)
{
const char *src;
char *delim, *dst;
- int len;
/* skip double chars at the beginning */
src = unc + 2;
@@ -60,10 +59,9 @@ char *extract_sharename(const char *unc)
if (!delim)
return ERR_PTR(-EINVAL);
delim++;
- len = strlen(delim);
/* caller has to free the memory */
- dst = kstrndup(delim, len, GFP_KERNEL);
+ dst = kstrdup(delim, GFP_KERNEL);
if (!dst)
return ERR_PTR(-ENOMEM);
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 41a611e76bb7..e351b945135b 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -53,7 +53,7 @@ enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT,
XATTR_CIFS_NTSD, XATTR_CIFS_NTSD_FULL };
static int cifs_attrib_set(unsigned int xid, struct cifs_tcon *pTcon,
- struct inode *inode, char *full_path,
+ struct inode *inode, const char *full_path,
const void *value, size_t size)
{
ssize_t rc = -EOPNOTSUPP;
@@ -77,7 +77,7 @@ static int cifs_attrib_set(unsigned int xid, struct cifs_tcon *pTcon,
}
static int cifs_creation_time_set(unsigned int xid, struct cifs_tcon *pTcon,
- struct inode *inode, char *full_path,
+ struct inode *inode, const char *full_path,
const void *value, size_t size)
{
ssize_t rc = -EOPNOTSUPP;
@@ -112,7 +112,8 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct tcon_link *tlink;
struct cifs_tcon *pTcon;
- char *full_path;
+ const char *full_path;
+ void *page;
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
@@ -120,10 +121,11 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
pTcon = tlink_tcon(tlink);
xid = get_xid();
+ page = alloc_dentry_path();
- full_path = build_path_from_dentry(dentry);
- if (full_path == NULL) {
- rc = -ENOMEM;
+ full_path = build_path_from_dentry(dentry, page);
+ if (IS_ERR(full_path)) {
+ rc = PTR_ERR(full_path);
goto out;
}
/* return dos attributes as pseudo xattr */
@@ -235,7 +237,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
}
out:
- kfree(full_path);
+ free_dentry_path(page);
free_xid(xid);
cifs_put_tlink(tlink);
return rc;
@@ -297,7 +299,8 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct tcon_link *tlink;
struct cifs_tcon *pTcon;
- char *full_path;
+ const char *full_path;
+ void *page;
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
@@ -305,10 +308,11 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
pTcon = tlink_tcon(tlink);
xid = get_xid();
+ page = alloc_dentry_path();
- full_path = build_path_from_dentry(dentry);
- if (full_path == NULL) {
- rc = -ENOMEM;
+ full_path = build_path_from_dentry(dentry, page);
+ if (IS_ERR(full_path)) {
+ rc = PTR_ERR(full_path);
goto out;
}
@@ -401,7 +405,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
rc = -EOPNOTSUPP;
out:
- kfree(full_path);
+ free_dentry_path(page);
free_xid(xid);
cifs_put_tlink(tlink);
return rc;
@@ -414,7 +418,8 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
struct tcon_link *tlink;
struct cifs_tcon *pTcon;
- char *full_path;
+ const char *full_path;
+ void *page;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
return -EOPNOTSUPP;
@@ -425,10 +430,11 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
pTcon = tlink_tcon(tlink);
xid = get_xid();
+ page = alloc_dentry_path();
- full_path = build_path_from_dentry(direntry);
- if (full_path == NULL) {
- rc = -ENOMEM;
+ full_path = build_path_from_dentry(direntry, page);
+ if (IS_ERR(full_path)) {
+ rc = PTR_ERR(full_path);
goto list_ea_exit;
}
/* return dos attributes as pseudo xattr */
@@ -442,7 +448,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
full_path, NULL, data, buf_size, cifs_sb);
list_ea_exit:
- kfree(full_path);
+ free_dentry_path(page);
free_xid(xid);
cifs_put_tlink(tlink);
return rc;
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 128d63df5bfb..ef5ca22bfb3e 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -175,10 +175,10 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
ret = call_mmap(vma->vm_file, vma);
if (ret) {
- /* if call_mmap fails, our caller will put coda_file so we
- * should drop the reference to the host_file that we got.
+ /* if call_mmap fails, our caller will put host_file so we
+ * should drop the reference to the coda_file that we got.
*/
- fput(host_file);
+ fput(coda_file);
kfree(cvm_ops);
} else {
/* here we add redirects for the open/close vm_operations */
diff --git a/fs/coredump.c b/fs/coredump.c
index 1c0fdc1aa70b..2868e3e171ae 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -809,6 +809,16 @@ void do_coredump(const kernel_siginfo_t *siginfo)
}
file_start_write(cprm.file);
core_dumped = binfmt->core_dump(&cprm);
+ /*
+ * Ensures that file size is big enough to contain the current
+ * file postion. This prevents gdb from complaining about
+ * a truncated file if the last "write" to the file was
+ * dump_skip.
+ */
+ if (cprm.to_skip) {
+ cprm.to_skip--;
+ dump_emit(&cprm, "", 1);
+ }
file_end_write(cprm.file);
}
if (ispipe && core_pipe_limit)
@@ -835,7 +845,7 @@ fail:
* do on a core-file: use only these functions to write out all the
* necessary info.
*/
-int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
+static int __dump_emit(struct coredump_params *cprm, const void *addr, int nr)
{
struct file *file = cprm->file;
loff_t pos = file->f_pos;
@@ -855,9 +865,8 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
return 1;
}
-EXPORT_SYMBOL(dump_emit);
-int dump_skip(struct coredump_params *cprm, size_t nr)
+static int __dump_skip(struct coredump_params *cprm, size_t nr)
{
static char zeroes[PAGE_SIZE];
struct file *file = cprm->file;
@@ -869,13 +878,35 @@ int dump_skip(struct coredump_params *cprm, size_t nr)
return 1;
} else {
while (nr > PAGE_SIZE) {
- if (!dump_emit(cprm, zeroes, PAGE_SIZE))
+ if (!__dump_emit(cprm, zeroes, PAGE_SIZE))
return 0;
nr -= PAGE_SIZE;
}
- return dump_emit(cprm, zeroes, nr);
+ return __dump_emit(cprm, zeroes, nr);
}
}
+
+int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
+{
+ if (cprm->to_skip) {
+ if (!__dump_skip(cprm, cprm->to_skip))
+ return 0;
+ cprm->to_skip = 0;
+ }
+ return __dump_emit(cprm, addr, nr);
+}
+EXPORT_SYMBOL(dump_emit);
+
+void dump_skip_to(struct coredump_params *cprm, unsigned long pos)
+{
+ cprm->to_skip = pos - cprm->pos;
+}
+EXPORT_SYMBOL(dump_skip_to);
+
+void dump_skip(struct coredump_params *cprm, size_t nr)
+{
+ cprm->to_skip += nr;
+}
EXPORT_SYMBOL(dump_skip);
#ifdef CONFIG_ELF_CORE
@@ -902,11 +933,11 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
kunmap_local(kaddr);
put_page(page);
+ if (stop)
+ return 0;
} else {
- stop = !dump_skip(cprm, PAGE_SIZE);
+ dump_skip(cprm, PAGE_SIZE);
}
- if (stop)
- return 0;
}
return 1;
}
@@ -914,33 +945,16 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
int dump_align(struct coredump_params *cprm, int align)
{
- unsigned mod = cprm->pos & (align - 1);
+ unsigned mod = (cprm->pos + cprm->to_skip) & (align - 1);
if (align & (align - 1))
return 0;
- return mod ? dump_skip(cprm, align - mod) : 1;
+ if (mod)
+ cprm->to_skip += align - mod;
+ return 1;
}
EXPORT_SYMBOL(dump_align);
/*
- * Ensures that file size is big enough to contain the current file
- * postion. This prevents gdb from complaining about a truncated file
- * if the last "write" to the file was dump_skip.
- */
-void dump_truncate(struct coredump_params *cprm)
-{
- struct file *file = cprm->file;
- loff_t offset;
-
- if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
- offset = file->f_op->llseek(file, 0, SEEK_CUR);
- if (i_size_read(file->f_mapping->host) < offset)
- do_truncate(file_mnt_user_ns(file), file->f_path.dentry,
- offset, 0, file);
- }
-}
-EXPORT_SYMBOL(dump_truncate);
-
-/*
* The purpose of always_dump_vma() is to make sure that special kernel mappings
* that are useful for post-mortem analysis are included in every core dump.
* In that way we ensure that the core dump is fully interpretable later
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index a5f5c30368a2..2d0c8922f635 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -14,16 +14,30 @@ config FS_ENCRYPTION
F2FS and UBIFS make use of this feature.
# Filesystems supporting encryption must select this if FS_ENCRYPTION. This
-# allows the algorithms to be built as modules when all the filesystems are.
+# allows the algorithms to be built as modules when all the filesystems are,
+# whereas selecting them from FS_ENCRYPTION would force them to be built-in.
+#
+# Note: this option only pulls in the algorithms that filesystem encryption
+# needs "by default". If userspace will use "non-default" encryption modes such
+# as Adiantum encryption, then those other modes need to be explicitly enabled
+# in the crypto API; see Documentation/filesystems/fscrypt.rst for details.
+#
+# Also note that this option only pulls in the generic implementations of the
+# algorithms, not any per-architecture optimized implementations. It is
+# strongly recommended to enable optimized implementations too. It is safe to
+# disable these generic implementations if corresponding optimized
+# implementations will always be available too; for this reason, these are soft
+# dependencies ('imply' rather than 'select'). Only disable these generic
+# implementations if you're sure they will never be needed, though.
config FS_ENCRYPTION_ALGS
tristate
- select CRYPTO_AES
- select CRYPTO_CBC
- select CRYPTO_CTS
- select CRYPTO_ECB
- select CRYPTO_HMAC
- select CRYPTO_SHA512
- select CRYPTO_XTS
+ imply CRYPTO_AES
+ imply CRYPTO_CBC
+ imply CRYPTO_CTS
+ imply CRYPTO_ECB
+ imply CRYPTO_HMAC
+ imply CRYPTO_SHA512
+ imply CRYPTO_XTS
config FS_ENCRYPTION_INLINE_CRYPT
bool "Enable fscrypt to use inline crypto"
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 686e0ad28788..e813acfaa6e8 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -773,7 +773,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
- char buf[3];
+ char buf[2];
bool val;
int r;
struct dentry *dentry = F_DENTRY(file);
@@ -789,7 +789,6 @@ ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
else
buf[0] = 'N';
buf[1] = '\n';
- buf[2] = 0x00;
return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
}
EXPORT_SYMBOL_GPL(debugfs_read_file_bool);
@@ -865,6 +864,97 @@ struct dentry *debugfs_create_bool(const char *name, umode_t mode,
}
EXPORT_SYMBOL_GPL(debugfs_create_bool);
+ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct dentry *dentry = F_DENTRY(file);
+ char *str, *copy = NULL;
+ int copy_len, len;
+ ssize_t ret;
+
+ ret = debugfs_file_get(dentry);
+ if (unlikely(ret))
+ return ret;
+
+ str = *(char **)file->private_data;
+ len = strlen(str) + 1;
+ copy = kmalloc(len, GFP_KERNEL);
+ if (!copy) {
+ debugfs_file_put(dentry);
+ return -ENOMEM;
+ }
+
+ copy_len = strscpy(copy, str, len);
+ debugfs_file_put(dentry);
+ if (copy_len < 0) {
+ kfree(copy);
+ return copy_len;
+ }
+
+ copy[copy_len] = '\n';
+
+ ret = simple_read_from_buffer(user_buf, count, ppos, copy, copy_len);
+ kfree(copy);
+
+ return ret;
+}
+
+static ssize_t debugfs_write_file_str(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ /* This is really only for read-only strings */
+ return -EINVAL;
+}
+
+static const struct file_operations fops_str = {
+ .read = debugfs_read_file_str,
+ .write = debugfs_write_file_str,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+static const struct file_operations fops_str_ro = {
+ .read = debugfs_read_file_str,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+static const struct file_operations fops_str_wo = {
+ .write = debugfs_write_file_str,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+/**
+ * debugfs_create_str - create a debugfs file that is used to read and write a string value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is %NULL, then the
+ * file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ * from.
+ *
+ * This function creates a file in debugfs with the given name that
+ * contains the value of the variable @value. If the @mode variable is so
+ * set, it can be read from, and written to.
+ *
+ * This function will return a pointer to a dentry if it succeeds. This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
+ * returned.
+ *
+ * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will
+ * be returned.
+ */
+void debugfs_create_str(const char *name, umode_t mode,
+ struct dentry *parent, char **value)
+{
+ debugfs_create_mode_unsafe(name, mode, parent, value, &fops_str,
+ &fops_str_ro, &fops_str_wo);
+}
+
static ssize_t read_file_blob(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 22e86ae4dd5a..1d252164d97b 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -35,7 +35,7 @@
static struct vfsmount *debugfs_mount;
static int debugfs_mount_count;
static bool debugfs_registered;
-static unsigned int debugfs_allow = DEFAULT_DEBUGFS_ALLOW_BITS;
+static unsigned int debugfs_allow __ro_after_init = DEFAULT_DEBUGFS_ALLOW_BITS;
/*
* Don't allow access attributes to be changed whilst the kernel is locked down
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b61491bf3166..b2e86e739d7a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -812,6 +812,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
struct buffer_head *map_bh)
{
int ret = 0;
+ int boundary = sdio->boundary; /* dio_send_cur_page may clear it */
if (dio->op == REQ_OP_WRITE) {
/*
@@ -850,10 +851,10 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
out:
/*
- * If sdio->boundary then we want to schedule the IO now to
+ * If boundary then we want to schedule the IO now to
* avoid metadata seeks.
*/
- if (sdio->boundary) {
+ if (boundary) {
ret = dio_send_cur_page(dio, sdio, map_bh);
if (sdio->bio)
dio_bio_submit(dio, sdio);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 18e9285fbb4c..7169ea873347 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -18,6 +18,7 @@
#include <linux/fs_stack.h>
#include <linux/slab.h>
#include <linux/xattr.h>
+#include <linux/fileattr.h>
#include <asm/unaligned.h>
#include "ecryptfs_kernel.h"
@@ -1118,6 +1119,23 @@ out:
return rc;
}
+static int ecryptfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+ return vfs_fileattr_get(ecryptfs_dentry_to_lower(dentry), fa);
+}
+
+static int ecryptfs_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
+{
+ struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ int rc;
+
+ rc = vfs_fileattr_set(&init_user_ns, lower_dentry, fa);
+ fsstack_copy_attr_all(d_inode(dentry), d_inode(lower_dentry));
+
+ return rc;
+}
+
const struct inode_operations ecryptfs_symlink_iops = {
.get_link = ecryptfs_get_link,
.permission = ecryptfs_permission,
@@ -1139,6 +1157,8 @@ const struct inode_operations ecryptfs_dir_iops = {
.permission = ecryptfs_permission,
.setattr = ecryptfs_setattr,
.listxattr = ecryptfs_listxattr,
+ .fileattr_get = ecryptfs_fileattr_get,
+ .fileattr_set = ecryptfs_fileattr_set,
};
const struct inode_operations ecryptfs_main_iops = {
@@ -1146,6 +1166,8 @@ const struct inode_operations ecryptfs_main_iops = {
.setattr = ecryptfs_setattr,
.getattr = ecryptfs_getattr,
.listxattr = ecryptfs_listxattr,
+ .fileattr_get = ecryptfs_fileattr_get,
+ .fileattr_set = ecryptfs_fileattr_set,
};
static int ecryptfs_xattr_get(const struct xattr_handler *handler,
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index e6bc0302643b..d57ee15874f9 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -106,86 +106,9 @@ out_free:
return size;
}
-static inline unsigned int efivarfs_getflags(struct inode *inode)
-{
- unsigned int i_flags;
- unsigned int flags = 0;
-
- i_flags = inode->i_flags;
- if (i_flags & S_IMMUTABLE)
- flags |= FS_IMMUTABLE_FL;
- return flags;
-}
-
-static int
-efivarfs_ioc_getxflags(struct file *file, void __user *arg)
-{
- struct inode *inode = file->f_mapping->host;
- unsigned int flags = efivarfs_getflags(inode);
-
- if (copy_to_user(arg, &flags, sizeof(flags)))
- return -EFAULT;
- return 0;
-}
-
-static int
-efivarfs_ioc_setxflags(struct file *file, void __user *arg)
-{
- struct inode *inode = file->f_mapping->host;
- unsigned int flags;
- unsigned int i_flags = 0;
- unsigned int oldflags = efivarfs_getflags(inode);
- int error;
-
- if (!inode_owner_or_capable(&init_user_ns, inode))
- return -EACCES;
-
- if (copy_from_user(&flags, arg, sizeof(flags)))
- return -EFAULT;
-
- if (flags & ~FS_IMMUTABLE_FL)
- return -EOPNOTSUPP;
-
- if (flags & FS_IMMUTABLE_FL)
- i_flags |= S_IMMUTABLE;
-
-
- error = mnt_want_write_file(file);
- if (error)
- return error;
-
- inode_lock(inode);
-
- error = vfs_ioc_setflags_prepare(inode, oldflags, flags);
- if (error)
- goto out;
-
- inode_set_flags(inode, i_flags, S_IMMUTABLE);
-out:
- inode_unlock(inode);
- mnt_drop_write_file(file);
- return error;
-}
-
-static long
-efivarfs_file_ioctl(struct file *file, unsigned int cmd, unsigned long p)
-{
- void __user *arg = (void __user *)p;
-
- switch (cmd) {
- case FS_IOC_GETFLAGS:
- return efivarfs_ioc_getxflags(file, arg);
- case FS_IOC_SETFLAGS:
- return efivarfs_ioc_setxflags(file, arg);
- }
-
- return -ENOTTY;
-}
-
const struct file_operations efivarfs_file_operations = {
.open = simple_open,
.read = efivarfs_file_read,
.write = efivarfs_file_write,
.llseek = no_llseek,
- .unlocked_ioctl = efivarfs_file_ioctl,
};
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 14e2947975fd..939e5e242b98 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -10,9 +10,12 @@
#include <linux/kmemleak.h>
#include <linux/slab.h>
#include <linux/uuid.h>
+#include <linux/fileattr.h>
#include "internal.h"
+static const struct inode_operations efivarfs_file_inode_operations;
+
struct inode *efivarfs_get_inode(struct super_block *sb,
const struct inode *dir, int mode,
dev_t dev, bool is_removable)
@@ -26,6 +29,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb,
inode->i_flags = is_removable ? 0 : S_IMMUTABLE;
switch (mode & S_IFMT) {
case S_IFREG:
+ inode->i_op = &efivarfs_file_inode_operations;
inode->i_fop = &efivarfs_file_operations;
break;
case S_IFDIR:
@@ -138,3 +142,43 @@ const struct inode_operations efivarfs_dir_inode_operations = {
.unlink = efivarfs_unlink,
.create = efivarfs_create,
};
+
+static int
+efivarfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+ unsigned int i_flags;
+ unsigned int flags = 0;
+
+ i_flags = d_inode(dentry)->i_flags;
+ if (i_flags & S_IMMUTABLE)
+ flags |= FS_IMMUTABLE_FL;
+
+ fileattr_fill_flags(fa, flags);
+
+ return 0;
+}
+
+static int
+efivarfs_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
+{
+ unsigned int i_flags = 0;
+
+ if (fileattr_has_fsx(fa))
+ return -EOPNOTSUPP;
+
+ if (fa->flags & ~FS_IMMUTABLE_FL)
+ return -EOPNOTSUPP;
+
+ if (fa->flags & FS_IMMUTABLE_FL)
+ i_flags |= S_IMMUTABLE;
+
+ inode_set_flags(d_inode(dentry), i_flags, S_IMMUTABLE);
+
+ return 0;
+}
+
+static const struct inode_operations efivarfs_file_inode_operations = {
+ .fileattr_get = efivarfs_fileattr_get,
+ .fileattr_set = efivarfs_fileattr_set,
+};
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 74b0aaa7114c..858b3339f381 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -76,17 +76,3 @@ config EROFS_FS_ZIP
If you don't want to enable compression feature, say N.
-config EROFS_FS_CLUSTER_PAGE_LIMIT
- int "EROFS Cluster Pages Hard Limit"
- depends on EROFS_FS_ZIP
- range 1 256
- default "1"
- help
- Indicates maximum # of pages of a compressed
- physical cluster.
-
- For example, if files in a image were compressed
- into 8k-unit, hard limit should not be configured
- less than 2. Otherwise, the image will be refused
- to mount on this kernel.
-
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index af159539fc1b..1f9aced49070 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_EROFS_FS) += erofs.o
-erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o
erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 1249e74b3bf0..ebac756cb2a3 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -109,21 +109,6 @@ err_out:
return err;
}
-int erofs_map_blocks(struct inode *inode,
- struct erofs_map_blocks *map, int flags)
-{
- if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) {
- int err = z_erofs_map_blocks_iter(inode, map, flags);
-
- if (map->mpage) {
- put_page(map->mpage);
- map->mpage = NULL;
- }
- return err;
- }
- return erofs_map_blocks_flatmode(inode, map, flags);
-}
-
static inline struct bio *erofs_read_raw_page(struct bio *bio,
struct address_space *mapping,
struct page *page,
@@ -159,7 +144,7 @@ submit_bio_retry:
erofs_blk_t blknr;
unsigned int blkoff;
- err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
+ err = erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW);
if (err)
goto err_out;
@@ -318,7 +303,7 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
return 0;
}
- if (!erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW))
+ if (!erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW))
return erofs_blknr(map.m_pa);
return 0;
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 1cb1ffd10569..88e33addf229 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -28,6 +28,42 @@ struct z_erofs_decompressor {
char *name;
};
+int z_erofs_load_lz4_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_lz4_cfgs *lz4, int size)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ u16 distance;
+
+ if (lz4) {
+ if (size < sizeof(struct z_erofs_lz4_cfgs)) {
+ erofs_err(sb, "invalid lz4 cfgs, size=%u", size);
+ return -EINVAL;
+ }
+ distance = le16_to_cpu(lz4->max_distance);
+
+ sbi->lz4.max_pclusterblks = le16_to_cpu(lz4->max_pclusterblks);
+ if (!sbi->lz4.max_pclusterblks) {
+ sbi->lz4.max_pclusterblks = 1; /* reserved case */
+ } else if (sbi->lz4.max_pclusterblks >
+ Z_EROFS_PCLUSTER_MAX_SIZE / EROFS_BLKSIZ) {
+ erofs_err(sb, "too large lz4 pclusterblks %u",
+ sbi->lz4.max_pclusterblks);
+ return -EINVAL;
+ } else if (sbi->lz4.max_pclusterblks >= 2) {
+ erofs_info(sb, "EXPERIMENTAL big pcluster feature in use. Use at your own risk!");
+ }
+ } else {
+ distance = le16_to_cpu(dsb->u1.lz4_max_distance);
+ sbi->lz4.max_pclusterblks = 1;
+ }
+
+ sbi->lz4.max_distance_pages = distance ?
+ DIV_ROUND_UP(distance, PAGE_SIZE) + 1 :
+ LZ4_MAX_DISTANCE_PAGES;
+ return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks);
+}
+
static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
struct list_head *pagepool)
{
@@ -36,6 +72,8 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL };
unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES,
BITS_PER_LONG)] = { 0 };
+ unsigned int lz4_max_distance_pages =
+ EROFS_SB(rq->sb)->lz4.max_distance_pages;
void *kaddr = NULL;
unsigned int i, j, top;
@@ -44,14 +82,14 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
struct page *const page = rq->out[i];
struct page *victim;
- if (j >= LZ4_MAX_DISTANCE_PAGES)
+ if (j >= lz4_max_distance_pages)
j = 0;
/* 'valid' bounced can only be tested after a complete round */
if (test_bit(j, bounced)) {
- DBG_BUGON(i < LZ4_MAX_DISTANCE_PAGES);
- DBG_BUGON(top >= LZ4_MAX_DISTANCE_PAGES);
- availables[top++] = rq->out[i - LZ4_MAX_DISTANCE_PAGES];
+ DBG_BUGON(i < lz4_max_distance_pages);
+ DBG_BUGON(top >= lz4_max_distance_pages);
+ availables[top++] = rq->out[i - lz4_max_distance_pages];
}
if (page) {
@@ -73,9 +111,8 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
victim = availables[--top];
get_page(victim);
} else {
- victim = erofs_allocpage(pagepool, GFP_KERNEL);
- if (!victim)
- return -ENOMEM;
+ victim = erofs_allocpage(pagepool,
+ GFP_KERNEL | __GFP_NOFAIL);
set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE);
}
rq->out[i] = victim;
@@ -83,96 +120,123 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
return kaddr ? 1 : 0;
}
-static void *generic_copy_inplace_data(struct z_erofs_decompress_req *rq,
- u8 *src, unsigned int pageofs_in)
+static void *z_erofs_handle_inplace_io(struct z_erofs_decompress_req *rq,
+ void *inpage, unsigned int *inputmargin, int *maptype,
+ bool support_0padding)
{
- /*
- * if in-place decompression is ongoing, those decompressed
- * pages should be copied in order to avoid being overlapped.
- */
- struct page **in = rq->in;
- u8 *const tmp = erofs_get_pcpubuf(0);
- u8 *tmpp = tmp;
- unsigned int inlen = rq->inputsize - pageofs_in;
- unsigned int count = min_t(uint, inlen, PAGE_SIZE - pageofs_in);
-
- while (tmpp < tmp + inlen) {
- if (!src)
- src = kmap_atomic(*in);
- memcpy(tmpp, src + pageofs_in, count);
- kunmap_atomic(src);
- src = NULL;
- tmpp += count;
- pageofs_in = 0;
- count = PAGE_SIZE;
+ unsigned int nrpages_in, nrpages_out;
+ unsigned int ofull, oend, inputsize, total, i, j;
+ struct page **in;
+ void *src, *tmp;
+
+ inputsize = rq->inputsize;
+ nrpages_in = PAGE_ALIGN(inputsize) >> PAGE_SHIFT;
+ oend = rq->pageofs_out + rq->outputsize;
+ ofull = PAGE_ALIGN(oend);
+ nrpages_out = ofull >> PAGE_SHIFT;
+
+ if (rq->inplace_io) {
+ if (rq->partial_decoding || !support_0padding ||
+ ofull - oend < LZ4_DECOMPRESS_INPLACE_MARGIN(inputsize))
+ goto docopy;
+
+ for (i = 0; i < nrpages_in; ++i) {
+ DBG_BUGON(rq->in[i] == NULL);
+ for (j = 0; j < nrpages_out - nrpages_in + i; ++j)
+ if (rq->out[j] == rq->in[i])
+ goto docopy;
+ }
+ }
+
+ if (nrpages_in <= 1) {
+ *maptype = 0;
+ return inpage;
+ }
+ kunmap_atomic(inpage);
+ might_sleep();
+ src = erofs_vm_map_ram(rq->in, nrpages_in);
+ if (!src)
+ return ERR_PTR(-ENOMEM);
+ *maptype = 1;
+ return src;
+
+docopy:
+ /* Or copy compressed data which can be overlapped to per-CPU buffer */
+ in = rq->in;
+ src = erofs_get_pcpubuf(nrpages_in);
+ if (!src) {
+ DBG_BUGON(1);
+ kunmap_atomic(inpage);
+ return ERR_PTR(-EFAULT);
+ }
+
+ tmp = src;
+ total = rq->inputsize;
+ while (total) {
+ unsigned int page_copycnt =
+ min_t(unsigned int, total, PAGE_SIZE - *inputmargin);
+
+ if (!inpage)
+ inpage = kmap_atomic(*in);
+ memcpy(tmp, inpage + *inputmargin, page_copycnt);
+ kunmap_atomic(inpage);
+ inpage = NULL;
+ tmp += page_copycnt;
+ total -= page_copycnt;
++in;
+ *inputmargin = 0;
}
- return tmp;
+ *maptype = 2;
+ return src;
}
static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
{
- unsigned int inputmargin, inlen;
- u8 *src;
- bool copied, support_0padding;
- int ret;
-
- if (rq->inputsize > PAGE_SIZE)
- return -EOPNOTSUPP;
+ unsigned int inputmargin;
+ u8 *headpage, *src;
+ bool support_0padding;
+ int ret, maptype;
- src = kmap_atomic(*rq->in);
+ DBG_BUGON(*rq->in == NULL);
+ headpage = kmap_atomic(*rq->in);
inputmargin = 0;
support_0padding = false;
/* decompression inplace is only safe when 0padding is enabled */
- if (EROFS_SB(rq->sb)->feature_incompat &
- EROFS_FEATURE_INCOMPAT_LZ4_0PADDING) {
+ if (erofs_sb_has_lz4_0padding(EROFS_SB(rq->sb))) {
support_0padding = true;
- while (!src[inputmargin & ~PAGE_MASK])
+ while (!headpage[inputmargin & ~PAGE_MASK])
if (!(++inputmargin & ~PAGE_MASK))
break;
if (inputmargin >= rq->inputsize) {
- kunmap_atomic(src);
+ kunmap_atomic(headpage);
return -EIO;
}
}
- copied = false;
- inlen = rq->inputsize - inputmargin;
- if (rq->inplace_io) {
- const uint oend = (rq->pageofs_out +
- rq->outputsize) & ~PAGE_MASK;
- const uint nr = PAGE_ALIGN(rq->pageofs_out +
- rq->outputsize) >> PAGE_SHIFT;
-
- if (rq->partial_decoding || !support_0padding ||
- rq->out[nr - 1] != rq->in[0] ||
- rq->inputsize - oend <
- LZ4_DECOMPRESS_INPLACE_MARGIN(inlen)) {
- src = generic_copy_inplace_data(rq, src, inputmargin);
- inputmargin = 0;
- copied = true;
- }
- }
+ rq->inputsize -= inputmargin;
+ src = z_erofs_handle_inplace_io(rq, headpage, &inputmargin, &maptype,
+ support_0padding);
+ if (IS_ERR(src))
+ return PTR_ERR(src);
/* legacy format could compress extra data in a pcluster. */
if (rq->partial_decoding || !support_0padding)
ret = LZ4_decompress_safe_partial(src + inputmargin, out,
- inlen, rq->outputsize,
- rq->outputsize);
+ rq->inputsize, rq->outputsize, rq->outputsize);
else
ret = LZ4_decompress_safe(src + inputmargin, out,
- inlen, rq->outputsize);
+ rq->inputsize, rq->outputsize);
if (ret != rq->outputsize) {
erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]",
- ret, inlen, inputmargin, rq->outputsize);
+ ret, rq->inputsize, inputmargin, rq->outputsize);
WARN_ON(1);
print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET,
- 16, 1, src + inputmargin, inlen, true);
+ 16, 1, src + inputmargin, rq->inputsize, true);
print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,
16, 1, out, rq->outputsize, true);
@@ -181,10 +245,16 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
ret = -EIO;
}
- if (copied)
- erofs_put_pcpubuf(src);
- else
+ if (maptype == 0) {
kunmap_atomic(src);
+ } else if (maptype == 1) {
+ vm_unmap_ram(src, PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT);
+ } else if (maptype == 2) {
+ erofs_put_pcpubuf(src);
+ } else {
+ DBG_BUGON(1);
+ return -EFAULT;
+ }
return ret;
}
@@ -234,57 +304,51 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
const struct z_erofs_decompressor *alg = decompressors + rq->alg;
unsigned int dst_maptype;
void *dst;
- int ret, i;
+ int ret;
- if (nrpages_out == 1 && !rq->inplace_io) {
- DBG_BUGON(!*rq->out);
- dst = kmap_atomic(*rq->out);
- dst_maptype = 0;
- goto dstmap_out;
- }
+ /* two optimized fast paths only for non bigpcluster cases yet */
+ if (rq->inputsize <= PAGE_SIZE) {
+ if (nrpages_out == 1 && !rq->inplace_io) {
+ DBG_BUGON(!*rq->out);
+ dst = kmap_atomic(*rq->out);
+ dst_maptype = 0;
+ goto dstmap_out;
+ }
- /*
- * For the case of small output size (especially much less
- * than PAGE_SIZE), memcpy the decompressed data rather than
- * compressed data is preferred.
- */
- if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
- dst = erofs_get_pcpubuf(0);
- if (IS_ERR(dst))
- return PTR_ERR(dst);
-
- rq->inplace_io = false;
- ret = alg->decompress(rq, dst);
- if (!ret)
- copy_from_pcpubuf(rq->out, dst, rq->pageofs_out,
- rq->outputsize);
-
- erofs_put_pcpubuf(dst);
- return ret;
+ /*
+ * For the case of small output size (especially much less
+ * than PAGE_SIZE), memcpy the decompressed data rather than
+ * compressed data is preferred.
+ */
+ if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
+ dst = erofs_get_pcpubuf(1);
+ if (IS_ERR(dst))
+ return PTR_ERR(dst);
+
+ rq->inplace_io = false;
+ ret = alg->decompress(rq, dst);
+ if (!ret)
+ copy_from_pcpubuf(rq->out, dst, rq->pageofs_out,
+ rq->outputsize);
+
+ erofs_put_pcpubuf(dst);
+ return ret;
+ }
}
+ /* general decoding path which can be used for all cases */
ret = alg->prepare_destpages(rq, pagepool);
- if (ret < 0) {
+ if (ret < 0)
return ret;
- } else if (ret) {
+ if (ret) {
dst = page_address(*rq->out);
dst_maptype = 1;
goto dstmap_out;
}
- i = 0;
- while (1) {
- dst = vm_map_ram(rq->out, nrpages_out, -1);
-
- /* retry two more times (totally 3 times) */
- if (dst || ++i >= 3)
- break;
- vm_unmap_aliases();
- }
-
+ dst = erofs_vm_map_ram(rq->out, nrpages_out);
if (!dst)
return -ENOMEM;
-
dst_maptype = 2;
dstmap_out:
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 9ad1615f4474..8739d3adf51f 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -18,15 +18,22 @@
* be incompatible with this kernel version.
*/
#define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING 0x00000001
-#define EROFS_ALL_FEATURE_INCOMPAT EROFS_FEATURE_INCOMPAT_LZ4_0PADDING
+#define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002
+#define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002
+#define EROFS_ALL_FEATURE_INCOMPAT \
+ (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \
+ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
+ EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER)
-/* 128-byte erofs on-disk super block */
+#define EROFS_SB_EXTSLOT_SIZE 16
+
+/* erofs on-disk super block (currently 128 bytes) */
struct erofs_super_block {
__le32 magic; /* file system magic number */
__le32 checksum; /* crc32c(super_block) */
__le32 feature_compat;
__u8 blkszbits; /* support block_size == PAGE_SIZE only */
- __u8 reserved;
+ __u8 sb_extslots; /* superblock size = 128 + sb_extslots * 16 */
__le16 root_nid; /* nid of root directory */
__le64 inos; /* total valid ino # (== f_files - f_favail) */
@@ -39,7 +46,13 @@ struct erofs_super_block {
__u8 uuid[16]; /* 128-bit uuid for volume */
__u8 volume_name[16]; /* volume name */
__le32 feature_incompat;
- __u8 reserved2[44];
+ union {
+ /* bitmap for available compression algorithms */
+ __le16 available_compr_algs;
+ /* customized sliding window size instead of 64k by default */
+ __le16 lz4_max_distance;
+ } __packed u1;
+ __u8 reserved2[42];
};
/*
@@ -75,6 +88,9 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode)
#define EROFS_I_VERSION_BIT 0
#define EROFS_I_DATALAYOUT_BIT 1
+#define EROFS_I_ALL \
+ ((1 << (EROFS_I_DATALAYOUT_BIT + EROFS_I_DATALAYOUT_BITS)) - 1)
+
/* 32-byte reduced form of an ondisk inode */
struct erofs_inode_compact {
__le16 i_format; /* inode format hints */
@@ -189,20 +205,33 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
e->e_name_len + le16_to_cpu(e->e_value_size));
}
+/* maximum supported size of a physical compression cluster */
+#define Z_EROFS_PCLUSTER_MAX_SIZE (1024 * 1024)
+
/* available compression algorithm types (for h_algorithmtype) */
enum {
Z_EROFS_COMPRESSION_LZ4 = 0,
Z_EROFS_COMPRESSION_MAX
};
+#define Z_EROFS_ALL_COMPR_ALGS (1 << (Z_EROFS_COMPRESSION_MAX - 1))
+
+/* 14 bytes (+ length field = 16 bytes) */
+struct z_erofs_lz4_cfgs {
+ __le16 max_distance;
+ __le16 max_pclusterblks;
+ u8 reserved[10];
+} __packed;
/*
* bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
* e.g. for 4k logical cluster size, 4B if compacted 2B is off;
* (4B) + 2B + (4B) if compacted 2B is on.
+ * bit 1 : HEAD1 big pcluster (0 - off; 1 - on)
+ * bit 2 : HEAD2 big pcluster (0 - off; 1 - on)
*/
-#define Z_EROFS_ADVISE_COMPACTED_2B_BIT 0
-
-#define Z_EROFS_ADVISE_COMPACTED_2B (1 << Z_EROFS_ADVISE_COMPACTED_2B_BIT)
+#define Z_EROFS_ADVISE_COMPACTED_2B 0x0001
+#define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002
+#define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004
struct z_erofs_map_header {
__le32 h_reserved1;
@@ -214,9 +243,7 @@ struct z_erofs_map_header {
__u8 h_algorithmtype;
/*
* bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096;
- * bit 3-4 : (physical - logical) cluster bits of head 1:
- * For example, if logical clustersize = 4096, 1 for 8192.
- * bit 5-7 : (physical - logical) cluster bits of head 2.
+ * bit 3-7 : reserved.
*/
__u8 h_clusterbits;
};
@@ -259,6 +286,13 @@ enum {
#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS 2
#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT 0
+/*
+ * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the
+ * compressed block count of a compressed extent (in logical clusters, aka.
+ * block count of a pcluster).
+ */
+#define Z_EROFS_VLE_DI_D0_CBLKCNT (1 << 11)
+
struct z_erofs_vle_decompressed_index {
__le16 di_advise;
/* where to decompress in the head cluster */
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 119fdce1b520..7ed2d7391692 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -44,6 +44,13 @@ static struct page *erofs_read_inode(struct inode *inode,
dic = page_address(page) + *ofs;
ifmt = le16_to_cpu(dic->i_format);
+ if (ifmt & ~EROFS_I_ALL) {
+ erofs_err(inode->i_sb, "unsupported i_format %u of nid %llu",
+ ifmt, vi->nid);
+ err = -EOPNOTSUPP;
+ goto err_out;
+ }
+
vi->datalayout = erofs_inode_datalayout(ifmt);
if (vi->datalayout >= EROFS_INODE_DATALAYOUT_MAX) {
erofs_err(inode->i_sb, "unsupported datalayout %u of nid %llu",
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 351dae524a0c..f92e3e32b9f4 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -50,6 +50,8 @@ struct erofs_fs_context {
#ifdef CONFIG_EROFS_FS_ZIP
/* current strategy of how to use managed cache */
unsigned char cache_strategy;
+ /* strategy of sync decompression (false - auto, true - force on) */
+ bool readahead_sync_decompress;
/* threshold for decompression synchronously */
unsigned int max_sync_decompress_pages;
@@ -57,6 +59,14 @@ struct erofs_fs_context {
unsigned int mount_opt;
};
+/* all filesystem-wide lz4 configurations */
+struct erofs_sb_lz4_info {
+ /* # of pages needed for EROFS lz4 rolling decompression */
+ u16 max_distance_pages;
+ /* maximum possible blocks for pclusters in the filesystem */
+ u16 max_pclusterblks;
+};
+
struct erofs_sb_info {
#ifdef CONFIG_EROFS_FS_ZIP
/* list for all registered superblocks, mainly for shrinker */
@@ -67,9 +77,12 @@ struct erofs_sb_info {
struct xarray managed_pslots;
unsigned int shrinker_run_no;
+ u16 available_compr_algs;
/* pseudo inode to manage cached pages */
struct inode *managed_cache;
+
+ struct erofs_sb_lz4_info lz4;
#endif /* CONFIG_EROFS_FS_ZIP */
u32 blocks;
u32 meta_blkaddr;
@@ -80,6 +93,7 @@ struct erofs_sb_info {
/* inode slot unit size in bit shift */
unsigned char islotbits;
+ u32 sb_size; /* total superblock size */
u32 build_time_nsec;
u64 build_time;
@@ -182,12 +196,6 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
return v;
}
#endif /* !CONFIG_SMP */
-
-/* hard limit of pages per compressed cluster */
-#define Z_EROFS_CLUSTER_MAX_PAGES (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
-#define EROFS_PCPUBUF_NR_PAGES Z_EROFS_CLUSTER_MAX_PAGES
-#else
-#define EROFS_PCPUBUF_NR_PAGES 0
#endif /* !CONFIG_EROFS_FS_ZIP */
/* we strictly follow PAGE_SIZE and no buffer head yet */
@@ -216,6 +224,17 @@ static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid)
return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits);
}
+#define EROFS_FEATURE_FUNCS(name, compat, feature) \
+static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \
+{ \
+ return sbi->feature_##compat & EROFS_FEATURE_##feature; \
+}
+
+EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING)
+EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS)
+EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER)
+EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
+
/* atomic flag definitions */
#define EROFS_I_EA_INITED_BIT 0
#define EROFS_I_Z_INITED_BIT 1
@@ -244,7 +263,6 @@ struct erofs_inode {
unsigned short z_advise;
unsigned char z_algorithmtype[2];
unsigned char z_logical_clusterbits;
- unsigned char z_physical_clusterbits[2];
};
#endif /* CONFIG_EROFS_FS_ZIP */
};
@@ -287,7 +305,7 @@ extern const struct address_space_operations erofs_raw_access_aops;
extern const struct address_space_operations z_erofs_aops;
/*
- * Logical to physical block mapping, used by erofs_map_blocks()
+ * Logical to physical block mapping
*
* Different with other file systems, it is used for 2 access modes:
*
@@ -334,7 +352,7 @@ struct erofs_map_blocks {
struct page *mpage;
};
-/* Flags used by erofs_map_blocks() */
+/* Flags used by erofs_map_blocks_flatmode() */
#define EROFS_GET_BLOCKS_RAW 0x0001
/* zmap.c */
@@ -356,8 +374,6 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode,
/* data.c */
struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr);
-int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int);
-
/* inode.c */
static inline unsigned long erofs_inode_hash(erofs_nid_t nid)
{
@@ -386,23 +402,30 @@ int erofs_namei(struct inode *dir, struct qstr *name,
/* dir.c */
extern const struct file_operations erofs_dir_fops;
-/* utils.c / zdata.c */
-struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
-
-#if (EROFS_PCPUBUF_NR_PAGES > 0)
-void *erofs_get_pcpubuf(unsigned int pagenr);
-#define erofs_put_pcpubuf(buf) do { \
- (void)&(buf); \
- preempt_enable(); \
-} while (0)
-#else
-static inline void *erofs_get_pcpubuf(unsigned int pagenr)
+static inline void *erofs_vm_map_ram(struct page **pages, unsigned int count)
{
- return ERR_PTR(-EOPNOTSUPP);
+ int retried = 0;
+
+ while (1) {
+ void *p = vm_map_ram(pages, count, -1);
+
+ /* retry two more times (totally 3 times) */
+ if (p || ++retried >= 3)
+ return p;
+ vm_unmap_aliases();
+ }
+ return NULL;
}
-#define erofs_put_pcpubuf(buf) do {} while (0)
-#endif
+/* pcpubuf.c */
+void *erofs_get_pcpubuf(unsigned int requiredpages);
+void erofs_put_pcpubuf(void *ptr);
+int erofs_pcpubuf_growsize(unsigned int nrpages);
+void erofs_pcpubuf_init(void);
+void erofs_pcpubuf_exit(void);
+
+/* utils.c / zdata.c */
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
#ifdef CONFIG_EROFS_FS_ZIP
int erofs_workgroup_put(struct erofs_workgroup *grp);
@@ -421,6 +444,9 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
struct erofs_workgroup *egrp);
int erofs_try_to_free_cached_page(struct address_space *mapping,
struct page *page);
+int z_erofs_load_lz4_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_lz4_cfgs *lz4, int len);
#else
static inline void erofs_shrinker_register(struct super_block *sb) {}
static inline void erofs_shrinker_unregister(struct super_block *sb) {}
@@ -428,6 +454,16 @@ static inline int erofs_init_shrinker(void) { return 0; }
static inline void erofs_exit_shrinker(void) {}
static inline int z_erofs_init_zip_subsystem(void) { return 0; }
static inline void z_erofs_exit_zip_subsystem(void) {}
+static inline int z_erofs_load_lz4_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_lz4_cfgs *lz4, int len)
+{
+ if (lz4 || dsb->u1.lz4_max_distance) {
+ erofs_err(sb, "lz4 algorithm isn't enabled");
+ return -EINVAL;
+ }
+ return 0;
+}
#endif /* !CONFIG_EROFS_FS_ZIP */
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c
new file mode 100644
index 000000000000..6c885575128a
--- /dev/null
+++ b/fs/erofs/pcpubuf.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) Gao Xiang <xiang@kernel.org>
+ *
+ * For low-latency decompression algorithms (e.g. lz4), reserve consecutive
+ * per-CPU virtual memory (in pages) in advance to store such inplace I/O
+ * data if inplace decompression is failed (due to unmet inplace margin for
+ * example).
+ */
+#include "internal.h"
+
+struct erofs_pcpubuf {
+ raw_spinlock_t lock;
+ void *ptr;
+ struct page **pages;
+ unsigned int nrpages;
+};
+
+static DEFINE_PER_CPU(struct erofs_pcpubuf, erofs_pcb);
+
+void *erofs_get_pcpubuf(unsigned int requiredpages)
+ __acquires(pcb->lock)
+{
+ struct erofs_pcpubuf *pcb = &get_cpu_var(erofs_pcb);
+
+ raw_spin_lock(&pcb->lock);
+ /* check if the per-CPU buffer is too small */
+ if (requiredpages > pcb->nrpages) {
+ raw_spin_unlock(&pcb->lock);
+ put_cpu_var(erofs_pcb);
+ /* (for sparse checker) pretend pcb->lock is still taken */
+ __acquire(pcb->lock);
+ return NULL;
+ }
+ return pcb->ptr;
+}
+
+void erofs_put_pcpubuf(void *ptr) __releases(pcb->lock)
+{
+ struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, smp_processor_id());
+
+ DBG_BUGON(pcb->ptr != ptr);
+ raw_spin_unlock(&pcb->lock);
+ put_cpu_var(erofs_pcb);
+}
+
+/* the next step: support per-CPU page buffers hotplug */
+int erofs_pcpubuf_growsize(unsigned int nrpages)
+{
+ static DEFINE_MUTEX(pcb_resize_mutex);
+ static unsigned int pcb_nrpages;
+ LIST_HEAD(pagepool);
+ int delta, cpu, ret, i;
+
+ mutex_lock(&pcb_resize_mutex);
+ delta = nrpages - pcb_nrpages;
+ ret = 0;
+ /* avoid shrinking pcpubuf, since no idea how many fses rely on */
+ if (delta <= 0)
+ goto out;
+
+ for_each_possible_cpu(cpu) {
+ struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
+ struct page **pages, **oldpages;
+ void *ptr, *old_ptr;
+
+ pages = kmalloc_array(nrpages, sizeof(*pages), GFP_KERNEL);
+ if (!pages) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ for (i = 0; i < nrpages; ++i) {
+ pages[i] = erofs_allocpage(&pagepool, GFP_KERNEL);
+ if (!pages[i]) {
+ ret = -ENOMEM;
+ oldpages = pages;
+ goto free_pagearray;
+ }
+ }
+ ptr = vmap(pages, nrpages, VM_MAP, PAGE_KERNEL);
+ if (!ptr) {
+ ret = -ENOMEM;
+ oldpages = pages;
+ goto free_pagearray;
+ }
+ raw_spin_lock(&pcb->lock);
+ old_ptr = pcb->ptr;
+ pcb->ptr = ptr;
+ oldpages = pcb->pages;
+ pcb->pages = pages;
+ i = pcb->nrpages;
+ pcb->nrpages = nrpages;
+ raw_spin_unlock(&pcb->lock);
+
+ if (!oldpages) {
+ DBG_BUGON(old_ptr);
+ continue;
+ }
+
+ if (old_ptr)
+ vunmap(old_ptr);
+free_pagearray:
+ while (i)
+ list_add(&oldpages[--i]->lru, &pagepool);
+ kfree(oldpages);
+ if (ret)
+ break;
+ }
+ pcb_nrpages = nrpages;
+ put_pages_list(&pagepool);
+out:
+ mutex_unlock(&pcb_resize_mutex);
+ return ret;
+}
+
+void erofs_pcpubuf_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
+
+ raw_spin_lock_init(&pcb->lock);
+ }
+}
+
+void erofs_pcpubuf_exit(void)
+{
+ int cpu, i;
+
+ for_each_possible_cpu(cpu) {
+ struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
+
+ if (pcb->ptr) {
+ vunmap(pcb->ptr);
+ pcb->ptr = NULL;
+ }
+ if (!pcb->pages)
+ continue;
+
+ for (i = 0; i < pcb->nrpages; ++i)
+ if (pcb->pages[i])
+ put_page(pcb->pages[i]);
+ kfree(pcb->pages);
+ pcb->pages = NULL;
+ }
+}
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index d5a6b9b888a5..bbf3bbd908e0 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -122,6 +122,136 @@ static bool check_layout_compatibility(struct super_block *sb,
return true;
}
+#ifdef CONFIG_EROFS_FS_ZIP
+/* read variable-sized metadata, offset will be aligned by 4-byte */
+static void *erofs_read_metadata(struct super_block *sb, struct page **pagep,
+ erofs_off_t *offset, int *lengthp)
+{
+ struct page *page = *pagep;
+ u8 *buffer, *ptr;
+ int len, i, cnt;
+ erofs_blk_t blk;
+
+ *offset = round_up(*offset, 4);
+ blk = erofs_blknr(*offset);
+
+ if (!page || page->index != blk) {
+ if (page) {
+ unlock_page(page);
+ put_page(page);
+ }
+ page = erofs_get_meta_page(sb, blk);
+ if (IS_ERR(page))
+ goto err_nullpage;
+ }
+
+ ptr = kmap(page);
+ len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(*offset)]);
+ if (!len)
+ len = U16_MAX + 1;
+ buffer = kmalloc(len, GFP_KERNEL);
+ if (!buffer) {
+ buffer = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+ *offset += sizeof(__le16);
+ *lengthp = len;
+
+ for (i = 0; i < len; i += cnt) {
+ cnt = min(EROFS_BLKSIZ - (int)erofs_blkoff(*offset), len - i);
+ blk = erofs_blknr(*offset);
+
+ if (!page || page->index != blk) {
+ if (page) {
+ kunmap(page);
+ unlock_page(page);
+ put_page(page);
+ }
+ page = erofs_get_meta_page(sb, blk);
+ if (IS_ERR(page)) {
+ kfree(buffer);
+ goto err_nullpage;
+ }
+ ptr = kmap(page);
+ }
+ memcpy(buffer + i, ptr + erofs_blkoff(*offset), cnt);
+ *offset += cnt;
+ }
+out:
+ kunmap(page);
+ *pagep = page;
+ return buffer;
+err_nullpage:
+ *pagep = NULL;
+ return page;
+}
+
+static int erofs_load_compr_cfgs(struct super_block *sb,
+ struct erofs_super_block *dsb)
+{
+ struct erofs_sb_info *sbi;
+ struct page *page;
+ unsigned int algs, alg;
+ erofs_off_t offset;
+ int size, ret;
+
+ sbi = EROFS_SB(sb);
+ sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs);
+
+ if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) {
+ erofs_err(sb, "try to load compressed fs with unsupported algorithms %x",
+ sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS);
+ return -EINVAL;
+ }
+
+ offset = EROFS_SUPER_OFFSET + sbi->sb_size;
+ page = NULL;
+ alg = 0;
+ ret = 0;
+
+ for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) {
+ void *data;
+
+ if (!(algs & 1))
+ continue;
+
+ data = erofs_read_metadata(sb, &page, &offset, &size);
+ if (IS_ERR(data)) {
+ ret = PTR_ERR(data);
+ goto err;
+ }
+
+ switch (alg) {
+ case Z_EROFS_COMPRESSION_LZ4:
+ ret = z_erofs_load_lz4_config(sb, dsb, data, size);
+ break;
+ default:
+ DBG_BUGON(1);
+ ret = -EFAULT;
+ }
+ kfree(data);
+ if (ret)
+ goto err;
+ }
+err:
+ if (page) {
+ unlock_page(page);
+ put_page(page);
+ }
+ return ret;
+}
+#else
+static int erofs_load_compr_cfgs(struct super_block *sb,
+ struct erofs_super_block *dsb)
+{
+ if (dsb->u1.available_compr_algs) {
+ erofs_err(sb, "try to load compressed fs when compression is disabled");
+ return -EINVAL;
+ }
+ return 0;
+}
+#endif
+
static int erofs_read_superblock(struct super_block *sb)
{
struct erofs_sb_info *sbi;
@@ -149,7 +279,7 @@ static int erofs_read_superblock(struct super_block *sb)
}
sbi->feature_compat = le32_to_cpu(dsb->feature_compat);
- if (sbi->feature_compat & EROFS_FEATURE_COMPAT_SB_CHKSUM) {
+ if (erofs_sb_has_sb_chksum(sbi)) {
ret = erofs_superblock_csum_verify(sb, data);
if (ret)
goto out;
@@ -166,6 +296,12 @@ static int erofs_read_superblock(struct super_block *sb)
if (!check_layout_compatibility(sb, dsb))
goto out;
+ sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE;
+ if (sbi->sb_size > EROFS_BLKSIZ) {
+ erofs_err(sb, "invalid sb_extslots %u (more than a fs block)",
+ sbi->sb_size);
+ goto out;
+ }
sbi->blocks = le32_to_cpu(dsb->blocks);
sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
#ifdef CONFIG_EROFS_FS_XATTR
@@ -187,7 +323,12 @@ static int erofs_read_superblock(struct super_block *sb)
ret = -EFSCORRUPTED;
goto out;
}
- ret = 0;
+
+ /* parse on-disk compression configurations */
+ if (erofs_sb_has_compr_cfgs(sbi))
+ ret = erofs_load_compr_cfgs(sb, dsb);
+ else
+ ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0);
out:
kunmap(page);
put_page(page);
@@ -200,6 +341,7 @@ static void erofs_default_options(struct erofs_fs_context *ctx)
#ifdef CONFIG_EROFS_FS_ZIP
ctx->cache_strategy = EROFS_ZIP_CACHE_READAROUND;
ctx->max_sync_decompress_pages = 3;
+ ctx->readahead_sync_decompress = false;
#endif
#ifdef CONFIG_EROFS_FS_XATTR
set_opt(ctx, XATTR_USER);
@@ -513,6 +655,7 @@ static int __init erofs_module_init(void)
if (err)
goto shrinker_err;
+ erofs_pcpubuf_init();
err = z_erofs_init_zip_subsystem();
if (err)
goto zip_err;
@@ -542,6 +685,7 @@ static void __exit erofs_module_exit(void)
/* Ensure all RCU free inodes are safe before cache is destroyed. */
rcu_barrier();
kmem_cache_destroy(erofs_inode_cachep);
+ erofs_pcpubuf_exit();
}
/* get filesystem statistics */
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index de9986d2f82f..6758c5b19f7c 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -21,18 +21,6 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
return page;
}
-#if (EROFS_PCPUBUF_NR_PAGES > 0)
-static struct {
- u8 data[PAGE_SIZE * EROFS_PCPUBUF_NR_PAGES];
-} ____cacheline_aligned_in_smp erofs_pcpubuf[NR_CPUS];
-
-void *erofs_get_pcpubuf(unsigned int pagenr)
-{
- preempt_disable();
- return &erofs_pcpubuf[smp_processor_id()].data[pagenr * PAGE_SIZE];
-}
-#endif
-
#ifdef CONFIG_EROFS_FS_ZIP
/* global shrink count (for all mounted EROFS instances) */
static atomic_long_t erofs_global_shrink_cnt;
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 3851e1a64f73..78e4b598ecca 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -11,6 +11,93 @@
#include <trace/events/erofs.h>
/*
+ * since pclustersize is variable for big pcluster feature, introduce slab
+ * pools implementation for different pcluster sizes.
+ */
+struct z_erofs_pcluster_slab {
+ struct kmem_cache *slab;
+ unsigned int maxpages;
+ char name[48];
+};
+
+#define _PCLP(n) { .maxpages = n }
+
+static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = {
+ _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128),
+ _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES)
+};
+
+static void z_erofs_destroy_pcluster_pool(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
+ if (!pcluster_pool[i].slab)
+ continue;
+ kmem_cache_destroy(pcluster_pool[i].slab);
+ pcluster_pool[i].slab = NULL;
+ }
+}
+
+static int z_erofs_create_pcluster_pool(void)
+{
+ struct z_erofs_pcluster_slab *pcs;
+ struct z_erofs_pcluster *a;
+ unsigned int size;
+
+ for (pcs = pcluster_pool;
+ pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) {
+ size = struct_size(a, compressed_pages, pcs->maxpages);
+
+ sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages);
+ pcs->slab = kmem_cache_create(pcs->name, size, 0,
+ SLAB_RECLAIM_ACCOUNT, NULL);
+ if (pcs->slab)
+ continue;
+
+ z_erofs_destroy_pcluster_pool();
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
+ struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
+ struct z_erofs_pcluster *pcl;
+
+ if (nrpages > pcs->maxpages)
+ continue;
+
+ pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS);
+ if (!pcl)
+ return ERR_PTR(-ENOMEM);
+ pcl->pclusterpages = nrpages;
+ return pcl;
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
+ struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
+
+ if (pcl->pclusterpages > pcs->maxpages)
+ continue;
+
+ kmem_cache_free(pcs->slab, pcl);
+ return;
+ }
+ DBG_BUGON(1);
+}
+
+/*
* a compressed_pages[] placeholder in order to avoid
* being filled with file pages for in-place decompression.
*/
@@ -37,12 +124,11 @@ typedef tagptr1_t compressed_page_t;
tagptr_fold(compressed_page_t, page, 1)
static struct workqueue_struct *z_erofs_workqueue __read_mostly;
-static struct kmem_cache *pcluster_cachep __read_mostly;
void z_erofs_exit_zip_subsystem(void)
{
destroy_workqueue(z_erofs_workqueue);
- kmem_cache_destroy(pcluster_cachep);
+ z_erofs_destroy_pcluster_pool();
}
static inline int z_erofs_init_workqueue(void)
@@ -59,32 +145,16 @@ static inline int z_erofs_init_workqueue(void)
return z_erofs_workqueue ? 0 : -ENOMEM;
}
-static void z_erofs_pcluster_init_once(void *ptr)
-{
- struct z_erofs_pcluster *pcl = ptr;
- struct z_erofs_collection *cl = z_erofs_primarycollection(pcl);
- unsigned int i;
-
- mutex_init(&cl->lock);
- cl->nr_pages = 0;
- cl->vcnt = 0;
- for (i = 0; i < Z_EROFS_CLUSTER_MAX_PAGES; ++i)
- pcl->compressed_pages[i] = NULL;
-}
-
int __init z_erofs_init_zip_subsystem(void)
{
- pcluster_cachep = kmem_cache_create("erofs_compress",
- Z_EROFS_WORKGROUP_SIZE, 0,
- SLAB_RECLAIM_ACCOUNT,
- z_erofs_pcluster_init_once);
- if (pcluster_cachep) {
- if (!z_erofs_init_workqueue())
- return 0;
-
- kmem_cache_destroy(pcluster_cachep);
- }
- return -ENOMEM;
+ int err = z_erofs_create_pcluster_pool();
+
+ if (err)
+ return err;
+ err = z_erofs_init_workqueue();
+ if (err)
+ z_erofs_destroy_pcluster_pool();
+ return err;
}
enum z_erofs_collectmode {
@@ -104,6 +174,12 @@ enum z_erofs_collectmode {
* |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________|
*/
COLLECT_PRIMARY_HOOKED,
+ /*
+ * a weak form of COLLECT_PRIMARY_FOLLOWED, the difference is that it
+ * could be dispatched into bypass queue later due to uptodated managed
+ * pages. All related online pages cannot be reused for inplace I/O (or
+ * pagevec) since it can be directly decoded without I/O submission.
+ */
COLLECT_PRIMARY_FOLLOWED_NOINPLACE,
/*
* The current collection has been linked with the owned chain, and
@@ -128,7 +204,8 @@ struct z_erofs_collector {
struct z_erofs_pcluster *pcl, *tailpcl;
struct z_erofs_collection *cl;
- struct page **compressedpages;
+ /* a pointer used to pick up inplace I/O pages */
+ struct page **icpage_ptr;
z_erofs_next_pcluster_t owned_head;
enum z_erofs_collectmode mode;
@@ -162,18 +239,19 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
enum z_erofs_cache_alloctype type,
struct list_head *pagepool)
{
- const struct z_erofs_pcluster *pcl = clt->pcl;
- const unsigned int clusterpages = BIT(pcl->clusterbits);
- struct page **pages = clt->compressedpages;
- pgoff_t index = pcl->obj.index + (pages - pcl->compressed_pages);
+ struct z_erofs_pcluster *pcl = clt->pcl;
bool standalone = true;
gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
+ struct page **pages;
+ pgoff_t index;
if (clt->mode < COLLECT_PRIMARY_FOLLOWED)
return;
- for (; pages < pcl->compressed_pages + clusterpages; ++pages) {
+ pages = pcl->compressed_pages;
+ index = pcl->obj.index;
+ for (; index < pcl->obj.index + pcl->pclusterpages; ++index, ++pages) {
struct page *page;
compressed_page_t t;
struct page *newpage = NULL;
@@ -186,21 +264,25 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
if (page) {
t = tag_compressed_page_justfound(page);
- } else if (type == DELAYEDALLOC) {
- t = tagptr_init(compressed_page_t, PAGE_UNALLOCATED);
- } else if (type == TRYALLOC) {
- newpage = erofs_allocpage(pagepool, gfp);
- if (!newpage)
- goto dontalloc;
-
- set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
- t = tag_compressed_page_justfound(newpage);
- } else { /* DONTALLOC */
-dontalloc:
- if (standalone)
- clt->compressedpages = pages;
+ } else {
+ /* I/O is needed, no possible to decompress directly */
standalone = false;
- continue;
+ switch (type) {
+ case DELAYEDALLOC:
+ t = tagptr_init(compressed_page_t,
+ PAGE_UNALLOCATED);
+ break;
+ case TRYALLOC:
+ newpage = erofs_allocpage(pagepool, gfp);
+ if (!newpage)
+ continue;
+ set_page_private(newpage,
+ Z_EROFS_PREALLOCATED_PAGE);
+ t = tag_compressed_page_justfound(newpage);
+ break;
+ default: /* DONTALLOC */
+ continue;
+ }
}
if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t)))
@@ -214,7 +296,11 @@ dontalloc:
}
}
- if (standalone) /* downgrade to PRIMARY_FOLLOWED_NOINPLACE */
+ /*
+ * don't do inplace I/O if all compressed pages are available in
+ * managed cache since it can be moved to the bypass queue instead.
+ */
+ if (standalone)
clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
}
@@ -225,14 +311,13 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
struct z_erofs_pcluster *const pcl =
container_of(grp, struct z_erofs_pcluster, obj);
struct address_space *const mapping = MNGD_MAPPING(sbi);
- const unsigned int clusterpages = BIT(pcl->clusterbits);
int i;
/*
* refcount of workgroup is now freezed as 1,
* therefore no need to worry about available decompression users.
*/
- for (i = 0; i < clusterpages; ++i) {
+ for (i = 0; i < pcl->pclusterpages; ++i) {
struct page *page = pcl->compressed_pages[i];
if (!page)
@@ -257,13 +342,12 @@ int erofs_try_to_free_cached_page(struct address_space *mapping,
struct page *page)
{
struct z_erofs_pcluster *const pcl = (void *)page_private(page);
- const unsigned int clusterpages = BIT(pcl->clusterbits);
int ret = 0; /* 0 - busy */
if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) {
unsigned int i;
- for (i = 0; i < clusterpages; ++i) {
+ for (i = 0; i < pcl->pclusterpages; ++i) {
if (pcl->compressed_pages[i] == page) {
WRITE_ONCE(pcl->compressed_pages[i], NULL);
ret = 1;
@@ -279,16 +363,14 @@ int erofs_try_to_free_cached_page(struct address_space *mapping,
}
/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
-static inline bool z_erofs_try_inplace_io(struct z_erofs_collector *clt,
- struct page *page)
+static bool z_erofs_try_inplace_io(struct z_erofs_collector *clt,
+ struct page *page)
{
struct z_erofs_pcluster *const pcl = clt->pcl;
- const unsigned int clusterpages = BIT(pcl->clusterbits);
- while (clt->compressedpages < pcl->compressed_pages + clusterpages) {
- if (!cmpxchg(clt->compressedpages++, NULL, page))
+ while (clt->icpage_ptr > pcl->compressed_pages)
+ if (!cmpxchg(--clt->icpage_ptr, NULL, page))
return true;
- }
return false;
}
@@ -399,10 +481,10 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
struct erofs_workgroup *grp;
int err;
- /* no available workgroup, let's allocate one */
- pcl = kmem_cache_alloc(pcluster_cachep, GFP_NOFS);
- if (!pcl)
- return -ENOMEM;
+ /* no available pcluster, let's allocate one */
+ pcl = z_erofs_alloc_pcluster(map->m_plen >> PAGE_SHIFT);
+ if (IS_ERR(pcl))
+ return PTR_ERR(pcl);
atomic_set(&pcl->obj.refcount, 1);
pcl->obj.index = map->m_pa >> PAGE_SHIFT;
@@ -416,25 +498,18 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
else
pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
- pcl->clusterbits = EROFS_I(inode)->z_physical_clusterbits[0];
- pcl->clusterbits -= PAGE_SHIFT;
-
/* new pclusters should be claimed as type 1, primary and followed */
pcl->next = clt->owned_head;
clt->mode = COLLECT_PRIMARY_FOLLOWED;
cl = z_erofs_primarycollection(pcl);
-
- /* must be cleaned before freeing to slab */
- DBG_BUGON(cl->nr_pages);
- DBG_BUGON(cl->vcnt);
-
cl->pageofs = map->m_la & ~PAGE_MASK;
/*
* lock all primary followed works before visible to others
* and mutex_trylock *never* fails for a new pcluster.
*/
+ mutex_init(&cl->lock);
DBG_BUGON(!mutex_trylock(&cl->lock));
grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj);
@@ -458,7 +533,7 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
err_out:
mutex_unlock(&cl->lock);
- kmem_cache_free(pcluster_cachep, pcl);
+ z_erofs_free_pcluster(pcl);
return err;
}
@@ -502,9 +577,8 @@ out:
z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS,
clt->cl->pagevec, clt->cl->vcnt);
- clt->compressedpages = clt->pcl->compressed_pages;
- if (clt->mode <= COLLECT_PRIMARY) /* cannot do in-place I/O */
- clt->compressedpages += Z_EROFS_CLUSTER_MAX_PAGES;
+ /* since file-backed online pages are traversed in reverse order */
+ clt->icpage_ptr = clt->pcl->compressed_pages + clt->pcl->pclusterpages;
return 0;
}
@@ -517,9 +591,8 @@ static void z_erofs_rcu_callback(struct rcu_head *head)
struct z_erofs_collection *const cl =
container_of(head, struct z_erofs_collection, rcu);
- kmem_cache_free(pcluster_cachep,
- container_of(cl, struct z_erofs_pcluster,
- primary_collection));
+ z_erofs_free_pcluster(container_of(cl, struct z_erofs_pcluster,
+ primary_collection));
}
void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
@@ -706,9 +779,12 @@ err_out:
goto out;
}
+static void z_erofs_decompressqueue_work(struct work_struct *work);
static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
bool sync, int bios)
{
+ struct erofs_sb_info *const sbi = EROFS_SB(io->sb);
+
/* wake up the caller thread for sync decompression */
if (sync) {
unsigned long flags;
@@ -720,8 +796,15 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
return;
}
- if (!atomic_add_return(bios, &io->pending_bios))
+ if (atomic_add_return(bios, &io->pending_bios))
+ return;
+ /* Use workqueue and sync decompression for atomic contexts only */
+ if (in_atomic() || irqs_disabled()) {
queue_work(z_erofs_workqueue, &io->u.work);
+ sbi->ctx.readahead_sync_decompress = true;
+ return;
+ }
+ z_erofs_decompressqueue_work(&io->u.work);
}
static bool z_erofs_page_is_invalidated(struct page *page)
@@ -761,9 +844,8 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
struct list_head *pagepool)
{
struct erofs_sb_info *const sbi = EROFS_SB(sb);
- const unsigned int clusterpages = BIT(pcl->clusterbits);
struct z_erofs_pagevec_ctor ctor;
- unsigned int i, outputsize, llen, nr_pages;
+ unsigned int i, inputsize, outputsize, llen, nr_pages;
struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES];
struct page **pages, **compressed_pages, *page;
@@ -843,7 +925,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
overlapped = false;
compressed_pages = pcl->compressed_pages;
- for (i = 0; i < clusterpages; ++i) {
+ for (i = 0; i < pcl->pclusterpages; ++i) {
unsigned int pagenr;
page = compressed_pages[i];
@@ -896,12 +978,13 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
partial = true;
}
+ inputsize = pcl->pclusterpages * PAGE_SIZE;
err = z_erofs_decompress(&(struct z_erofs_decompress_req) {
.sb = sb,
.in = compressed_pages,
.out = pages,
.pageofs_out = cl->pageofs,
- .inputsize = PAGE_SIZE,
+ .inputsize = inputsize,
.outputsize = outputsize,
.alg = pcl->algorithmformat,
.inplace_io = overlapped,
@@ -909,8 +992,8 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
}, pagepool);
out:
- /* must handle all compressed pages before endding pages */
- for (i = 0; i < clusterpages; ++i) {
+ /* must handle all compressed pages before ending pages */
+ for (i = 0; i < pcl->pclusterpages; ++i) {
page = compressed_pages[i];
if (erofs_page_is_managed(sbi, page))
@@ -1213,7 +1296,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
pcl = container_of(owned_head, struct z_erofs_pcluster, next);
cur = pcl->obj.index;
- end = cur + BIT(pcl->clusterbits);
+ end = cur + pcl->pclusterpages;
/* close the main owned chain at first */
owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
@@ -1333,7 +1416,8 @@ static void z_erofs_readahead(struct readahead_control *rac)
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
unsigned int nr_pages = readahead_count(rac);
- bool sync = (nr_pages <= sbi->ctx.max_sync_decompress_pages);
+ bool sync = (sbi->ctx.readahead_sync_decompress &&
+ nr_pages <= sbi->ctx.max_sync_decompress_pages);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
struct page *page, *head = NULL;
LIST_HEAD(pagepool);
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index b503b353d4ab..942ee69dff6a 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -10,6 +10,7 @@
#include "internal.h"
#include "zpvec.h"
+#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
#define Z_EROFS_NR_INLINE_PAGEVECS 3
/*
@@ -59,16 +60,17 @@ struct z_erofs_pcluster {
/* A: point to next chained pcluster or TAILs */
z_erofs_next_pcluster_t next;
- /* A: compressed pages (including multi-usage pages) */
- struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES];
-
/* A: lower limit of decompressed length and if full length or not */
unsigned int length;
+ /* I: physical cluster size in pages */
+ unsigned short pclusterpages;
+
/* I: compression algorithm format */
unsigned char algorithmformat;
- /* I: bit shift of physical cluster size */
- unsigned char clusterbits;
+
+ /* A: compressed pages (can be cached or inplaced pages) */
+ struct page *compressed_pages[];
};
#define z_erofs_primarycollection(pcluster) (&(pcluster)->primary_collection)
@@ -82,8 +84,6 @@ struct z_erofs_pcluster {
#define Z_EROFS_PCLUSTER_NIL (NULL)
-#define Z_EROFS_WORKGROUP_SIZE sizeof(struct z_erofs_pcluster)
-
struct z_erofs_decompressqueue {
struct super_block *sb;
atomic_t pending_bios;
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 14d2de35110c..e62d813756f2 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -11,17 +11,16 @@
int z_erofs_fill_inode(struct inode *inode)
{
struct erofs_inode *const vi = EROFS_I(inode);
+ struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
- if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) {
+ if (!erofs_sb_has_big_pcluster(sbi) &&
+ vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) {
vi->z_advise = 0;
vi->z_algorithmtype[0] = 0;
vi->z_algorithmtype[1] = 0;
vi->z_logical_clusterbits = LOG_BLOCK_SIZE;
- vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits;
- vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits;
set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
}
-
inode->i_mapping->a_ops = &z_erofs_aops;
return 0;
}
@@ -52,7 +51,8 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
goto out_unlock;
- DBG_BUGON(vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY);
+ DBG_BUGON(!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
+ vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY);
pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
vi->xattr_isize, 8);
@@ -77,18 +77,22 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
}
vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7);
- vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits +
- ((h->h_clusterbits >> 3) & 3);
-
- if (vi->z_physical_clusterbits[0] != LOG_BLOCK_SIZE) {
- erofs_err(sb, "unsupported physical clusterbits %u for nid %llu, please upgrade kernel",
- vi->z_physical_clusterbits[0], vi->nid);
- err = -EOPNOTSUPP;
+ if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
+ vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
+ Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
+ erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu",
+ vi->nid);
+ err = -EFSCORRUPTED;
+ goto unmap_done;
+ }
+ if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION &&
+ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^
+ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
+ erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu",
+ vi->nid);
+ err = -EFSCORRUPTED;
goto unmap_done;
}
-
- vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits +
- ((h->h_clusterbits >> 5) & 7);
/* paired with smp_mb() at the beginning of the function */
smp_mb();
set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
@@ -111,7 +115,7 @@ struct z_erofs_maprecorder {
u8 type;
u16 clusterofs;
u16 delta[2];
- erofs_blk_t pblk;
+ erofs_blk_t pblk, compressedlcs;
};
static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m,
@@ -174,6 +178,15 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
m->clusterofs = 1 << vi->z_logical_clusterbits;
m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
+ if (m->delta[0] & Z_EROFS_VLE_DI_D0_CBLKCNT) {
+ if (!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ m->compressedlcs = m->delta[0] &
+ ~Z_EROFS_VLE_DI_D0_CBLKCNT;
+ m->delta[0] = 1;
+ }
m->delta[1] = le16_to_cpu(di->di_u.delta[1]);
break;
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
@@ -210,6 +223,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
unsigned int vcnt, base, lo, encodebits, nblk;
int i;
u8 *in, type;
+ bool big_pcluster;
if (1 << amortizedshift == 4)
vcnt = 2;
@@ -218,6 +232,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
else
return -EOPNOTSUPP;
+ big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt;
base = round_down(eofs, vcnt << amortizedshift);
in = m->kaddr + base;
@@ -229,7 +244,15 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
m->type = type;
if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
m->clusterofs = 1 << lclusterbits;
- if (i + 1 != vcnt) {
+ if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) {
+ if (!big_pcluster) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ m->compressedlcs = lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT;
+ m->delta[0] = 1;
+ return 0;
+ } else if (i + 1 != (int)vcnt) {
m->delta[0] = lo;
return 0;
}
@@ -242,22 +265,48 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
in, encodebits * (i - 1), &type);
if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
lo = 0;
+ else if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT)
+ lo = 1;
m->delta[0] = lo + 1;
return 0;
}
m->clusterofs = lo;
m->delta[0] = 0;
/* figout out blkaddr (pblk) for HEAD lclusters */
- nblk = 1;
- while (i > 0) {
- --i;
- lo = decode_compactedbits(lclusterbits, lomask,
- in, encodebits * i, &type);
- if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
- i -= lo;
-
- if (i >= 0)
+ if (!big_pcluster) {
+ nblk = 1;
+ while (i > 0) {
+ --i;
+ lo = decode_compactedbits(lclusterbits, lomask,
+ in, encodebits * i, &type);
+ if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
+ i -= lo;
+
+ if (i >= 0)
+ ++nblk;
+ }
+ } else {
+ nblk = 0;
+ while (i > 0) {
+ --i;
+ lo = decode_compactedbits(lclusterbits, lomask,
+ in, encodebits * i, &type);
+ if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
+ if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) {
+ --i;
+ nblk += lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT;
+ continue;
+ }
+ /* bigpcluster shouldn't have plain d0 == 1 */
+ if (lo <= 1) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ i -= lo - 2;
+ continue;
+ }
++nblk;
+ }
}
in += (vcnt << amortizedshift) - sizeof(__le32);
m->pblk = le32_to_cpu(*(__le32 *)in) + nblk;
@@ -381,6 +430,58 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
return 0;
}
+static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
+ unsigned int initial_lcn)
+{
+ struct erofs_inode *const vi = EROFS_I(m->inode);
+ struct erofs_map_blocks *const map = m->map;
+ const unsigned int lclusterbits = vi->z_logical_clusterbits;
+ unsigned long lcn;
+ int err;
+
+ DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN &&
+ m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD);
+ if (!(map->m_flags & EROFS_MAP_ZIPPED) ||
+ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
+ map->m_plen = 1 << lclusterbits;
+ return 0;
+ }
+
+ lcn = m->lcn + 1;
+ if (m->compressedlcs)
+ goto out;
+ if (lcn == initial_lcn)
+ goto err_bonus_cblkcnt;
+
+ err = z_erofs_load_cluster_from_disk(m, lcn);
+ if (err)
+ return err;
+
+ switch (m->type) {
+ case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+ if (m->delta[0] != 1)
+ goto err_bonus_cblkcnt;
+ if (m->compressedlcs)
+ break;
+ fallthrough;
+ default:
+ erofs_err(m->inode->i_sb,
+ "cannot found CBLKCNT @ lcn %lu of nid %llu",
+ lcn, vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+out:
+ map->m_plen = m->compressedlcs << lclusterbits;
+ return 0;
+err_bonus_cblkcnt:
+ erofs_err(m->inode->i_sb,
+ "bogus CBLKCNT @ lcn %lu of nid %llu",
+ lcn, vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+}
+
int z_erofs_map_blocks_iter(struct inode *inode,
struct erofs_map_blocks *map,
int flags)
@@ -392,6 +493,7 @@ int z_erofs_map_blocks_iter(struct inode *inode,
};
int err = 0;
unsigned int lclusterbits, endoff;
+ unsigned long initial_lcn;
unsigned long long ofs, end;
trace_z_erofs_map_blocks_iter_enter(inode, map, flags);
@@ -410,10 +512,10 @@ int z_erofs_map_blocks_iter(struct inode *inode,
lclusterbits = vi->z_logical_clusterbits;
ofs = map->m_la;
- m.lcn = ofs >> lclusterbits;
+ initial_lcn = ofs >> lclusterbits;
endoff = ofs & ((1 << lclusterbits) - 1);
- err = z_erofs_load_cluster_from_disk(&m, m.lcn);
+ err = z_erofs_load_cluster_from_disk(&m, initial_lcn);
if (err)
goto unmap_out;
@@ -443,7 +545,7 @@ int z_erofs_map_blocks_iter(struct inode *inode,
m.delta[0] = 1;
fallthrough;
case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
- /* get the correspoinding first chunk */
+ /* get the corresponding first chunk */
err = z_erofs_extent_lookback(&m, m.delta[0]);
if (err)
goto unmap_out;
@@ -457,10 +559,12 @@ int z_erofs_map_blocks_iter(struct inode *inode,
}
map->m_llen = end - map->m_la;
- map->m_plen = 1 << lclusterbits;
map->m_pa = blknr_to_addr(m.pblk);
map->m_flags |= EROFS_MAP_MAPPED;
+ err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
+ if (err)
+ goto out;
unmap_out:
if (m.kaddr)
kunmap_atomic(m.kaddr);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3196474cbe24..73138ea68342 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -366,8 +366,8 @@ static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
*
* @ep: Pointer to the eventpoll context.
*
- * Returns: Returns a value different than zero if ready events are available,
- * or zero otherwise.
+ * Return: a value different than %zero if ready events are available,
+ * or %zero otherwise.
*/
static inline int ep_events_available(struct eventpoll *ep)
{
@@ -1023,7 +1023,7 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
}
#endif /* CONFIG_KCMP */
-/**
+/*
* Adds a new entry to the tail of the list in a lockless way, i.e.
* multiple CPUs are allowed to call this function concurrently.
*
@@ -1035,10 +1035,10 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
* completed.
*
* Also an element can be locklessly added to the list only in one
- * direction i.e. either to the tail either to the head, otherwise
+ * direction i.e. either to the tail or to the head, otherwise
* concurrent access will corrupt the list.
*
- * Returns %false if element has been already added to the list, %true
+ * Return: %false if element has been already added to the list, %true
* otherwise.
*/
static inline bool list_add_tail_lockless(struct list_head *new,
@@ -1076,11 +1076,11 @@ static inline bool list_add_tail_lockless(struct list_head *new,
return true;
}
-/**
+/*
* Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
* i.e. multiple CPUs are allowed to call this function concurrently.
*
- * Returns %false if epi element has been already chained, %true otherwise.
+ * Return: %false if epi element has been already chained, %true otherwise.
*/
static inline bool chain_epi_lockless(struct epitem *epi)
{
@@ -1105,8 +1105,8 @@ static inline bool chain_epi_lockless(struct epitem *epi)
* mechanism. It is called by the stored file descriptors when they
* have events to report.
*
- * This callback takes a read lock in order not to content with concurrent
- * events from another file descriptors, thus all modifications to ->rdllist
+ * This callback takes a read lock in order not to contend with concurrent
+ * events from another file descriptor, thus all modifications to ->rdllist
* or ->ovflist are lockless. Read lock is paired with the write lock from
* ep_scan_ready_list(), which stops all list modifications and guarantees
* that lists state is seen correctly.
@@ -1335,8 +1335,8 @@ static int reverse_path_check_proc(struct hlist_head *refs, int depth)
* paths such that we will spend all our time waking up
* eventpoll objects.
*
- * Returns: Returns zero if the proposed links don't create too many paths,
- * -1 otherwise.
+ * Return: %zero if the proposed links don't create too many paths,
+ * %-1 otherwise.
*/
static int reverse_path_check(void)
{
@@ -1734,7 +1734,7 @@ static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
}
/**
- * ep_poll - Retrieves ready events, and delivers them to the caller supplied
+ * ep_poll - Retrieves ready events, and delivers them to the caller-supplied
* event buffer.
*
* @ep: Pointer to the eventpoll context.
@@ -1747,7 +1747,7 @@ static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
* until at least one event has been retrieved (or an error
* occurred).
*
- * Returns: Returns the number of ready events which have been fetched, or an
+ * Return: the number of ready events which have been fetched, or an
* error code, in case of error.
*/
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
@@ -1774,9 +1774,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
/*
* This call is racy: We may or may not see events that are being added
- * to the ready list under the lock (e.g., in IRQ callbacks). For, cases
+ * to the ready list under the lock (e.g., in IRQ callbacks). For cases
* with a non-zero timeout, this thread will check the ready list under
- * lock and will added to the wait queue. For, cases with a zero
+ * lock and will add to the wait queue. For cases with a zero
* timeout, the user by definition should not care and will have to
* recheck again.
*/
@@ -1869,15 +1869,15 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
/**
* ep_loop_check_proc - verify that adding an epoll file inside another
- * epoll structure, does not violate the constraints, in
+ * epoll structure does not violate the constraints, in
* terms of closed loops, or too deep chains (which can
* result in excessive stack usage).
*
- * @priv: Pointer to the epoll file to be currently checked.
+ * @ep: the &struct eventpoll to be currently checked.
* @depth: Current depth of the path being checked.
*
- * Returns: Returns zero if adding the epoll @file inside current epoll
- * structure @ep does not violate the constraints, or -1 otherwise.
+ * Return: %zero if adding the epoll @file inside current epoll
+ * structure @ep does not violate the constraints, or %-1 otherwise.
*/
static int ep_loop_check_proc(struct eventpoll *ep, int depth)
{
@@ -1919,14 +1919,14 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth)
/**
* ep_loop_check - Performs a check to verify that adding an epoll file (@to)
- * into another epoll file (represented by @from) does not create
+ * into another epoll file (represented by @ep) does not create
* closed loops or too deep chains.
*
- * @from: Pointer to the epoll we are inserting into.
+ * @ep: Pointer to the epoll we are inserting into.
* @to: Pointer to the epoll to be inserted.
*
- * Returns: Returns zero if adding the epoll @to inside the epoll @from
- * does not violate the constraints, or -1 otherwise.
+ * Return: %zero if adding the epoll @to inside the epoll @from
+ * does not violate the constraints, or %-1 otherwise.
*/
static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
{
@@ -2074,8 +2074,8 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
ep = f.file->private_data;
/*
- * When we insert an epoll file descriptor, inside another epoll file
- * descriptor, there is the change of creating closed loops, which are
+ * When we insert an epoll file descriptor inside another epoll file
+ * descriptor, there is the chance of creating closed loops, which are
* better be handled here, than in more critical paths. While we are
* checking for loops we also determine the list of files reachable
* and hang them on the tfile_check_list, so we can check that we
@@ -2113,7 +2113,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
}
/*
- * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
+ * Try to lookup the file inside our RB tree. Since we grabbed "mtx"
* above, we can be sure to be able to use the item looked up by
* ep_find() till we release the mutex.
*/
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 3309fb2d327a..23ffe5b96010 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -283,8 +283,6 @@ static inline __u32 ext2_mask_flags(umode_t mode, __u32 flags)
/*
* ioctl commands
*/
-#define EXT2_IOC_GETFLAGS FS_IOC_GETFLAGS
-#define EXT2_IOC_SETFLAGS FS_IOC_SETFLAGS
#define EXT2_IOC_GETVERSION FS_IOC_GETVERSION
#define EXT2_IOC_SETVERSION FS_IOC_SETVERSION
#define EXT2_IOC_GETRSVSZ _IOR('f', 5, long)
@@ -293,8 +291,6 @@ static inline __u32 ext2_mask_flags(umode_t mode, __u32 flags)
/*
* ioctl commands in 32 bit emulation
*/
-#define EXT2_IOC32_GETFLAGS FS_IOC32_GETFLAGS
-#define EXT2_IOC32_SETFLAGS FS_IOC32_SETFLAGS
#define EXT2_IOC32_GETVERSION FS_IOC32_GETVERSION
#define EXT2_IOC32_SETVERSION FS_IOC32_SETVERSION
@@ -772,6 +768,9 @@ extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
/* ioctl.c */
+extern int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+extern int ext2_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa);
extern long ext2_ioctl(struct file *, unsigned int, unsigned long);
extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 96044f5dbc0e..f98466acc672 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -204,4 +204,6 @@ const struct inode_operations ext2_file_inode_operations = {
.get_acl = ext2_get_acl,
.set_acl = ext2_set_acl,
.fiemap = ext2_fiemap,
+ .fileattr_get = ext2_fileattr_get,
+ .fileattr_set = ext2_fileattr_set,
};
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index b399cbb7022d..e8340bf09b10 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -16,69 +16,51 @@
#include <linux/mount.h>
#include <asm/current.h>
#include <linux/uaccess.h>
+#include <linux/fileattr.h>
-
-long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa)
{
- struct inode *inode = file_inode(filp);
- struct ext2_inode_info *ei = EXT2_I(inode);
- unsigned int flags;
- unsigned short rsv_window_size;
- int ret;
+ struct ext2_inode_info *ei = EXT2_I(d_inode(dentry));
- ext2_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+ fileattr_fill_flags(fa, ei->i_flags & EXT2_FL_USER_VISIBLE);
- switch (cmd) {
- case EXT2_IOC_GETFLAGS:
- flags = ei->i_flags & EXT2_FL_USER_VISIBLE;
- return put_user(flags, (int __user *) arg);
- case EXT2_IOC_SETFLAGS: {
- unsigned int oldflags;
+ return 0;
+}
- ret = mnt_want_write_file(filp);
- if (ret)
- return ret;
+int ext2_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ struct ext2_inode_info *ei = EXT2_I(inode);
- if (!inode_owner_or_capable(&init_user_ns, inode)) {
- ret = -EACCES;
- goto setflags_out;
- }
+ if (fileattr_has_fsx(fa))
+ return -EOPNOTSUPP;
- if (get_user(flags, (int __user *) arg)) {
- ret = -EFAULT;
- goto setflags_out;
- }
+ /* Is it quota file? Do not allow user to mess with it */
+ if (IS_NOQUOTA(inode))
+ return -EPERM;
- flags = ext2_mask_flags(inode->i_mode, flags);
+ ei->i_flags = (ei->i_flags & ~EXT2_FL_USER_MODIFIABLE) |
+ (fa->flags & EXT2_FL_USER_MODIFIABLE);
- inode_lock(inode);
- /* Is it quota file? Do not allow user to mess with it */
- if (IS_NOQUOTA(inode)) {
- inode_unlock(inode);
- ret = -EPERM;
- goto setflags_out;
- }
- oldflags = ei->i_flags;
+ ext2_set_inode_flags(inode);
+ inode->i_ctime = current_time(inode);
+ mark_inode_dirty(inode);
- ret = vfs_ioc_setflags_prepare(inode, oldflags, flags);
- if (ret) {
- inode_unlock(inode);
- goto setflags_out;
- }
+ return 0;
+}
- flags = flags & EXT2_FL_USER_MODIFIABLE;
- flags |= oldflags & ~EXT2_FL_USER_MODIFIABLE;
- ei->i_flags = flags;
- ext2_set_inode_flags(inode);
- inode->i_ctime = current_time(inode);
- inode_unlock(inode);
+long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct ext2_inode_info *ei = EXT2_I(inode);
+ unsigned short rsv_window_size;
+ int ret;
- mark_inode_dirty(inode);
-setflags_out:
- mnt_drop_write_file(filp);
- return ret;
- }
+ ext2_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+
+ switch (cmd) {
case EXT2_IOC_GETVERSION:
return put_user(inode->i_generation, (int __user *) arg);
case EXT2_IOC_SETVERSION: {
@@ -163,12 +145,6 @@ long ext2_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
/* These are just misnamed, they actually get/put from/to user an int */
switch (cmd) {
- case EXT2_IOC32_GETFLAGS:
- cmd = EXT2_IOC_GETFLAGS;
- break;
- case EXT2_IOC32_SETFLAGS:
- cmd = EXT2_IOC_SETFLAGS;
- break;
case EXT2_IOC32_GETVERSION:
cmd = EXT2_IOC_GETVERSION;
break;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 3367384d344d..c03fc3c1533e 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -427,6 +427,8 @@ const struct inode_operations ext2_dir_inode_operations = {
.get_acl = ext2_get_acl,
.set_acl = ext2_set_acl,
.tmpfile = ext2_tmpfile,
+ .fileattr_get = ext2_fileattr_get,
+ .fileattr_set = ext2_fileattr_set,
};
const struct inode_operations ext2_special_inode_operations = {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 826a56e3bbd2..18f021c988a1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -472,15 +472,6 @@ struct flex_groups {
EXT4_VERITY_FL | \
EXT4_INLINE_DATA_FL)
-/* Flags we can manipulate with through FS_IOC_FSSETXATTR */
-#define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \
- EXT4_IMMUTABLE_FL | \
- EXT4_APPEND_FL | \
- EXT4_NODUMP_FL | \
- EXT4_NOATIME_FL | \
- EXT4_PROJINHERIT_FL | \
- EXT4_DAX_FL)
-
/* Flags that should be inherited by new inodes from their parent. */
#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
@@ -2928,6 +2919,9 @@ extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
+int ext4_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa);
+int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa);
extern void ext4_reset_inode_seed(struct inode *inode);
/* migrate.c */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 194f5d00fa32..5332dd3ea7e2 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -919,5 +919,7 @@ const struct inode_operations ext4_file_inode_operations = {
.get_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
+ .fileattr_get = ext4_fileattr_get,
+ .fileattr_set = ext4_fileattr_set,
};
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index 4c2a9fe30067..4493ef0c715e 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -354,8 +354,8 @@ static unsigned int ext4_getfsmap_find_sb(struct super_block *sb,
/* Compare two fsmap items. */
static int ext4_getfsmap_compare(void *priv,
- struct list_head *a,
- struct list_head *b)
+ const struct list_head *a,
+ const struct list_head *b)
{
struct ext4_fsmap *fa;
struct ext4_fsmap *fb;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 633ae7becd61..755a68bb7e22 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -970,7 +970,7 @@ struct inode *__ext4_new_inode(struct user_namespace *mnt_userns,
i_gid_write(inode, owner[1]);
} else if (test_opt(sb, GRPID)) {
inode->i_mode = mode;
- inode->i_uid = fsuid_into_mnt(mnt_userns);
+ inode_fsuid_set(inode, mnt_userns);
inode->i_gid = dir->i_gid;
} else
inode_init_owner(mnt_userns, inode, dir, mode);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a2cf35066f46..e9b0a1fa2ba8 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -20,6 +20,7 @@
#include <linux/uaccess.h>
#include <linux/delay.h>
#include <linux/iversion.h>
+#include <linux/fileattr.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include <linux/fsmap.h>
@@ -344,11 +345,6 @@ static int ext4_ioctl_setflags(struct inode *inode,
goto flags_out;
oldflags = ei->i_flags;
-
- err = vfs_ioc_setflags_prepare(inode, oldflags, flags);
- if (err)
- goto flags_out;
-
/*
* The JOURNAL_DATA flag can only be changed by
* the relevant capability.
@@ -459,9 +455,8 @@ flags_out:
}
#ifdef CONFIG_QUOTA
-static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
+static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
{
- struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
struct ext4_inode_info *ei = EXT4_I(inode);
int err, rc;
@@ -545,7 +540,7 @@ out_stop:
return err;
}
#else
-static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
+static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
{
if (projid != EXT4_DEF_PROJID)
return -EOPNOTSUPP;
@@ -553,56 +548,6 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
}
#endif
-/* Transfer internal flags to xflags */
-static inline __u32 ext4_iflags_to_xflags(unsigned long iflags)
-{
- __u32 xflags = 0;
-
- if (iflags & EXT4_SYNC_FL)
- xflags |= FS_XFLAG_SYNC;
- if (iflags & EXT4_IMMUTABLE_FL)
- xflags |= FS_XFLAG_IMMUTABLE;
- if (iflags & EXT4_APPEND_FL)
- xflags |= FS_XFLAG_APPEND;
- if (iflags & EXT4_NODUMP_FL)
- xflags |= FS_XFLAG_NODUMP;
- if (iflags & EXT4_NOATIME_FL)
- xflags |= FS_XFLAG_NOATIME;
- if (iflags & EXT4_PROJINHERIT_FL)
- xflags |= FS_XFLAG_PROJINHERIT;
- if (iflags & EXT4_DAX_FL)
- xflags |= FS_XFLAG_DAX;
- return xflags;
-}
-
-#define EXT4_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \
- FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \
- FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT | \
- FS_XFLAG_DAX)
-
-/* Transfer xflags flags to internal */
-static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
-{
- unsigned long iflags = 0;
-
- if (xflags & FS_XFLAG_SYNC)
- iflags |= EXT4_SYNC_FL;
- if (xflags & FS_XFLAG_IMMUTABLE)
- iflags |= EXT4_IMMUTABLE_FL;
- if (xflags & FS_XFLAG_APPEND)
- iflags |= EXT4_APPEND_FL;
- if (xflags & FS_XFLAG_NODUMP)
- iflags |= EXT4_NODUMP_FL;
- if (xflags & FS_XFLAG_NOATIME)
- iflags |= EXT4_NOATIME_FL;
- if (xflags & FS_XFLAG_PROJINHERIT)
- iflags |= EXT4_PROJINHERIT_FL;
- if (xflags & FS_XFLAG_DAX)
- iflags |= EXT4_DAX_FL;
-
- return iflags;
-}
-
static int ext4_shutdown(struct super_block *sb, unsigned long arg)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -770,15 +715,52 @@ group_add_out:
return err;
}
-static void ext4_fill_fsxattr(struct inode *inode, struct fsxattr *fa)
+int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa)
{
+ struct inode *inode = d_inode(dentry);
struct ext4_inode_info *ei = EXT4_I(inode);
+ u32 flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
- simple_fill_fsxattr(fa, ext4_iflags_to_xflags(ei->i_flags &
- EXT4_FL_USER_VISIBLE));
+ if (S_ISREG(inode->i_mode))
+ flags &= ~FS_PROJINHERIT_FL;
+ fileattr_fill_flags(fa, flags);
if (ext4_has_feature_project(inode->i_sb))
fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid);
+
+ return 0;
+}
+
+int ext4_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ u32 flags = fa->flags;
+ int err = -EOPNOTSUPP;
+
+ ext4_fc_start_update(inode);
+ if (flags & ~EXT4_FL_USER_VISIBLE)
+ goto out;
+
+ /*
+ * chattr(1) grabs flags via GETFLAGS, modifies the result and
+ * passes that to SETFLAGS. So we cannot easily make SETFLAGS
+ * more restrictive than just silently masking off visible but
+ * not settable flags as we always did.
+ */
+ flags &= EXT4_FL_USER_MODIFIABLE;
+ if (ext4_mask_flags(inode->i_mode, flags) != flags)
+ goto out;
+ err = ext4_ioctl_check_immutable(inode, fa->fsx_projid, flags);
+ if (err)
+ goto out;
+ err = ext4_ioctl_setflags(inode, flags);
+ if (err)
+ goto out;
+ err = ext4_ioctl_setproject(inode, fa->fsx_projid);
+out:
+ ext4_fc_stop_update(inode);
+ return err;
}
/* So that the fiemap access checks can't overflow on 32 bit machines. */
@@ -816,55 +798,13 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
- struct ext4_inode_info *ei = EXT4_I(inode);
struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
- unsigned int flags;
ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
switch (cmd) {
case FS_IOC_GETFSMAP:
return ext4_ioc_getfsmap(sb, (void __user *)arg);
- case FS_IOC_GETFLAGS:
- flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
- if (S_ISREG(inode->i_mode))
- flags &= ~EXT4_PROJINHERIT_FL;
- return put_user(flags, (int __user *) arg);
- case FS_IOC_SETFLAGS: {
- int err;
-
- if (!inode_owner_or_capable(mnt_userns, inode))
- return -EACCES;
-
- if (get_user(flags, (int __user *) arg))
- return -EFAULT;
-
- if (flags & ~EXT4_FL_USER_VISIBLE)
- return -EOPNOTSUPP;
- /*
- * chattr(1) grabs flags via GETFLAGS, modifies the result and
- * passes that to SETFLAGS. So we cannot easily make SETFLAGS
- * more restrictive than just silently masking off visible but
- * not settable flags as we always did.
- */
- flags &= EXT4_FL_USER_MODIFIABLE;
- if (ext4_mask_flags(inode->i_mode, flags) != flags)
- return -EOPNOTSUPP;
-
- err = mnt_want_write_file(filp);
- if (err)
- return err;
-
- inode_lock(inode);
- err = ext4_ioctl_check_immutable(inode,
- from_kprojid(&init_user_ns, ei->i_projid),
- flags);
- if (!err)
- err = ext4_ioctl_setflags(inode, flags);
- inode_unlock(inode);
- mnt_drop_write_file(filp);
- return err;
- }
case EXT4_IOC_GETVERSION:
case EXT4_IOC_GETVERSION_OLD:
return put_user(inode->i_generation, (int __user *) arg);
@@ -1246,60 +1186,6 @@ resizefs_out:
case EXT4_IOC_GET_ES_CACHE:
return ext4_ioctl_get_es_cache(filp, arg);
- case FS_IOC_FSGETXATTR:
- {
- struct fsxattr fa;
-
- ext4_fill_fsxattr(inode, &fa);
-
- if (copy_to_user((struct fsxattr __user *)arg,
- &fa, sizeof(fa)))
- return -EFAULT;
- return 0;
- }
- case FS_IOC_FSSETXATTR:
- {
- struct fsxattr fa, old_fa;
- int err;
-
- if (copy_from_user(&fa, (struct fsxattr __user *)arg,
- sizeof(fa)))
- return -EFAULT;
-
- /* Make sure caller has proper permission */
- if (!inode_owner_or_capable(mnt_userns, inode))
- return -EACCES;
-
- if (fa.fsx_xflags & ~EXT4_SUPPORTED_FS_XFLAGS)
- return -EOPNOTSUPP;
-
- flags = ext4_xflags_to_iflags(fa.fsx_xflags);
- if (ext4_mask_flags(inode->i_mode, flags) != flags)
- return -EOPNOTSUPP;
-
- err = mnt_want_write_file(filp);
- if (err)
- return err;
-
- inode_lock(inode);
- ext4_fill_fsxattr(inode, &old_fa);
- err = vfs_ioc_fssetxattr_check(inode, &old_fa, &fa);
- if (err)
- goto out;
- flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |
- (flags & EXT4_FL_XFLAG_VISIBLE);
- err = ext4_ioctl_check_immutable(inode, fa.fsx_projid, flags);
- if (err)
- goto out;
- err = ext4_ioctl_setflags(inode, flags);
- if (err)
- goto out;
- err = ext4_ioctl_setproject(filp, fa.fsx_projid);
-out:
- inode_unlock(inode);
- mnt_drop_write_file(filp);
- return err;
- }
case EXT4_IOC_SHUTDOWN:
return ext4_shutdown(sb, arg);
@@ -1340,12 +1226,6 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
/* These are just misnamed, they actually get/put from/to user an int */
switch (cmd) {
- case FS_IOC32_GETFLAGS:
- cmd = FS_IOC_GETFLAGS;
- break;
- case FS_IOC32_SETFLAGS:
- cmd = FS_IOC_SETFLAGS;
- break;
case EXT4_IOC32_GETVERSION:
cmd = EXT4_IOC_GETVERSION;
break;
@@ -1405,8 +1285,6 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC_CLEAR_ES_CACHE:
case EXT4_IOC_GETSTATE:
case EXT4_IOC_GET_ES_CACHE:
- case FS_IOC_FSGETXATTR:
- case FS_IOC_FSSETXATTR:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 883e2a7cd4ab..a37a19fabee4 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -4172,6 +4172,8 @@ const struct inode_operations ext4_dir_inode_operations = {
.get_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
+ .fileattr_get = ext4_fileattr_get,
+ .fileattr_set = ext4_fileattr_set,
};
const struct inode_operations ext4_special_inode_operations = {
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index 00e3cbde472e..07438f46b558 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -370,7 +370,7 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
unsigned long num_ra_pages)
{
- DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index);
+ DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
struct page *page;
index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index e2d302ae3a46..11a20dc505aa 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3194,6 +3194,9 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end);
void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count);
int f2fs_precache_extents(struct inode *inode);
+int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int f2fs_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa);
long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index d26ff2ae3f5e..8a56acbcee4c 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -22,6 +22,7 @@
#include <linux/file.h>
#include <linux/nls.h>
#include <linux/sched/signal.h>
+#include <linux/fileattr.h>
#include "f2fs.h"
#include "node.h"
@@ -990,6 +991,8 @@ const struct inode_operations f2fs_file_inode_operations = {
.set_acl = f2fs_set_acl,
.listxattr = f2fs_listxattr,
.fiemap = f2fs_fiemap,
+ .fileattr_get = f2fs_fileattr_get,
+ .fileattr_set = f2fs_fileattr_set,
};
static int fill_zero(struct inode *inode, pgoff_t index,
@@ -1871,13 +1874,16 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
return 0;
}
-/* FS_IOC_GETFLAGS and FS_IOC_SETFLAGS support */
+/* FS_IOC_[GS]ETFLAGS and FS_IOC_FS[GS]ETXATTR support */
/*
* To make a new on-disk f2fs i_flag gettable via FS_IOC_GETFLAGS, add an entry
* for it to f2fs_fsflags_map[], and add its FS_*_FL equivalent to
* F2FS_GETTABLE_FS_FL. To also make it settable via FS_IOC_SETFLAGS, also add
* its FS_*_FL equivalent to F2FS_SETTABLE_FS_FL.
+ *
+ * Translating flags to fsx_flags value used by FS_IOC_FSGETXATTR and
+ * FS_IOC_FSSETXATTR is done by the VFS.
*/
static const struct {
@@ -1952,67 +1958,6 @@ static inline u32 f2fs_fsflags_to_iflags(u32 fsflags)
return iflags;
}
-static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
-{
- struct inode *inode = file_inode(filp);
- struct f2fs_inode_info *fi = F2FS_I(inode);
- u32 fsflags = f2fs_iflags_to_fsflags(fi->i_flags);
-
- if (IS_ENCRYPTED(inode))
- fsflags |= FS_ENCRYPT_FL;
- if (IS_VERITY(inode))
- fsflags |= FS_VERITY_FL;
- if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode))
- fsflags |= FS_INLINE_DATA_FL;
- if (is_inode_flag_set(inode, FI_PIN_FILE))
- fsflags |= FS_NOCOW_FL;
-
- fsflags &= F2FS_GETTABLE_FS_FL;
-
- return put_user(fsflags, (int __user *)arg);
-}
-
-static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
-{
- struct inode *inode = file_inode(filp);
- struct f2fs_inode_info *fi = F2FS_I(inode);
- u32 fsflags, old_fsflags;
- u32 iflags;
- int ret;
-
- if (!inode_owner_or_capable(&init_user_ns, inode))
- return -EACCES;
-
- if (get_user(fsflags, (int __user *)arg))
- return -EFAULT;
-
- if (fsflags & ~F2FS_GETTABLE_FS_FL)
- return -EOPNOTSUPP;
- fsflags &= F2FS_SETTABLE_FS_FL;
-
- iflags = f2fs_fsflags_to_iflags(fsflags);
- if (f2fs_mask_flags(inode->i_mode, iflags) != iflags)
- return -EOPNOTSUPP;
-
- ret = mnt_want_write_file(filp);
- if (ret)
- return ret;
-
- inode_lock(inode);
-
- old_fsflags = f2fs_iflags_to_fsflags(fi->i_flags);
- ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags);
- if (ret)
- goto out;
-
- ret = f2fs_setflags_common(inode, iflags,
- f2fs_fsflags_to_iflags(F2FS_SETTABLE_FS_FL));
-out:
- inode_unlock(inode);
- mnt_drop_write_file(filp);
- return ret;
-}
-
static int f2fs_ioc_getversion(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -3019,9 +2964,8 @@ int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid)
return err;
}
-static int f2fs_ioc_setproject(struct file *filp, __u32 projid)
+static int f2fs_ioc_setproject(struct inode *inode, __u32 projid)
{
- struct inode *inode = file_inode(filp);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *ipage;
@@ -3082,7 +3026,7 @@ int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid)
return 0;
}
-static int f2fs_ioc_setproject(struct file *filp, __u32 projid)
+static int f2fs_ioc_setproject(struct inode *inode, __u32 projid)
{
if (projid != F2FS_DEF_PROJID)
return -EOPNOTSUPP;
@@ -3090,123 +3034,55 @@ static int f2fs_ioc_setproject(struct file *filp, __u32 projid)
}
#endif
-/* FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR support */
-
-/*
- * To make a new on-disk f2fs i_flag gettable via FS_IOC_FSGETXATTR and settable
- * via FS_IOC_FSSETXATTR, add an entry for it to f2fs_xflags_map[], and add its
- * FS_XFLAG_* equivalent to F2FS_SUPPORTED_XFLAGS.
- */
-
-static const struct {
- u32 iflag;
- u32 xflag;
-} f2fs_xflags_map[] = {
- { F2FS_SYNC_FL, FS_XFLAG_SYNC },
- { F2FS_IMMUTABLE_FL, FS_XFLAG_IMMUTABLE },
- { F2FS_APPEND_FL, FS_XFLAG_APPEND },
- { F2FS_NODUMP_FL, FS_XFLAG_NODUMP },
- { F2FS_NOATIME_FL, FS_XFLAG_NOATIME },
- { F2FS_PROJINHERIT_FL, FS_XFLAG_PROJINHERIT },
-};
-
-#define F2FS_SUPPORTED_XFLAGS ( \
- FS_XFLAG_SYNC | \
- FS_XFLAG_IMMUTABLE | \
- FS_XFLAG_APPEND | \
- FS_XFLAG_NODUMP | \
- FS_XFLAG_NOATIME | \
- FS_XFLAG_PROJINHERIT)
-
-/* Convert f2fs on-disk i_flags to FS_IOC_FS{GET,SET}XATTR flags */
-static inline u32 f2fs_iflags_to_xflags(u32 iflags)
-{
- u32 xflags = 0;
- int i;
-
- for (i = 0; i < ARRAY_SIZE(f2fs_xflags_map); i++)
- if (iflags & f2fs_xflags_map[i].iflag)
- xflags |= f2fs_xflags_map[i].xflag;
-
- return xflags;
-}
-
-/* Convert FS_IOC_FS{GET,SET}XATTR flags to f2fs on-disk i_flags */
-static inline u32 f2fs_xflags_to_iflags(u32 xflags)
-{
- u32 iflags = 0;
- int i;
-
- for (i = 0; i < ARRAY_SIZE(f2fs_xflags_map); i++)
- if (xflags & f2fs_xflags_map[i].xflag)
- iflags |= f2fs_xflags_map[i].iflag;
-
- return iflags;
-}
-
-static void f2fs_fill_fsxattr(struct inode *inode, struct fsxattr *fa)
+int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
{
+ struct inode *inode = d_inode(dentry);
struct f2fs_inode_info *fi = F2FS_I(inode);
+ u32 fsflags = f2fs_iflags_to_fsflags(fi->i_flags);
- simple_fill_fsxattr(fa, f2fs_iflags_to_xflags(fi->i_flags));
+ if (IS_ENCRYPTED(inode))
+ fsflags |= FS_ENCRYPT_FL;
+ if (IS_VERITY(inode))
+ fsflags |= FS_VERITY_FL;
+ if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode))
+ fsflags |= FS_INLINE_DATA_FL;
+ if (is_inode_flag_set(inode, FI_PIN_FILE))
+ fsflags |= FS_NOCOW_FL;
+
+ fileattr_fill_flags(fa, fsflags & F2FS_GETTABLE_FS_FL);
if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)))
fa->fsx_projid = from_kprojid(&init_user_ns, fi->i_projid);
-}
-static int f2fs_ioc_fsgetxattr(struct file *filp, unsigned long arg)
-{
- struct inode *inode = file_inode(filp);
- struct fsxattr fa;
-
- f2fs_fill_fsxattr(inode, &fa);
-
- if (copy_to_user((struct fsxattr __user *)arg, &fa, sizeof(fa)))
- return -EFAULT;
return 0;
}
-static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg)
+int f2fs_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
{
- struct inode *inode = file_inode(filp);
- struct fsxattr fa, old_fa;
+ struct inode *inode = d_inode(dentry);
+ u32 fsflags = fa->flags, mask = F2FS_SETTABLE_FS_FL;
u32 iflags;
int err;
- if (copy_from_user(&fa, (struct fsxattr __user *)arg, sizeof(fa)))
- return -EFAULT;
-
- /* Make sure caller has proper permission */
- if (!inode_owner_or_capable(&init_user_ns, inode))
- return -EACCES;
-
- if (fa.fsx_xflags & ~F2FS_SUPPORTED_XFLAGS)
+ if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+ return -EIO;
+ if (!f2fs_is_checkpoint_ready(F2FS_I_SB(inode)))
+ return -ENOSPC;
+ if (fsflags & ~F2FS_GETTABLE_FS_FL)
return -EOPNOTSUPP;
+ fsflags &= F2FS_SETTABLE_FS_FL;
+ if (!fa->flags_valid)
+ mask &= FS_COMMON_FL;
- iflags = f2fs_xflags_to_iflags(fa.fsx_xflags);
+ iflags = f2fs_fsflags_to_iflags(fsflags);
if (f2fs_mask_flags(inode->i_mode, iflags) != iflags)
return -EOPNOTSUPP;
- err = mnt_want_write_file(filp);
- if (err)
- return err;
-
- inode_lock(inode);
-
- f2fs_fill_fsxattr(inode, &old_fa);
- err = vfs_ioc_fssetxattr_check(inode, &old_fa, &fa);
- if (err)
- goto out;
-
- err = f2fs_setflags_common(inode, iflags,
- f2fs_xflags_to_iflags(F2FS_SUPPORTED_XFLAGS));
- if (err)
- goto out;
+ err = f2fs_setflags_common(inode, iflags, f2fs_fsflags_to_iflags(mask));
+ if (!err)
+ err = f2fs_ioc_setproject(inode, fa->fsx_projid);
- err = f2fs_ioc_setproject(filp, fa.fsx_projid);
-out:
- inode_unlock(inode);
- mnt_drop_write_file(filp);
return err;
}
@@ -4051,7 +3927,7 @@ out:
static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len)
{
- DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, page_idx);
+ DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, page_idx);
struct address_space *mapping = inode->i_mapping;
struct page *page;
pgoff_t redirty_idx = page_idx;
@@ -4233,10 +4109,6 @@ out:
static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
switch (cmd) {
- case FS_IOC_GETFLAGS:
- return f2fs_ioc_getflags(filp, arg);
- case FS_IOC_SETFLAGS:
- return f2fs_ioc_setflags(filp, arg);
case FS_IOC_GETVERSION:
return f2fs_ioc_getversion(filp, arg);
case F2FS_IOC_START_ATOMIC_WRITE:
@@ -4285,10 +4157,6 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_flush_device(filp, arg);
case F2FS_IOC_GET_FEATURES:
return f2fs_ioc_get_features(filp, arg);
- case FS_IOC_FSGETXATTR:
- return f2fs_ioc_fsgetxattr(filp, arg);
- case FS_IOC_FSSETXATTR:
- return f2fs_ioc_fssetxattr(filp, arg);
case F2FS_IOC_GET_PIN_FILE:
return f2fs_ioc_get_pin_file(filp, arg);
case F2FS_IOC_SET_PIN_FILE:
@@ -4518,12 +4386,6 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return -ENOSPC;
switch (cmd) {
- case FS_IOC32_GETFLAGS:
- cmd = FS_IOC_GETFLAGS;
- break;
- case FS_IOC32_SETFLAGS:
- cmd = FS_IOC_SETFLAGS;
- break;
case FS_IOC32_GETVERSION:
cmd = FS_IOC_GETVERSION;
break;
@@ -4552,8 +4414,6 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC_DEFRAGMENT:
case F2FS_IOC_FLUSH_DEVICE:
case F2FS_IOC_GET_FEATURES:
- case FS_IOC_FSGETXATTR:
- case FS_IOC_FSSETXATTR:
case F2FS_IOC_GET_PIN_FILE:
case F2FS_IOC_SET_PIN_FILE:
case F2FS_IOC_PRECACHE_EXTENTS:
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 17bd072a5d39..14bf4f65bcb3 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -1327,6 +1327,8 @@ const struct inode_operations f2fs_dir_inode_operations = {
.set_acl = f2fs_set_acl,
.listxattr = f2fs_listxattr,
.fiemap = f2fs_fiemap,
+ .fileattr_get = f2fs_fileattr_get,
+ .fileattr_set = f2fs_fileattr_set,
};
const struct inode_operations f2fs_symlink_inode_operations = {
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 054ec852b5ea..a7beff28a3c5 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -228,7 +228,7 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
unsigned long num_ra_pages)
{
- DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index);
+ DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
struct page *page;
index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT;
diff --git a/fs/file.c b/fs/file.c
index f3a4bac2cbe9..f633348029a5 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -629,17 +629,30 @@ int close_fd(unsigned fd)
}
EXPORT_SYMBOL(close_fd); /* for ksys_close() */
+/**
+ * last_fd - return last valid index into fd table
+ * @cur_fds: files struct
+ *
+ * Context: Either rcu read lock or files_lock must be held.
+ *
+ * Returns: Last valid index into fdtable.
+ */
+static inline unsigned last_fd(struct fdtable *fdt)
+{
+ return fdt->max_fds - 1;
+}
+
static inline void __range_cloexec(struct files_struct *cur_fds,
unsigned int fd, unsigned int max_fd)
{
struct fdtable *fdt;
- if (fd > max_fd)
- return;
-
+ /* make sure we're using the correct maximum value */
spin_lock(&cur_fds->file_lock);
fdt = files_fdtable(cur_fds);
- bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1);
+ max_fd = min(last_fd(fdt), max_fd);
+ if (fd <= max_fd)
+ bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1);
spin_unlock(&cur_fds->file_lock);
}
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index 5e796e6c38e5..427efa73b9bd 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -2,6 +2,7 @@
config FSCACHE
tristate "General filesystem local caching manager"
+ select NETFS_SUPPORT
help
This option enables a generic filesystem caching manager that can be
used by various network and other filesystems to cache data locally.
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
index 79e08e05ef84..3b2ffa93ac18 100644
--- a/fs/fscache/Makefile
+++ b/fs/fscache/Makefile
@@ -7,6 +7,7 @@ fscache-y := \
cache.o \
cookie.o \
fsdef.o \
+ io.o \
main.o \
netfs.o \
object.o \
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 08e91efbce53..c483863b740a 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -142,6 +142,10 @@ extern int fscache_wait_for_operation_activation(struct fscache_object *,
atomic_t *,
atomic_t *);
extern void fscache_invalidate_writes(struct fscache_cookie *);
+struct fscache_retrieval *fscache_alloc_retrieval(struct fscache_cookie *cookie,
+ struct address_space *mapping,
+ fscache_rw_complete_t end_io_func,
+ void *context);
/*
* proc.c
diff --git a/fs/fscache/io.c b/fs/fscache/io.c
new file mode 100644
index 000000000000..8ecc1141802f
--- /dev/null
+++ b/fs/fscache/io.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Cache data I/O routines
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define FSCACHE_DEBUG_LEVEL PAGE
+#include <linux/module.h>
+#define FSCACHE_USE_NEW_IO_API
+#include <linux/fscache-cache.h>
+#include <linux/slab.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+/*
+ * Start a cache read operation.
+ * - we return:
+ * -ENOMEM - out of memory, some pages may be being read
+ * -ERESTARTSYS - interrupted, some pages may be being read
+ * -ENOBUFS - no backing object or space available in which to cache any
+ * pages not being read
+ * -ENODATA - no data available in the backing object for some or all of
+ * the pages
+ * 0 - dispatched a read on all pages
+ */
+int __fscache_begin_read_operation(struct netfs_read_request *rreq,
+ struct fscache_cookie *cookie)
+{
+ struct fscache_retrieval *op;
+ struct fscache_object *object;
+ bool wake_cookie = false;
+ int ret;
+
+ _enter("rr=%08x", rreq->debug_id);
+
+ fscache_stat(&fscache_n_retrievals);
+
+ if (hlist_empty(&cookie->backing_objects))
+ goto nobufs;
+
+ if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
+ _leave(" = -ENOBUFS [invalidating]");
+ return -ENOBUFS;
+ }
+
+ ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+
+ if (fscache_wait_for_deferred_lookup(cookie) < 0)
+ return -ERESTARTSYS;
+
+ op = fscache_alloc_retrieval(cookie, NULL, NULL, NULL);
+ if (!op)
+ return -ENOMEM;
+ trace_fscache_page_op(cookie, NULL, &op->op, fscache_page_op_retr_multi);
+
+ spin_lock(&cookie->lock);
+
+ if (!fscache_cookie_enabled(cookie) ||
+ hlist_empty(&cookie->backing_objects))
+ goto nobufs_unlock;
+ object = hlist_entry(cookie->backing_objects.first,
+ struct fscache_object, cookie_link);
+
+ __fscache_use_cookie(cookie);
+ atomic_inc(&object->n_reads);
+ __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
+
+ if (fscache_submit_op(object, &op->op) < 0)
+ goto nobufs_unlock_dec;
+ spin_unlock(&cookie->lock);
+
+ fscache_stat(&fscache_n_retrieval_ops);
+
+ /* we wait for the operation to become active, and then process it
+ * *here*, in this thread, and not in the thread pool */
+ ret = fscache_wait_for_operation_activation(
+ object, &op->op,
+ __fscache_stat(&fscache_n_retrieval_op_waits),
+ __fscache_stat(&fscache_n_retrievals_object_dead));
+ if (ret < 0)
+ goto error;
+
+ /* ask the cache to honour the operation */
+ ret = object->cache->ops->begin_read_operation(rreq, op);
+
+error:
+ if (ret == -ENOMEM)
+ fscache_stat(&fscache_n_retrievals_nomem);
+ else if (ret == -ERESTARTSYS)
+ fscache_stat(&fscache_n_retrievals_intr);
+ else if (ret == -ENODATA)
+ fscache_stat(&fscache_n_retrievals_nodata);
+ else if (ret < 0)
+ fscache_stat(&fscache_n_retrievals_nobufs);
+ else
+ fscache_stat(&fscache_n_retrievals_ok);
+
+ fscache_put_retrieval(op);
+ _leave(" = %d", ret);
+ return ret;
+
+nobufs_unlock_dec:
+ atomic_dec(&object->n_reads);
+ wake_cookie = __fscache_unuse_cookie(cookie);
+nobufs_unlock:
+ spin_unlock(&cookie->lock);
+ fscache_put_retrieval(op);
+ if (wake_cookie)
+ __fscache_wake_unused_cookie(cookie);
+nobufs:
+ fscache_stat(&fscache_n_retrievals_nobufs);
+ _leave(" = -ENOBUFS");
+ return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_begin_read_operation);
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 26af6fdf1538..991b0a871744 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -299,7 +299,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
/*
* allocate a retrieval op
*/
-static struct fscache_retrieval *fscache_alloc_retrieval(
+struct fscache_retrieval *fscache_alloc_retrieval(
struct fscache_cookie *cookie,
struct address_space *mapping,
fscache_rw_complete_t end_io_func,
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index a5aa93ece8c5..a7c3ed89a3e0 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -278,5 +278,6 @@ int fscache_stats_show(struct seq_file *m, void *v)
atomic_read(&fscache_n_cache_stale_objects),
atomic_read(&fscache_n_cache_retired_objects),
atomic_read(&fscache_n_cache_culled_objects));
+ netfs_stats_show(m);
return 0;
}
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 8c7021fb2cd4..0c48b35c058d 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_FUSE_FS) += fuse.o
obj-$(CONFIG_CUSE) += cuse.o
obj-$(CONFIG_VIRTIO_FS) += virtiofs.o
-fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o
+fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o
fuse-$(CONFIG_FUSE_DAX) += dax.o
virtiofs-y := virtio_fs.o
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 06a18700a845..1b6c001a7dd1 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -252,7 +252,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
if (ret == -ENOMEM)
goto out;
if (ret || fuse_invalid_attr(&outarg.attr) ||
- (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
+ inode_wrong_type(inode, outarg.attr.mode))
goto invalid;
forget_all_cached_acls(inode);
@@ -508,7 +508,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
* 'mknod' + 'open' requests.
*/
static int fuse_create_open(struct inode *dir, struct dentry *entry,
- struct file *file, unsigned flags,
+ struct file *file, unsigned int flags,
umode_t mode)
{
int err;
@@ -1054,7 +1054,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
err = fuse_simple_request(fm, &args);
if (!err) {
if (fuse_invalid_attr(&outarg.attr) ||
- (inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
+ inode_wrong_type(inode, outarg.attr.mode)) {
fuse_make_bad(inode);
err = -EIO;
} else {
@@ -1703,7 +1703,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
}
if (fuse_invalid_attr(&outarg.attr) ||
- (inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
+ inode_wrong_type(inode, outarg.attr.mode)) {
fuse_make_bad(inode);
err = -EIO;
goto error;
@@ -1866,6 +1866,8 @@ static const struct inode_operations fuse_dir_inode_operations = {
.listxattr = fuse_listxattr,
.get_acl = fuse_get_acl,
.set_acl = fuse_set_acl,
+ .fileattr_get = fuse_fileattr_get,
+ .fileattr_set = fuse_fileattr_set,
};
static const struct file_operations fuse_dir_operations = {
@@ -1886,6 +1888,8 @@ static const struct inode_operations fuse_common_inode_operations = {
.listxattr = fuse_listxattr,
.get_acl = fuse_get_acl,
.set_acl = fuse_set_acl,
+ .fileattr_get = fuse_fileattr_get,
+ .fileattr_set = fuse_fileattr_set,
};
static const struct inode_operations fuse_symlink_inode_operations = {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 8cccecb55fb8..e8aa5337eb29 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -14,32 +14,20 @@
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/module.h>
-#include <linux/compat.h>
#include <linux/swap.h>
#include <linux/falloc.h>
#include <linux/uio.h>
#include <linux/fs.h>
-static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
- struct fuse_page_desc **desc)
-{
- struct page **pages;
-
- pages = kzalloc(npages * (sizeof(struct page *) +
- sizeof(struct fuse_page_desc)), flags);
- *desc = (void *) (pages + npages);
-
- return pages;
-}
-
-static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
- int opcode, struct fuse_open_out *outargp)
+static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
+ unsigned int open_flags, int opcode,
+ struct fuse_open_out *outargp)
{
struct fuse_open_in inarg;
FUSE_ARGS(args);
memset(&inarg, 0, sizeof(inarg));
- inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
+ inarg.flags = open_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
if (!fm->fc->atomic_o_trunc)
inarg.flags &= ~O_TRUNC;
@@ -136,8 +124,8 @@ static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
}
}
-int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
- bool isdir)
+struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
+ unsigned int open_flags, bool isdir)
{
struct fuse_conn *fc = fm->fc;
struct fuse_file *ff;
@@ -145,7 +133,7 @@ int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
ff = fuse_file_alloc(fm);
if (!ff)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
ff->fh = 0;
/* Default for no-open */
@@ -154,14 +142,14 @@ int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
struct fuse_open_out outarg;
int err;
- err = fuse_send_open(fm, nodeid, file, opcode, &outarg);
+ err = fuse_send_open(fm, nodeid, open_flags, opcode, &outarg);
if (!err) {
ff->fh = outarg.fh;
ff->open_flags = outarg.open_flags;
} else if (err != -ENOSYS) {
fuse_file_free(ff);
- return err;
+ return ERR_PTR(err);
} else {
if (isdir)
fc->no_opendir = 1;
@@ -174,9 +162,19 @@ int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
ff->open_flags &= ~FOPEN_DIRECT_IO;
ff->nodeid = nodeid;
- file->private_data = ff;
- return 0;
+ return ff;
+}
+
+int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
+ bool isdir)
+{
+ struct fuse_file *ff = fuse_file_open(fm, nodeid, file->f_flags, isdir);
+
+ if (!IS_ERR(ff))
+ file->private_data = ff;
+
+ return PTR_ERR_OR_ZERO(ff);
}
EXPORT_SYMBOL_GPL(fuse_do_open);
@@ -268,7 +266,7 @@ out:
}
static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
- int flags, int opcode)
+ unsigned int flags, int opcode)
{
struct fuse_conn *fc = ff->fm->fc;
struct fuse_release_args *ra = ff->release_args;
@@ -297,22 +295,21 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
ra->args.nocreds = true;
}
-void fuse_release_common(struct file *file, bool isdir)
+void fuse_file_release(struct inode *inode, struct fuse_file *ff,
+ unsigned int open_flags, fl_owner_t id, bool isdir)
{
- struct fuse_inode *fi = get_fuse_inode(file_inode(file));
- struct fuse_file *ff = file->private_data;
+ struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_release_args *ra = ff->release_args;
int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
- fuse_prepare_release(fi, ff, file->f_flags, opcode);
+ fuse_prepare_release(fi, ff, open_flags, opcode);
if (ff->flock) {
ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
- ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc,
- (fl_owner_t) file);
+ ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id);
}
/* Hold inode until release is finished */
- ra->inode = igrab(file_inode(file));
+ ra->inode = igrab(inode);
/*
* Normally this will send the RELEASE request, however if
@@ -326,6 +323,12 @@ void fuse_release_common(struct file *file, bool isdir)
fuse_file_put(ff, ff->fm->fc->destroy, isdir);
}
+void fuse_release_common(struct file *file, bool isdir)
+{
+ fuse_file_release(file_inode(file), file->private_data, file->f_flags,
+ (fl_owner_t) file, isdir);
+}
+
static int fuse_open(struct inode *inode, struct file *file)
{
return fuse_open_common(inode, file, false);
@@ -345,7 +348,8 @@ static int fuse_release(struct inode *inode, struct file *file)
return 0;
}
-void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags)
+void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff,
+ unsigned int flags)
{
WARN_ON(refcount_read(&ff->count) > 1);
fuse_prepare_release(fi, ff, flags, FUSE_RELEASE);
@@ -1346,16 +1350,6 @@ out:
return written ? written : err;
}
-static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs,
- unsigned int index,
- unsigned int nr_pages)
-{
- int i;
-
- for (i = index; i < index + nr_pages; i++)
- descs[i].length = PAGE_SIZE - descs[i].offset;
-}
-
static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
{
return (unsigned long)ii->iov->iov_base + ii->iov_offset;
@@ -2637,363 +2631,6 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
}
/*
- * CUSE servers compiled on 32bit broke on 64bit kernels because the
- * ABI was defined to be 'struct iovec' which is different on 32bit
- * and 64bit. Fortunately we can determine which structure the server
- * used from the size of the reply.
- */
-static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
- size_t transferred, unsigned count,
- bool is_compat)
-{
-#ifdef CONFIG_COMPAT
- if (count * sizeof(struct compat_iovec) == transferred) {
- struct compat_iovec *ciov = src;
- unsigned i;
-
- /*
- * With this interface a 32bit server cannot support
- * non-compat (i.e. ones coming from 64bit apps) ioctl
- * requests
- */
- if (!is_compat)
- return -EINVAL;
-
- for (i = 0; i < count; i++) {
- dst[i].iov_base = compat_ptr(ciov[i].iov_base);
- dst[i].iov_len = ciov[i].iov_len;
- }
- return 0;
- }
-#endif
-
- if (count * sizeof(struct iovec) != transferred)
- return -EIO;
-
- memcpy(dst, src, transferred);
- return 0;
-}
-
-/* Make sure iov_length() won't overflow */
-static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov,
- size_t count)
-{
- size_t n;
- u32 max = fc->max_pages << PAGE_SHIFT;
-
- for (n = 0; n < count; n++, iov++) {
- if (iov->iov_len > (size_t) max)
- return -ENOMEM;
- max -= iov->iov_len;
- }
- return 0;
-}
-
-static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
- void *src, size_t transferred, unsigned count,
- bool is_compat)
-{
- unsigned i;
- struct fuse_ioctl_iovec *fiov = src;
-
- if (fc->minor < 16) {
- return fuse_copy_ioctl_iovec_old(dst, src, transferred,
- count, is_compat);
- }
-
- if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
- return -EIO;
-
- for (i = 0; i < count; i++) {
- /* Did the server supply an inappropriate value? */
- if (fiov[i].base != (unsigned long) fiov[i].base ||
- fiov[i].len != (unsigned long) fiov[i].len)
- return -EIO;
-
- dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
- dst[i].iov_len = (size_t) fiov[i].len;
-
-#ifdef CONFIG_COMPAT
- if (is_compat &&
- (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
- (compat_size_t) dst[i].iov_len != fiov[i].len))
- return -EIO;
-#endif
- }
-
- return 0;
-}
-
-
-/*
- * For ioctls, there is no generic way to determine how much memory
- * needs to be read and/or written. Furthermore, ioctls are allowed
- * to dereference the passed pointer, so the parameter requires deep
- * copying but FUSE has no idea whatsoever about what to copy in or
- * out.
- *
- * This is solved by allowing FUSE server to retry ioctl with
- * necessary in/out iovecs. Let's assume the ioctl implementation
- * needs to read in the following structure.
- *
- * struct a {
- * char *buf;
- * size_t buflen;
- * }
- *
- * On the first callout to FUSE server, inarg->in_size and
- * inarg->out_size will be NULL; then, the server completes the ioctl
- * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
- * the actual iov array to
- *
- * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } }
- *
- * which tells FUSE to copy in the requested area and retry the ioctl.
- * On the second round, the server has access to the structure and
- * from that it can tell what to look for next, so on the invocation,
- * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
- *
- * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) },
- * { .iov_base = a.buf, .iov_len = a.buflen } }
- *
- * FUSE will copy both struct a and the pointed buffer from the
- * process doing the ioctl and retry ioctl with both struct a and the
- * buffer.
- *
- * This time, FUSE server has everything it needs and completes ioctl
- * without FUSE_IOCTL_RETRY which finishes the ioctl call.
- *
- * Copying data out works the same way.
- *
- * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
- * automatically initializes in and out iovs by decoding @cmd with
- * _IOC_* macros and the server is not allowed to request RETRY. This
- * limits ioctl data transfers to well-formed ioctls and is the forced
- * behavior for all FUSE servers.
- */
-long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
- unsigned int flags)
-{
- struct fuse_file *ff = file->private_data;
- struct fuse_mount *fm = ff->fm;
- struct fuse_ioctl_in inarg = {
- .fh = ff->fh,
- .cmd = cmd,
- .arg = arg,
- .flags = flags
- };
- struct fuse_ioctl_out outarg;
- struct iovec *iov_page = NULL;
- struct iovec *in_iov = NULL, *out_iov = NULL;
- unsigned int in_iovs = 0, out_iovs = 0, max_pages;
- size_t in_size, out_size, c;
- ssize_t transferred;
- int err, i;
- struct iov_iter ii;
- struct fuse_args_pages ap = {};
-
-#if BITS_PER_LONG == 32
- inarg.flags |= FUSE_IOCTL_32BIT;
-#else
- if (flags & FUSE_IOCTL_COMPAT) {
- inarg.flags |= FUSE_IOCTL_32BIT;
-#ifdef CONFIG_X86_X32
- if (in_x32_syscall())
- inarg.flags |= FUSE_IOCTL_COMPAT_X32;
-#endif
- }
-#endif
-
- /* assume all the iovs returned by client always fits in a page */
- BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
-
- err = -ENOMEM;
- ap.pages = fuse_pages_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs);
- iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
- if (!ap.pages || !iov_page)
- goto out;
-
- fuse_page_descs_length_init(ap.descs, 0, fm->fc->max_pages);
-
- /*
- * If restricted, initialize IO parameters as encoded in @cmd.
- * RETRY from server is not allowed.
- */
- if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
- struct iovec *iov = iov_page;
-
- iov->iov_base = (void __user *)arg;
-
- switch (cmd) {
- case FS_IOC_GETFLAGS:
- case FS_IOC_SETFLAGS:
- iov->iov_len = sizeof(int);
- break;
- default:
- iov->iov_len = _IOC_SIZE(cmd);
- break;
- }
-
- if (_IOC_DIR(cmd) & _IOC_WRITE) {
- in_iov = iov;
- in_iovs = 1;
- }
-
- if (_IOC_DIR(cmd) & _IOC_READ) {
- out_iov = iov;
- out_iovs = 1;
- }
- }
-
- retry:
- inarg.in_size = in_size = iov_length(in_iov, in_iovs);
- inarg.out_size = out_size = iov_length(out_iov, out_iovs);
-
- /*
- * Out data can be used either for actual out data or iovs,
- * make sure there always is at least one page.
- */
- out_size = max_t(size_t, out_size, PAGE_SIZE);
- max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);
-
- /* make sure there are enough buffer pages and init request with them */
- err = -ENOMEM;
- if (max_pages > fm->fc->max_pages)
- goto out;
- while (ap.num_pages < max_pages) {
- ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
- if (!ap.pages[ap.num_pages])
- goto out;
- ap.num_pages++;
- }
-
-
- /* okay, let's send it to the client */
- ap.args.opcode = FUSE_IOCTL;
- ap.args.nodeid = ff->nodeid;
- ap.args.in_numargs = 1;
- ap.args.in_args[0].size = sizeof(inarg);
- ap.args.in_args[0].value = &inarg;
- if (in_size) {
- ap.args.in_numargs++;
- ap.args.in_args[1].size = in_size;
- ap.args.in_pages = true;
-
- err = -EFAULT;
- iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size);
- for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
- c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
- if (c != PAGE_SIZE && iov_iter_count(&ii))
- goto out;
- }
- }
-
- ap.args.out_numargs = 2;
- ap.args.out_args[0].size = sizeof(outarg);
- ap.args.out_args[0].value = &outarg;
- ap.args.out_args[1].size = out_size;
- ap.args.out_pages = true;
- ap.args.out_argvar = true;
-
- transferred = fuse_simple_request(fm, &ap.args);
- err = transferred;
- if (transferred < 0)
- goto out;
-
- /* did it ask for retry? */
- if (outarg.flags & FUSE_IOCTL_RETRY) {
- void *vaddr;
-
- /* no retry if in restricted mode */
- err = -EIO;
- if (!(flags & FUSE_IOCTL_UNRESTRICTED))
- goto out;
-
- in_iovs = outarg.in_iovs;
- out_iovs = outarg.out_iovs;
-
- /*
- * Make sure things are in boundary, separate checks
- * are to protect against overflow.
- */
- err = -ENOMEM;
- if (in_iovs > FUSE_IOCTL_MAX_IOV ||
- out_iovs > FUSE_IOCTL_MAX_IOV ||
- in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
- goto out;
-
- vaddr = kmap_atomic(ap.pages[0]);
- err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr,
- transferred, in_iovs + out_iovs,
- (flags & FUSE_IOCTL_COMPAT) != 0);
- kunmap_atomic(vaddr);
- if (err)
- goto out;
-
- in_iov = iov_page;
- out_iov = in_iov + in_iovs;
-
- err = fuse_verify_ioctl_iov(fm->fc, in_iov, in_iovs);
- if (err)
- goto out;
-
- err = fuse_verify_ioctl_iov(fm->fc, out_iov, out_iovs);
- if (err)
- goto out;
-
- goto retry;
- }
-
- err = -EIO;
- if (transferred > inarg.out_size)
- goto out;
-
- err = -EFAULT;
- iov_iter_init(&ii, READ, out_iov, out_iovs, transferred);
- for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
- c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
- if (c != PAGE_SIZE && iov_iter_count(&ii))
- goto out;
- }
- err = 0;
- out:
- free_page((unsigned long) iov_page);
- while (ap.num_pages)
- __free_page(ap.pages[--ap.num_pages]);
- kfree(ap.pages);
-
- return err ? err : outarg.result;
-}
-EXPORT_SYMBOL_GPL(fuse_do_ioctl);
-
-long fuse_ioctl_common(struct file *file, unsigned int cmd,
- unsigned long arg, unsigned int flags)
-{
- struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
-
- if (!fuse_allow_current_process(fc))
- return -EACCES;
-
- if (fuse_is_bad(inode))
- return -EIO;
-
- return fuse_do_ioctl(file, cmd, arg, flags);
-}
-
-static long fuse_file_ioctl(struct file *file, unsigned int cmd,
- unsigned long arg)
-{
- return fuse_ioctl_common(file, cmd, arg, 0);
-}
-
-static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
- unsigned long arg)
-{
- return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
-}
-
-/*
* All files which have been polled are linked to RB tree
* fuse_conn->polled_files which is indexed by kh. Walk the tree and
* find the matching one.
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 63d97a15ffde..ca868b71eb97 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -872,6 +872,28 @@ static inline bool fuse_is_bad(struct inode *inode)
return unlikely(test_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state));
}
+static inline struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
+ struct fuse_page_desc **desc)
+{
+ struct page **pages;
+
+ pages = kzalloc(npages * (sizeof(struct page *) +
+ sizeof(struct fuse_page_desc)), flags);
+ *desc = (void *) (pages + npages);
+
+ return pages;
+}
+
+static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs,
+ unsigned int index,
+ unsigned int nr_pages)
+{
+ int i;
+
+ for (i = index; i < index + nr_pages; i++)
+ descs[i].length = PAGE_SIZE - descs[i].offset;
+}
+
/** Device operations */
extern const struct file_operations fuse_dev_operations;
@@ -932,7 +954,8 @@ struct fuse_file *fuse_file_alloc(struct fuse_mount *fm);
void fuse_file_free(struct fuse_file *ff);
void fuse_finish_open(struct inode *inode, struct file *file);
-void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags);
+void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff,
+ unsigned int flags);
/**
* Send RELEASE or RELEASEDIR request
@@ -1214,4 +1237,19 @@ void fuse_dax_inode_cleanup(struct inode *inode);
bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment);
void fuse_dax_cancel_work(struct fuse_conn *fc);
+/* ioctl.c */
+long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg);
+int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int fuse_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa);
+
+/* file.c */
+
+struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
+ unsigned int open_flags, bool isdir);
+void fuse_file_release(struct inode *inode, struct fuse_file *ff,
+ unsigned int open_flags, fl_owner_t id, bool isdir);
+
#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index b0e18b470e91..b4b956da3851 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -350,7 +350,7 @@ retry:
inode->i_generation = generation;
fuse_init_inode(inode, attr);
unlock_new_inode(inode);
- } else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
+ } else if (inode_wrong_type(inode, attr->mode)) {
/* Inode has changed type, any I/O on the old should fail */
fuse_make_bad(inode);
iput(inode);
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
new file mode 100644
index 000000000000..546ea3d58fb4
--- /dev/null
+++ b/fs/fuse/ioctl.c
@@ -0,0 +1,490 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017 Red Hat, Inc.
+ */
+
+#include "fuse_i.h"
+
+#include <linux/uio.h>
+#include <linux/compat.h>
+#include <linux/fileattr.h>
+
+/*
+ * CUSE servers compiled on 32bit broke on 64bit kernels because the
+ * ABI was defined to be 'struct iovec' which is different on 32bit
+ * and 64bit. Fortunately we can determine which structure the server
+ * used from the size of the reply.
+ */
+static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
+ size_t transferred, unsigned count,
+ bool is_compat)
+{
+#ifdef CONFIG_COMPAT
+ if (count * sizeof(struct compat_iovec) == transferred) {
+ struct compat_iovec *ciov = src;
+ unsigned i;
+
+ /*
+ * With this interface a 32bit server cannot support
+ * non-compat (i.e. ones coming from 64bit apps) ioctl
+ * requests
+ */
+ if (!is_compat)
+ return -EINVAL;
+
+ for (i = 0; i < count; i++) {
+ dst[i].iov_base = compat_ptr(ciov[i].iov_base);
+ dst[i].iov_len = ciov[i].iov_len;
+ }
+ return 0;
+ }
+#endif
+
+ if (count * sizeof(struct iovec) != transferred)
+ return -EIO;
+
+ memcpy(dst, src, transferred);
+ return 0;
+}
+
+/* Make sure iov_length() won't overflow */
+static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov,
+ size_t count)
+{
+ size_t n;
+ u32 max = fc->max_pages << PAGE_SHIFT;
+
+ for (n = 0; n < count; n++, iov++) {
+ if (iov->iov_len > (size_t) max)
+ return -ENOMEM;
+ max -= iov->iov_len;
+ }
+ return 0;
+}
+
+static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
+ void *src, size_t transferred, unsigned count,
+ bool is_compat)
+{
+ unsigned i;
+ struct fuse_ioctl_iovec *fiov = src;
+
+ if (fc->minor < 16) {
+ return fuse_copy_ioctl_iovec_old(dst, src, transferred,
+ count, is_compat);
+ }
+
+ if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
+ return -EIO;
+
+ for (i = 0; i < count; i++) {
+ /* Did the server supply an inappropriate value? */
+ if (fiov[i].base != (unsigned long) fiov[i].base ||
+ fiov[i].len != (unsigned long) fiov[i].len)
+ return -EIO;
+
+ dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
+ dst[i].iov_len = (size_t) fiov[i].len;
+
+#ifdef CONFIG_COMPAT
+ if (is_compat &&
+ (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
+ (compat_size_t) dst[i].iov_len != fiov[i].len))
+ return -EIO;
+#endif
+ }
+
+ return 0;
+}
+
+
+/*
+ * For ioctls, there is no generic way to determine how much memory
+ * needs to be read and/or written. Furthermore, ioctls are allowed
+ * to dereference the passed pointer, so the parameter requires deep
+ * copying but FUSE has no idea whatsoever about what to copy in or
+ * out.
+ *
+ * This is solved by allowing FUSE server to retry ioctl with
+ * necessary in/out iovecs. Let's assume the ioctl implementation
+ * needs to read in the following structure.
+ *
+ * struct a {
+ * char *buf;
+ * size_t buflen;
+ * }
+ *
+ * On the first callout to FUSE server, inarg->in_size and
+ * inarg->out_size will be NULL; then, the server completes the ioctl
+ * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
+ * the actual iov array to
+ *
+ * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } }
+ *
+ * which tells FUSE to copy in the requested area and retry the ioctl.
+ * On the second round, the server has access to the structure and
+ * from that it can tell what to look for next, so on the invocation,
+ * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
+ *
+ * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) },
+ * { .iov_base = a.buf, .iov_len = a.buflen } }
+ *
+ * FUSE will copy both struct a and the pointed buffer from the
+ * process doing the ioctl and retry ioctl with both struct a and the
+ * buffer.
+ *
+ * This time, FUSE server has everything it needs and completes ioctl
+ * without FUSE_IOCTL_RETRY which finishes the ioctl call.
+ *
+ * Copying data out works the same way.
+ *
+ * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
+ * automatically initializes in and out iovs by decoding @cmd with
+ * _IOC_* macros and the server is not allowed to request RETRY. This
+ * limits ioctl data transfers to well-formed ioctls and is the forced
+ * behavior for all FUSE servers.
+ */
+long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
+ unsigned int flags)
+{
+ struct fuse_file *ff = file->private_data;
+ struct fuse_mount *fm = ff->fm;
+ struct fuse_ioctl_in inarg = {
+ .fh = ff->fh,
+ .cmd = cmd,
+ .arg = arg,
+ .flags = flags
+ };
+ struct fuse_ioctl_out outarg;
+ struct iovec *iov_page = NULL;
+ struct iovec *in_iov = NULL, *out_iov = NULL;
+ unsigned int in_iovs = 0, out_iovs = 0, max_pages;
+ size_t in_size, out_size, c;
+ ssize_t transferred;
+ int err, i;
+ struct iov_iter ii;
+ struct fuse_args_pages ap = {};
+
+#if BITS_PER_LONG == 32
+ inarg.flags |= FUSE_IOCTL_32BIT;
+#else
+ if (flags & FUSE_IOCTL_COMPAT) {
+ inarg.flags |= FUSE_IOCTL_32BIT;
+#ifdef CONFIG_X86_X32
+ if (in_x32_syscall())
+ inarg.flags |= FUSE_IOCTL_COMPAT_X32;
+#endif
+ }
+#endif
+
+ /* assume all the iovs returned by client always fits in a page */
+ BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
+
+ err = -ENOMEM;
+ ap.pages = fuse_pages_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs);
+ iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
+ if (!ap.pages || !iov_page)
+ goto out;
+
+ fuse_page_descs_length_init(ap.descs, 0, fm->fc->max_pages);
+
+ /*
+ * If restricted, initialize IO parameters as encoded in @cmd.
+ * RETRY from server is not allowed.
+ */
+ if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
+ struct iovec *iov = iov_page;
+
+ iov->iov_base = (void __user *)arg;
+ iov->iov_len = _IOC_SIZE(cmd);
+
+ if (_IOC_DIR(cmd) & _IOC_WRITE) {
+ in_iov = iov;
+ in_iovs = 1;
+ }
+
+ if (_IOC_DIR(cmd) & _IOC_READ) {
+ out_iov = iov;
+ out_iovs = 1;
+ }
+ }
+
+ retry:
+ inarg.in_size = in_size = iov_length(in_iov, in_iovs);
+ inarg.out_size = out_size = iov_length(out_iov, out_iovs);
+
+ /*
+ * Out data can be used either for actual out data or iovs,
+ * make sure there always is at least one page.
+ */
+ out_size = max_t(size_t, out_size, PAGE_SIZE);
+ max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);
+
+ /* make sure there are enough buffer pages and init request with them */
+ err = -ENOMEM;
+ if (max_pages > fm->fc->max_pages)
+ goto out;
+ while (ap.num_pages < max_pages) {
+ ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+ if (!ap.pages[ap.num_pages])
+ goto out;
+ ap.num_pages++;
+ }
+
+
+ /* okay, let's send it to the client */
+ ap.args.opcode = FUSE_IOCTL;
+ ap.args.nodeid = ff->nodeid;
+ ap.args.in_numargs = 1;
+ ap.args.in_args[0].size = sizeof(inarg);
+ ap.args.in_args[0].value = &inarg;
+ if (in_size) {
+ ap.args.in_numargs++;
+ ap.args.in_args[1].size = in_size;
+ ap.args.in_pages = true;
+
+ err = -EFAULT;
+ iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size);
+ for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
+ c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
+ if (c != PAGE_SIZE && iov_iter_count(&ii))
+ goto out;
+ }
+ }
+
+ ap.args.out_numargs = 2;
+ ap.args.out_args[0].size = sizeof(outarg);
+ ap.args.out_args[0].value = &outarg;
+ ap.args.out_args[1].size = out_size;
+ ap.args.out_pages = true;
+ ap.args.out_argvar = true;
+
+ transferred = fuse_simple_request(fm, &ap.args);
+ err = transferred;
+ if (transferred < 0)
+ goto out;
+
+ /* did it ask for retry? */
+ if (outarg.flags & FUSE_IOCTL_RETRY) {
+ void *vaddr;
+
+ /* no retry if in restricted mode */
+ err = -EIO;
+ if (!(flags & FUSE_IOCTL_UNRESTRICTED))
+ goto out;
+
+ in_iovs = outarg.in_iovs;
+ out_iovs = outarg.out_iovs;
+
+ /*
+ * Make sure things are in boundary, separate checks
+ * are to protect against overflow.
+ */
+ err = -ENOMEM;
+ if (in_iovs > FUSE_IOCTL_MAX_IOV ||
+ out_iovs > FUSE_IOCTL_MAX_IOV ||
+ in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
+ goto out;
+
+ vaddr = kmap_atomic(ap.pages[0]);
+ err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr,
+ transferred, in_iovs + out_iovs,
+ (flags & FUSE_IOCTL_COMPAT) != 0);
+ kunmap_atomic(vaddr);
+ if (err)
+ goto out;
+
+ in_iov = iov_page;
+ out_iov = in_iov + in_iovs;
+
+ err = fuse_verify_ioctl_iov(fm->fc, in_iov, in_iovs);
+ if (err)
+ goto out;
+
+ err = fuse_verify_ioctl_iov(fm->fc, out_iov, out_iovs);
+ if (err)
+ goto out;
+
+ goto retry;
+ }
+
+ err = -EIO;
+ if (transferred > inarg.out_size)
+ goto out;
+
+ err = -EFAULT;
+ iov_iter_init(&ii, READ, out_iov, out_iovs, transferred);
+ for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
+ c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
+ if (c != PAGE_SIZE && iov_iter_count(&ii))
+ goto out;
+ }
+ err = 0;
+ out:
+ free_page((unsigned long) iov_page);
+ while (ap.num_pages)
+ __free_page(ap.pages[--ap.num_pages]);
+ kfree(ap.pages);
+
+ return err ? err : outarg.result;
+}
+EXPORT_SYMBOL_GPL(fuse_do_ioctl);
+
+long fuse_ioctl_common(struct file *file, unsigned int cmd,
+ unsigned long arg, unsigned int flags)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (!fuse_allow_current_process(fc))
+ return -EACCES;
+
+ if (fuse_is_bad(inode))
+ return -EIO;
+
+ return fuse_do_ioctl(file, cmd, arg, flags);
+}
+
+long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ return fuse_ioctl_common(file, cmd, arg, 0);
+}
+
+long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
+}
+
+static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff,
+ unsigned int cmd, void *ptr, size_t size)
+{
+ struct fuse_mount *fm = ff->fm;
+ struct fuse_ioctl_in inarg;
+ struct fuse_ioctl_out outarg;
+ FUSE_ARGS(args);
+ int err;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.fh = ff->fh;
+ inarg.cmd = cmd;
+
+#if BITS_PER_LONG == 32
+ inarg.flags |= FUSE_IOCTL_32BIT;
+#endif
+ if (S_ISDIR(inode->i_mode))
+ inarg.flags |= FUSE_IOCTL_DIR;
+
+ if (_IOC_DIR(cmd) & _IOC_READ)
+ inarg.out_size = size;
+ if (_IOC_DIR(cmd) & _IOC_WRITE)
+ inarg.in_size = size;
+
+ args.opcode = FUSE_IOCTL;
+ args.nodeid = ff->nodeid;
+ args.in_numargs = 2;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ args.in_args[1].size = inarg.in_size;
+ args.in_args[1].value = ptr;
+ args.out_numargs = 2;
+ args.out_args[0].size = sizeof(outarg);
+ args.out_args[0].value = &outarg;
+ args.out_args[1].size = inarg.out_size;
+ args.out_args[1].value = ptr;
+
+ err = fuse_simple_request(fm, &args);
+ if (!err && outarg.flags & FUSE_IOCTL_RETRY)
+ err = -EIO;
+
+ return err;
+}
+
+static struct fuse_file *fuse_priv_ioctl_prepare(struct inode *inode)
+{
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ bool isdir = S_ISDIR(inode->i_mode);
+
+ if (!S_ISREG(inode->i_mode) && !isdir)
+ return ERR_PTR(-ENOTTY);
+
+ return fuse_file_open(fm, get_node_id(inode), O_RDONLY, isdir);
+}
+
+static void fuse_priv_ioctl_cleanup(struct inode *inode, struct fuse_file *ff)
+{
+ fuse_file_release(inode, ff, O_RDONLY, NULL, S_ISDIR(inode->i_mode));
+}
+
+int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ struct fuse_file *ff;
+ unsigned int flags;
+ struct fsxattr xfa;
+ int err;
+
+ ff = fuse_priv_ioctl_prepare(inode);
+ if (IS_ERR(ff))
+ return PTR_ERR(ff);
+
+ if (fa->flags_valid) {
+ err = fuse_priv_ioctl(inode, ff, FS_IOC_GETFLAGS,
+ &flags, sizeof(flags));
+ if (err)
+ goto cleanup;
+
+ fileattr_fill_flags(fa, flags);
+ } else {
+ err = fuse_priv_ioctl(inode, ff, FS_IOC_FSGETXATTR,
+ &xfa, sizeof(xfa));
+ if (err)
+ goto cleanup;
+
+ fileattr_fill_xflags(fa, xfa.fsx_xflags);
+ fa->fsx_extsize = xfa.fsx_extsize;
+ fa->fsx_nextents = xfa.fsx_nextents;
+ fa->fsx_projid = xfa.fsx_projid;
+ fa->fsx_cowextsize = xfa.fsx_cowextsize;
+ }
+cleanup:
+ fuse_priv_ioctl_cleanup(inode, ff);
+
+ return err;
+}
+
+int fuse_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ struct fuse_file *ff;
+ unsigned int flags = fa->flags;
+ struct fsxattr xfa;
+ int err;
+
+ ff = fuse_priv_ioctl_prepare(inode);
+ if (IS_ERR(ff))
+ return PTR_ERR(ff);
+
+ if (fa->flags_valid) {
+ err = fuse_priv_ioctl(inode, ff, FS_IOC_SETFLAGS,
+ &flags, sizeof(flags));
+ if (err)
+ goto cleanup;
+ } else {
+ memset(&xfa, 0, sizeof(xfa));
+ xfa.fsx_xflags = fa->fsx_xflags;
+ xfa.fsx_extsize = fa->fsx_extsize;
+ xfa.fsx_nextents = fa->fsx_nextents;
+ xfa.fsx_projid = fa->fsx_projid;
+ xfa.fsx_cowextsize = fa->fsx_cowextsize;
+
+ err = fuse_priv_ioctl(inode, ff, FS_IOC_FSSETXATTR,
+ &xfa, sizeof(xfa));
+ }
+
+cleanup:
+ fuse_priv_ioctl_cleanup(inode, ff);
+
+ return err;
+}
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index 3441ffa740f3..277f7041d55a 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -202,7 +202,7 @@ retry:
inode = d_inode(dentry);
if (!inode ||
get_node_id(inode) != o->nodeid ||
- ((o->attr.mode ^ inode->i_mode) & S_IFMT)) {
+ inode_wrong_type(inode, o->attr.mode)) {
d_invalidate(dentry);
dput(dentry);
goto retry;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 2d500f90cdac..b46714d48107 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -25,6 +25,7 @@
#include <linux/dlm_plock.h>
#include <linux/delay.h>
#include <linux/backing-dev.h>
+#include <linux/fileattr.h>
#include "gfs2.h"
#include "incore.h"
@@ -153,14 +154,17 @@ static inline u32 gfs2_gfsflags_to_fsflags(struct inode *inode, u32 gfsflags)
return fsflags;
}
-static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
+int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = d_inode(dentry);
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder gh;
int error;
u32 fsflags;
+ if (d_is_special(dentry))
+ return -ENOTTY;
+
gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
error = gfs2_glock_nq(&gh);
if (error)
@@ -168,8 +172,7 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
fsflags = gfs2_gfsflags_to_fsflags(inode, ip->i_diskflags);
- if (put_user(fsflags, ptr))
- error = -EFAULT;
+ fileattr_fill_flags(fa, fsflags);
gfs2_glock_dq(&gh);
out_uninit:
@@ -213,33 +216,19 @@ void gfs2_set_inode_flags(struct inode *inode)
* @fsflags: The FS_* inode flags passed in
*
*/
-static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask,
+static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask,
const u32 fsflags)
{
- struct inode *inode = file_inode(filp);
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct buffer_head *bh;
struct gfs2_holder gh;
int error;
- u32 new_flags, flags, oldflags;
-
- error = mnt_want_write_file(filp);
- if (error)
- return error;
+ u32 new_flags, flags;
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
if (error)
- goto out_drop_write;
-
- oldflags = gfs2_gfsflags_to_fsflags(inode, ip->i_diskflags);
- error = vfs_ioc_setflags_prepare(inode, oldflags, fsflags);
- if (error)
- goto out;
-
- error = -EACCES;
- if (!inode_owner_or_capable(&init_user_ns, inode))
- goto out;
+ return error;
error = 0;
flags = ip->i_diskflags;
@@ -252,9 +241,6 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask,
goto out;
if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY))
goto out;
- if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) &&
- !capable(CAP_LINUX_IMMUTABLE))
- goto out;
if (!IS_IMMUTABLE(inode)) {
error = gfs2_permission(&init_user_ns, inode, MAY_WRITE);
if (error)
@@ -291,20 +277,22 @@ out_trans_end:
gfs2_trans_end(sdp);
out:
gfs2_glock_dq_uninit(&gh);
-out_drop_write:
- mnt_drop_write_file(filp);
return error;
}
-static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
+int gfs2_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
{
- struct inode *inode = file_inode(filp);
- u32 fsflags, gfsflags = 0;
+ struct inode *inode = d_inode(dentry);
+ u32 fsflags = fa->flags, gfsflags = 0;
u32 mask;
int i;
- if (get_user(fsflags, ptr))
- return -EFAULT;
+ if (d_is_special(dentry))
+ return -ENOTTY;
+
+ if (fileattr_has_fsx(fa))
+ return -EOPNOTSUPP;
for (i = 0; i < ARRAY_SIZE(fsflag_gfs2flag); i++) {
if (fsflags & fsflag_gfs2flag[i].fsflag) {
@@ -325,7 +313,7 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
mask &= ~(GFS2_DIF_TOPDIR | GFS2_DIF_INHERIT_JDATA);
}
- return do_gfs2_set_flags(filp, gfsflags, mask, fsflags);
+ return do_gfs2_set_flags(inode, gfsflags, mask, fsflags);
}
static int gfs2_getlabel(struct file *filp, char __user *label)
@@ -342,10 +330,6 @@ static int gfs2_getlabel(struct file *filp, char __user *label)
static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
switch(cmd) {
- case FS_IOC_GETFLAGS:
- return gfs2_get_flags(filp, (u32 __user *)arg);
- case FS_IOC_SETFLAGS:
- return gfs2_set_flags(filp, (u32 __user *)arg);
case FITRIM:
return gfs2_fitrim(filp, (void __user *)arg);
case FS_IOC_GETFSLABEL:
@@ -359,13 +343,6 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
static long gfs2_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
switch(cmd) {
- /* These are just misnamed, they actually get/put from/to user an int */
- case FS_IOC32_GETFLAGS:
- cmd = FS_IOC_GETFLAGS;
- break;
- case FS_IOC32_SETFLAGS:
- cmd = FS_IOC_SETFLAGS;
- break;
/* Keep this list in sync with gfs2_ioctl */
case FITRIM:
case FS_IOC_GETFSLABEL:
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9567520d79f7..c06a6cdf05de 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1732,7 +1732,8 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
spin_unlock(&gl->gl_lockref.lock);
}
-static int glock_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int glock_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
{
struct gfs2_glock *gla, *glb;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 8e32d569c8bf..ef0b583c3417 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -394,18 +394,24 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
const struct gfs2_dinode *str = buf;
struct timespec64 atime;
u16 height, depth;
+ umode_t mode = be32_to_cpu(str->di_mode);
+ bool is_new = ip->i_inode.i_flags & I_NEW;
if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
goto corrupt;
+ if (unlikely(!is_new && inode_wrong_type(&ip->i_inode, mode)))
+ goto corrupt;
ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino);
- ip->i_inode.i_mode = be32_to_cpu(str->di_mode);
- ip->i_inode.i_rdev = 0;
- switch (ip->i_inode.i_mode & S_IFMT) {
- case S_IFBLK:
- case S_IFCHR:
- ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major),
- be32_to_cpu(str->di_minor));
- break;
+ ip->i_inode.i_mode = mode;
+ if (is_new) {
+ ip->i_inode.i_rdev = 0;
+ switch (mode & S_IFMT) {
+ case S_IFBLK:
+ case S_IFCHR:
+ ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major),
+ be32_to_cpu(str->di_minor));
+ break;
+ }
}
i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid));
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index c9775d5c6594..6a63607ac526 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -2157,6 +2157,8 @@ static const struct inode_operations gfs2_file_iops = {
.get_acl = gfs2_get_acl,
.set_acl = gfs2_set_acl,
.update_time = gfs2_update_time,
+ .fileattr_get = gfs2_fileattr_get,
+ .fileattr_set = gfs2_fileattr_set,
};
static const struct inode_operations gfs2_dir_iops = {
@@ -2178,6 +2180,8 @@ static const struct inode_operations gfs2_dir_iops = {
.set_acl = gfs2_set_acl,
.update_time = gfs2_update_time,
.atomic_open = gfs2_atomic_open,
+ .fileattr_get = gfs2_fileattr_get,
+ .fileattr_set = gfs2_fileattr_set,
};
static const struct inode_operations gfs2_symlink_iops = {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c447bd5b3017..9e898660eae0 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -111,6 +111,9 @@ extern loff_t gfs2_seek_hole(struct file *file, loff_t offset);
extern const struct file_operations gfs2_file_fops_nolock;
extern const struct file_operations gfs2_dir_fops_nolock;
+extern int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+extern int gfs2_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa);
extern void gfs2_set_inode_flags(struct inode *inode);
#ifdef CONFIG_GFS2_FS_LOCKING_DLM
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 6410281546f9..88649b43fcff 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -695,7 +695,7 @@ void log_flush_wait(struct gfs2_sbd *sdp)
}
}
-static int ip_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int ip_cmp(void *priv, const struct list_head *a, const struct list_head *b)
{
struct gfs2_inode *ipa, *ipb;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index a82f4747aa8d..b4809967efc6 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -634,7 +634,8 @@ static void gfs2_check_magic(struct buffer_head *bh)
kunmap_atomic(kaddr);
}
-static int blocknr_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int blocknr_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
{
struct gfs2_bufdata *bda, *bdb;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 97076d3f562f..8fb9602d79b4 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -162,8 +162,10 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
int error;
error = init_threads(sdp);
- if (error)
+ if (error) {
+ gfs2_withdraw_delayed(sdp);
return error;
+ }
j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
if (gfs2_withdrawn(sdp)) {
@@ -750,11 +752,13 @@ void gfs2_freeze_func(struct work_struct *work)
static int gfs2_freeze(struct super_block *sb)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
- int error = 0;
+ int error;
mutex_lock(&sdp->sd_freeze_mutex);
- if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN)
+ if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN) {
+ error = -EBUSY;
goto out;
+ }
for (;;) {
if (gfs2_withdrawn(sdp)) {
@@ -795,10 +799,10 @@ static int gfs2_unfreeze(struct super_block *sb)
struct gfs2_sbd *sdp = sb->s_fs_info;
mutex_lock(&sdp->sd_freeze_mutex);
- if (atomic_read(&sdp->sd_freeze_state) != SFS_FROZEN ||
+ if (atomic_read(&sdp->sd_freeze_state) != SFS_FROZEN ||
!gfs2_holder_initialized(&sdp->sd_freeze_gh)) {
mutex_unlock(&sdp->sd_freeze_mutex);
- return 0;
+ return -EINVAL;
}
gfs2_freeze_unlock(&sdp->sd_freeze_gh);
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 03e6c046faf4..84714bbccc12 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -569,6 +569,8 @@ const struct inode_operations hfsplus_dir_inode_operations = {
.rename = hfsplus_rename,
.getattr = hfsplus_getattr,
.listxattr = hfsplus_listxattr,
+ .fileattr_get = hfsplus_fileattr_get,
+ .fileattr_set = hfsplus_fileattr_set,
};
const struct file_operations hfsplus_dir_operations = {
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 12b20479ed2b..1798949f269b 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -345,17 +345,6 @@ static inline unsigned short hfsplus_min_io_size(struct super_block *sb)
#define hfs_part_find hfsplus_part_find
/*
- * definitions for ext2 flag ioctls (linux really needs a generic
- * interface for this).
- */
-
-/* ext2 ioctls (EXT2_IOC_GETFLAGS and EXT2_IOC_SETFLAGS) to support
- * chattr/lsattr */
-#define HFSPLUS_IOC_EXT2_GETFLAGS FS_IOC_GETFLAGS
-#define HFSPLUS_IOC_EXT2_SETFLAGS FS_IOC_SETFLAGS
-
-
-/*
* hfs+-specific ioctl for making the filesystem bootable
*/
#define HFSPLUS_IOC_BLESS _IO('h', 0x80)
@@ -493,6 +482,9 @@ int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path,
unsigned int query_flags);
int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
int datasync);
+int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int hfsplus_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa);
/* ioctl.c */
long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 078c5c8a5156..8ea447e5c470 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -17,6 +17,7 @@
#include <linux/sched.h>
#include <linux/cred.h>
#include <linux/uio.h>
+#include <linux/fileattr.h>
#include "hfsplus_fs.h"
#include "hfsplus_raw.h"
@@ -353,6 +354,8 @@ static const struct inode_operations hfsplus_file_inode_operations = {
.setattr = hfsplus_setattr,
.getattr = hfsplus_getattr,
.listxattr = hfsplus_listxattr,
+ .fileattr_get = hfsplus_fileattr_get,
+ .fileattr_set = hfsplus_fileattr_set,
};
static const struct file_operations hfsplus_file_operations = {
@@ -628,3 +631,54 @@ out:
hfs_find_exit(&fd);
return 0;
}
+
+int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+ unsigned int flags = 0;
+
+ if (inode->i_flags & S_IMMUTABLE)
+ flags |= FS_IMMUTABLE_FL;
+ if (inode->i_flags & S_APPEND)
+ flags |= FS_APPEND_FL;
+ if (hip->userflags & HFSPLUS_FLG_NODUMP)
+ flags |= FS_NODUMP_FL;
+
+ fileattr_fill_flags(fa, flags);
+
+ return 0;
+}
+
+int hfsplus_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+ unsigned int new_fl = 0;
+
+ if (fileattr_has_fsx(fa))
+ return -EOPNOTSUPP;
+
+ /* don't silently ignore unsupported ext2 flags */
+ if (fa->flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL))
+ return -EOPNOTSUPP;
+
+ if (fa->flags & FS_IMMUTABLE_FL)
+ new_fl |= S_IMMUTABLE;
+
+ if (fa->flags & FS_APPEND_FL)
+ new_fl |= S_APPEND;
+
+ inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND);
+
+ if (fa->flags & FS_NODUMP_FL)
+ hip->userflags |= HFSPLUS_FLG_NODUMP;
+ else
+ hip->userflags &= ~HFSPLUS_FLG_NODUMP;
+
+ inode->i_ctime = current_time(inode);
+ mark_inode_dirty(inode);
+
+ return 0;
+}
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 3edb1926d127..5661a2e24d03 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -57,95 +57,11 @@ static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags)
return 0;
}
-static inline unsigned int hfsplus_getflags(struct inode *inode)
-{
- struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
- unsigned int flags = 0;
-
- if (inode->i_flags & S_IMMUTABLE)
- flags |= FS_IMMUTABLE_FL;
- if (inode->i_flags & S_APPEND)
- flags |= FS_APPEND_FL;
- if (hip->userflags & HFSPLUS_FLG_NODUMP)
- flags |= FS_NODUMP_FL;
- return flags;
-}
-
-static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
-{
- struct inode *inode = file_inode(file);
- unsigned int flags = hfsplus_getflags(inode);
-
- return put_user(flags, user_flags);
-}
-
-static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
-{
- struct inode *inode = file_inode(file);
- struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
- unsigned int flags, new_fl = 0;
- unsigned int oldflags = hfsplus_getflags(inode);
- int err = 0;
-
- err = mnt_want_write_file(file);
- if (err)
- goto out;
-
- if (!inode_owner_or_capable(&init_user_ns, inode)) {
- err = -EACCES;
- goto out_drop_write;
- }
-
- if (get_user(flags, user_flags)) {
- err = -EFAULT;
- goto out_drop_write;
- }
-
- inode_lock(inode);
-
- err = vfs_ioc_setflags_prepare(inode, oldflags, flags);
- if (err)
- goto out_unlock_inode;
-
- /* don't silently ignore unsupported ext2 flags */
- if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
- err = -EOPNOTSUPP;
- goto out_unlock_inode;
- }
-
- if (flags & FS_IMMUTABLE_FL)
- new_fl |= S_IMMUTABLE;
-
- if (flags & FS_APPEND_FL)
- new_fl |= S_APPEND;
-
- inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND);
-
- if (flags & FS_NODUMP_FL)
- hip->userflags |= HFSPLUS_FLG_NODUMP;
- else
- hip->userflags &= ~HFSPLUS_FLG_NODUMP;
-
- inode->i_ctime = current_time(inode);
- mark_inode_dirty(inode);
-
-out_unlock_inode:
- inode_unlock(inode);
-out_drop_write:
- mnt_drop_write_file(file);
-out:
- return err;
-}
-
long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
void __user *argp = (void __user *)arg;
switch (cmd) {
- case HFSPLUS_IOC_EXT2_GETFLAGS:
- return hfsplus_ioctl_getflags(file, argp);
- case HFSPLUS_IOC_EXT2_SETFLAGS:
- return hfsplus_ioctl_setflags(file, argp);
case HFSPLUS_IOC_BLESS:
return hfsplus_ioctl_bless(file, argp);
default:
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 29e407762626..7b5e984ff02a 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -144,7 +144,7 @@ static char *follow_link(char *link)
char *name, *resolved, *end;
int n;
- name = __getname();
+ name = kmalloc(PATH_MAX, GFP_KERNEL);
if (!name) {
n = -ENOMEM;
goto out_free;
@@ -173,12 +173,11 @@ static char *follow_link(char *link)
goto out_free;
}
- __putname(name);
- kfree(link);
+ kfree(name);
return resolved;
out_free:
- __putname(name);
+ kfree(name);
return ERR_PTR(n);
}
@@ -712,7 +711,6 @@ static int hostfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
if (name == NULL)
goto out_put;
- init_special_inode(inode, mode, dev);
err = do_mknod(name, mode, MAJOR(dev), MINOR(dev));
if (err)
goto out_free;
diff --git a/fs/inode.c b/fs/inode.c
index a047ab306f9a..9e192bea0630 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -12,7 +12,6 @@
#include <linux/security.h>
#include <linux/cdev.h>
#include <linux/memblock.h>
-#include <linux/fscrypt.h>
#include <linux/fsnotify.h>
#include <linux/mount.h>
#include <linux/posix_acl.h>
@@ -2148,7 +2147,7 @@ EXPORT_SYMBOL(init_special_inode);
void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
const struct inode *dir, umode_t mode)
{
- inode->i_uid = fsuid_into_mnt(mnt_userns);
+ inode_fsuid_set(inode, mnt_userns);
if (dir && dir->i_mode & S_ISGID) {
inode->i_gid = dir->i_gid;
@@ -2160,7 +2159,7 @@ void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
!capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID))
mode &= ~S_ISGID;
} else
- inode->i_gid = fsgid_into_mnt(mnt_userns);
+ inode_fsgid_set(inode, mnt_userns);
inode->i_mode = mode;
}
EXPORT_SYMBOL(inode_init_owner);
@@ -2314,89 +2313,3 @@ struct timespec64 current_time(struct inode *inode)
return timestamp_truncate(now, inode);
}
EXPORT_SYMBOL(current_time);
-
-/*
- * Generic function to check FS_IOC_SETFLAGS values and reject any invalid
- * configurations.
- *
- * Note: the caller should be holding i_mutex, or else be sure that they have
- * exclusive access to the inode structure.
- */
-int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags,
- unsigned int flags)
-{
- /*
- * The IMMUTABLE and APPEND_ONLY flags can only be changed by
- * the relevant capability.
- *
- * This test looks nicer. Thanks to Pauline Middelink
- */
- if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) &&
- !capable(CAP_LINUX_IMMUTABLE))
- return -EPERM;
-
- return fscrypt_prepare_setflags(inode, oldflags, flags);
-}
-EXPORT_SYMBOL(vfs_ioc_setflags_prepare);
-
-/*
- * Generic function to check FS_IOC_FSSETXATTR values and reject any invalid
- * configurations.
- *
- * Note: the caller should be holding i_mutex, or else be sure that they have
- * exclusive access to the inode structure.
- */
-int vfs_ioc_fssetxattr_check(struct inode *inode, const struct fsxattr *old_fa,
- struct fsxattr *fa)
-{
- /*
- * Can't modify an immutable/append-only file unless we have
- * appropriate permission.
- */
- if ((old_fa->fsx_xflags ^ fa->fsx_xflags) &
- (FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND) &&
- !capable(CAP_LINUX_IMMUTABLE))
- return -EPERM;
-
- /*
- * Project Quota ID state is only allowed to change from within the init
- * namespace. Enforce that restriction only if we are trying to change
- * the quota ID state. Everything else is allowed in user namespaces.
- */
- if (current_user_ns() != &init_user_ns) {
- if (old_fa->fsx_projid != fa->fsx_projid)
- return -EINVAL;
- if ((old_fa->fsx_xflags ^ fa->fsx_xflags) &
- FS_XFLAG_PROJINHERIT)
- return -EINVAL;
- }
-
- /* Check extent size hints. */
- if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode))
- return -EINVAL;
-
- if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
- !S_ISDIR(inode->i_mode))
- return -EINVAL;
-
- if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) &&
- !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
- return -EINVAL;
-
- /*
- * It is only valid to set the DAX flag on regular files and
- * directories on filesystems.
- */
- if ((fa->fsx_xflags & FS_XFLAG_DAX) &&
- !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
- return -EINVAL;
-
- /* Extent size hints of zero turn off the flags. */
- if (fa->fsx_extsize == 0)
- fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
- if (fa->fsx_cowextsize == 0)
- fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
-
- return 0;
-}
-EXPORT_SYMBOL(vfs_ioc_fssetxattr_check);
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 3dc10bfd8c3b..4eba531bea5a 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -16,7 +16,6 @@
#include <linux/rculist_nulls.h>
#include <linux/cpu.h>
#include <linux/tracehook.h>
-#include <linux/freezer.h>
#include "../kernel/sched/sched.h"
#include "io-wq.h"
@@ -388,11 +387,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
static bool io_flush_signals(void)
{
- if (unlikely(test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))) {
+ if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) {
__set_current_state(TASK_RUNNING);
- if (current->task_works)
- task_work_run();
- clear_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL);
+ tracehook_notify_signal();
return true;
}
return false;
@@ -418,6 +415,7 @@ static void io_worker_handle_work(struct io_worker *worker)
{
struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq;
+ bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state);
do {
struct io_wq_work *work;
@@ -447,6 +445,9 @@ get_next:
unsigned int hash = io_get_work_hash(work);
next_hashed = wq_next_work(work);
+
+ if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND))
+ work->flags |= IO_WQ_WORK_CANCEL;
wq->do_work(work);
io_assign_current_work(worker, NULL);
@@ -487,7 +488,7 @@ static int io_wqe_worker(void *data)
worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
io_wqe_inc_running(worker);
- sprintf(buf, "iou-wrk-%d", wq->task_pid);
+ snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task_pid);
set_task_comm(current, buf);
while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
@@ -505,10 +506,15 @@ loop:
if (io_flush_signals())
continue;
ret = schedule_timeout(WORKER_IDLE_TIMEOUT);
- if (try_to_freeze() || ret)
- continue;
- if (fatal_signal_pending(current))
+ if (signal_pending(current)) {
+ struct ksignal ksig;
+
+ if (!get_signal(&ksig))
+ continue;
break;
+ }
+ if (ret)
+ continue;
/* timed out, exit unless we're the fixed worker */
if (test_bit(IO_WQ_BIT_EXIT, &wq->state) ||
!(worker->flags & IO_WORKER_F_FIXED))
@@ -709,16 +715,20 @@ static int io_wq_manager(void *data)
char buf[TASK_COMM_LEN];
int node;
- sprintf(buf, "iou-mgr-%d", wq->task_pid);
+ snprintf(buf, sizeof(buf), "iou-mgr-%d", wq->task_pid);
set_task_comm(current, buf);
do {
set_current_state(TASK_INTERRUPTIBLE);
io_wq_check_workers(wq);
schedule_timeout(HZ);
- try_to_freeze();
- if (fatal_signal_pending(current))
+ if (signal_pending(current)) {
+ struct ksignal ksig;
+
+ if (!get_signal(&ksig))
+ continue;
set_bit(IO_WQ_BIT_EXIT, &wq->state);
+ }
} while (!test_bit(IO_WQ_BIT_EXIT, &wq->state));
io_wq_check_workers(wq);
@@ -1065,7 +1075,11 @@ static void io_wq_destroy(struct io_wq *wq)
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
- WARN_ON_ONCE(!wq_list_empty(&wqe->work_list));
+ struct io_cb_cancel_data match = {
+ .fn = io_wq_work_match_all,
+ .cancel_all = true,
+ };
+ io_wqe_cancel_pending_work(wqe, &match);
kfree(wqe);
}
io_wq_put_hash(wq->hash);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 543551d70327..dff34975d86b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -78,7 +78,6 @@
#include <linux/task_work.h>
#include <linux/pagemap.h>
#include <linux/io_uring.h>
-#include <linux/freezer.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -698,6 +697,7 @@ enum {
REQ_F_NO_FILE_TABLE_BIT,
REQ_F_LTIMEOUT_ACTIVE_BIT,
REQ_F_COMPLETE_INLINE_BIT,
+ REQ_F_REISSUE_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -741,6 +741,8 @@ enum {
REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
/* completion is deferred through io_comp_state */
REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
+ /* caller should reissue async */
+ REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
};
struct async_poll {
@@ -1095,8 +1097,6 @@ static bool io_match_task(struct io_kiocb *head,
io_for_each_link(req, head) {
if (req->flags & REQ_F_INFLIGHT)
return true;
- if (req->task->files == files)
- return true;
}
return false;
}
@@ -1216,7 +1216,7 @@ static void io_prep_async_work(struct io_kiocb *req)
if (req->flags & REQ_F_ISREG) {
if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
io_wq_hash_work(&req->work, file_inode(req->file));
- } else {
+ } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND;
}
@@ -1239,16 +1239,16 @@ static void io_queue_async_work(struct io_kiocb *req)
BUG_ON(!tctx);
BUG_ON(!tctx->io_wq);
- trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
- &req->work, req->flags);
/* init ->work of the whole link before punting */
io_prep_async_link(req);
+ trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
+ &req->work, req->flags);
io_wq_enqueue(tctx->io_wq, &req->work);
if (link)
io_queue_linked_timeout(link);
}
-static void io_kill_timeout(struct io_kiocb *req)
+static void io_kill_timeout(struct io_kiocb *req, int status)
{
struct io_timeout_data *io = req->async_data;
int ret;
@@ -1258,31 +1258,11 @@ static void io_kill_timeout(struct io_kiocb *req)
atomic_set(&req->ctx->cq_timeouts,
atomic_read(&req->ctx->cq_timeouts) + 1);
list_del_init(&req->timeout.list);
- io_cqring_fill_event(req, 0);
+ io_cqring_fill_event(req, status);
io_put_req_deferred(req, 1);
}
}
-/*
- * Returns true if we found and killed one or more timeouts
- */
-static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
- struct files_struct *files)
-{
- struct io_kiocb *req, *tmp;
- int canceled = 0;
-
- spin_lock_irq(&ctx->completion_lock);
- list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
- if (io_match_task(req, tsk, files)) {
- io_kill_timeout(req);
- canceled++;
- }
- }
- spin_unlock_irq(&ctx->completion_lock);
- return canceled != 0;
-}
-
static void __io_queue_deferred(struct io_ring_ctx *ctx)
{
do {
@@ -1327,7 +1307,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
break;
list_del_init(&req->timeout.list);
- io_kill_timeout(req);
+ io_kill_timeout(req, 0);
} while (!list_empty(&ctx->timeout_list));
ctx->cq_last_tm_flush = seq;
@@ -2499,6 +2479,11 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
return false;
return true;
}
+#else
+static bool io_rw_should_reissue(struct io_kiocb *req)
+{
+ return false;
+}
#endif
static bool io_rw_reissue(struct io_kiocb *req)
@@ -2524,13 +2509,14 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
{
int cflags = 0;
- if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_reissue(req))
+ if (req->rw.kiocb.ki_flags & IOCB_WRITE)
+ kiocb_end_write(req);
+ if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_should_reissue(req)) {
+ req->flags |= REQ_F_REISSUE;
return;
+ }
if (res != req->result)
req_set_fail_links(req);
-
- if (req->rw.kiocb.ki_flags & IOCB_WRITE)
- kiocb_end_write(req);
if (req->flags & REQ_F_BUFFER_SELECTED)
cflags = io_put_rw_kbuf(req);
__io_req_complete(req, issue_flags, res, cflags);
@@ -2776,6 +2762,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
struct io_async_rw *io = req->async_data;
+ bool check_reissue = kiocb->ki_complete == io_complete_rw;
/* add previously done IO, if any */
if (io && io->bytes_done > 0) {
@@ -2791,6 +2778,18 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
__io_complete_rw(req, ret, 0, issue_flags);
else
io_rw_done(kiocb, ret);
+
+ if (check_reissue && req->flags & REQ_F_REISSUE) {
+ req->flags &= ~REQ_F_REISSUE;
+ if (!io_rw_reissue(req)) {
+ int cflags = 0;
+
+ req_set_fail_links(req);
+ if (req->flags & REQ_F_BUFFER_SELECTED)
+ cflags = io_put_rw_kbuf(req);
+ __io_req_complete(req, issue_flags, ret, cflags);
+ }
+ }
}
static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
@@ -3307,11 +3306,8 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
ret = io_iter_do_read(req, iter);
- if (ret == -EIOCBQUEUED) {
- if (req->async_data)
- iov_iter_revert(iter, io_size - iov_iter_count(iter));
- goto out_free;
- } else if (ret == -EAGAIN) {
+ if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
+ req->flags &= ~REQ_F_REISSUE;
/* IOPOLL retry should happen for io-wq threads */
if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
goto done;
@@ -3321,6 +3317,8 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
/* some cases will consume bytes even on error returns */
iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = 0;
+ } else if (ret == -EIOCBQUEUED) {
+ goto out_free;
} else if (ret <= 0 || ret == io_size || !force_nonblock ||
(req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) {
/* read all, failed, already did sync or don't want to retry */
@@ -3433,6 +3431,11 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
else
ret2 = -EINVAL;
+ if (req->flags & REQ_F_REISSUE) {
+ req->flags &= ~REQ_F_REISSUE;
+ ret2 = -EAGAIN;
+ }
+
/*
* Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
* retry them without IOCB_NOWAIT.
@@ -3442,8 +3445,6 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
/* no retry on NONBLOCK nor RWF_NOWAIT */
if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
goto done;
- if (ret2 == -EIOCBQUEUED && req->async_data)
- iov_iter_revert(iter, io_size - iov_iter_count(iter));
if (!force_nonblock || ret2 != -EAGAIN) {
/* IOPOLL retry should happen for io-wq threads */
if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
@@ -3978,6 +3979,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
static int io_provide_buffers_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
+ unsigned long size;
struct io_provide_buf *p = &req->pbuf;
u64 tmp;
@@ -3991,7 +3993,8 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
p->addr = READ_ONCE(sqe->addr);
p->len = READ_ONCE(sqe->len);
- if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs)))
+ size = (unsigned long)p->len * p->nbufs;
+ if (!access_ok(u64_to_user_ptr(p->addr), size))
return -EFAULT;
p->bgid = READ_ONCE(sqe->buf_group);
@@ -4820,7 +4823,6 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
ret = -ENOMEM;
goto out;
}
- io = req->async_data;
memcpy(req->async_data, &__io, sizeof(__io));
return -EAGAIN;
}
@@ -5583,7 +5585,8 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
data->mode = io_translate_timeout_mode(flags);
hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
- io_req_track_inflight(req);
+ if (is_timeout_link)
+ io_req_track_inflight(req);
return 0;
}
@@ -6479,8 +6482,6 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
ret = io_init_req(ctx, req, sqe);
if (unlikely(ret)) {
fail_req:
- io_put_req(req);
- io_req_complete(req, ret);
if (link->head) {
/* fail even hard links since we don't submit */
link->head->flags |= REQ_F_FAIL_LINK;
@@ -6488,6 +6489,8 @@ fail_req:
io_req_complete(link->head, -ECANCELED);
link->head = NULL;
}
+ io_put_req(req);
+ io_req_complete(req, ret);
return ret;
}
ret = io_req_prep(req, sqe);
@@ -6740,7 +6743,7 @@ static int io_sq_thread(void *data)
char buf[TASK_COMM_LEN];
DEFINE_WAIT(wait);
- sprintf(buf, "iou-sqp-%d", sqd->task_pid);
+ snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
set_task_comm(current, buf);
current->pf_io_worker = NULL;
@@ -6751,21 +6754,32 @@ static int io_sq_thread(void *data)
current->flags |= PF_NO_SETAFFINITY;
mutex_lock(&sqd->lock);
+ /* a user may had exited before the thread started */
+ io_run_task_work_head(&sqd->park_task_work);
+
while (!test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) {
int ret;
bool cap_entries, sqt_spin, needs_sched;
- if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) {
+ if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
+ signal_pending(current)) {
+ bool did_sig = false;
+
mutex_unlock(&sqd->lock);
+ if (signal_pending(current)) {
+ struct ksignal ksig;
+
+ did_sig = get_signal(&ksig);
+ }
cond_resched();
mutex_lock(&sqd->lock);
io_run_task_work();
io_run_task_work_head(&sqd->park_task_work);
+ if (did_sig)
+ break;
timeout = jiffies + sqd->sq_thread_idle;
continue;
}
- if (fatal_signal_pending(current))
- break;
sqt_spin = false;
cap_entries = !list_is_singular(&sqd->ctx_list);
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
@@ -6808,7 +6822,6 @@ static int io_sq_thread(void *data)
mutex_unlock(&sqd->lock);
schedule();
- try_to_freeze();
mutex_lock(&sqd->lock);
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_clear_wakeup_flag(ctx);
@@ -6873,7 +6886,7 @@ static int io_run_task_work_sig(void)
return 1;
if (!signal_pending(current))
return 0;
- if (test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))
+ if (test_thread_flag(TIF_NOTIFY_SIGNAL))
return -ERESTARTSYS;
return -EINTR;
}
@@ -8563,6 +8576,14 @@ static void io_ring_exit_work(struct work_struct *work)
struct io_tctx_node *node;
int ret;
+ /* prevent SQPOLL from submitting new requests */
+ if (ctx->sq_data) {
+ io_sq_thread_park(ctx->sq_data);
+ list_del_init(&ctx->sqd_list);
+ io_sqd_update_thread_idle(ctx->sq_data);
+ io_sq_thread_unpark(ctx->sq_data);
+ }
+
/*
* If we're doing polled IO and end up having requests being
* submitted async (out-of-line), then completions can come in while
@@ -8599,6 +8620,28 @@ static void io_ring_exit_work(struct work_struct *work)
io_ring_ctx_free(ctx);
}
+/* Returns true if we found and killed one or more timeouts */
+static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
+ struct files_struct *files)
+{
+ struct io_kiocb *req, *tmp;
+ int canceled = 0;
+
+ spin_lock_irq(&ctx->completion_lock);
+ list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
+ if (io_match_task(req, tsk, files)) {
+ io_kill_timeout(req, -ECANCELED);
+ canceled++;
+ }
+ }
+ if (canceled != 0)
+ io_commit_cqring(ctx);
+ spin_unlock_irq(&ctx->completion_lock);
+ if (canceled != 0)
+ io_cqring_ev_posted(ctx);
+ return canceled != 0;
+}
+
static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
unsigned long index;
@@ -8614,14 +8657,6 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
io_unregister_personality(ctx, index);
mutex_unlock(&ctx->uring_lock);
- /* prevent SQPOLL from submitting new requests */
- if (ctx->sq_data) {
- io_sq_thread_park(ctx->sq_data);
- list_del_init(&ctx->sqd_list);
- io_sqd_update_thread_idle(ctx->sq_data);
- io_sq_thread_unpark(ctx->sq_data);
- }
-
io_kill_timeouts(ctx, NULL, NULL);
io_poll_remove_all(ctx, NULL, NULL);
@@ -8998,6 +9033,8 @@ void __io_uring_task_cancel(void)
/* make sure overflow events are dropped */
atomic_inc(&tctx->in_idle);
+ __io_uring_files_cancel(NULL);
+
do {
/* read completions before cancelations */
inflight = tctx_inflight(tctx);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 4e6cc0a7d69c..1e2204fa9963 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -19,6 +19,9 @@
#include <linux/falloc.h>
#include <linux/sched/signal.h>
#include <linux/fiemap.h>
+#include <linux/mount.h>
+#include <linux/fscrypt.h>
+#include <linux/fileattr.h>
#include "internal.h"
@@ -657,6 +660,307 @@ out:
return ret;
}
+/**
+ * fileattr_fill_xflags - initialize fileattr with xflags
+ * @fa: fileattr pointer
+ * @xflags: FS_XFLAG_* flags
+ *
+ * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags). All
+ * other fields are zeroed.
+ */
+void fileattr_fill_xflags(struct fileattr *fa, u32 xflags)
+{
+ memset(fa, 0, sizeof(*fa));
+ fa->fsx_valid = true;
+ fa->fsx_xflags = xflags;
+ if (fa->fsx_xflags & FS_XFLAG_IMMUTABLE)
+ fa->flags |= FS_IMMUTABLE_FL;
+ if (fa->fsx_xflags & FS_XFLAG_APPEND)
+ fa->flags |= FS_APPEND_FL;
+ if (fa->fsx_xflags & FS_XFLAG_SYNC)
+ fa->flags |= FS_SYNC_FL;
+ if (fa->fsx_xflags & FS_XFLAG_NOATIME)
+ fa->flags |= FS_NOATIME_FL;
+ if (fa->fsx_xflags & FS_XFLAG_NODUMP)
+ fa->flags |= FS_NODUMP_FL;
+ if (fa->fsx_xflags & FS_XFLAG_DAX)
+ fa->flags |= FS_DAX_FL;
+ if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
+ fa->flags |= FS_PROJINHERIT_FL;
+}
+EXPORT_SYMBOL(fileattr_fill_xflags);
+
+/**
+ * fileattr_fill_flags - initialize fileattr with flags
+ * @fa: fileattr pointer
+ * @flags: FS_*_FL flags
+ *
+ * Set ->flags, ->flags_valid and ->fsx_xflags (translated flags).
+ * All other fields are zeroed.
+ */
+void fileattr_fill_flags(struct fileattr *fa, u32 flags)
+{
+ memset(fa, 0, sizeof(*fa));
+ fa->flags_valid = true;
+ fa->flags = flags;
+ if (fa->flags & FS_SYNC_FL)
+ fa->fsx_xflags |= FS_XFLAG_SYNC;
+ if (fa->flags & FS_IMMUTABLE_FL)
+ fa->fsx_xflags |= FS_XFLAG_IMMUTABLE;
+ if (fa->flags & FS_APPEND_FL)
+ fa->fsx_xflags |= FS_XFLAG_APPEND;
+ if (fa->flags & FS_NODUMP_FL)
+ fa->fsx_xflags |= FS_XFLAG_NODUMP;
+ if (fa->flags & FS_NOATIME_FL)
+ fa->fsx_xflags |= FS_XFLAG_NOATIME;
+ if (fa->flags & FS_DAX_FL)
+ fa->fsx_xflags |= FS_XFLAG_DAX;
+ if (fa->flags & FS_PROJINHERIT_FL)
+ fa->fsx_xflags |= FS_XFLAG_PROJINHERIT;
+}
+EXPORT_SYMBOL(fileattr_fill_flags);
+
+/**
+ * vfs_fileattr_get - retrieve miscellaneous file attributes
+ * @dentry: the object to retrieve from
+ * @fa: fileattr pointer
+ *
+ * Call i_op->fileattr_get() callback, if exists.
+ *
+ * Return: 0 on success, or a negative error on failure.
+ */
+int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+
+ if (!inode->i_op->fileattr_get)
+ return -ENOIOCTLCMD;
+
+ return inode->i_op->fileattr_get(dentry, fa);
+}
+EXPORT_SYMBOL(vfs_fileattr_get);
+
+/**
+ * copy_fsxattr_to_user - copy fsxattr to userspace.
+ * @fa: fileattr pointer
+ * @ufa: fsxattr user pointer
+ *
+ * Return: 0 on success, or -EFAULT on failure.
+ */
+int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa)
+{
+ struct fsxattr xfa;
+
+ memset(&xfa, 0, sizeof(xfa));
+ xfa.fsx_xflags = fa->fsx_xflags;
+ xfa.fsx_extsize = fa->fsx_extsize;
+ xfa.fsx_nextents = fa->fsx_nextents;
+ xfa.fsx_projid = fa->fsx_projid;
+ xfa.fsx_cowextsize = fa->fsx_cowextsize;
+
+ if (copy_to_user(ufa, &xfa, sizeof(xfa)))
+ return -EFAULT;
+
+ return 0;
+}
+EXPORT_SYMBOL(copy_fsxattr_to_user);
+
+static int copy_fsxattr_from_user(struct fileattr *fa,
+ struct fsxattr __user *ufa)
+{
+ struct fsxattr xfa;
+
+ if (copy_from_user(&xfa, ufa, sizeof(xfa)))
+ return -EFAULT;
+
+ fileattr_fill_xflags(fa, xfa.fsx_xflags);
+ fa->fsx_extsize = xfa.fsx_extsize;
+ fa->fsx_nextents = xfa.fsx_nextents;
+ fa->fsx_projid = xfa.fsx_projid;
+ fa->fsx_cowextsize = xfa.fsx_cowextsize;
+
+ return 0;
+}
+
+/*
+ * Generic function to check FS_IOC_FSSETXATTR/FS_IOC_SETFLAGS values and reject
+ * any invalid configurations.
+ *
+ * Note: must be called with inode lock held.
+ */
+static int fileattr_set_prepare(struct inode *inode,
+ const struct fileattr *old_ma,
+ struct fileattr *fa)
+{
+ int err;
+
+ /*
+ * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+ * the relevant capability.
+ */
+ if ((fa->flags ^ old_ma->flags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) &&
+ !capable(CAP_LINUX_IMMUTABLE))
+ return -EPERM;
+
+ err = fscrypt_prepare_setflags(inode, old_ma->flags, fa->flags);
+ if (err)
+ return err;
+
+ /*
+ * Project Quota ID state is only allowed to change from within the init
+ * namespace. Enforce that restriction only if we are trying to change
+ * the quota ID state. Everything else is allowed in user namespaces.
+ */
+ if (current_user_ns() != &init_user_ns) {
+ if (old_ma->fsx_projid != fa->fsx_projid)
+ return -EINVAL;
+ if ((old_ma->fsx_xflags ^ fa->fsx_xflags) &
+ FS_XFLAG_PROJINHERIT)
+ return -EINVAL;
+ }
+
+ /* Check extent size hints. */
+ if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
+ !S_ISDIR(inode->i_mode))
+ return -EINVAL;
+
+ if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) &&
+ !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+ return -EINVAL;
+
+ /*
+ * It is only valid to set the DAX flag on regular files and
+ * directories on filesystems.
+ */
+ if ((fa->fsx_xflags & FS_XFLAG_DAX) &&
+ !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
+ return -EINVAL;
+
+ /* Extent size hints of zero turn off the flags. */
+ if (fa->fsx_extsize == 0)
+ fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
+ if (fa->fsx_cowextsize == 0)
+ fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
+
+ return 0;
+}
+
+/**
+ * vfs_fileattr_set - change miscellaneous file attributes
+ * @mnt_userns: user namespace of the mount
+ * @dentry: the object to change
+ * @fa: fileattr pointer
+ *
+ * After verifying permissions, call i_op->fileattr_set() callback, if
+ * exists.
+ *
+ * Verifying attributes involves retrieving current attributes with
+ * i_op->fileattr_get(), this also allows initializing attributes that have
+ * not been set by the caller to current values. Inode lock is held
+ * thoughout to prevent racing with another instance.
+ *
+ * Return: 0 on success, or a negative error on failure.
+ */
+int vfs_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ struct fileattr old_ma = {};
+ int err;
+
+ if (!inode->i_op->fileattr_set)
+ return -ENOIOCTLCMD;
+
+ if (!inode_owner_or_capable(mnt_userns, inode))
+ return -EPERM;
+
+ inode_lock(inode);
+ err = vfs_fileattr_get(dentry, &old_ma);
+ if (!err) {
+ /* initialize missing bits from old_ma */
+ if (fa->flags_valid) {
+ fa->fsx_xflags |= old_ma.fsx_xflags & ~FS_XFLAG_COMMON;
+ fa->fsx_extsize = old_ma.fsx_extsize;
+ fa->fsx_nextents = old_ma.fsx_nextents;
+ fa->fsx_projid = old_ma.fsx_projid;
+ fa->fsx_cowextsize = old_ma.fsx_cowextsize;
+ } else {
+ fa->flags |= old_ma.flags & ~FS_COMMON_FL;
+ }
+ err = fileattr_set_prepare(inode, &old_ma, fa);
+ if (!err)
+ err = inode->i_op->fileattr_set(mnt_userns, dentry, fa);
+ }
+ inode_unlock(inode);
+
+ return err;
+}
+EXPORT_SYMBOL(vfs_fileattr_set);
+
+static int ioctl_getflags(struct file *file, unsigned int __user *argp)
+{
+ struct fileattr fa = { .flags_valid = true }; /* hint only */
+ int err;
+
+ err = vfs_fileattr_get(file->f_path.dentry, &fa);
+ if (!err)
+ err = put_user(fa.flags, argp);
+ return err;
+}
+
+static int ioctl_setflags(struct file *file, unsigned int __user *argp)
+{
+ struct user_namespace *mnt_userns = file_mnt_user_ns(file);
+ struct dentry *dentry = file->f_path.dentry;
+ struct fileattr fa;
+ unsigned int flags;
+ int err;
+
+ err = get_user(flags, argp);
+ if (!err) {
+ err = mnt_want_write_file(file);
+ if (!err) {
+ fileattr_fill_flags(&fa, flags);
+ err = vfs_fileattr_set(mnt_userns, dentry, &fa);
+ mnt_drop_write_file(file);
+ }
+ }
+ return err;
+}
+
+static int ioctl_fsgetxattr(struct file *file, void __user *argp)
+{
+ struct fileattr fa = { .fsx_valid = true }; /* hint only */
+ int err;
+
+ err = vfs_fileattr_get(file->f_path.dentry, &fa);
+ if (!err)
+ err = copy_fsxattr_to_user(&fa, argp);
+
+ return err;
+}
+
+static int ioctl_fssetxattr(struct file *file, void __user *argp)
+{
+ struct user_namespace *mnt_userns = file_mnt_user_ns(file);
+ struct dentry *dentry = file->f_path.dentry;
+ struct fileattr fa;
+ int err;
+
+ err = copy_fsxattr_from_user(&fa, argp);
+ if (!err) {
+ err = mnt_want_write_file(file);
+ if (!err) {
+ err = vfs_fileattr_set(mnt_userns, dentry, &fa);
+ mnt_drop_write_file(file);
+ }
+ }
+ return err;
+}
+
/*
* do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d.
* It's just a simple helper for sys_ioctl and compat_sys_ioctl.
@@ -727,6 +1031,18 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
return put_user(i_size_read(inode) - filp->f_pos,
(int __user *)argp);
+ case FS_IOC_GETFLAGS:
+ return ioctl_getflags(filp, argp);
+
+ case FS_IOC_SETFLAGS:
+ return ioctl_setflags(filp, argp);
+
+ case FS_IOC_FSGETXATTR:
+ return ioctl_fsgetxattr(filp, argp);
+
+ case FS_IOC_FSSETXATTR:
+ return ioctl_fssetxattr(filp, argp);
+
default:
if (S_ISREG(inode->i_mode))
return file_ioctl(filp, cmd, argp);
@@ -828,6 +1144,15 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
#endif
/*
+ * These access 32-bit values anyway so no further handling is
+ * necessary.
+ */
+ case FS_IOC32_GETFLAGS:
+ case FS_IOC32_SETFLAGS:
+ cmd = (cmd == FS_IOC32_GETFLAGS) ?
+ FS_IOC_GETFLAGS : FS_IOC_SETFLAGS;
+ fallthrough;
+ /*
* everything else in do_vfs_ioctl() takes either a compatible
* pointer argument or no argument -- call it with a modified
* argument.
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 414769a6ad11..0129e6bab985 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1155,7 +1155,8 @@ iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends,
EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
static int
-iomap_ioend_compare(void *priv, struct list_head *a, struct list_head *b)
+iomap_ioend_compare(void *priv, const struct list_head *a,
+ const struct list_head *b)
{
struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c
index a5e478de1417..6250ca6a1f85 100644
--- a/fs/iomap/swapfile.c
+++ b/fs/iomap/swapfile.c
@@ -18,6 +18,7 @@ struct iomap_swapfile_info {
uint64_t highest_ppage; /* highest physical addr seen (pages) */
unsigned long nr_pages; /* number of pages collected */
int nr_extents; /* extent count */
+ struct file *file;
};
/*
@@ -70,6 +71,18 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
return 0;
}
+static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str)
+{
+ char *buf, *p = ERR_PTR(-ENOMEM);
+
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (buf)
+ p = file_path(isi->file, buf, PATH_MAX);
+ pr_err("swapon: file %s %s\n", IS_ERR(p) ? "<unknown>" : p, str);
+ kfree(buf);
+ return -EINVAL;
+}
+
/*
* Accumulate iomaps for this swap file. We have to accumulate iomaps because
* swap only cares about contiguous page-aligned physical extents and makes no
@@ -89,28 +102,20 @@ static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
break;
case IOMAP_INLINE:
/* No inline data. */
- pr_err("swapon: file is inline\n");
- return -EINVAL;
+ return iomap_swapfile_fail(isi, "is inline");
default:
- pr_err("swapon: file has unallocated extents\n");
- return -EINVAL;
+ return iomap_swapfile_fail(isi, "has unallocated extents");
}
/* No uncommitted metadata or shared blocks. */
- if (iomap->flags & IOMAP_F_DIRTY) {
- pr_err("swapon: file is not committed\n");
- return -EINVAL;
- }
- if (iomap->flags & IOMAP_F_SHARED) {
- pr_err("swapon: file has shared extents\n");
- return -EINVAL;
- }
+ if (iomap->flags & IOMAP_F_DIRTY)
+ return iomap_swapfile_fail(isi, "is not committed");
+ if (iomap->flags & IOMAP_F_SHARED)
+ return iomap_swapfile_fail(isi, "has shared extents");
/* Only one bdev per swap file. */
- if (iomap->bdev != isi->sis->bdev) {
- pr_err("swapon: file is on multiple devices\n");
- return -EINVAL;
- }
+ if (iomap->bdev != isi->sis->bdev)
+ return iomap_swapfile_fail(isi, "outside the main device");
if (isi->iomap.length == 0) {
/* No accumulated extent, so just store it. */
@@ -139,6 +144,7 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
struct iomap_swapfile_info isi = {
.sis = sis,
.lowest_ppage = (sector_t)-1ULL,
+ .file = swap_file,
};
struct address_space *mapping = swap_file->f_mapping;
struct inode *inode = mapping->host;
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 28b70e7c7dd4..1d732fd223d4 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -130,6 +130,8 @@ int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
const struct inode_operations jfs_file_inode_operations = {
.listxattr = jfs_listxattr,
.setattr = jfs_setattr,
+ .fileattr_get = jfs_fileattr_get,
+ .fileattr_set = jfs_fileattr_set,
#ifdef CONFIG_JFS_POSIX_ACL
.get_acl = jfs_get_acl,
.set_acl = jfs_set_acl,
@@ -147,7 +149,5 @@ const struct file_operations jfs_file_operations = {
.fsync = jfs_fsync,
.release = jfs_release,
.unlocked_ioctl = jfs_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = jfs_compat_ioctl,
-#endif
+ .compat_ioctl = compat_ptr_ioctl,
};
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index 2581d4db58ff..03a845ab4f00 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -15,6 +15,7 @@
#include <linux/blkdev.h>
#include <asm/current.h>
#include <linux/uaccess.h>
+#include <linux/fileattr.h>
#include "jfs_filsys.h"
#include "jfs_debug.h"
@@ -56,69 +57,56 @@ static long jfs_map_ext2(unsigned long flags, int from)
return mapped;
}
+int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+ struct jfs_inode_info *jfs_inode = JFS_IP(d_inode(dentry));
+ unsigned int flags = jfs_inode->mode2 & JFS_FL_USER_VISIBLE;
-long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+ if (d_is_special(dentry))
+ return -ENOTTY;
+
+ fileattr_fill_flags(fa, jfs_map_ext2(flags, 0));
+
+ return 0;
+}
+
+int jfs_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = d_inode(dentry);
struct jfs_inode_info *jfs_inode = JFS_IP(inode);
unsigned int flags;
- switch (cmd) {
- case JFS_IOC_GETFLAGS:
- flags = jfs_inode->mode2 & JFS_FL_USER_VISIBLE;
- flags = jfs_map_ext2(flags, 0);
- return put_user(flags, (int __user *) arg);
- case JFS_IOC_SETFLAGS: {
- unsigned int oldflags;
- int err;
-
- err = mnt_want_write_file(filp);
- if (err)
- return err;
-
- if (!inode_owner_or_capable(&init_user_ns, inode)) {
- err = -EACCES;
- goto setflags_out;
- }
- if (get_user(flags, (int __user *) arg)) {
- err = -EFAULT;
- goto setflags_out;
- }
+ if (d_is_special(dentry))
+ return -ENOTTY;
- flags = jfs_map_ext2(flags, 1);
- if (!S_ISDIR(inode->i_mode))
- flags &= ~JFS_DIRSYNC_FL;
+ if (fileattr_has_fsx(fa))
+ return -EOPNOTSUPP;
- /* Is it quota file? Do not allow user to mess with it */
- if (IS_NOQUOTA(inode)) {
- err = -EPERM;
- goto setflags_out;
- }
+ flags = jfs_map_ext2(fa->flags, 1);
+ if (!S_ISDIR(inode->i_mode))
+ flags &= ~JFS_DIRSYNC_FL;
- /* Lock against other parallel changes of flags */
- inode_lock(inode);
+ /* Is it quota file? Do not allow user to mess with it */
+ if (IS_NOQUOTA(inode))
+ return -EPERM;
- oldflags = jfs_map_ext2(jfs_inode->mode2 & JFS_FL_USER_VISIBLE,
- 0);
- err = vfs_ioc_setflags_prepare(inode, oldflags, flags);
- if (err) {
- inode_unlock(inode);
- goto setflags_out;
- }
+ flags = flags & JFS_FL_USER_MODIFIABLE;
+ flags |= jfs_inode->mode2 & ~JFS_FL_USER_MODIFIABLE;
+ jfs_inode->mode2 = flags;
- flags = flags & JFS_FL_USER_MODIFIABLE;
- flags |= jfs_inode->mode2 & ~JFS_FL_USER_MODIFIABLE;
- jfs_inode->mode2 = flags;
-
- jfs_set_inode_flags(inode);
- inode_unlock(inode);
- inode->i_ctime = current_time(inode);
- mark_inode_dirty(inode);
-setflags_out:
- mnt_drop_write_file(filp);
- return err;
- }
+ jfs_set_inode_flags(inode);
+ inode->i_ctime = current_time(inode);
+ mark_inode_dirty(inode);
+
+ return 0;
+}
+
+long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ switch (cmd) {
case FITRIM:
{
struct super_block *sb = inode->i_sb;
@@ -156,22 +144,3 @@ setflags_out:
return -ENOTTY;
}
}
-
-#ifdef CONFIG_COMPAT
-long jfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
- /* While these ioctl numbers defined with 'long' and have different
- * numbers than the 64bit ABI,
- * the actual implementation only deals with ints and is compatible.
- */
- switch (cmd) {
- case JFS_IOC_GETFLAGS32:
- cmd = JFS_IOC_GETFLAGS;
- break;
- case JFS_IOC_SETFLAGS32:
- cmd = JFS_IOC_SETFLAGS;
- break;
- }
- return jfs_ioctl(filp, cmd, arg);
-}
-#endif
diff --git a/fs/jfs/jfs_dinode.h b/fs/jfs/jfs_dinode.h
index 5fa9fd594115..d6af79e94263 100644
--- a/fs/jfs/jfs_dinode.h
+++ b/fs/jfs/jfs_dinode.h
@@ -160,11 +160,4 @@ struct dinode {
#define JFS_FL_USER_MODIFIABLE 0x03F80000
#define JFS_FL_INHERIT 0x03C80000
-/* These are identical to EXT[23]_IOC_GETFLAGS/SETFLAGS */
-#define JFS_IOC_GETFLAGS _IOR('f', 1, long)
-#define JFS_IOC_SETFLAGS _IOW('f', 2, long)
-
-#define JFS_IOC_GETFLAGS32 _IOR('f', 1, int)
-#define JFS_IOC_SETFLAGS32 _IOW('f', 2, int)
-
#endif /*_H_JFS_DINODE */
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 01daa0cb0ae5..7de961a81862 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -9,8 +9,10 @@ struct fid;
extern struct inode *ialloc(struct inode *, umode_t);
extern int jfs_fsync(struct file *, loff_t, loff_t, int);
+extern int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+extern int jfs_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa);
extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
-extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
extern struct inode *jfs_iget(struct super_block *, unsigned long);
extern int jfs_commit_inode(struct inode *, int);
extern int jfs_write_inode(struct inode *, struct writeback_control *);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 9abed0d750e5..9db4f5789c0e 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1522,6 +1522,8 @@ const struct inode_operations jfs_dir_inode_operations = {
.rename = jfs_rename,
.listxattr = jfs_listxattr,
.setattr = jfs_setattr,
+ .fileattr_get = jfs_fileattr_get,
+ .fileattr_set = jfs_fileattr_set,
#ifdef CONFIG_JFS_POSIX_ACL
.get_acl = jfs_get_acl,
.set_acl = jfs_set_acl,
@@ -1533,9 +1535,7 @@ const struct file_operations jfs_dir_operations = {
.iterate = jfs_readdir,
.fsync = jfs_fsync,
.unlocked_ioctl = jfs_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = jfs_compat_ioctl,
-#endif
+ .compat_ioctl = compat_ptr_ioctl,
.llseek = generic_file_llseek,
};
diff --git a/fs/libfs.c b/fs/libfs.c
index e2de5401abca..e9b29c6ffccb 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -481,6 +481,7 @@ EXPORT_SYMBOL(simple_rename);
/**
* simple_setattr - setattr for simple filesystem
+ * @mnt_userns: user namespace of the target mount
* @dentry: dentry
* @iattr: iattr structure
*
diff --git a/fs/locks.c b/fs/locks.c
index 6125d2de39b8..5c42363aa811 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2369,7 +2369,6 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock)
if (flock->l_pid != 0)
goto out;
- cmd = F_GETLK;
fl->fl_flags |= FL_OFDLCK;
fl->fl_owner = filp;
}
@@ -2825,7 +2824,7 @@ struct locks_iterator {
};
static void lock_get_status(struct seq_file *f, struct file_lock *fl,
- loff_t id, char *pfx)
+ loff_t id, char *pfx, int repeat)
{
struct inode *inode = NULL;
unsigned int fl_pid;
@@ -2841,7 +2840,11 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
if (fl->fl_file != NULL)
inode = locks_inode(fl->fl_file);
- seq_printf(f, "%lld:%s ", id, pfx);
+ seq_printf(f, "%lld: ", id);
+
+ if (repeat)
+ seq_printf(f, "%*s", repeat - 1 + (int)strlen(pfx), pfx);
+
if (IS_POSIX(fl)) {
if (fl->fl_flags & FL_ACCESS)
seq_puts(f, "ACCESS");
@@ -2903,21 +2906,64 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
}
}
+static struct file_lock *get_next_blocked_member(struct file_lock *node)
+{
+ struct file_lock *tmp;
+
+ /* NULL node or root node */
+ if (node == NULL || node->fl_blocker == NULL)
+ return NULL;
+
+ /* Next member in the linked list could be itself */
+ tmp = list_next_entry(node, fl_blocked_member);
+ if (list_entry_is_head(tmp, &node->fl_blocker->fl_blocked_requests, fl_blocked_member)
+ || tmp == node) {
+ return NULL;
+ }
+
+ return tmp;
+}
+
static int locks_show(struct seq_file *f, void *v)
{
struct locks_iterator *iter = f->private;
- struct file_lock *fl, *bfl;
+ struct file_lock *cur, *tmp;
struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
+ int level = 0;
- fl = hlist_entry(v, struct file_lock, fl_link);
+ cur = hlist_entry(v, struct file_lock, fl_link);
- if (locks_translate_pid(fl, proc_pidns) == 0)
+ if (locks_translate_pid(cur, proc_pidns) == 0)
return 0;
- lock_get_status(f, fl, iter->li_pos, "");
+ /* View this crossed linked list as a binary tree, the first member of fl_blocked_requests
+ * is the left child of current node, the next silibing in fl_blocked_member is the
+ * right child, we can alse get the parent of current node from fl_blocker, so this
+ * question becomes traversal of a binary tree
+ */
+ while (cur != NULL) {
+ if (level)
+ lock_get_status(f, cur, iter->li_pos, "-> ", level);
+ else
+ lock_get_status(f, cur, iter->li_pos, "", level);
- list_for_each_entry(bfl, &fl->fl_blocked_requests, fl_blocked_member)
- lock_get_status(f, bfl, iter->li_pos, " ->");
+ if (!list_empty(&cur->fl_blocked_requests)) {
+ /* Turn left */
+ cur = list_first_entry_or_null(&cur->fl_blocked_requests,
+ struct file_lock, fl_blocked_member);
+ level++;
+ } else {
+ /* Turn right */
+ tmp = get_next_blocked_member(cur);
+ /* Fall back to parent node */
+ while (tmp == NULL && cur->fl_blocker != NULL) {
+ cur = cur->fl_blocker;
+ level--;
+ tmp = get_next_blocked_member(cur);
+ }
+ cur = tmp;
+ }
+ }
return 0;
}
@@ -2938,7 +2984,7 @@ static void __show_fd_locks(struct seq_file *f,
(*id)++;
seq_puts(f, "lock:\t");
- lock_get_status(f, fl, *id, "");
+ lock_get_status(f, fl, *id, "", 0);
}
}
diff --git a/fs/namei.c b/fs/namei.c
index 216f16e74351..79b0ff9b151e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -579,6 +579,8 @@ static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
p->stack = p->internal;
p->dfd = dfd;
p->name = name;
+ p->path.mnt = NULL;
+ p->path.dentry = NULL;
p->total_link_count = old ? old->total_link_count : 0;
p->saved = old;
current->nameidata = p;
@@ -652,6 +654,8 @@ static void terminate_walk(struct nameidata *nd)
rcu_read_unlock();
}
nd->depth = 0;
+ nd->path.mnt = NULL;
+ nd->path.dentry = NULL;
}
/* path_put is needed afterwards regardless of success or failure */
@@ -1122,8 +1126,7 @@ int may_linkat(struct user_namespace *mnt_userns, struct path *link)
* should be allowed, or not, on files that already
* exist.
* @mnt_userns: user namespace of the mount the inode was found from
- * @dir_mode: mode bits of directory
- * @dir_uid: owner of directory
+ * @nd: nameidata pathwalk data
* @inode: the inode of the file to open
*
* Block an O_CREAT open of a FIFO (or a regular file) when:
@@ -2322,8 +2325,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
}
nd->root.mnt = NULL;
- nd->path.mnt = NULL;
- nd->path.dentry = NULL;
/* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
@@ -2419,16 +2420,16 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path
while (!(err = link_path_walk(s, nd)) &&
(s = lookup_last(nd)) != NULL)
;
+ if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
+ err = handle_lookup_down(nd);
+ nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please...
+ }
if (!err)
err = complete_walk(nd);
if (!err && nd->flags & LOOKUP_DIRECTORY)
if (!d_can_lookup(nd->path.dentry))
err = -ENOTDIR;
- if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
- err = handle_lookup_down(nd);
- nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please...
- }
if (!err) {
*path = nd->path;
nd->path.mnt = NULL;
@@ -2823,16 +2824,14 @@ static int may_delete(struct user_namespace *mnt_userns, struct inode *dir,
static inline int may_create(struct user_namespace *mnt_userns,
struct inode *dir, struct dentry *child)
{
- struct user_namespace *s_user_ns;
audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
if (child->d_inode)
return -EEXIST;
if (IS_DEADDIR(dir))
return -ENOENT;
- s_user_ns = dir->i_sb->s_user_ns;
- if (!kuid_has_mapping(s_user_ns, fsuid_into_mnt(mnt_userns)) ||
- !kgid_has_mapping(s_user_ns, fsgid_into_mnt(mnt_userns)))
+ if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns))
return -EOVERFLOW;
+
return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
}
@@ -3034,14 +3033,11 @@ static int may_o_create(struct user_namespace *mnt_userns,
const struct path *dir, struct dentry *dentry,
umode_t mode)
{
- struct user_namespace *s_user_ns;
int error = security_path_mknod(dir, dentry, mode, 0);
if (error)
return error;
- s_user_ns = dir->dentry->d_sb->s_user_ns;
- if (!kuid_has_mapping(s_user_ns, fsuid_into_mnt(mnt_userns)) ||
- !kgid_has_mapping(s_user_ns, fsgid_into_mnt(mnt_userns)))
+ if (!fsuidgid_has_mapping(dir->dentry->d_sb, mnt_userns))
return -EOVERFLOW;
error = inode_permission(mnt_userns, dir->dentry->d_inode,
@@ -3381,7 +3377,7 @@ static int do_open(struct nameidata *nd,
* @mnt_userns: user namespace of the mount the inode was found from
* @dentry: pointer to dentry of the base directory
* @mode: mode of the new tmpfile
- * @open_flags: flags
+ * @open_flag: flags
*
* Create a temporary file.
*
@@ -4406,14 +4402,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
/**
* vfs_rename - rename a filesystem object
- * @old_mnt_userns: old user namespace of the mount the inode was found from
- * @old_dir: parent of source
- * @old_dentry: source
- * @new_mnt_userns: new user namespace of the mount the inode was found from
- * @new_dir: parent of destination
- * @new_dentry: destination
- * @delegated_inode: returns an inode needing a delegation break
- * @flags: rename flags
+ * @rd: pointer to &struct renamedata info
*
* The caller must hold multiple mutexes--see lock_rename()).
*
diff --git a/fs/namespace.c b/fs/namespace.c
index 56bb5a5fdc0d..f63337828e1c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1242,8 +1242,9 @@ struct vfsmount *mntget(struct vfsmount *mnt)
}
EXPORT_SYMBOL(mntget);
-/* path_is_mountpoint() - Check if path is a mount in the current
- * namespace.
+/**
+ * path_is_mountpoint() - Check if path is a mount in the current namespace.
+ * @path: path to check
*
* d_mountpoint() can only be used reliably to establish if a dentry is
* not mounted in any namespace and that common case is handled inline.
@@ -1369,7 +1370,7 @@ void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor)
/**
* may_umount_tree - check if a mount tree is busy
- * @mnt: root of mount tree
+ * @m: root of mount tree
*
* This is called to check if a tree of mounts has any
* open files, pwds, chroots or sub mounts that are
@@ -1939,10 +1940,11 @@ void drop_collected_mounts(struct vfsmount *mnt)
/**
* clone_private_mount - create a private clone of a path
+ * @path: path to clone
*
- * This creates a new vfsmount, which will be the clone of @path. The new will
- * not be attached anywhere in the namespace and will be private (i.e. changes
- * to the originating mount won't be propagated into this).
+ * This creates a new vfsmount, which will be the clone of @path. The new mount
+ * will not be attached anywhere in the namespace and will be private (i.e.
+ * changes to the originating mount won't be propagated into this).
*
* Release with mntput().
*/
diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig
new file mode 100644
index 000000000000..578112713703
--- /dev/null
+++ b/fs/netfs/Kconfig
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config NETFS_SUPPORT
+ tristate "Support for network filesystem high-level I/O"
+ help
+ This option enables support for network filesystems, including
+ helpers for high-level buffered I/O, abstracting out read
+ segmentation, local caching and transparent huge page support.
+
+config NETFS_STATS
+ bool "Gather statistical information on local caching"
+ depends on NETFS_SUPPORT && PROC_FS
+ help
+ This option causes statistical information to be gathered on local
+ caching and exported through file:
+
+ /proc/fs/fscache/stats
+
+ The gathering of statistics adds a certain amount of overhead to
+ execution as there are a quite a few stats gathered, and on a
+ multi-CPU system these may be on cachelines that keep bouncing
+ between CPUs. On the other hand, the stats are very useful for
+ debugging purposes. Saying 'Y' here is recommended.
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
new file mode 100644
index 000000000000..c15bfc966d96
--- /dev/null
+++ b/fs/netfs/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+netfs-y := read_helper.o stats.o
+
+obj-$(CONFIG_NETFS_SUPPORT) := netfs.o
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
new file mode 100644
index 000000000000..b7f2c4459f33
--- /dev/null
+++ b/fs/netfs/internal.h
@@ -0,0 +1,97 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Internal definitions for network filesystem support
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) "netfs: " fmt
+
+/*
+ * read_helper.c
+ */
+extern unsigned int netfs_debug;
+
+/*
+ * stats.c
+ */
+#ifdef CONFIG_NETFS_STATS
+extern atomic_t netfs_n_rh_readahead;
+extern atomic_t netfs_n_rh_readpage;
+extern atomic_t netfs_n_rh_rreq;
+extern atomic_t netfs_n_rh_sreq;
+extern atomic_t netfs_n_rh_download;
+extern atomic_t netfs_n_rh_download_done;
+extern atomic_t netfs_n_rh_download_failed;
+extern atomic_t netfs_n_rh_download_instead;
+extern atomic_t netfs_n_rh_read;
+extern atomic_t netfs_n_rh_read_done;
+extern atomic_t netfs_n_rh_read_failed;
+extern atomic_t netfs_n_rh_zero;
+extern atomic_t netfs_n_rh_short_read;
+extern atomic_t netfs_n_rh_write;
+extern atomic_t netfs_n_rh_write_begin;
+extern atomic_t netfs_n_rh_write_done;
+extern atomic_t netfs_n_rh_write_failed;
+extern atomic_t netfs_n_rh_write_zskip;
+
+
+static inline void netfs_stat(atomic_t *stat)
+{
+ atomic_inc(stat);
+}
+
+static inline void netfs_stat_d(atomic_t *stat)
+{
+ atomic_dec(stat);
+}
+
+#else
+#define netfs_stat(x) do {} while(0)
+#define netfs_stat_d(x) do {} while(0)
+#endif
+
+/*****************************************************************************/
+/*
+ * debug tracing
+ */
+#define dbgprintk(FMT, ...) \
+ printk("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
+
+#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
+
+#ifdef __KDEBUG
+#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
+#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
+#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
+
+#elif defined(CONFIG_NETFS_DEBUG)
+#define _enter(FMT, ...) \
+do { \
+ if (netfs_debug) \
+ kenter(FMT, ##__VA_ARGS__); \
+} while (0)
+
+#define _leave(FMT, ...) \
+do { \
+ if (netfs_debug) \
+ kleave(FMT, ##__VA_ARGS__); \
+} while (0)
+
+#define _debug(FMT, ...) \
+do { \
+ if (netfs_debug) \
+ kdebug(FMT, ##__VA_ARGS__); \
+} while (0)
+
+#else
+#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
+#endif
diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c
new file mode 100644
index 000000000000..193841d03de0
--- /dev/null
+++ b/fs/netfs/read_helper.c
@@ -0,0 +1,1185 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Network filesystem high-level read support.
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <linux/sched/mm.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/netfs.h>
+#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/netfs.h>
+
+MODULE_DESCRIPTION("Network fs support");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+unsigned netfs_debug;
+module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask");
+
+static void netfs_rreq_work(struct work_struct *);
+static void __netfs_put_subrequest(struct netfs_read_subrequest *, bool);
+
+static void netfs_put_subrequest(struct netfs_read_subrequest *subreq,
+ bool was_async)
+{
+ if (refcount_dec_and_test(&subreq->usage))
+ __netfs_put_subrequest(subreq, was_async);
+}
+
+static struct netfs_read_request *netfs_alloc_read_request(
+ const struct netfs_read_request_ops *ops, void *netfs_priv,
+ struct file *file)
+{
+ static atomic_t debug_ids;
+ struct netfs_read_request *rreq;
+
+ rreq = kzalloc(sizeof(struct netfs_read_request), GFP_KERNEL);
+ if (rreq) {
+ rreq->netfs_ops = ops;
+ rreq->netfs_priv = netfs_priv;
+ rreq->inode = file_inode(file);
+ rreq->i_size = i_size_read(rreq->inode);
+ rreq->debug_id = atomic_inc_return(&debug_ids);
+ INIT_LIST_HEAD(&rreq->subrequests);
+ INIT_WORK(&rreq->work, netfs_rreq_work);
+ refcount_set(&rreq->usage, 1);
+ __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+ ops->init_rreq(rreq, file);
+ netfs_stat(&netfs_n_rh_rreq);
+ }
+
+ return rreq;
+}
+
+static void netfs_get_read_request(struct netfs_read_request *rreq)
+{
+ refcount_inc(&rreq->usage);
+}
+
+static void netfs_rreq_clear_subreqs(struct netfs_read_request *rreq,
+ bool was_async)
+{
+ struct netfs_read_subrequest *subreq;
+
+ while (!list_empty(&rreq->subrequests)) {
+ subreq = list_first_entry(&rreq->subrequests,
+ struct netfs_read_subrequest, rreq_link);
+ list_del(&subreq->rreq_link);
+ netfs_put_subrequest(subreq, was_async);
+ }
+}
+
+static void netfs_free_read_request(struct work_struct *work)
+{
+ struct netfs_read_request *rreq =
+ container_of(work, struct netfs_read_request, work);
+ netfs_rreq_clear_subreqs(rreq, false);
+ if (rreq->netfs_priv)
+ rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv);
+ trace_netfs_rreq(rreq, netfs_rreq_trace_free);
+ if (rreq->cache_resources.ops)
+ rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
+ kfree(rreq);
+ netfs_stat_d(&netfs_n_rh_rreq);
+}
+
+static void netfs_put_read_request(struct netfs_read_request *rreq, bool was_async)
+{
+ if (refcount_dec_and_test(&rreq->usage)) {
+ if (was_async) {
+ rreq->work.func = netfs_free_read_request;
+ if (!queue_work(system_unbound_wq, &rreq->work))
+ BUG();
+ } else {
+ netfs_free_read_request(&rreq->work);
+ }
+ }
+}
+
+/*
+ * Allocate and partially initialise an I/O request structure.
+ */
+static struct netfs_read_subrequest *netfs_alloc_subrequest(
+ struct netfs_read_request *rreq)
+{
+ struct netfs_read_subrequest *subreq;
+
+ subreq = kzalloc(sizeof(struct netfs_read_subrequest), GFP_KERNEL);
+ if (subreq) {
+ INIT_LIST_HEAD(&subreq->rreq_link);
+ refcount_set(&subreq->usage, 2);
+ subreq->rreq = rreq;
+ netfs_get_read_request(rreq);
+ netfs_stat(&netfs_n_rh_sreq);
+ }
+
+ return subreq;
+}
+
+static void netfs_get_read_subrequest(struct netfs_read_subrequest *subreq)
+{
+ refcount_inc(&subreq->usage);
+}
+
+static void __netfs_put_subrequest(struct netfs_read_subrequest *subreq,
+ bool was_async)
+{
+ struct netfs_read_request *rreq = subreq->rreq;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_free);
+ kfree(subreq);
+ netfs_stat_d(&netfs_n_rh_sreq);
+ netfs_put_read_request(rreq, was_async);
+}
+
+/*
+ * Clear the unread part of an I/O request.
+ */
+static void netfs_clear_unread(struct netfs_read_subrequest *subreq)
+{
+ struct iov_iter iter;
+
+ iov_iter_xarray(&iter, WRITE, &subreq->rreq->mapping->i_pages,
+ subreq->start + subreq->transferred,
+ subreq->len - subreq->transferred);
+ iov_iter_zero(iov_iter_count(&iter), &iter);
+}
+
+static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error,
+ bool was_async)
+{
+ struct netfs_read_subrequest *subreq = priv;
+
+ netfs_subreq_terminated(subreq, transferred_or_error, was_async);
+}
+
+/*
+ * Issue a read against the cache.
+ * - Eats the caller's ref on subreq.
+ */
+static void netfs_read_from_cache(struct netfs_read_request *rreq,
+ struct netfs_read_subrequest *subreq,
+ bool seek_data)
+{
+ struct netfs_cache_resources *cres = &rreq->cache_resources;
+ struct iov_iter iter;
+
+ netfs_stat(&netfs_n_rh_read);
+ iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages,
+ subreq->start + subreq->transferred,
+ subreq->len - subreq->transferred);
+
+ cres->ops->read(cres, subreq->start, &iter, seek_data,
+ netfs_cache_read_terminated, subreq);
+}
+
+/*
+ * Fill a subrequest region with zeroes.
+ */
+static void netfs_fill_with_zeroes(struct netfs_read_request *rreq,
+ struct netfs_read_subrequest *subreq)
+{
+ netfs_stat(&netfs_n_rh_zero);
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ netfs_subreq_terminated(subreq, 0, false);
+}
+
+/*
+ * Ask the netfs to issue a read request to the server for us.
+ *
+ * The netfs is expected to read from subreq->pos + subreq->transferred to
+ * subreq->pos + subreq->len - 1. It may not backtrack and write data into the
+ * buffer prior to the transferred point as it might clobber dirty data
+ * obtained from the cache.
+ *
+ * Alternatively, the netfs is allowed to indicate one of two things:
+ *
+ * - NETFS_SREQ_SHORT_READ: A short read - it will get called again to try and
+ * make progress.
+ *
+ * - NETFS_SREQ_CLEAR_TAIL: A short read - the rest of the buffer will be
+ * cleared.
+ */
+static void netfs_read_from_server(struct netfs_read_request *rreq,
+ struct netfs_read_subrequest *subreq)
+{
+ netfs_stat(&netfs_n_rh_download);
+ rreq->netfs_ops->issue_op(subreq);
+}
+
+/*
+ * Release those waiting.
+ */
+static void netfs_rreq_completed(struct netfs_read_request *rreq, bool was_async)
+{
+ trace_netfs_rreq(rreq, netfs_rreq_trace_done);
+ netfs_rreq_clear_subreqs(rreq, was_async);
+ netfs_put_read_request(rreq, was_async);
+}
+
+/*
+ * Deal with the completion of writing the data to the cache. We have to clear
+ * the PG_fscache bits on the pages involved and release the caller's ref.
+ *
+ * May be called in softirq mode and we inherit a ref from the caller.
+ */
+static void netfs_rreq_unmark_after_write(struct netfs_read_request *rreq,
+ bool was_async)
+{
+ struct netfs_read_subrequest *subreq;
+ struct page *page;
+ pgoff_t unlocked = 0;
+ bool have_unlocked = false;
+
+ rcu_read_lock();
+
+ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+ XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE);
+
+ xas_for_each(&xas, page, (subreq->start + subreq->len - 1) / PAGE_SIZE) {
+ /* We might have multiple writes from the same huge
+ * page, but we mustn't unlock a page more than once.
+ */
+ if (have_unlocked && page->index <= unlocked)
+ continue;
+ unlocked = page->index;
+ end_page_fscache(page);
+ have_unlocked = true;
+ }
+ }
+
+ rcu_read_unlock();
+ netfs_rreq_completed(rreq, was_async);
+}
+
+static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error,
+ bool was_async)
+{
+ struct netfs_read_subrequest *subreq = priv;
+ struct netfs_read_request *rreq = subreq->rreq;
+
+ if (IS_ERR_VALUE(transferred_or_error)) {
+ netfs_stat(&netfs_n_rh_write_failed);
+ trace_netfs_failure(rreq, subreq, transferred_or_error,
+ netfs_fail_copy_to_cache);
+ } else {
+ netfs_stat(&netfs_n_rh_write_done);
+ }
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_write_term);
+
+ /* If we decrement nr_wr_ops to 0, the ref belongs to us. */
+ if (atomic_dec_and_test(&rreq->nr_wr_ops))
+ netfs_rreq_unmark_after_write(rreq, was_async);
+
+ netfs_put_subrequest(subreq, was_async);
+}
+
+/*
+ * Perform any outstanding writes to the cache. We inherit a ref from the
+ * caller.
+ */
+static void netfs_rreq_do_write_to_cache(struct netfs_read_request *rreq)
+{
+ struct netfs_cache_resources *cres = &rreq->cache_resources;
+ struct netfs_read_subrequest *subreq, *next, *p;
+ struct iov_iter iter;
+ int ret;
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_write);
+
+ /* We don't want terminating writes trying to wake us up whilst we're
+ * still going through the list.
+ */
+ atomic_inc(&rreq->nr_wr_ops);
+
+ list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) {
+ if (!test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags)) {
+ list_del_init(&subreq->rreq_link);
+ netfs_put_subrequest(subreq, false);
+ }
+ }
+
+ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+ /* Amalgamate adjacent writes */
+ while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
+ next = list_next_entry(subreq, rreq_link);
+ if (next->start != subreq->start + subreq->len)
+ break;
+ subreq->len += next->len;
+ list_del_init(&next->rreq_link);
+ netfs_put_subrequest(next, false);
+ }
+
+ ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len,
+ rreq->i_size);
+ if (ret < 0) {
+ trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip);
+ continue;
+ }
+
+ iov_iter_xarray(&iter, WRITE, &rreq->mapping->i_pages,
+ subreq->start, subreq->len);
+
+ atomic_inc(&rreq->nr_wr_ops);
+ netfs_stat(&netfs_n_rh_write);
+ netfs_get_read_subrequest(subreq);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_write);
+ cres->ops->write(cres, subreq->start, &iter,
+ netfs_rreq_copy_terminated, subreq);
+ }
+
+ /* If we decrement nr_wr_ops to 0, the usage ref belongs to us. */
+ if (atomic_dec_and_test(&rreq->nr_wr_ops))
+ netfs_rreq_unmark_after_write(rreq, false);
+}
+
+static void netfs_rreq_write_to_cache_work(struct work_struct *work)
+{
+ struct netfs_read_request *rreq =
+ container_of(work, struct netfs_read_request, work);
+
+ netfs_rreq_do_write_to_cache(rreq);
+}
+
+static void netfs_rreq_write_to_cache(struct netfs_read_request *rreq,
+ bool was_async)
+{
+ if (was_async) {
+ rreq->work.func = netfs_rreq_write_to_cache_work;
+ if (!queue_work(system_unbound_wq, &rreq->work))
+ BUG();
+ } else {
+ netfs_rreq_do_write_to_cache(rreq);
+ }
+}
+
+/*
+ * Unlock the pages in a read operation. We need to set PG_fscache on any
+ * pages we're going to write back before we unlock them.
+ */
+static void netfs_rreq_unlock(struct netfs_read_request *rreq)
+{
+ struct netfs_read_subrequest *subreq;
+ struct page *page;
+ unsigned int iopos, account = 0;
+ pgoff_t start_page = rreq->start / PAGE_SIZE;
+ pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
+ bool subreq_failed = false;
+ int i;
+
+ XA_STATE(xas, &rreq->mapping->i_pages, start_page);
+
+ if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) {
+ __clear_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
+ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+ __clear_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags);
+ }
+ }
+
+ /* Walk through the pagecache and the I/O request lists simultaneously.
+ * We may have a mixture of cached and uncached sections and we only
+ * really want to write out the uncached sections. This is slightly
+ * complicated by the possibility that we might have huge pages with a
+ * mixture inside.
+ */
+ subreq = list_first_entry(&rreq->subrequests,
+ struct netfs_read_subrequest, rreq_link);
+ iopos = 0;
+ subreq_failed = (subreq->error < 0);
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
+
+ rcu_read_lock();
+ xas_for_each(&xas, page, last_page) {
+ unsigned int pgpos = (page->index - start_page) * PAGE_SIZE;
+ unsigned int pgend = pgpos + thp_size(page);
+ bool pg_failed = false;
+
+ for (;;) {
+ if (!subreq) {
+ pg_failed = true;
+ break;
+ }
+ if (test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags))
+ set_page_fscache(page);
+ pg_failed |= subreq_failed;
+ if (pgend < iopos + subreq->len)
+ break;
+
+ account += subreq->transferred;
+ iopos += subreq->len;
+ if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
+ subreq = list_next_entry(subreq, rreq_link);
+ subreq_failed = (subreq->error < 0);
+ } else {
+ subreq = NULL;
+ subreq_failed = false;
+ }
+ if (pgend == iopos)
+ break;
+ }
+
+ if (!pg_failed) {
+ for (i = 0; i < thp_nr_pages(page); i++)
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ }
+
+ if (!test_bit(NETFS_RREQ_DONT_UNLOCK_PAGES, &rreq->flags)) {
+ if (page->index == rreq->no_unlock_page &&
+ test_bit(NETFS_RREQ_NO_UNLOCK_PAGE, &rreq->flags))
+ _debug("no unlock");
+ else
+ unlock_page(page);
+ }
+ }
+ rcu_read_unlock();
+
+ task_io_account_read(account);
+ if (rreq->netfs_ops->done)
+ rreq->netfs_ops->done(rreq);
+}
+
+/*
+ * Handle a short read.
+ */
+static void netfs_rreq_short_read(struct netfs_read_request *rreq,
+ struct netfs_read_subrequest *subreq)
+{
+ __clear_bit(NETFS_SREQ_SHORT_READ, &subreq->flags);
+ __set_bit(NETFS_SREQ_SEEK_DATA_READ, &subreq->flags);
+
+ netfs_stat(&netfs_n_rh_short_read);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_resubmit_short);
+
+ netfs_get_read_subrequest(subreq);
+ atomic_inc(&rreq->nr_rd_ops);
+ if (subreq->source == NETFS_READ_FROM_CACHE)
+ netfs_read_from_cache(rreq, subreq, true);
+ else
+ netfs_read_from_server(rreq, subreq);
+}
+
+/*
+ * Resubmit any short or failed operations. Returns true if we got the rreq
+ * ref back.
+ */
+static bool netfs_rreq_perform_resubmissions(struct netfs_read_request *rreq)
+{
+ struct netfs_read_subrequest *subreq;
+
+ WARN_ON(in_interrupt());
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit);
+
+ /* We don't want terminating submissions trying to wake us up whilst
+ * we're still going through the list.
+ */
+ atomic_inc(&rreq->nr_rd_ops);
+
+ __clear_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
+ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+ if (subreq->error) {
+ if (subreq->source != NETFS_READ_FROM_CACHE)
+ break;
+ subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
+ subreq->error = 0;
+ netfs_stat(&netfs_n_rh_download_instead);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead);
+ netfs_get_read_subrequest(subreq);
+ atomic_inc(&rreq->nr_rd_ops);
+ netfs_read_from_server(rreq, subreq);
+ } else if (test_bit(NETFS_SREQ_SHORT_READ, &subreq->flags)) {
+ netfs_rreq_short_read(rreq, subreq);
+ }
+ }
+
+ /* If we decrement nr_rd_ops to 0, the usage ref belongs to us. */
+ if (atomic_dec_and_test(&rreq->nr_rd_ops))
+ return true;
+
+ wake_up_var(&rreq->nr_rd_ops);
+ return false;
+}
+
+/*
+ * Check to see if the data read is still valid.
+ */
+static void netfs_rreq_is_still_valid(struct netfs_read_request *rreq)
+{
+ struct netfs_read_subrequest *subreq;
+
+ if (!rreq->netfs_ops->is_still_valid ||
+ rreq->netfs_ops->is_still_valid(rreq))
+ return;
+
+ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+ if (subreq->source == NETFS_READ_FROM_CACHE) {
+ subreq->error = -ESTALE;
+ __set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
+ }
+ }
+}
+
+/*
+ * Assess the state of a read request and decide what to do next.
+ *
+ * Note that we could be in an ordinary kernel thread, on a workqueue or in
+ * softirq context at this point. We inherit a ref from the caller.
+ */
+static void netfs_rreq_assess(struct netfs_read_request *rreq, bool was_async)
+{
+ trace_netfs_rreq(rreq, netfs_rreq_trace_assess);
+
+again:
+ netfs_rreq_is_still_valid(rreq);
+
+ if (!test_bit(NETFS_RREQ_FAILED, &rreq->flags) &&
+ test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags)) {
+ if (netfs_rreq_perform_resubmissions(rreq))
+ goto again;
+ return;
+ }
+
+ netfs_rreq_unlock(rreq);
+
+ clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+ wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
+
+ if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags))
+ return netfs_rreq_write_to_cache(rreq, was_async);
+
+ netfs_rreq_completed(rreq, was_async);
+}
+
+static void netfs_rreq_work(struct work_struct *work)
+{
+ struct netfs_read_request *rreq =
+ container_of(work, struct netfs_read_request, work);
+ netfs_rreq_assess(rreq, false);
+}
+
+/*
+ * Handle the completion of all outstanding I/O operations on a read request.
+ * We inherit a ref from the caller.
+ */
+static void netfs_rreq_terminated(struct netfs_read_request *rreq,
+ bool was_async)
+{
+ if (test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags) &&
+ was_async) {
+ if (!queue_work(system_unbound_wq, &rreq->work))
+ BUG();
+ } else {
+ netfs_rreq_assess(rreq, was_async);
+ }
+}
+
+/**
+ * netfs_subreq_terminated - Note the termination of an I/O operation.
+ * @subreq: The I/O request that has terminated.
+ * @transferred_or_error: The amount of data transferred or an error code.
+ * @was_async: The termination was asynchronous
+ *
+ * This tells the read helper that a contributory I/O operation has terminated,
+ * one way or another, and that it should integrate the results.
+ *
+ * The caller indicates in @transferred_or_error the outcome of the operation,
+ * supplying a positive value to indicate the number of bytes transferred, 0 to
+ * indicate a failure to transfer anything that should be retried or a negative
+ * error code. The helper will look after reissuing I/O operations as
+ * appropriate and writing downloaded data to the cache.
+ *
+ * If @was_async is true, the caller might be running in softirq or interrupt
+ * context and we can't sleep.
+ */
+void netfs_subreq_terminated(struct netfs_read_subrequest *subreq,
+ ssize_t transferred_or_error,
+ bool was_async)
+{
+ struct netfs_read_request *rreq = subreq->rreq;
+ int u;
+
+ _enter("[%u]{%llx,%lx},%zd",
+ subreq->debug_index, subreq->start, subreq->flags,
+ transferred_or_error);
+
+ switch (subreq->source) {
+ case NETFS_READ_FROM_CACHE:
+ netfs_stat(&netfs_n_rh_read_done);
+ break;
+ case NETFS_DOWNLOAD_FROM_SERVER:
+ netfs_stat(&netfs_n_rh_download_done);
+ break;
+ default:
+ break;
+ }
+
+ if (IS_ERR_VALUE(transferred_or_error)) {
+ subreq->error = transferred_or_error;
+ trace_netfs_failure(rreq, subreq, transferred_or_error,
+ netfs_fail_read);
+ goto failed;
+ }
+
+ if (WARN(transferred_or_error > subreq->len - subreq->transferred,
+ "Subreq overread: R%x[%x] %zd > %zu - %zu",
+ rreq->debug_id, subreq->debug_index,
+ transferred_or_error, subreq->len, subreq->transferred))
+ transferred_or_error = subreq->len - subreq->transferred;
+
+ subreq->error = 0;
+ subreq->transferred += transferred_or_error;
+ if (subreq->transferred < subreq->len)
+ goto incomplete;
+
+complete:
+ __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+ if (test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags))
+ set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
+
+out:
+ trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
+
+ /* If we decrement nr_rd_ops to 0, the ref belongs to us. */
+ u = atomic_dec_return(&rreq->nr_rd_ops);
+ if (u == 0)
+ netfs_rreq_terminated(rreq, was_async);
+ else if (u == 1)
+ wake_up_var(&rreq->nr_rd_ops);
+
+ netfs_put_subrequest(subreq, was_async);
+ return;
+
+incomplete:
+ if (test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags)) {
+ netfs_clear_unread(subreq);
+ subreq->transferred = subreq->len;
+ goto complete;
+ }
+
+ if (transferred_or_error == 0) {
+ if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
+ subreq->error = -ENODATA;
+ goto failed;
+ }
+ } else {
+ __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+ }
+
+ __set_bit(NETFS_SREQ_SHORT_READ, &subreq->flags);
+ set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
+ goto out;
+
+failed:
+ if (subreq->source == NETFS_READ_FROM_CACHE) {
+ netfs_stat(&netfs_n_rh_read_failed);
+ set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags);
+ } else {
+ netfs_stat(&netfs_n_rh_download_failed);
+ set_bit(NETFS_RREQ_FAILED, &rreq->flags);
+ rreq->error = subreq->error;
+ }
+ goto out;
+}
+EXPORT_SYMBOL(netfs_subreq_terminated);
+
+static enum netfs_read_source netfs_cache_prepare_read(struct netfs_read_subrequest *subreq,
+ loff_t i_size)
+{
+ struct netfs_read_request *rreq = subreq->rreq;
+ struct netfs_cache_resources *cres = &rreq->cache_resources;
+
+ if (cres->ops)
+ return cres->ops->prepare_read(subreq, i_size);
+ if (subreq->start >= rreq->i_size)
+ return NETFS_FILL_WITH_ZEROES;
+ return NETFS_DOWNLOAD_FROM_SERVER;
+}
+
+/*
+ * Work out what sort of subrequest the next one will be.
+ */
+static enum netfs_read_source
+netfs_rreq_prepare_read(struct netfs_read_request *rreq,
+ struct netfs_read_subrequest *subreq)
+{
+ enum netfs_read_source source;
+
+ _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
+
+ source = netfs_cache_prepare_read(subreq, rreq->i_size);
+ if (source == NETFS_INVALID_READ)
+ goto out;
+
+ if (source == NETFS_DOWNLOAD_FROM_SERVER) {
+ /* Call out to the netfs to let it shrink the request to fit
+ * its own I/O sizes and boundaries. If it shinks it here, it
+ * will be called again to make simultaneous calls; if it wants
+ * to make serial calls, it can indicate a short read and then
+ * we will call it again.
+ */
+ if (subreq->len > rreq->i_size - subreq->start)
+ subreq->len = rreq->i_size - subreq->start;
+
+ if (rreq->netfs_ops->clamp_length &&
+ !rreq->netfs_ops->clamp_length(subreq)) {
+ source = NETFS_INVALID_READ;
+ goto out;
+ }
+ }
+
+ if (WARN_ON(subreq->len == 0))
+ source = NETFS_INVALID_READ;
+
+out:
+ subreq->source = source;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+ return source;
+}
+
+/*
+ * Slice off a piece of a read request and submit an I/O request for it.
+ */
+static bool netfs_rreq_submit_slice(struct netfs_read_request *rreq,
+ unsigned int *_debug_index)
+{
+ struct netfs_read_subrequest *subreq;
+ enum netfs_read_source source;
+
+ subreq = netfs_alloc_subrequest(rreq);
+ if (!subreq)
+ return false;
+
+ subreq->debug_index = (*_debug_index)++;
+ subreq->start = rreq->start + rreq->submitted;
+ subreq->len = rreq->len - rreq->submitted;
+
+ _debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted);
+ list_add_tail(&subreq->rreq_link, &rreq->subrequests);
+
+ /* Call out to the cache to find out what it can do with the remaining
+ * subset. It tells us in subreq->flags what it decided should be done
+ * and adjusts subreq->len down if the subset crosses a cache boundary.
+ *
+ * Then when we hand the subset, it can choose to take a subset of that
+ * (the starts must coincide), in which case, we go around the loop
+ * again and ask it to download the next piece.
+ */
+ source = netfs_rreq_prepare_read(rreq, subreq);
+ if (source == NETFS_INVALID_READ)
+ goto subreq_failed;
+
+ atomic_inc(&rreq->nr_rd_ops);
+
+ rreq->submitted += subreq->len;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ switch (source) {
+ case NETFS_FILL_WITH_ZEROES:
+ netfs_fill_with_zeroes(rreq, subreq);
+ break;
+ case NETFS_DOWNLOAD_FROM_SERVER:
+ netfs_read_from_server(rreq, subreq);
+ break;
+ case NETFS_READ_FROM_CACHE:
+ netfs_read_from_cache(rreq, subreq, false);
+ break;
+ default:
+ BUG();
+ }
+
+ return true;
+
+subreq_failed:
+ rreq->error = subreq->error;
+ netfs_put_subrequest(subreq, false);
+ return false;
+}
+
+static void netfs_cache_expand_readahead(struct netfs_read_request *rreq,
+ loff_t *_start, size_t *_len, loff_t i_size)
+{
+ struct netfs_cache_resources *cres = &rreq->cache_resources;
+
+ if (cres->ops && cres->ops->expand_readahead)
+ cres->ops->expand_readahead(cres, _start, _len, i_size);
+}
+
+static void netfs_rreq_expand(struct netfs_read_request *rreq,
+ struct readahead_control *ractl)
+{
+ /* Give the cache a chance to change the request parameters. The
+ * resultant request must contain the original region.
+ */
+ netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size);
+
+ /* Give the netfs a chance to change the request parameters. The
+ * resultant request must contain the original region.
+ */
+ if (rreq->netfs_ops->expand_readahead)
+ rreq->netfs_ops->expand_readahead(rreq);
+
+ /* Expand the request if the cache wants it to start earlier. Note
+ * that the expansion may get further extended if the VM wishes to
+ * insert THPs and the preferred start and/or end wind up in the middle
+ * of THPs.
+ *
+ * If this is the case, however, the THP size should be an integer
+ * multiple of the cache granule size, so we get a whole number of
+ * granules to deal with.
+ */
+ if (rreq->start != readahead_pos(ractl) ||
+ rreq->len != readahead_length(ractl)) {
+ readahead_expand(ractl, rreq->start, rreq->len);
+ rreq->start = readahead_pos(ractl);
+ rreq->len = readahead_length(ractl);
+
+ trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
+ netfs_read_trace_expanded);
+ }
+}
+
+/**
+ * netfs_readahead - Helper to manage a read request
+ * @ractl: The description of the readahead request
+ * @ops: The network filesystem's operations for the helper to use
+ * @netfs_priv: Private netfs data to be retained in the request
+ *
+ * Fulfil a readahead request by drawing data from the cache if possible, or
+ * the netfs if not. Space beyond the EOF is zero-filled. Multiple I/O
+ * requests from different sources will get munged together. If necessary, the
+ * readahead window can be expanded in either direction to a more convenient
+ * alighment for RPC efficiency or to make storage in the cache feasible.
+ *
+ * The calling netfs must provide a table of operations, only one of which,
+ * issue_op, is mandatory. It may also be passed a private token, which will
+ * be retained in rreq->netfs_priv and will be cleaned up by ops->cleanup().
+ *
+ * This is usable whether or not caching is enabled.
+ */
+void netfs_readahead(struct readahead_control *ractl,
+ const struct netfs_read_request_ops *ops,
+ void *netfs_priv)
+{
+ struct netfs_read_request *rreq;
+ struct page *page;
+ unsigned int debug_index = 0;
+ int ret;
+
+ _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
+
+ if (readahead_count(ractl) == 0)
+ goto cleanup;
+
+ rreq = netfs_alloc_read_request(ops, netfs_priv, ractl->file);
+ if (!rreq)
+ goto cleanup;
+ rreq->mapping = ractl->mapping;
+ rreq->start = readahead_pos(ractl);
+ rreq->len = readahead_length(ractl);
+
+ if (ops->begin_cache_operation) {
+ ret = ops->begin_cache_operation(rreq);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto cleanup_free;
+ }
+
+ netfs_stat(&netfs_n_rh_readahead);
+ trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
+ netfs_read_trace_readahead);
+
+ netfs_rreq_expand(rreq, ractl);
+
+ atomic_set(&rreq->nr_rd_ops, 1);
+ do {
+ if (!netfs_rreq_submit_slice(rreq, &debug_index))
+ break;
+
+ } while (rreq->submitted < rreq->len);
+
+ /* Drop the refs on the pages here rather than in the cache or
+ * filesystem. The locks will be dropped in netfs_rreq_unlock().
+ */
+ while ((page = readahead_page(ractl)))
+ put_page(page);
+
+ /* If we decrement nr_rd_ops to 0, the ref belongs to us. */
+ if (atomic_dec_and_test(&rreq->nr_rd_ops))
+ netfs_rreq_assess(rreq, false);
+ return;
+
+cleanup_free:
+ netfs_put_read_request(rreq, false);
+ return;
+cleanup:
+ if (netfs_priv)
+ ops->cleanup(ractl->mapping, netfs_priv);
+ return;
+}
+EXPORT_SYMBOL(netfs_readahead);
+
+/**
+ * netfs_readpage - Helper to manage a readpage request
+ * @file: The file to read from
+ * @page: The page to read
+ * @ops: The network filesystem's operations for the helper to use
+ * @netfs_priv: Private netfs data to be retained in the request
+ *
+ * Fulfil a readpage request by drawing data from the cache if possible, or the
+ * netfs if not. Space beyond the EOF is zero-filled. Multiple I/O requests
+ * from different sources will get munged together.
+ *
+ * The calling netfs must provide a table of operations, only one of which,
+ * issue_op, is mandatory. It may also be passed a private token, which will
+ * be retained in rreq->netfs_priv and will be cleaned up by ops->cleanup().
+ *
+ * This is usable whether or not caching is enabled.
+ */
+int netfs_readpage(struct file *file,
+ struct page *page,
+ const struct netfs_read_request_ops *ops,
+ void *netfs_priv)
+{
+ struct netfs_read_request *rreq;
+ unsigned int debug_index = 0;
+ int ret;
+
+ _enter("%lx", page_index(page));
+
+ rreq = netfs_alloc_read_request(ops, netfs_priv, file);
+ if (!rreq) {
+ if (netfs_priv)
+ ops->cleanup(netfs_priv, page_file_mapping(page));
+ unlock_page(page);
+ return -ENOMEM;
+ }
+ rreq->mapping = page_file_mapping(page);
+ rreq->start = page_file_offset(page);
+ rreq->len = thp_size(page);
+
+ if (ops->begin_cache_operation) {
+ ret = ops->begin_cache_operation(rreq);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) {
+ unlock_page(page);
+ goto out;
+ }
+ }
+
+ netfs_stat(&netfs_n_rh_readpage);
+ trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
+
+ netfs_get_read_request(rreq);
+
+ atomic_set(&rreq->nr_rd_ops, 1);
+ do {
+ if (!netfs_rreq_submit_slice(rreq, &debug_index))
+ break;
+
+ } while (rreq->submitted < rreq->len);
+
+ /* Keep nr_rd_ops incremented so that the ref always belongs to us, and
+ * the service code isn't punted off to a random thread pool to
+ * process.
+ */
+ do {
+ wait_var_event(&rreq->nr_rd_ops, atomic_read(&rreq->nr_rd_ops) == 1);
+ netfs_rreq_assess(rreq, false);
+ } while (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags));
+
+ ret = rreq->error;
+ if (ret == 0 && rreq->submitted < rreq->len) {
+ trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_readpage);
+ ret = -EIO;
+ }
+out:
+ netfs_put_read_request(rreq, false);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_readpage);
+
+static void netfs_clear_thp(struct page *page)
+{
+ unsigned int i;
+
+ for (i = 0; i < thp_nr_pages(page); i++)
+ clear_highpage(page + i);
+}
+
+/**
+ * netfs_write_begin - Helper to prepare for writing
+ * @file: The file to read from
+ * @mapping: The mapping to read from
+ * @pos: File position at which the write will begin
+ * @len: The length of the write in this page
+ * @flags: AOP_* flags
+ * @_page: Where to put the resultant page
+ * @_fsdata: Place for the netfs to store a cookie
+ * @ops: The network filesystem's operations for the helper to use
+ * @netfs_priv: Private netfs data to be retained in the request
+ *
+ * Pre-read data for a write-begin request by drawing data from the cache if
+ * possible, or the netfs if not. Space beyond the EOF is zero-filled.
+ * Multiple I/O requests from different sources will get munged together. If
+ * necessary, the readahead window can be expanded in either direction to a
+ * more convenient alighment for RPC efficiency or to make storage in the cache
+ * feasible.
+ *
+ * The calling netfs must provide a table of operations, only one of which,
+ * issue_op, is mandatory.
+ *
+ * The check_write_begin() operation can be provided to check for and flush
+ * conflicting writes once the page is grabbed and locked. It is passed a
+ * pointer to the fsdata cookie that gets returned to the VM to be passed to
+ * write_end. It is permitted to sleep. It should return 0 if the request
+ * should go ahead; unlock the page and return -EAGAIN to cause the page to be
+ * regot; or return an error.
+ *
+ * This is usable whether or not caching is enabled.
+ */
+int netfs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned int len, unsigned int flags,
+ struct page **_page, void **_fsdata,
+ const struct netfs_read_request_ops *ops,
+ void *netfs_priv)
+{
+ struct netfs_read_request *rreq;
+ struct page *page, *xpage;
+ struct inode *inode = file_inode(file);
+ unsigned int debug_index = 0;
+ pgoff_t index = pos >> PAGE_SHIFT;
+ int pos_in_page = pos & ~PAGE_MASK;
+ loff_t size;
+ int ret;
+
+ DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
+
+retry:
+ page = grab_cache_page_write_begin(mapping, index, 0);
+ if (!page)
+ return -ENOMEM;
+
+ if (ops->check_write_begin) {
+ /* Allow the netfs (eg. ceph) to flush conflicts. */
+ ret = ops->check_write_begin(file, pos, len, page, _fsdata);
+ if (ret < 0) {
+ trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
+ if (ret == -EAGAIN)
+ goto retry;
+ goto error;
+ }
+ }
+
+ if (PageUptodate(page))
+ goto have_page;
+
+ /* If the page is beyond the EOF, we want to clear it - unless it's
+ * within the cache granule containing the EOF, in which case we need
+ * to preload the granule.
+ */
+ size = i_size_read(inode);
+ if (!ops->is_cache_enabled(inode) &&
+ ((pos_in_page == 0 && len == thp_size(page)) ||
+ (pos >= size) ||
+ (pos_in_page == 0 && (pos + len) >= size))) {
+ netfs_clear_thp(page);
+ SetPageUptodate(page);
+ netfs_stat(&netfs_n_rh_write_zskip);
+ goto have_page_no_wait;
+ }
+
+ ret = -ENOMEM;
+ rreq = netfs_alloc_read_request(ops, netfs_priv, file);
+ if (!rreq)
+ goto error;
+ rreq->mapping = page->mapping;
+ rreq->start = page_offset(page);
+ rreq->len = thp_size(page);
+ rreq->no_unlock_page = page->index;
+ __set_bit(NETFS_RREQ_NO_UNLOCK_PAGE, &rreq->flags);
+ netfs_priv = NULL;
+
+ if (ops->begin_cache_operation) {
+ ret = ops->begin_cache_operation(rreq);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto error_put;
+ }
+
+ netfs_stat(&netfs_n_rh_write_begin);
+ trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
+
+ /* Expand the request to meet caching requirements and download
+ * preferences.
+ */
+ ractl._nr_pages = thp_nr_pages(page);
+ netfs_rreq_expand(rreq, &ractl);
+ netfs_get_read_request(rreq);
+
+ /* We hold the page locks, so we can drop the references */
+ while ((xpage = readahead_page(&ractl)))
+ if (xpage != page)
+ put_page(xpage);
+
+ atomic_set(&rreq->nr_rd_ops, 1);
+ do {
+ if (!netfs_rreq_submit_slice(rreq, &debug_index))
+ break;
+
+ } while (rreq->submitted < rreq->len);
+
+ /* Keep nr_rd_ops incremented so that the ref always belongs to us, and
+ * the service code isn't punted off to a random thread pool to
+ * process.
+ */
+ for (;;) {
+ wait_var_event(&rreq->nr_rd_ops, atomic_read(&rreq->nr_rd_ops) == 1);
+ netfs_rreq_assess(rreq, false);
+ if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
+ break;
+ cond_resched();
+ }
+
+ ret = rreq->error;
+ if (ret == 0 && rreq->submitted < rreq->len) {
+ trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_write_begin);
+ ret = -EIO;
+ }
+ netfs_put_read_request(rreq, false);
+ if (ret < 0)
+ goto error;
+
+have_page:
+ ret = wait_on_page_fscache_killable(page);
+ if (ret < 0)
+ goto error;
+have_page_no_wait:
+ if (netfs_priv)
+ ops->cleanup(netfs_priv, mapping);
+ *_page = page;
+ _leave(" = 0");
+ return 0;
+
+error_put:
+ netfs_put_read_request(rreq, false);
+error:
+ unlock_page(page);
+ put_page(page);
+ if (netfs_priv)
+ ops->cleanup(netfs_priv, mapping);
+ _leave(" = %d", ret);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_write_begin);
diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c
new file mode 100644
index 000000000000..9ae538c85378
--- /dev/null
+++ b/fs/netfs/stats.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Netfs support statistics
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/seq_file.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+atomic_t netfs_n_rh_readahead;
+atomic_t netfs_n_rh_readpage;
+atomic_t netfs_n_rh_rreq;
+atomic_t netfs_n_rh_sreq;
+atomic_t netfs_n_rh_download;
+atomic_t netfs_n_rh_download_done;
+atomic_t netfs_n_rh_download_failed;
+atomic_t netfs_n_rh_download_instead;
+atomic_t netfs_n_rh_read;
+atomic_t netfs_n_rh_read_done;
+atomic_t netfs_n_rh_read_failed;
+atomic_t netfs_n_rh_zero;
+atomic_t netfs_n_rh_short_read;
+atomic_t netfs_n_rh_write;
+atomic_t netfs_n_rh_write_begin;
+atomic_t netfs_n_rh_write_done;
+atomic_t netfs_n_rh_write_failed;
+atomic_t netfs_n_rh_write_zskip;
+
+void netfs_stats_show(struct seq_file *m)
+{
+ seq_printf(m, "RdHelp : RA=%u RP=%u WB=%u WBZ=%u rr=%u sr=%u\n",
+ atomic_read(&netfs_n_rh_readahead),
+ atomic_read(&netfs_n_rh_readpage),
+ atomic_read(&netfs_n_rh_write_begin),
+ atomic_read(&netfs_n_rh_write_zskip),
+ atomic_read(&netfs_n_rh_rreq),
+ atomic_read(&netfs_n_rh_sreq));
+ seq_printf(m, "RdHelp : ZR=%u sh=%u sk=%u\n",
+ atomic_read(&netfs_n_rh_zero),
+ atomic_read(&netfs_n_rh_short_read),
+ atomic_read(&netfs_n_rh_write_zskip));
+ seq_printf(m, "RdHelp : DL=%u ds=%u df=%u di=%u\n",
+ atomic_read(&netfs_n_rh_download),
+ atomic_read(&netfs_n_rh_download_done),
+ atomic_read(&netfs_n_rh_download_failed),
+ atomic_read(&netfs_n_rh_download_instead));
+ seq_printf(m, "RdHelp : RD=%u rs=%u rf=%u\n",
+ atomic_read(&netfs_n_rh_read),
+ atomic_read(&netfs_n_rh_read_done),
+ atomic_read(&netfs_n_rh_read_failed));
+ seq_printf(m, "RdHelp : WR=%u ws=%u wf=%u\n",
+ atomic_read(&netfs_n_rh_write),
+ atomic_read(&netfs_n_rh_write_done),
+ atomic_read(&netfs_n_rh_write_failed));
+}
+EXPORT_SYMBOL(netfs_stats_show);
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index 971a9251c1d9..a06d213d7689 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -463,6 +463,9 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
if (opt < 0)
return ctx->sloppy ? 1 : opt;
+ if (fc->security)
+ ctx->has_sec_mnt_opts = 1;
+
switch (opt) {
case Opt_source:
if (fc->source)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index a7fb076a5f44..5a8854de0c19 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -335,7 +335,7 @@ nfs_find_actor(struct inode *inode, void *opaque)
if (NFS_FILEID(inode) != fattr->fileid)
return 0;
- if ((S_IFMT & inode->i_mode) != (S_IFMT & fattr->mode))
+ if (inode_wrong_type(inode, fattr->mode))
return 0;
if (nfs_compare_fh(NFS_FH(inode), fh))
return 0;
@@ -1461,7 +1461,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
return 0;
return -ESTALE;
}
- if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
+ if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && inode_wrong_type(inode, fattr->mode))
return -ESTALE;
@@ -1876,7 +1876,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/*
* Make sure the inode's type hasn't changed.
*/
- if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
+ if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && inode_wrong_type(inode, fattr->mode)) {
/*
* Big trouble! The inode has become a different object.
*/
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7b644d6c09e4..7395d0977b7d 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -96,6 +96,7 @@ struct nfs_fs_context {
char *fscache_uniq;
unsigned short protofamily;
unsigned short mountfamily;
+ bool has_sec_mnt_opts;
struct {
union {
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 94885c6f8f54..4aaa1f5dd381 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1045,7 +1045,7 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx)
sb->s_blocksize = 0;
sb->s_xattr = server->nfs_client->cl_nfs_mod->xattr;
sb->s_op = server->nfs_client->cl_nfs_mod->sops;
- if (ctx && ctx->bsize)
+ if (ctx->bsize)
sb->s_blocksize = nfs_block_size(ctx->bsize, &sb->s_blocksize_bits);
if (server->nfs_client->rpc_ops->version != 2) {
@@ -1077,6 +1077,7 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx)
&sb->s_blocksize_bits);
nfs_super_set_maxbytes(sb, server->maxfilesize);
+ server->has_sec_mnt_opts = ctx->has_sec_mnt_opts;
}
static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b,
@@ -1193,6 +1194,9 @@ static int nfs_compare_super(struct super_block *sb, struct fs_context *fc)
return 0;
if (!nfs_compare_userns(old, server))
return 0;
+ if ((old->has_sec_mnt_opts || fc->security) &&
+ security_sb_mnt_opts_compat(sb, fc->security))
+ return 0;
return nfs_compare_mount_options(sb, server, fc);
}
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 79c563c1a5e8..5a5bd85d08f8 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -136,6 +136,77 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
}
EXPORT_SYMBOL_GPL(nfsacl_encode);
+/**
+ * nfs_stream_encode_acl - Encode an NFSv3 ACL
+ *
+ * @xdr: an xdr_stream positioned to receive an encoded ACL
+ * @inode: inode of file whose ACL this is
+ * @acl: posix_acl to encode
+ * @encode_entries: whether to encode ACEs as well
+ * @typeflag: ACL type: NFS_ACL_DEFAULT or zero
+ *
+ * Return values:
+ * %false: The ACL could not be encoded
+ * %true: @xdr is advanced to the next available position
+ */
+bool nfs_stream_encode_acl(struct xdr_stream *xdr, struct inode *inode,
+ struct posix_acl *acl, int encode_entries,
+ int typeflag)
+{
+ const size_t elem_size = XDR_UNIT * 3;
+ u32 entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0;
+ struct nfsacl_encode_desc nfsacl_desc = {
+ .desc = {
+ .elem_size = elem_size,
+ .array_len = encode_entries ? entries : 0,
+ .xcode = xdr_nfsace_encode,
+ },
+ .acl = acl,
+ .typeflag = typeflag,
+ .uid = inode->i_uid,
+ .gid = inode->i_gid,
+ };
+ struct nfsacl_simple_acl aclbuf;
+ unsigned int base;
+ int err;
+
+ if (entries > NFS_ACL_MAX_ENTRIES)
+ return false;
+ if (xdr_stream_encode_u32(xdr, entries) < 0)
+ return false;
+
+ if (encode_entries && acl && acl->a_count == 3) {
+ struct posix_acl *acl2 = &aclbuf.acl;
+
+ /* Avoid the use of posix_acl_alloc(). nfsacl_encode() is
+ * invoked in contexts where a memory allocation failure is
+ * fatal. Fortunately this fake ACL is small enough to
+ * construct on the stack. */
+ posix_acl_init(acl2, 4);
+
+ /* Insert entries in canonical order: other orders seem
+ to confuse Solaris VxFS. */
+ acl2->a_entries[0] = acl->a_entries[0]; /* ACL_USER_OBJ */
+ acl2->a_entries[1] = acl->a_entries[1]; /* ACL_GROUP_OBJ */
+ acl2->a_entries[2] = acl->a_entries[1]; /* ACL_MASK */
+ acl2->a_entries[2].e_tag = ACL_MASK;
+ acl2->a_entries[3] = acl->a_entries[2]; /* ACL_OTHER */
+ nfsacl_desc.acl = acl2;
+ }
+
+ base = xdr_stream_pos(xdr);
+ if (!xdr_reserve_space(xdr, XDR_UNIT +
+ elem_size * nfsacl_desc.desc.array_len))
+ return false;
+ err = xdr_encode_array2(xdr->buf, base, &nfsacl_desc.desc);
+ if (err)
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(nfs_stream_encode_acl);
+
+
struct nfsacl_decode_desc {
struct xdr_array2_desc desc;
unsigned int count;
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index d6cff5fbe705..5fa38ad9e7e3 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -99,7 +99,7 @@ config NFSD_BLOCKLAYOUT
help
This option enables support for the exporting pNFS block layouts
in the kernel's NFS server. The pNFS block layout enables NFS
- clients to directly perform I/O to block devices accesible to both
+ clients to directly perform I/O to block devices accessible to both
the server and the clients. See RFC 5663 for more details.
If unsure, say N.
@@ -113,7 +113,7 @@ config NFSD_SCSILAYOUT
help
This option enables support for the exporting pNFS SCSI layouts
in the kernel's NFS server. The pNFS SCSI layout enables NFS
- clients to directly perform I/O to SCSI devices accesible to both
+ clients to directly perform I/O to SCSI devices accessible to both
the server and the clients. See draft-ietf-nfsv4-scsi-layout for
more details.
@@ -127,7 +127,7 @@ config NFSD_FLEXFILELAYOUT
This option enables support for the exporting pNFS Flex File
layouts in the kernel's NFS server. The pNFS Flex File layout
enables NFS clients to directly perform I/O to NFSv3 devices
- accesible to both the server and the clients. See
+ accessible to both the server and the clients. See
draft-ietf-nfsv4-flex-files for more details.
Warning, this server implements the bare minimum functionality
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index c330f5bd0cf3..a75abeb1e698 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -51,9 +51,6 @@ struct nfsd_net {
bool grace_ended;
time64_t boot_time;
- /* internal mount of the "nfsd" pseudofilesystem: */
- struct vfsmount *nfsd_mnt;
-
struct dentry *nfsd_client_dir;
/*
@@ -130,6 +127,9 @@ struct nfsd_net {
wait_queue_head_t ntf_wq;
atomic_t ntf_refcnt;
+ /* Allow umount to wait for nfsd state cleanup */
+ struct completion nfsd_shutdown_complete;
+
/*
* clientid and stateid data for construction of net unique COPY
* stateids.
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 855e17772eba..4b43929c1f25 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -242,79 +242,61 @@ static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p)
/* GETACL */
static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_getaclres *resp = rqstp->rq_resp;
struct dentry *dentry = resp->fh.fh_dentry;
struct inode *inode;
- struct kvec *head = rqstp->rq_res.head;
- unsigned int base;
- int n;
int w;
- *p++ = resp->status;
- if (resp->status != nfs_ok)
- return xdr_ressize_check(rqstp, p);
+ if (!svcxdr_encode_stat(xdr, resp->status))
+ return 0;
- /*
- * Since this is version 2, the check for nfserr in
- * nfsd_dispatch actually ensures the following cannot happen.
- * However, it seems fragile to depend on that.
- */
if (dentry == NULL || d_really_is_negative(dentry))
- return 0;
+ return 1;
inode = d_inode(dentry);
- p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat);
- *p++ = htonl(resp->mask);
- if (!xdr_ressize_check(rqstp, p))
+ if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat))
+ return 0;
+ if (xdr_stream_encode_u32(xdr, resp->mask) < 0)
return 0;
- base = (char *)p - (char *)head->iov_base;
rqstp->rq_res.page_len = w = nfsacl_size(
(resp->mask & NFS_ACL) ? resp->acl_access : NULL,
(resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
while (w > 0) {
if (!*(rqstp->rq_next_page++))
- return 0;
+ return 1;
w -= PAGE_SIZE;
}
- n = nfsacl_encode(&rqstp->rq_res, base, inode,
- resp->acl_access,
- resp->mask & NFS_ACL, 0);
- if (n > 0)
- n = nfsacl_encode(&rqstp->rq_res, base + n, inode,
- resp->acl_default,
- resp->mask & NFS_DFACL,
- NFS_ACL_DEFAULT);
- return (n > 0);
-}
-
-static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p)
-{
- struct nfsd_attrstat *resp = rqstp->rq_resp;
-
- *p++ = resp->status;
- if (resp->status != nfs_ok)
- goto out;
+ if (!nfs_stream_encode_acl(xdr, inode, resp->acl_access,
+ resp->mask & NFS_ACL, 0))
+ return 0;
+ if (!nfs_stream_encode_acl(xdr, inode, resp->acl_default,
+ resp->mask & NFS_DFACL, NFS_ACL_DEFAULT))
+ return 0;
- p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat);
-out:
- return xdr_ressize_check(rqstp, p);
+ return 1;
}
/* ACCESS */
static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_accessres *resp = rqstp->rq_resp;
- *p++ = resp->status;
- if (resp->status != nfs_ok)
- goto out;
+ if (!svcxdr_encode_stat(xdr, resp->status))
+ return 0;
+ switch (resp->status) {
+ case nfs_ok:
+ if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat))
+ return 0;
+ if (xdr_stream_encode_u32(xdr, resp->access) < 0)
+ return 0;
+ break;
+ }
- p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat);
- *p++ = htonl(resp->access);
-out:
- return xdr_ressize_check(rqstp, p);
+ return 1;
}
/*
@@ -329,13 +311,6 @@ static void nfsaclsvc_release_getacl(struct svc_rqst *rqstp)
posix_acl_release(resp->acl_default);
}
-static void nfsaclsvc_release_attrstat(struct svc_rqst *rqstp)
-{
- struct nfsd_attrstat *resp = rqstp->rq_resp;
-
- fh_put(&resp->fh);
-}
-
static void nfsaclsvc_release_access(struct svc_rqst *rqstp)
{
struct nfsd3_accessres *resp = rqstp->rq_resp;
@@ -375,8 +350,8 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
[ACLPROC2_SETACL] = {
.pc_func = nfsacld_proc_setacl,
.pc_decode = nfsaclsvc_decode_setaclargs,
- .pc_encode = nfsaclsvc_encode_attrstatres,
- .pc_release = nfsaclsvc_release_attrstat,
+ .pc_encode = nfssvc_encode_attrstatres,
+ .pc_release = nfssvc_release_attrstat,
.pc_argsize = sizeof(struct nfsd3_setaclargs),
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_NOCACHE,
@@ -386,8 +361,8 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
[ACLPROC2_GETATTR] = {
.pc_func = nfsacld_proc_getattr,
.pc_decode = nfssvc_decode_fhandleargs,
- .pc_encode = nfsaclsvc_encode_attrstatres,
- .pc_release = nfsaclsvc_release_attrstat,
+ .pc_encode = nfssvc_encode_attrstatres,
+ .pc_release = nfssvc_release_attrstat,
.pc_argsize = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_NOCACHE,
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 9a6f18d74d14..a1591feeea22 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -168,22 +168,25 @@ static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p)
/* GETACL */
static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_getaclres *resp = rqstp->rq_resp;
struct dentry *dentry = resp->fh.fh_dentry;
+ struct kvec *head = rqstp->rq_res.head;
+ struct inode *inode = d_inode(dentry);
+ unsigned int base;
+ int n;
+ int w;
- *p++ = resp->status;
- p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh);
- if (resp->status == 0 && dentry && d_really_is_positive(dentry)) {
- struct inode *inode = d_inode(dentry);
- struct kvec *head = rqstp->rq_res.head;
- unsigned int base;
- int n;
- int w;
-
- *p++ = htonl(resp->mask);
- if (!xdr_ressize_check(rqstp, p))
+ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
+ return 0;
+ switch (resp->status) {
+ case nfs_ok:
+ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
+ return 0;
+ if (xdr_stream_encode_u32(xdr, resp->mask) < 0)
return 0;
- base = (char *)p - (char *)head->iov_base;
+
+ base = (char *)xdr->p - (char *)head->iov_base;
rqstp->rq_res.page_len = w = nfsacl_size(
(resp->mask & NFS_ACL) ? resp->acl_access : NULL,
@@ -204,9 +207,11 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p)
NFS_ACL_DEFAULT);
if (n <= 0)
return 0;
- } else
- if (!xdr_ressize_check(rqstp, p))
+ break;
+ default:
+ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
return 0;
+ }
return 1;
}
@@ -214,11 +219,11 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p)
/* SETACL */
static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_attrstat *resp = rqstp->rq_resp;
- *p++ = resp->status;
- p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh);
- return xdr_ressize_check(rqstp, p);
+ return svcxdr_encode_nfsstat3(xdr, resp->status) &&
+ svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh);
}
/*
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 8675851199f8..17715a6c7a40 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -126,14 +126,15 @@ nfsd3_proc_readlink(struct svc_rqst *rqstp)
{
struct nfsd_fhandle *argp = rqstp->rq_argp;
struct nfsd3_readlinkres *resp = rqstp->rq_resp;
- char *buffer = page_address(*(rqstp->rq_next_page++));
dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh));
/* Read the symlink. */
fh_copy(&resp->fh, &argp->fh);
resp->len = NFS3_MAXPATHLEN;
- resp->status = nfsd_readlink(rqstp, &resp->fh, buffer, &resp->len);
+ resp->pages = rqstp->rq_next_page++;
+ resp->status = nfsd_readlink(rqstp, &resp->fh,
+ page_address(*resp->pages), &resp->len);
return rpc_success;
}
@@ -158,6 +159,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
v = 0;
len = argp->count;
+ resp->pages = rqstp->rq_next_page;
while (len > 0) {
struct page *page = *(rqstp->rq_next_page++);
@@ -439,17 +441,30 @@ static void nfsd3_init_dirlist_pages(struct svc_rqst *rqstp,
struct nfsd3_readdirres *resp,
int count)
{
+ struct xdr_buf *buf = &resp->dirlist;
+ struct xdr_stream *xdr = &resp->xdr;
+
count = min_t(u32, count, svc_max_payload(rqstp));
- /* Convert byte count to number of words (i.e. >> 2),
- * and reserve room for the NULL ptr & eof flag (-2 words) */
- resp->buflen = (count >> 2) - 2;
+ memset(buf, 0, sizeof(*buf));
- resp->buffer = page_address(*rqstp->rq_next_page);
+ /* Reserve room for the NULL ptr & eof flag (-2 words) */
+ buf->buflen = count - XDR_UNIT * 2;
+ buf->pages = rqstp->rq_next_page;
while (count > 0) {
rqstp->rq_next_page++;
count -= PAGE_SIZE;
}
+
+ /* This is xdr_init_encode(), but it assumes that
+ * the head kvec has already been consumed. */
+ xdr_set_scratch_buffer(xdr, NULL, 0);
+ xdr->buf = buf;
+ xdr->page_ptr = buf->pages;
+ xdr->iov = NULL;
+ xdr->p = page_address(*buf->pages);
+ xdr->end = xdr->p + (PAGE_SIZE >> 2);
+ xdr->rqst = NULL;
}
/*
@@ -460,10 +475,7 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp)
{
struct nfsd3_readdirargs *argp = rqstp->rq_argp;
struct nfsd3_readdirres *resp = rqstp->rq_resp;
- int count = 0;
loff_t offset;
- struct page **p;
- caddr_t page_addr = NULL;
dprintk("nfsd: READDIR(3) %s %d bytes at %d\n",
SVCFH_fmt(&argp->fh),
@@ -471,39 +483,18 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp)
nfsd3_init_dirlist_pages(rqstp, resp, argp->count);
- /* Read directory and encode entries on the fly */
fh_copy(&resp->fh, &argp->fh);
-
resp->common.err = nfs_ok;
+ resp->cookie_offset = 0;
resp->rqstp = rqstp;
offset = argp->cookie;
-
resp->status = nfsd_readdir(rqstp, &resp->fh, &offset,
- &resp->common, nfs3svc_encode_entry);
+ &resp->common, nfs3svc_encode_entry3);
memcpy(resp->verf, argp->verf, 8);
- count = 0;
- for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) {
- page_addr = page_address(*p);
+ nfs3svc_encode_cookie3(resp, offset);
- if (((caddr_t)resp->buffer >= page_addr) &&
- ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) {
- count += (caddr_t)resp->buffer - page_addr;
- break;
- }
- count += PAGE_SIZE;
- }
- resp->count = count >> 2;
- if (resp->offset) {
- if (unlikely(resp->offset1)) {
- /* we ended up with offset on a page boundary */
- *resp->offset = htonl(offset >> 32);
- *resp->offset1 = htonl(offset & 0xffffffff);
- resp->offset1 = NULL;
- } else {
- xdr_encode_hyper(resp->offset, offset);
- }
- resp->offset = NULL;
- }
+ /* Recycle only pages that were part of the reply */
+ rqstp->rq_next_page = resp->xdr.page_ptr + 1;
return rpc_success;
}
@@ -517,10 +508,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp)
{
struct nfsd3_readdirargs *argp = rqstp->rq_argp;
struct nfsd3_readdirres *resp = rqstp->rq_resp;
- int count = 0;
loff_t offset;
- struct page **p;
- caddr_t page_addr = NULL;
dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n",
SVCFH_fmt(&argp->fh),
@@ -528,10 +516,9 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp)
nfsd3_init_dirlist_pages(rqstp, resp, argp->count);
- /* Read directory and encode entries on the fly */
fh_copy(&resp->fh, &argp->fh);
-
resp->common.err = nfs_ok;
+ resp->cookie_offset = 0;
resp->rqstp = rqstp;
offset = argp->cookie;
@@ -545,30 +532,12 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp)
}
resp->status = nfsd_readdir(rqstp, &resp->fh, &offset,
- &resp->common, nfs3svc_encode_entry_plus);
+ &resp->common, nfs3svc_encode_entryplus3);
memcpy(resp->verf, argp->verf, 8);
- for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) {
- page_addr = page_address(*p);
+ nfs3svc_encode_cookie3(resp, offset);
- if (((caddr_t)resp->buffer >= page_addr) &&
- ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) {
- count += (caddr_t)resp->buffer - page_addr;
- break;
- }
- count += PAGE_SIZE;
- }
- resp->count = count >> 2;
- if (resp->offset) {
- if (unlikely(resp->offset1)) {
- /* we ended up with offset on a page boundary */
- *resp->offset = htonl(offset >> 32);
- *resp->offset1 = htonl(offset & 0xffffffff);
- resp->offset1 = NULL;
- } else {
- xdr_encode_hyper(resp->offset, offset);
- }
- resp->offset = NULL;
- }
+ /* Recycle only pages that were part of the reply */
+ rqstp->rq_next_page = resp->xdr.page_ptr + 1;
out:
return rpc_success;
@@ -736,7 +705,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
[NFS3PROC_GETATTR] = {
.pc_func = nfsd3_proc_getattr,
.pc_decode = nfs3svc_decode_fhandleargs,
- .pc_encode = nfs3svc_encode_attrstatres,
+ .pc_encode = nfs3svc_encode_getattrres,
.pc_release = nfs3svc_release_fhandle,
.pc_argsize = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd3_attrstatres),
@@ -758,7 +727,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
[NFS3PROC_LOOKUP] = {
.pc_func = nfsd3_proc_lookup,
.pc_decode = nfs3svc_decode_diropargs,
- .pc_encode = nfs3svc_encode_diropres,
+ .pc_encode = nfs3svc_encode_lookupres,
.pc_release = nfs3svc_release_fhandle2,
.pc_argsize = sizeof(struct nfsd3_diropargs),
.pc_ressize = sizeof(struct nfsd3_diropres),
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 9d9a01ce0b27..0a5ebc52e6a9 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -14,13 +14,26 @@
#include "netns.h"
#include "vfs.h"
-#define NFSDDBG_FACILITY NFSDDBG_XDR
+/*
+ * Force construction of an empty post-op attr
+ */
+static const struct svc_fh nfs3svc_null_fh = {
+ .fh_no_wcc = true,
+};
+/*
+ * time_delta. {1, 0} means the server is accurate only
+ * to the nearest second.
+ */
+static const struct timespec64 nfs3svc_time_delta = {
+ .tv_sec = 1,
+ .tv_nsec = 0,
+};
/*
* Mapping of S_IF* types to NFS file types
*/
-static u32 nfs3_ftypes[] = {
+static const u32 nfs3_ftypes[] = {
NF3NON, NF3FIFO, NF3CHR, NF3BAD,
NF3DIR, NF3BAD, NF3BLK, NF3BAD,
NF3REG, NF3BAD, NF3LNK, NF3BAD,
@@ -33,9 +46,11 @@ static u32 nfs3_ftypes[] = {
*/
static __be32 *
-encode_time3(__be32 *p, struct timespec64 *time)
+encode_nfstime3(__be32 *p, const struct timespec64 *time)
{
- *p++ = htonl((u32) time->tv_sec); *p++ = htonl(time->tv_nsec);
+ *p++ = cpu_to_be32((u32)time->tv_sec);
+ *p++ = cpu_to_be32(time->tv_nsec);
+
return p;
}
@@ -82,14 +97,80 @@ svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp)
return true;
}
-static __be32 *
-encode_fh(__be32 *p, struct svc_fh *fhp)
+/**
+ * svcxdr_encode_nfsstat3 - Encode an NFSv3 status code
+ * @xdr: XDR stream
+ * @status: status value to encode
+ *
+ * Return values:
+ * %false: Send buffer space was exhausted
+ * %true: Success
+ */
+bool
+svcxdr_encode_nfsstat3(struct xdr_stream *xdr, __be32 status)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, sizeof(status));
+ if (!p)
+ return false;
+ *p = status;
+
+ return true;
+}
+
+static bool
+svcxdr_encode_nfs_fh3(struct xdr_stream *xdr, const struct svc_fh *fhp)
{
- unsigned int size = fhp->fh_handle.fh_size;
- *p++ = htonl(size);
- if (size) p[XDR_QUADLEN(size)-1]=0;
+ u32 size = fhp->fh_handle.fh_size;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, XDR_UNIT + size);
+ if (!p)
+ return false;
+ *p++ = cpu_to_be32(size);
+ if (size)
+ p[XDR_QUADLEN(size) - 1] = 0;
memcpy(p, &fhp->fh_handle.fh_base, size);
- return p + XDR_QUADLEN(size);
+
+ return true;
+}
+
+static bool
+svcxdr_encode_post_op_fh3(struct xdr_stream *xdr, const struct svc_fh *fhp)
+{
+ if (xdr_stream_encode_item_present(xdr) < 0)
+ return false;
+ if (!svcxdr_encode_nfs_fh3(xdr, fhp))
+ return false;
+
+ return true;
+}
+
+static bool
+svcxdr_encode_cookieverf3(struct xdr_stream *xdr, const __be32 *verf)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, NFS3_COOKIEVERFSIZE);
+ if (!p)
+ return false;
+ memcpy(p, verf, NFS3_COOKIEVERFSIZE);
+
+ return true;
+}
+
+static bool
+svcxdr_encode_writeverf3(struct xdr_stream *xdr, const __be32 *verf)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, NFS3_WRITEVERFSIZE);
+ if (!p)
+ return false;
+ memcpy(p, verf, NFS3_WRITEVERFSIZE);
+
+ return true;
}
static bool
@@ -253,115 +334,157 @@ svcxdr_decode_devicedata3(struct svc_rqst *rqstp, struct xdr_stream *xdr,
svcxdr_decode_specdata3(xdr, args);
}
-static __be32 *encode_fsid(__be32 *p, struct svc_fh *fhp)
+static bool
+svcxdr_encode_fattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr,
+ const struct svc_fh *fhp, const struct kstat *stat)
{
- u64 f;
+ struct user_namespace *userns = nfsd_user_namespace(rqstp);
+ __be32 *p;
+ u64 fsid;
+
+ p = xdr_reserve_space(xdr, XDR_UNIT * 21);
+ if (!p)
+ return false;
+
+ *p++ = cpu_to_be32(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]);
+ *p++ = cpu_to_be32((u32)(stat->mode & S_IALLUGO));
+ *p++ = cpu_to_be32((u32)stat->nlink);
+ *p++ = cpu_to_be32((u32)from_kuid_munged(userns, stat->uid));
+ *p++ = cpu_to_be32((u32)from_kgid_munged(userns, stat->gid));
+ if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN)
+ p = xdr_encode_hyper(p, (u64)NFS3_MAXPATHLEN);
+ else
+ p = xdr_encode_hyper(p, (u64)stat->size);
+
+ /* used */
+ p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9);
+
+ /* rdev */
+ *p++ = cpu_to_be32((u32)MAJOR(stat->rdev));
+ *p++ = cpu_to_be32((u32)MINOR(stat->rdev));
+
switch(fsid_source(fhp)) {
- default:
- case FSIDSOURCE_DEV:
- p = xdr_encode_hyper(p, (u64)huge_encode_dev
- (fhp->fh_dentry->d_sb->s_dev));
- break;
case FSIDSOURCE_FSID:
- p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid);
+ fsid = (u64)fhp->fh_export->ex_fsid;
break;
case FSIDSOURCE_UUID:
- f = ((u64*)fhp->fh_export->ex_uuid)[0];
- f ^= ((u64*)fhp->fh_export->ex_uuid)[1];
- p = xdr_encode_hyper(p, f);
+ fsid = ((u64 *)fhp->fh_export->ex_uuid)[0];
+ fsid ^= ((u64 *)fhp->fh_export->ex_uuid)[1];
break;
+ default:
+ fsid = (u64)huge_encode_dev(fhp->fh_dentry->d_sb->s_dev);
}
- return p;
-}
+ p = xdr_encode_hyper(p, fsid);
-static __be32 *
-encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
- struct kstat *stat)
-{
- struct user_namespace *userns = nfsd_user_namespace(rqstp);
- *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]);
- *p++ = htonl((u32) (stat->mode & S_IALLUGO));
- *p++ = htonl((u32) stat->nlink);
- *p++ = htonl((u32) from_kuid_munged(userns, stat->uid));
- *p++ = htonl((u32) from_kgid_munged(userns, stat->gid));
- if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) {
- p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN);
- } else {
- p = xdr_encode_hyper(p, (u64) stat->size);
- }
- p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9);
- *p++ = htonl((u32) MAJOR(stat->rdev));
- *p++ = htonl((u32) MINOR(stat->rdev));
- p = encode_fsid(p, fhp);
+ /* fileid */
p = xdr_encode_hyper(p, stat->ino);
- p = encode_time3(p, &stat->atime);
- p = encode_time3(p, &stat->mtime);
- p = encode_time3(p, &stat->ctime);
- return p;
+ p = encode_nfstime3(p, &stat->atime);
+ p = encode_nfstime3(p, &stat->mtime);
+ encode_nfstime3(p, &stat->ctime);
+
+ return true;
}
-static __be32 *
-encode_saved_post_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
+static bool
+svcxdr_encode_wcc_attr(struct xdr_stream *xdr, const struct svc_fh *fhp)
{
- /* Attributes to follow */
- *p++ = xdr_one;
- return encode_fattr3(rqstp, p, fhp, &fhp->fh_post_attr);
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, XDR_UNIT * 6);
+ if (!p)
+ return false;
+ p = xdr_encode_hyper(p, (u64)fhp->fh_pre_size);
+ p = encode_nfstime3(p, &fhp->fh_pre_mtime);
+ encode_nfstime3(p, &fhp->fh_pre_ctime);
+
+ return true;
}
-/*
- * Encode post-operation attributes.
- * The inode may be NULL if the call failed because of a stale file
- * handle. In this case, no attributes are returned.
- */
-static __be32 *
-encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
+static bool
+svcxdr_encode_pre_op_attr(struct xdr_stream *xdr, const struct svc_fh *fhp)
{
- struct dentry *dentry = fhp->fh_dentry;
- if (!fhp->fh_no_wcc && dentry && d_really_is_positive(dentry)) {
- __be32 err;
- struct kstat stat;
-
- err = fh_getattr(fhp, &stat);
- if (!err) {
- *p++ = xdr_one; /* attributes follow */
- lease_get_mtime(d_inode(dentry), &stat.mtime);
- return encode_fattr3(rqstp, p, fhp, &stat);
- }
+ if (!fhp->fh_pre_saved) {
+ if (xdr_stream_encode_item_absent(xdr) < 0)
+ return false;
+ return true;
}
- *p++ = xdr_zero;
- return p;
+
+ if (xdr_stream_encode_item_present(xdr) < 0)
+ return false;
+ return svcxdr_encode_wcc_attr(xdr, fhp);
}
-/* Helper for NFSv3 ACLs */
-__be32 *
-nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
+/**
+ * svcxdr_encode_post_op_attr - Encode NFSv3 post-op attributes
+ * @rqstp: Context of a completed RPC transaction
+ * @xdr: XDR stream
+ * @fhp: File handle to encode
+ *
+ * Return values:
+ * %false: Send buffer space was exhausted
+ * %true: Success
+ */
+bool
+svcxdr_encode_post_op_attr(struct svc_rqst *rqstp, struct xdr_stream *xdr,
+ const struct svc_fh *fhp)
{
- return encode_post_op_attr(rqstp, p, fhp);
+ struct dentry *dentry = fhp->fh_dentry;
+ struct kstat stat;
+
+ /*
+ * The inode may be NULL if the call failed because of a
+ * stale file handle. In this case, no attributes are
+ * returned.
+ */
+ if (fhp->fh_no_wcc || !dentry || !d_really_is_positive(dentry))
+ goto no_post_op_attrs;
+ if (fh_getattr(fhp, &stat) != nfs_ok)
+ goto no_post_op_attrs;
+
+ if (xdr_stream_encode_item_present(xdr) < 0)
+ return false;
+ lease_get_mtime(d_inode(dentry), &stat.mtime);
+ if (!svcxdr_encode_fattr3(rqstp, xdr, fhp, &stat))
+ return false;
+
+ return true;
+
+no_post_op_attrs:
+ return xdr_stream_encode_item_absent(xdr) > 0;
}
/*
- * Enocde weak cache consistency data
+ * Encode weak cache consistency data
*/
-static __be32 *
-encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
+static bool
+svcxdr_encode_wcc_data(struct svc_rqst *rqstp, struct xdr_stream *xdr,
+ const struct svc_fh *fhp)
{
- struct dentry *dentry = fhp->fh_dentry;
-
- if (dentry && d_really_is_positive(dentry) && fhp->fh_post_saved) {
- if (fhp->fh_pre_saved) {
- *p++ = xdr_one;
- p = xdr_encode_hyper(p, (u64) fhp->fh_pre_size);
- p = encode_time3(p, &fhp->fh_pre_mtime);
- p = encode_time3(p, &fhp->fh_pre_ctime);
- } else {
- *p++ = xdr_zero;
- }
- return encode_saved_post_attr(rqstp, p, fhp);
- }
- /* no pre- or post-attrs */
- *p++ = xdr_zero;
- return encode_post_op_attr(rqstp, p, fhp);
+ struct dentry *dentry = fhp->fh_dentry;
+
+ if (!dentry || !d_really_is_positive(dentry) || !fhp->fh_post_saved)
+ goto neither;
+
+ /* before */
+ if (!svcxdr_encode_pre_op_attr(xdr, fhp))
+ return false;
+
+ /* after */
+ if (xdr_stream_encode_item_present(xdr) < 0)
+ return false;
+ if (!svcxdr_encode_fattr3(rqstp, xdr, fhp, &fhp->fh_post_attr))
+ return false;
+
+ return true;
+
+neither:
+ if (xdr_stream_encode_item_absent(xdr) < 0)
+ return false;
+ if (!svcxdr_encode_post_op_attr(rqstp, xdr, fhp))
+ return false;
+
+ return true;
}
static bool fs_supports_change_attribute(struct super_block *sb)
@@ -713,210 +836,252 @@ nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p)
/* GETATTR */
int
-nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p)
+nfs3svc_encode_getattrres(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_attrstat *resp = rqstp->rq_resp;
- *p++ = resp->status;
- if (resp->status == 0) {
- lease_get_mtime(d_inode(resp->fh.fh_dentry),
- &resp->stat.mtime);
- p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat);
+ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
+ return 0;
+ switch (resp->status) {
+ case nfs_ok:
+ lease_get_mtime(d_inode(resp->fh.fh_dentry), &resp->stat.mtime);
+ if (!svcxdr_encode_fattr3(rqstp, xdr, &resp->fh, &resp->stat))
+ return 0;
+ break;
}
- return xdr_ressize_check(rqstp, p);
+
+ return 1;
}
/* SETATTR, REMOVE, RMDIR */
int
nfs3svc_encode_wccstat(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_attrstat *resp = rqstp->rq_resp;
- *p++ = resp->status;
- p = encode_wcc_data(rqstp, p, &resp->fh);
- return xdr_ressize_check(rqstp, p);
+ return svcxdr_encode_nfsstat3(xdr, resp->status) &&
+ svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh);
}
/* LOOKUP */
-int
-nfs3svc_encode_diropres(struct svc_rqst *rqstp, __be32 *p)
+int nfs3svc_encode_lookupres(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_diropres *resp = rqstp->rq_resp;
- *p++ = resp->status;
- if (resp->status == 0) {
- p = encode_fh(p, &resp->fh);
- p = encode_post_op_attr(rqstp, p, &resp->fh);
+ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
+ return 0;
+ switch (resp->status) {
+ case nfs_ok:
+ if (!svcxdr_encode_nfs_fh3(xdr, &resp->fh))
+ return 0;
+ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
+ return 0;
+ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->dirfh))
+ return 0;
+ break;
+ default:
+ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->dirfh))
+ return 0;
}
- p = encode_post_op_attr(rqstp, p, &resp->dirfh);
- return xdr_ressize_check(rqstp, p);
+
+ return 1;
}
/* ACCESS */
int
nfs3svc_encode_accessres(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_accessres *resp = rqstp->rq_resp;
- *p++ = resp->status;
- p = encode_post_op_attr(rqstp, p, &resp->fh);
- if (resp->status == 0)
- *p++ = htonl(resp->access);
- return xdr_ressize_check(rqstp, p);
+ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
+ return 0;
+ switch (resp->status) {
+ case nfs_ok:
+ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
+ return 0;
+ if (xdr_stream_encode_u32(xdr, resp->access) < 0)
+ return 0;
+ break;
+ default:
+ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
+ return 0;
+ }
+
+ return 1;
}
/* READLINK */
int
nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_readlinkres *resp = rqstp->rq_resp;
struct kvec *head = rqstp->rq_res.head;
- *p++ = resp->status;
- p = encode_post_op_attr(rqstp, p, &resp->fh);
- if (resp->status == 0) {
- *p++ = htonl(resp->len);
- xdr_ressize_check(rqstp, p);
- rqstp->rq_res.page_len = resp->len;
- if (resp->len & 3) {
- /* need to pad the tail */
- rqstp->rq_res.tail[0].iov_base = p;
- *p = 0;
- rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3);
- }
- if (svc_encode_result_payload(rqstp, head->iov_len, resp->len))
+ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
+ return 0;
+ switch (resp->status) {
+ case nfs_ok:
+ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
return 0;
- return 1;
- } else
- return xdr_ressize_check(rqstp, p);
+ if (xdr_stream_encode_u32(xdr, resp->len) < 0)
+ return 0;
+ xdr_write_pages(xdr, resp->pages, 0, resp->len);
+ if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0)
+ return 0;
+ break;
+ default:
+ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
+ return 0;
+ }
+
+ return 1;
}
/* READ */
int
nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_readres *resp = rqstp->rq_resp;
struct kvec *head = rqstp->rq_res.head;
- *p++ = resp->status;
- p = encode_post_op_attr(rqstp, p, &resp->fh);
- if (resp->status == 0) {
- *p++ = htonl(resp->count);
- *p++ = htonl(resp->eof);
- *p++ = htonl(resp->count); /* xdr opaque count */
- xdr_ressize_check(rqstp, p);
- /* now update rqstp->rq_res to reflect data as well */
- rqstp->rq_res.page_len = resp->count;
- if (resp->count & 3) {
- /* need to pad the tail */
- rqstp->rq_res.tail[0].iov_base = p;
- *p = 0;
- rqstp->rq_res.tail[0].iov_len = 4 - (resp->count & 3);
- }
- if (svc_encode_result_payload(rqstp, head->iov_len,
- resp->count))
+ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
+ return 0;
+ switch (resp->status) {
+ case nfs_ok:
+ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
return 0;
- return 1;
- } else
- return xdr_ressize_check(rqstp, p);
+ if (xdr_stream_encode_u32(xdr, resp->count) < 0)
+ return 0;
+ if (xdr_stream_encode_bool(xdr, resp->eof) < 0)
+ return 0;
+ if (xdr_stream_encode_u32(xdr, resp->count) < 0)
+ return 0;
+ xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base,
+ resp->count);
+ if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0)
+ return 0;
+ break;
+ default:
+ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
+ return 0;
+ }
+
+ return 1;
}
/* WRITE */
int
nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_writeres *resp = rqstp->rq_resp;
- *p++ = resp->status;
- p = encode_wcc_data(rqstp, p, &resp->fh);
- if (resp->status == 0) {
- *p++ = htonl(resp->count);
- *p++ = htonl(resp->committed);
- *p++ = resp->verf[0];
- *p++ = resp->verf[1];
+ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
+ return 0;
+ switch (resp->status) {
+ case nfs_ok:
+ if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh))
+ return 0;
+ if (xdr_stream_encode_u32(xdr, resp->count) < 0)
+ return 0;
+ if (xdr_stream_encode_u32(xdr, resp->committed) < 0)
+ return 0;
+ if (!svcxdr_encode_writeverf3(xdr, resp->verf))
+ return 0;
+ break;
+ default:
+ if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh))
+ return 0;
}
- return xdr_ressize_check(rqstp, p);
+
+ return 1;
}
/* CREATE, MKDIR, SYMLINK, MKNOD */
int
nfs3svc_encode_createres(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_diropres *resp = rqstp->rq_resp;
- *p++ = resp->status;
- if (resp->status == 0) {
- *p++ = xdr_one;
- p = encode_fh(p, &resp->fh);
- p = encode_post_op_attr(rqstp, p, &resp->fh);
+ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
+ return 0;
+ switch (resp->status) {
+ case nfs_ok:
+ if (!svcxdr_encode_post_op_fh3(xdr, &resp->fh))
+ return 0;
+ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
+ return 0;
+ if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->dirfh))
+ return 0;
+ break;
+ default:
+ if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->dirfh))
+ return 0;
}
- p = encode_wcc_data(rqstp, p, &resp->dirfh);
- return xdr_ressize_check(rqstp, p);
+
+ return 1;
}
/* RENAME */
int
nfs3svc_encode_renameres(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_renameres *resp = rqstp->rq_resp;
- *p++ = resp->status;
- p = encode_wcc_data(rqstp, p, &resp->ffh);
- p = encode_wcc_data(rqstp, p, &resp->tfh);
- return xdr_ressize_check(rqstp, p);
+ return svcxdr_encode_nfsstat3(xdr, resp->status) &&
+ svcxdr_encode_wcc_data(rqstp, xdr, &resp->ffh) &&
+ svcxdr_encode_wcc_data(rqstp, xdr, &resp->tfh);
}
/* LINK */
int
nfs3svc_encode_linkres(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_linkres *resp = rqstp->rq_resp;
- *p++ = resp->status;
- p = encode_post_op_attr(rqstp, p, &resp->fh);
- p = encode_wcc_data(rqstp, p, &resp->tfh);
- return xdr_ressize_check(rqstp, p);
+ return svcxdr_encode_nfsstat3(xdr, resp->status) &&
+ svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh) &&
+ svcxdr_encode_wcc_data(rqstp, xdr, &resp->tfh);
}
/* READDIR */
int
nfs3svc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_readdirres *resp = rqstp->rq_resp;
+ struct xdr_buf *dirlist = &resp->dirlist;
- *p++ = resp->status;
- p = encode_post_op_attr(rqstp, p, &resp->fh);
-
- if (resp->status == 0) {
- /* stupid readdir cookie */
- memcpy(p, resp->verf, 8); p += 2;
- xdr_ressize_check(rqstp, p);
- if (rqstp->rq_res.head[0].iov_len + (2<<2) > PAGE_SIZE)
- return 1; /*No room for trailer */
- rqstp->rq_res.page_len = (resp->count) << 2;
-
- /* add the 'tail' to the end of the 'head' page - page 0. */
- rqstp->rq_res.tail[0].iov_base = p;
- *p++ = 0; /* no more entries */
- *p++ = htonl(resp->common.err == nfserr_eof);
- rqstp->rq_res.tail[0].iov_len = 2<<2;
- return 1;
- } else
- return xdr_ressize_check(rqstp, p);
-}
-
-static __be32 *
-encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name,
- int namlen, u64 ino)
-{
- *p++ = xdr_one; /* mark entry present */
- p = xdr_encode_hyper(p, ino); /* file id */
- p = xdr_encode_array(p, name, namlen);/* name length & name */
-
- cd->offset = p; /* remember pointer */
- p = xdr_encode_hyper(p, NFS_OFFSET_MAX);/* offset of next entry */
+ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
+ return 0;
+ switch (resp->status) {
+ case nfs_ok:
+ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
+ return 0;
+ if (!svcxdr_encode_cookieverf3(xdr, resp->verf))
+ return 0;
+ xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len);
+ /* no more entries */
+ if (xdr_stream_encode_item_absent(xdr) < 0)
+ return 0;
+ if (xdr_stream_encode_bool(xdr, resp->common.err == nfserr_eof) < 0)
+ return 0;
+ break;
+ default:
+ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
+ return 0;
+ }
- return p;
+ return 1;
}
static __be32
@@ -957,267 +1122,327 @@ out:
return rv;
}
-static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen, u64 ino)
+/**
+ * nfs3svc_encode_cookie3 - Encode a directory offset cookie
+ * @resp: readdir result context
+ * @offset: offset cookie to encode
+ *
+ * The buffer space for the offset cookie has already been reserved
+ * by svcxdr_encode_entry3_common().
+ */
+void nfs3svc_encode_cookie3(struct nfsd3_readdirres *resp, u64 offset)
{
- struct svc_fh *fh = &cd->scratch;
- __be32 err;
-
- fh_init(fh, NFS3_FHSIZE);
- err = compose_entry_fh(cd, fh, name, namlen, ino);
- if (err) {
- *p++ = 0;
- *p++ = 0;
- goto out;
- }
- p = encode_post_op_attr(cd->rqstp, p, fh);
- *p++ = xdr_one; /* yes, a file handle follows */
- p = encode_fh(p, fh);
-out:
- fh_put(fh);
- return p;
-}
+ __be64 cookie = cpu_to_be64(offset);
-/*
- * Encode a directory entry. This one works for both normal readdir
- * and readdirplus.
- * The normal readdir reply requires 2 (fileid) + 1 (stringlen)
- * + string + 2 (cookie) + 1 (next) words, i.e. 6 + strlen.
- *
- * The readdirplus baggage is 1+21 words for post_op_attr, plus the
- * file handle.
- */
+ if (!resp->cookie_offset)
+ return;
+ write_bytes_to_xdr_buf(&resp->dirlist, resp->cookie_offset, &cookie,
+ sizeof(cookie));
+ resp->cookie_offset = 0;
+}
-#define NFS3_ENTRY_BAGGAGE (2 + 1 + 2 + 1)
-#define NFS3_ENTRYPLUS_BAGGAGE (1 + 21 + 1 + (NFS3_FHSIZE >> 2))
-static int
-encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
- loff_t offset, u64 ino, unsigned int d_type, int plus)
+static bool
+svcxdr_encode_entry3_common(struct nfsd3_readdirres *resp, const char *name,
+ int namlen, loff_t offset, u64 ino)
{
- struct nfsd3_readdirres *cd = container_of(ccd, struct nfsd3_readdirres,
- common);
- __be32 *p = cd->buffer;
- caddr_t curr_page_addr = NULL;
- struct page ** page;
- int slen; /* string (name) length */
- int elen; /* estimated entry length in words */
- int num_entry_words = 0; /* actual number of words */
-
- if (cd->offset) {
- u64 offset64 = offset;
-
- if (unlikely(cd->offset1)) {
- /* we ended up with offset on a page boundary */
- *cd->offset = htonl(offset64 >> 32);
- *cd->offset1 = htonl(offset64 & 0xffffffff);
- cd->offset1 = NULL;
- } else {
- xdr_encode_hyper(cd->offset, offset64);
- }
- cd->offset = NULL;
- }
+ struct xdr_buf *dirlist = &resp->dirlist;
+ struct xdr_stream *xdr = &resp->xdr;
- /*
- dprintk("encode_entry(%.*s @%ld%s)\n",
- namlen, name, (long) offset, plus? " plus" : "");
- */
-
- /* truncate filename if too long */
- namlen = min(namlen, NFS3_MAXNAMLEN);
+ if (xdr_stream_encode_item_present(xdr) < 0)
+ return false;
+ /* fileid */
+ if (xdr_stream_encode_u64(xdr, ino) < 0)
+ return false;
+ /* name */
+ if (xdr_stream_encode_opaque(xdr, name, min(namlen, NFS3_MAXNAMLEN)) < 0)
+ return false;
+ /* cookie */
+ resp->cookie_offset = dirlist->len;
+ if (xdr_stream_encode_u64(xdr, NFS_OFFSET_MAX) < 0)
+ return false;
- slen = XDR_QUADLEN(namlen);
- elen = slen + NFS3_ENTRY_BAGGAGE
- + (plus? NFS3_ENTRYPLUS_BAGGAGE : 0);
+ return true;
+}
- if (cd->buflen < elen) {
- cd->common.err = nfserr_toosmall;
- return -EINVAL;
- }
+/**
+ * nfs3svc_encode_entry3 - encode one NFSv3 READDIR entry
+ * @data: directory context
+ * @name: name of the object to be encoded
+ * @namlen: length of that name, in bytes
+ * @offset: the offset of the previous entry
+ * @ino: the fileid of this entry
+ * @d_type: unused
+ *
+ * Return values:
+ * %0: Entry was successfully encoded.
+ * %-EINVAL: An encoding problem occured, secondary status code in resp->common.err
+ *
+ * On exit, the following fields are updated:
+ * - resp->xdr
+ * - resp->common.err
+ * - resp->cookie_offset
+ */
+int nfs3svc_encode_entry3(void *data, const char *name, int namlen,
+ loff_t offset, u64 ino, unsigned int d_type)
+{
+ struct readdir_cd *ccd = data;
+ struct nfsd3_readdirres *resp = container_of(ccd,
+ struct nfsd3_readdirres,
+ common);
+ unsigned int starting_length = resp->dirlist.len;
- /* determine which page in rq_respages[] we are currently filling */
- for (page = cd->rqstp->rq_respages + 1;
- page < cd->rqstp->rq_next_page; page++) {
- curr_page_addr = page_address(*page);
+ /* The offset cookie for the previous entry */
+ nfs3svc_encode_cookie3(resp, offset);
- if (((caddr_t)cd->buffer >= curr_page_addr) &&
- ((caddr_t)cd->buffer < curr_page_addr + PAGE_SIZE))
- break;
- }
+ if (!svcxdr_encode_entry3_common(resp, name, namlen, offset, ino))
+ goto out_toosmall;
- if ((caddr_t)(cd->buffer + elen) < (curr_page_addr + PAGE_SIZE)) {
- /* encode entry in current page */
+ xdr_commit_encode(&resp->xdr);
+ resp->common.err = nfs_ok;
+ return 0;
- p = encode_entry_baggage(cd, p, name, namlen, ino);
+out_toosmall:
+ resp->cookie_offset = 0;
+ resp->common.err = nfserr_toosmall;
+ resp->dirlist.len = starting_length;
+ return -EINVAL;
+}
- if (plus)
- p = encode_entryplus_baggage(cd, p, name, namlen, ino);
- num_entry_words = p - cd->buffer;
- } else if (*(page+1) != NULL) {
- /* temporarily encode entry into next page, then move back to
- * current and next page in rq_respages[] */
- __be32 *p1, *tmp;
- int len1, len2;
+static bool
+svcxdr_encode_entry3_plus(struct nfsd3_readdirres *resp, const char *name,
+ int namlen, u64 ino)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ struct svc_fh *fhp = &resp->scratch;
+ bool result;
- /* grab next page for temporary storage of entry */
- p1 = tmp = page_address(*(page+1));
+ result = false;
+ fh_init(fhp, NFS3_FHSIZE);
+ if (compose_entry_fh(resp, fhp, name, namlen, ino) != nfs_ok)
+ goto out_noattrs;
- p1 = encode_entry_baggage(cd, p1, name, namlen, ino);
+ if (!svcxdr_encode_post_op_attr(resp->rqstp, xdr, fhp))
+ goto out;
+ if (!svcxdr_encode_post_op_fh3(xdr, fhp))
+ goto out;
+ result = true;
- if (plus)
- p1 = encode_entryplus_baggage(cd, p1, name, namlen, ino);
+out:
+ fh_put(fhp);
+ return result;
- /* determine entry word length and lengths to go in pages */
- num_entry_words = p1 - tmp;
- len1 = curr_page_addr + PAGE_SIZE - (caddr_t)cd->buffer;
- if ((num_entry_words << 2) < len1) {
- /* the actual number of words in the entry is less
- * than elen and can still fit in the current page
- */
- memmove(p, tmp, num_entry_words << 2);
- p += num_entry_words;
-
- /* update offset */
- cd->offset = cd->buffer + (cd->offset - tmp);
- } else {
- unsigned int offset_r = (cd->offset - tmp) << 2;
-
- /* update pointer to offset location.
- * This is a 64bit quantity, so we need to
- * deal with 3 cases:
- * - entirely in first page
- * - entirely in second page
- * - 4 bytes in each page
- */
- if (offset_r + 8 <= len1) {
- cd->offset = p + (cd->offset - tmp);
- } else if (offset_r >= len1) {
- cd->offset -= len1 >> 2;
- } else {
- /* sitting on the fence */
- BUG_ON(offset_r != len1 - 4);
- cd->offset = p + (cd->offset - tmp);
- cd->offset1 = tmp;
- }
-
- len2 = (num_entry_words << 2) - len1;
-
- /* move from temp page to current and next pages */
- memmove(p, tmp, len1);
- memmove(tmp, (caddr_t)tmp+len1, len2);
-
- p = tmp + (len2 >> 2);
- }
- }
- else {
- cd->common.err = nfserr_toosmall;
- return -EINVAL;
- }
+out_noattrs:
+ if (xdr_stream_encode_item_absent(xdr) < 0)
+ return false;
+ if (xdr_stream_encode_item_absent(xdr) < 0)
+ return false;
+ return true;
+}
- cd->buflen -= num_entry_words;
- cd->buffer = p;
- cd->common.err = nfs_ok;
+/**
+ * nfs3svc_encode_entryplus3 - encode one NFSv3 READDIRPLUS entry
+ * @data: directory context
+ * @name: name of the object to be encoded
+ * @namlen: length of that name, in bytes
+ * @offset: the offset of the previous entry
+ * @ino: the fileid of this entry
+ * @d_type: unused
+ *
+ * Return values:
+ * %0: Entry was successfully encoded.
+ * %-EINVAL: An encoding problem occured, secondary status code in resp->common.err
+ *
+ * On exit, the following fields are updated:
+ * - resp->xdr
+ * - resp->common.err
+ * - resp->cookie_offset
+ */
+int nfs3svc_encode_entryplus3(void *data, const char *name, int namlen,
+ loff_t offset, u64 ino, unsigned int d_type)
+{
+ struct readdir_cd *ccd = data;
+ struct nfsd3_readdirres *resp = container_of(ccd,
+ struct nfsd3_readdirres,
+ common);
+ unsigned int starting_length = resp->dirlist.len;
+
+ /* The offset cookie for the previous entry */
+ nfs3svc_encode_cookie3(resp, offset);
+
+ if (!svcxdr_encode_entry3_common(resp, name, namlen, offset, ino))
+ goto out_toosmall;
+ if (!svcxdr_encode_entry3_plus(resp, name, namlen, ino))
+ goto out_toosmall;
+
+ xdr_commit_encode(&resp->xdr);
+ resp->common.err = nfs_ok;
return 0;
+out_toosmall:
+ resp->cookie_offset = 0;
+ resp->common.err = nfserr_toosmall;
+ resp->dirlist.len = starting_length;
+ return -EINVAL;
}
-int
-nfs3svc_encode_entry(void *cd, const char *name,
- int namlen, loff_t offset, u64 ino, unsigned int d_type)
+static bool
+svcxdr_encode_fsstat3resok(struct xdr_stream *xdr,
+ const struct nfsd3_fsstatres *resp)
{
- return encode_entry(cd, name, namlen, offset, ino, d_type, 0);
-}
+ const struct kstatfs *s = &resp->stats;
+ u64 bs = s->f_bsize;
+ __be32 *p;
-int
-nfs3svc_encode_entry_plus(void *cd, const char *name,
- int namlen, loff_t offset, u64 ino,
- unsigned int d_type)
-{
- return encode_entry(cd, name, namlen, offset, ino, d_type, 1);
+ p = xdr_reserve_space(xdr, XDR_UNIT * 13);
+ if (!p)
+ return false;
+ p = xdr_encode_hyper(p, bs * s->f_blocks); /* total bytes */
+ p = xdr_encode_hyper(p, bs * s->f_bfree); /* free bytes */
+ p = xdr_encode_hyper(p, bs * s->f_bavail); /* user available bytes */
+ p = xdr_encode_hyper(p, s->f_files); /* total inodes */
+ p = xdr_encode_hyper(p, s->f_ffree); /* free inodes */
+ p = xdr_encode_hyper(p, s->f_ffree); /* user available inodes */
+ *p = cpu_to_be32(resp->invarsec); /* mean unchanged time */
+
+ return true;
}
/* FSSTAT */
int
nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_fsstatres *resp = rqstp->rq_resp;
- struct kstatfs *s = &resp->stats;
- u64 bs = s->f_bsize;
-
- *p++ = resp->status;
- *p++ = xdr_zero; /* no post_op_attr */
-
- if (resp->status == 0) {
- p = xdr_encode_hyper(p, bs * s->f_blocks); /* total bytes */
- p = xdr_encode_hyper(p, bs * s->f_bfree); /* free bytes */
- p = xdr_encode_hyper(p, bs * s->f_bavail); /* user available bytes */
- p = xdr_encode_hyper(p, s->f_files); /* total inodes */
- p = xdr_encode_hyper(p, s->f_ffree); /* free inodes */
- p = xdr_encode_hyper(p, s->f_ffree); /* user available inodes */
- *p++ = htonl(resp->invarsec); /* mean unchanged time */
+
+ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
+ return 0;
+ switch (resp->status) {
+ case nfs_ok:
+ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
+ return 0;
+ if (!svcxdr_encode_fsstat3resok(xdr, resp))
+ return 0;
+ break;
+ default:
+ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
+ return 0;
}
- return xdr_ressize_check(rqstp, p);
+
+ return 1;
+}
+
+static bool
+svcxdr_encode_fsinfo3resok(struct xdr_stream *xdr,
+ const struct nfsd3_fsinfores *resp)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, XDR_UNIT * 12);
+ if (!p)
+ return false;
+ *p++ = cpu_to_be32(resp->f_rtmax);
+ *p++ = cpu_to_be32(resp->f_rtpref);
+ *p++ = cpu_to_be32(resp->f_rtmult);
+ *p++ = cpu_to_be32(resp->f_wtmax);
+ *p++ = cpu_to_be32(resp->f_wtpref);
+ *p++ = cpu_to_be32(resp->f_wtmult);
+ *p++ = cpu_to_be32(resp->f_dtpref);
+ p = xdr_encode_hyper(p, resp->f_maxfilesize);
+ p = encode_nfstime3(p, &nfs3svc_time_delta);
+ *p = cpu_to_be32(resp->f_properties);
+
+ return true;
}
/* FSINFO */
int
nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_fsinfores *resp = rqstp->rq_resp;
- *p++ = resp->status;
- *p++ = xdr_zero; /* no post_op_attr */
-
- if (resp->status == 0) {
- *p++ = htonl(resp->f_rtmax);
- *p++ = htonl(resp->f_rtpref);
- *p++ = htonl(resp->f_rtmult);
- *p++ = htonl(resp->f_wtmax);
- *p++ = htonl(resp->f_wtpref);
- *p++ = htonl(resp->f_wtmult);
- *p++ = htonl(resp->f_dtpref);
- p = xdr_encode_hyper(p, resp->f_maxfilesize);
- *p++ = xdr_one;
- *p++ = xdr_zero;
- *p++ = htonl(resp->f_properties);
+ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
+ return 0;
+ switch (resp->status) {
+ case nfs_ok:
+ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
+ return 0;
+ if (!svcxdr_encode_fsinfo3resok(xdr, resp))
+ return 0;
+ break;
+ default:
+ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
+ return 0;
}
- return xdr_ressize_check(rqstp, p);
+ return 1;
+}
+
+static bool
+svcxdr_encode_pathconf3resok(struct xdr_stream *xdr,
+ const struct nfsd3_pathconfres *resp)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, XDR_UNIT * 6);
+ if (!p)
+ return false;
+ *p++ = cpu_to_be32(resp->p_link_max);
+ *p++ = cpu_to_be32(resp->p_name_max);
+ p = xdr_encode_bool(p, resp->p_no_trunc);
+ p = xdr_encode_bool(p, resp->p_chown_restricted);
+ p = xdr_encode_bool(p, resp->p_case_insensitive);
+ xdr_encode_bool(p, resp->p_case_preserving);
+
+ return true;
}
/* PATHCONF */
int
nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_pathconfres *resp = rqstp->rq_resp;
- *p++ = resp->status;
- *p++ = xdr_zero; /* no post_op_attr */
-
- if (resp->status == 0) {
- *p++ = htonl(resp->p_link_max);
- *p++ = htonl(resp->p_name_max);
- *p++ = htonl(resp->p_no_trunc);
- *p++ = htonl(resp->p_chown_restricted);
- *p++ = htonl(resp->p_case_insensitive);
- *p++ = htonl(resp->p_case_preserving);
+ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
+ return 0;
+ switch (resp->status) {
+ case nfs_ok:
+ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
+ return 0;
+ if (!svcxdr_encode_pathconf3resok(xdr, resp))
+ return 0;
+ break;
+ default:
+ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
+ return 0;
}
- return xdr_ressize_check(rqstp, p);
+ return 1;
}
/* COMMIT */
int
nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd3_commitres *resp = rqstp->rq_resp;
- *p++ = resp->status;
- p = encode_wcc_data(rqstp, p, &resp->fh);
- /* Write verifier */
- if (resp->status == 0) {
- *p++ = resp->verf[0];
- *p++ = resp->verf[1];
+ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
+ return 0;
+ switch (resp->status) {
+ case nfs_ok:
+ if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh))
+ return 0;
+ if (!svcxdr_encode_writeverf3(xdr, resp->verf))
+ return 0;
+ break;
+ default:
+ if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh))
+ return 0;
}
- return xdr_ressize_check(rqstp, p);
+
+ return 1;
}
/*
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index dd9f38d072dd..daf43b980d4b 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1383,10 +1383,13 @@ static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync)
static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy)
{
ssize_t bytes_copied = 0;
- size_t bytes_total = copy->cp_count;
+ u64 bytes_total = copy->cp_count;
u64 src_pos = copy->cp_src_pos;
u64 dst_pos = copy->cp_dst_pos;
+ /* See RFC 7862 p.67: */
+ if (bytes_total == 0)
+ bytes_total = ULLONG_MAX;
do {
if (kthread_should_stop())
break;
@@ -1538,8 +1541,8 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (!nfs4_init_copy_state(nn, copy))
goto out_err;
refcount_set(&async_copy->refcount, 1);
- memcpy(&copy->cp_res.cb_stateid, &copy->cp_stateid,
- sizeof(copy->cp_stateid));
+ memcpy(&copy->cp_res.cb_stateid, &copy->cp_stateid.stid,
+ sizeof(copy->cp_res.cb_stateid));
dup_copy_fields(copy, async_copy);
async_copy->copy_task = kthread_create(nfsd4_do_async_copy,
async_copy, "%s", "copy thread");
@@ -2262,25 +2265,6 @@ static bool need_wrongsec_check(struct svc_rqst *rqstp)
return !(nextd->op_flags & OP_HANDLES_WRONGSEC);
}
-static void svcxdr_init_encode(struct svc_rqst *rqstp,
- struct nfsd4_compoundres *resp)
-{
- struct xdr_stream *xdr = &resp->xdr;
- struct xdr_buf *buf = &rqstp->rq_res;
- struct kvec *head = buf->head;
-
- xdr->buf = buf;
- xdr->iov = head;
- xdr->p = head->iov_base + head->iov_len;
- xdr->end = head->iov_base + PAGE_SIZE - rqstp->rq_auth_slack;
- /* Tail and page_len should be zero at this point: */
- buf->len = buf->head[0].iov_len;
- xdr_reset_scratch_buffer(xdr);
- xdr->page_ptr = buf->pages - 1;
- buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages)
- - rqstp->rq_auth_slack;
-}
-
#ifdef CONFIG_NFSD_V4_2_INTER_SSC
static void
check_if_stalefh_allowed(struct nfsd4_compoundargs *args)
@@ -2335,10 +2319,14 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
__be32 status;
- svcxdr_init_encode(rqstp, resp);
- resp->tagp = resp->xdr.p;
+ resp->xdr = &rqstp->rq_res_stream;
+
+ /* reserve space for: NFS status code */
+ xdr_reserve_space(resp->xdr, XDR_UNIT);
+
+ resp->tagp = resp->xdr->p;
/* reserve space for: taglen, tag, and opcnt */
- xdr_reserve_space(&resp->xdr, 8 + args->taglen);
+ xdr_reserve_space(resp->xdr, XDR_UNIT * 2 + args->taglen);
resp->taglen = args->taglen;
resp->tag = args->tag;
resp->rqstp = rqstp;
@@ -2444,7 +2432,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
encode_op:
if (op->status == nfserr_replay_me) {
op->replay = &cstate->replay_owner->so_replay;
- nfsd4_encode_replay(&resp->xdr, op);
+ nfsd4_encode_replay(resp->xdr, op);
status = op->status = op->replay->rp_status;
} else {
nfsd4_encode_operation(resp, op);
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 891395c6c7d3..6fedc49726bf 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -626,7 +626,7 @@ nfsd4_legacy_tracking_init(struct net *net)
status = nfsd4_load_reboot_recovery_data(net);
if (status)
goto err;
- printk("NFSD: Using legacy client tracking operations.\n");
+ pr_info("NFSD: Using legacy client tracking operations.\n");
return 0;
err:
@@ -1028,7 +1028,7 @@ nfsd4_init_cld_pipe(struct net *net)
status = __nfsd4_init_cld_pipe(net);
if (!status)
- printk("NFSD: Using old nfsdcld client tracking operations.\n");
+ pr_info("NFSD: Using old nfsdcld client tracking operations.\n");
return status;
}
@@ -1605,7 +1605,7 @@ nfsd4_cld_tracking_init(struct net *net)
nfs4_release_reclaim(nn);
goto err_remove;
} else
- printk("NFSD: Using nfsdcld client tracking operations.\n");
+ pr_info("NFSD: Using nfsdcld client tracking operations.\n");
return 0;
err_remove:
@@ -1864,7 +1864,7 @@ nfsd4_umh_cltrack_init(struct net *net)
ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL);
kfree(grace_start);
if (!ret)
- printk("NFSD: Using UMH upcall client tracking operations.\n");
+ pr_info("NFSD: Using UMH upcall client tracking operations.\n");
return ret;
}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 97447a64bad0..7698172ac0c7 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -43,6 +43,7 @@
#include <linux/sunrpc/addr.h>
#include <linux/jhash.h>
#include <linux/string_helpers.h>
+#include <linux/fsnotify.h>
#include "xdr4.h"
#include "xdr4cb.h"
#include "vfs.h"
@@ -2352,6 +2353,10 @@ static int client_info_show(struct seq_file *m, void *v)
memcpy(&clid, &clp->cl_clientid, sizeof(clid));
seq_printf(m, "clientid: 0x%llx\n", clid);
seq_printf(m, "address: \"%pISpc\"\n", (struct sockaddr *)&clp->cl_addr);
+ if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags))
+ seq_puts(m, "status: confirmed\n");
+ else
+ seq_puts(m, "status: unconfirmed\n");
seq_printf(m, "name: ");
seq_quote_mem(m, clp->cl_name.data, clp->cl_name.len);
seq_printf(m, "\nminor version: %d\n", clp->cl_minorversion);
@@ -2702,6 +2707,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
int ret;
struct net *net = SVC_NET(rqstp);
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct dentry *dentries[ARRAY_SIZE(client_files)];
clp = alloc_client(name);
if (clp == NULL)
@@ -2721,9 +2727,11 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
clp->cl_cb_session = NULL;
clp->net = net;
- clp->cl_nfsd_dentry = nfsd_client_mkdir(nn, &clp->cl_nfsdfs,
- clp->cl_clientid.cl_id - nn->clientid_base,
- client_files);
+ clp->cl_nfsd_dentry = nfsd_client_mkdir(
+ nn, &clp->cl_nfsdfs,
+ clp->cl_clientid.cl_id - nn->clientid_base,
+ client_files, dentries);
+ clp->cl_nfsd_info_dentry = dentries[0];
if (!clp->cl_nfsd_dentry) {
free_client(clp);
return NULL;
@@ -2798,7 +2806,10 @@ move_to_confirmed(struct nfs4_client *clp)
list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]);
rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
add_clp_to_name_tree(clp, &nn->conf_name_tree);
- set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags);
+ if (!test_and_set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags) &&
+ clp->cl_nfsd_dentry &&
+ clp->cl_nfsd_info_dentry)
+ fsnotify_dentry(clp->cl_nfsd_info_dentry, FS_MODIFY);
renew_client_locked(clp);
}
@@ -2903,7 +2914,7 @@ out_err:
static void
nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
{
- struct xdr_buf *buf = resp->xdr.buf;
+ struct xdr_buf *buf = resp->xdr->buf;
struct nfsd4_slot *slot = resp->cstate.slot;
unsigned int base;
@@ -2973,7 +2984,7 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
struct nfsd4_sequence *seq)
{
struct nfsd4_slot *slot = resp->cstate.slot;
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
__be32 *p;
__be32 status;
@@ -3708,7 +3719,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
struct nfsd4_sequence *seq = &u->sequence;
struct nfsd4_compoundres *resp = rqstp->rq_resp;
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
struct nfsd4_session *session;
struct nfs4_client *clp;
struct nfsd4_slot *slot;
@@ -5338,6 +5349,22 @@ static bool clients_still_reclaiming(struct nfsd_net *nn)
return true;
}
+struct laundry_time {
+ time64_t cutoff;
+ time64_t new_timeo;
+};
+
+static bool state_expired(struct laundry_time *lt, time64_t last_refresh)
+{
+ time64_t time_remaining;
+
+ if (last_refresh < lt->cutoff)
+ return true;
+ time_remaining = last_refresh - lt->cutoff;
+ lt->new_timeo = min(lt->new_timeo, time_remaining);
+ return false;
+}
+
static time64_t
nfs4_laundromat(struct nfsd_net *nn)
{
@@ -5347,14 +5374,16 @@ nfs4_laundromat(struct nfsd_net *nn)
struct nfs4_ol_stateid *stp;
struct nfsd4_blocked_lock *nbl;
struct list_head *pos, *next, reaplist;
- time64_t cutoff = ktime_get_boottime_seconds() - nn->nfsd4_lease;
- time64_t t, new_timeo = nn->nfsd4_lease;
+ struct laundry_time lt = {
+ .cutoff = ktime_get_boottime_seconds() - nn->nfsd4_lease,
+ .new_timeo = nn->nfsd4_lease
+ };
struct nfs4_cpntf_state *cps;
copy_stateid_t *cps_t;
int i;
if (clients_still_reclaiming(nn)) {
- new_timeo = 0;
+ lt.new_timeo = 0;
goto out;
}
nfsd4_end_grace(nn);
@@ -5364,7 +5393,7 @@ nfs4_laundromat(struct nfsd_net *nn)
idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) {
cps = container_of(cps_t, struct nfs4_cpntf_state, cp_stateid);
if (cps->cp_stateid.sc_type == NFS4_COPYNOTIFY_STID &&
- cps->cpntf_time < cutoff)
+ state_expired(&lt, cps->cpntf_time))
_free_cpntf_state_locked(nn, cps);
}
spin_unlock(&nn->s2s_cp_lock);
@@ -5372,11 +5401,8 @@ nfs4_laundromat(struct nfsd_net *nn)
spin_lock(&nn->client_lock);
list_for_each_safe(pos, next, &nn->client_lru) {
clp = list_entry(pos, struct nfs4_client, cl_lru);
- if (clp->cl_time > cutoff) {
- t = clp->cl_time - cutoff;
- new_timeo = min(new_timeo, t);
+ if (!state_expired(&lt, clp->cl_time))
break;
- }
if (mark_client_expired_locked(clp)) {
trace_nfsd_clid_expired(&clp->cl_clientid);
continue;
@@ -5393,11 +5419,8 @@ nfs4_laundromat(struct nfsd_net *nn)
spin_lock(&state_lock);
list_for_each_safe(pos, next, &nn->del_recall_lru) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
- if (dp->dl_time > cutoff) {
- t = dp->dl_time - cutoff;
- new_timeo = min(new_timeo, t);
+ if (!state_expired(&lt, dp->dl_time))
break;
- }
WARN_ON(!unhash_delegation_locked(dp));
list_add(&dp->dl_recall_lru, &reaplist);
}
@@ -5413,11 +5436,8 @@ nfs4_laundromat(struct nfsd_net *nn)
while (!list_empty(&nn->close_lru)) {
oo = list_first_entry(&nn->close_lru, struct nfs4_openowner,
oo_close_lru);
- if (oo->oo_time > cutoff) {
- t = oo->oo_time - cutoff;
- new_timeo = min(new_timeo, t);
+ if (!state_expired(&lt, oo->oo_time))
break;
- }
list_del_init(&oo->oo_close_lru);
stp = oo->oo_last_closed_stid;
oo->oo_last_closed_stid = NULL;
@@ -5443,11 +5463,8 @@ nfs4_laundromat(struct nfsd_net *nn)
while (!list_empty(&nn->blocked_locks_lru)) {
nbl = list_first_entry(&nn->blocked_locks_lru,
struct nfsd4_blocked_lock, nbl_lru);
- if (nbl->nbl_time > cutoff) {
- t = nbl->nbl_time - cutoff;
- new_timeo = min(new_timeo, t);
+ if (!state_expired(&lt, nbl->nbl_time))
break;
- }
list_move(&nbl->nbl_lru, &reaplist);
list_del_init(&nbl->nbl_list);
}
@@ -5460,8 +5477,7 @@ nfs4_laundromat(struct nfsd_net *nn)
free_blocked_lock(nbl);
}
out:
- new_timeo = max_t(time64_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT);
- return new_timeo;
+ return max_t(time64_t, lt.new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT);
}
static struct workqueue_struct *laundry_wq;
@@ -7321,14 +7337,9 @@ nfs4_state_start_net(struct net *net)
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
int ret;
- ret = get_nfsdfs(net);
- if (ret)
- return ret;
ret = nfs4_state_create_net(net);
- if (ret) {
- mntput(nn->nfsd_mnt);
+ if (ret)
return ret;
- }
locks_start_grace(net, &nn->nfsd4_manager);
nfsd4_client_tracking_init(net);
if (nn->track_reclaim_completes && nn->reclaim_str_hashtbl_size == 0)
@@ -7398,7 +7409,6 @@ nfs4_state_shutdown_net(struct net *net)
nfsd4_client_tracking_exit(net);
nfs4_state_destroy_net(net);
- mntput(nn->nfsd_mnt);
}
void
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index eaaa1605b5b5..e0f06d3cbd44 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3581,7 +3581,7 @@ nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
static __be32
nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
__be32 *p;
p = xdr_reserve_space(xdr, 8);
@@ -3594,7 +3594,7 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
__be32 *p;
p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 8);
@@ -3611,7 +3611,7 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp,
static __be32
nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
return nfsd4_encode_stateid(xdr, &close->cl_stateid);
}
@@ -3620,7 +3620,7 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
static __be32
nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
__be32 *p;
p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
@@ -3634,7 +3634,7 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
static __be32
nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
__be32 *p;
p = xdr_reserve_space(xdr, 20);
@@ -3649,7 +3649,7 @@ static __be32
nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr)
{
struct svc_fh *fhp = getattr->ga_fhp;
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
return nfsd4_encode_fattr(xdr, fhp, fhp->fh_export, fhp->fh_dentry,
getattr->ga_bmval, resp->rqstp, 0);
@@ -3658,7 +3658,7 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
static __be32
nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
struct svc_fh *fhp = *fhpp;
unsigned int len;
__be32 *p;
@@ -3713,7 +3713,7 @@ again:
static __be32
nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
if (!nfserr)
nfserr = nfsd4_encode_stateid(xdr, &lock->lk_resp_stateid);
@@ -3726,7 +3726,7 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo
static __be32
nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
if (nfserr == nfserr_denied)
nfsd4_encode_lock_denied(xdr, &lockt->lt_denied);
@@ -3736,7 +3736,7 @@ nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
static __be32
nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
return nfsd4_encode_stateid(xdr, &locku->lu_stateid);
}
@@ -3745,7 +3745,7 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
static __be32
nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
__be32 *p;
p = xdr_reserve_space(xdr, 20);
@@ -3759,7 +3759,7 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
static __be32
nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
__be32 *p;
nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid);
@@ -3853,7 +3853,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
static __be32
nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
return nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid);
}
@@ -3861,7 +3861,7 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct
static __be32
nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
return nfsd4_encode_stateid(xdr, &od->od_stateid);
}
@@ -3871,7 +3871,7 @@ static __be32 nfsd4_encode_splice_read(
struct nfsd4_read *read,
struct file *file, unsigned long maxcount)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
struct xdr_buf *buf = xdr->buf;
int status, space_left;
u32 eof;
@@ -3937,7 +3937,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
struct nfsd4_read *read,
struct file *file, unsigned long maxcount)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
u32 eof;
int starting_len = xdr->buf->len - 8;
__be32 nfserr;
@@ -3976,7 +3976,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_read *read)
{
unsigned long maxcount;
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
struct file *file;
int starting_len = xdr->buf->len;
__be32 *p;
@@ -3990,7 +3990,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags));
return nfserr_resource;
}
- if (resp->xdr.buf->page_len &&
+ if (resp->xdr->buf->page_len &&
test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) {
WARN_ON_ONCE(1);
return nfserr_resource;
@@ -4020,7 +4020,7 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd
int maxcount;
__be32 wire_count;
int zero = 0;
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
int length_offset = xdr->buf->len;
int status;
__be32 *p;
@@ -4072,7 +4072,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
int bytes_left;
loff_t offset;
__be64 wire_offset;
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
int starting_len = xdr->buf->len;
__be32 *p;
@@ -4083,8 +4083,8 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
/* XXX: Following NFSv3, we ignore the READDIR verifier for now. */
*p++ = cpu_to_be32(0);
*p++ = cpu_to_be32(0);
- resp->xdr.buf->head[0].iov_len = ((char *)resp->xdr.p)
- - (char *)resp->xdr.buf->head[0].iov_base;
+ xdr->buf->head[0].iov_len = (char *)xdr->p -
+ (char *)xdr->buf->head[0].iov_base;
/*
* Number of bytes left for directory entries allowing for the
@@ -4159,7 +4159,7 @@ err_no_verf:
static __be32
nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
__be32 *p;
p = xdr_reserve_space(xdr, 20);
@@ -4172,7 +4172,7 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
static __be32
nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
__be32 *p;
p = xdr_reserve_space(xdr, 40);
@@ -4255,7 +4255,7 @@ static __be32
nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_secinfo *secinfo)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
return nfsd4_do_encode_secinfo(xdr, secinfo->si_exp);
}
@@ -4264,7 +4264,7 @@ static __be32
nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_secinfo_no_name *secinfo)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
return nfsd4_do_encode_secinfo(xdr, secinfo->sin_exp);
}
@@ -4276,7 +4276,7 @@ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
static __be32
nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
__be32 *p;
p = xdr_reserve_space(xdr, 16);
@@ -4300,7 +4300,7 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
static __be32
nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
__be32 *p;
if (!nfserr) {
@@ -4324,7 +4324,7 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n
static __be32
nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
__be32 *p;
p = xdr_reserve_space(xdr, 16);
@@ -4341,7 +4341,7 @@ static __be32
nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_exchange_id *exid)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
__be32 *p;
char *major_id;
char *server_scope;
@@ -4419,7 +4419,7 @@ static __be32
nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_create_session *sess)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
__be32 *p;
p = xdr_reserve_space(xdr, 24);
@@ -4472,7 +4472,7 @@ static __be32
nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_sequence *seq)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
__be32 *p;
p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 20);
@@ -4495,7 +4495,7 @@ static __be32
nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_test_stateid *test_stateid)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
struct nfsd4_test_stateid_id *stateid, *next;
__be32 *p;
@@ -4516,7 +4516,7 @@ static __be32
nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_getdeviceinfo *gdev)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
const struct nfsd4_layout_ops *ops;
u32 starting_len = xdr->buf->len, needed_len;
__be32 *p;
@@ -4572,7 +4572,7 @@ static __be32
nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_layoutget *lgp)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
const struct nfsd4_layout_ops *ops;
__be32 *p;
@@ -4599,7 +4599,7 @@ static __be32
nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_layoutcommit *lcp)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
__be32 *p;
p = xdr_reserve_space(xdr, 4);
@@ -4620,7 +4620,7 @@ static __be32
nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_layoutreturn *lrp)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
__be32 *p;
p = xdr_reserve_space(xdr, 4);
@@ -4638,7 +4638,7 @@ nfsd42_encode_write_res(struct nfsd4_compoundres *resp,
struct nfsd42_write_res *write, bool sync)
{
__be32 *p;
- p = xdr_reserve_space(&resp->xdr, 4);
+ p = xdr_reserve_space(resp->xdr, 4);
if (!p)
return nfserr_resource;
@@ -4647,11 +4647,11 @@ nfsd42_encode_write_res(struct nfsd4_compoundres *resp,
else {
__be32 nfserr;
*p++ = cpu_to_be32(1);
- nfserr = nfsd4_encode_stateid(&resp->xdr, &write->cb_stateid);
+ nfserr = nfsd4_encode_stateid(resp->xdr, &write->cb_stateid);
if (nfserr)
return nfserr;
}
- p = xdr_reserve_space(&resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE);
+ p = xdr_reserve_space(resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE);
if (!p)
return nfserr_resource;
@@ -4665,7 +4665,7 @@ nfsd42_encode_write_res(struct nfsd4_compoundres *resp,
static __be32
nfsd42_encode_nl4_server(struct nfsd4_compoundres *resp, struct nl4_server *ns)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
struct nfs42_netaddr *addr;
__be32 *p;
@@ -4713,7 +4713,7 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
if (nfserr)
return nfserr;
- p = xdr_reserve_space(&resp->xdr, 4 + 4);
+ p = xdr_reserve_space(resp->xdr, 4 + 4);
*p++ = xdr_one; /* cr_consecutive */
*p++ = cpu_to_be32(copy->cp_synchronous);
return 0;
@@ -4723,7 +4723,7 @@ static __be32
nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_offload_status *os)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
__be32 *p;
p = xdr_reserve_space(xdr, 8 + 4);
@@ -4740,7 +4740,7 @@ nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
unsigned long *maxcount, u32 *eof,
loff_t *pos)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
struct file *file = read->rd_nf->nf_file;
int starting_len = xdr->buf->len;
loff_t hole_pos;
@@ -4799,7 +4799,7 @@ nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
count = data_pos - read->rd_offset;
/* Content type, offset, byte count */
- p = xdr_reserve_space(&resp->xdr, 4 + 8 + 8);
+ p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
if (!p)
return nfserr_resource;
@@ -4817,7 +4817,7 @@ nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_read *read)
{
unsigned long maxcount, count;
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
struct file *file;
int starting_len = xdr->buf->len;
int last_segment = xdr->buf->len;
@@ -4888,7 +4888,7 @@ static __be32
nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_copy_notify *cn)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
__be32 *p;
if (nfserr)
@@ -4924,7 +4924,7 @@ nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
{
__be32 *p;
- p = xdr_reserve_space(&resp->xdr, 4 + 8);
+ p = xdr_reserve_space(resp->xdr, 4 + 8);
*p++ = cpu_to_be32(seek->seek_eof);
p = xdr_encode_hyper(p, seek->seek_pos);
@@ -4985,7 +4985,7 @@ static __be32
nfsd4_encode_getxattr(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_getxattr *getxattr)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
__be32 *p, err;
p = xdr_reserve_space(xdr, 4);
@@ -5009,7 +5009,7 @@ static __be32
nfsd4_encode_setxattr(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_setxattr *setxattr)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
__be32 *p;
p = xdr_reserve_space(xdr, 20);
@@ -5050,7 +5050,7 @@ static __be32
nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_listxattrs *listxattrs)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
u32 cookie_offset, count_offset, eof;
u32 left, xdrleft, slen, count;
u32 xdrlen, offset;
@@ -5161,7 +5161,7 @@ static __be32
nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_removexattr *removexattr)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
__be32 *p;
p = xdr_reserve_space(xdr, 20);
@@ -5301,7 +5301,7 @@ __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 respsize)
void
nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
{
- struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_stream *xdr = resp->xdr;
struct nfs4_stateowner *so = resp->cstate.replay_owner;
struct svc_rqst *rqstp = resp->rqstp;
const struct nfsd4_operation *opdesc = op->opdesc;
@@ -5430,14 +5430,14 @@ int
nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p)
{
struct nfsd4_compoundres *resp = rqstp->rq_resp;
- struct xdr_buf *buf = resp->xdr.buf;
+ struct xdr_buf *buf = resp->xdr->buf;
WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len +
buf->tail[0].iov_len);
*p = resp->cstate.status;
- rqstp->rq_next_page = resp->xdr.page_ptr + 1;
+ rqstp->rq_next_page = resp->xdr->page_ptr + 1;
p = resp->tagp;
*p++ = htonl(resp->taglen);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index ef86ed23af82..853bf50a2a9b 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1266,7 +1266,8 @@ static void nfsdfs_remove_files(struct dentry *root)
/* XXX: cut'n'paste from simple_fill_super; figure out if we could share
* code instead. */
static int nfsdfs_create_files(struct dentry *root,
- const struct tree_descr *files)
+ const struct tree_descr *files,
+ struct dentry **fdentries)
{
struct inode *dir = d_inode(root);
struct inode *inode;
@@ -1275,8 +1276,6 @@ static int nfsdfs_create_files(struct dentry *root,
inode_lock(dir);
for (i = 0; files->name && files->name[0]; i++, files++) {
- if (!files->name)
- continue;
dentry = d_alloc_name(root, files->name);
if (!dentry)
goto out;
@@ -1290,6 +1289,8 @@ static int nfsdfs_create_files(struct dentry *root,
inode->i_private = __get_nfsdfs_client(dir);
d_add(dentry, inode);
fsnotify_create(dir, dentry);
+ if (fdentries)
+ fdentries[i] = dentry;
}
inode_unlock(dir);
return 0;
@@ -1301,8 +1302,9 @@ out:
/* on success, returns positive number unique to that client. */
struct dentry *nfsd_client_mkdir(struct nfsd_net *nn,
- struct nfsdfs_client *ncl, u32 id,
- const struct tree_descr *files)
+ struct nfsdfs_client *ncl, u32 id,
+ const struct tree_descr *files,
+ struct dentry **fdentries)
{
struct dentry *dentry;
char name[11];
@@ -1313,7 +1315,7 @@ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn,
dentry = nfsd_mkdir(nn->nfsd_client_dir, ncl, name);
if (IS_ERR(dentry)) /* XXX: tossing errors? */
return NULL;
- ret = nfsdfs_create_files(dentry, files);
+ ret = nfsdfs_create_files(dentry, files, fdentries);
if (ret) {
nfsd_client_rmdir(dentry);
return NULL;
@@ -1416,6 +1418,8 @@ static void nfsd_umount(struct super_block *sb)
{
struct net *net = sb->s_fs_info;
+ nfsd_shutdown_threads(net);
+
kill_litter_super(sb);
put_net(net);
}
@@ -1428,18 +1432,6 @@ static struct file_system_type nfsd_fs_type = {
};
MODULE_ALIAS_FS("nfsd");
-int get_nfsdfs(struct net *net)
-{
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- struct vfsmount *mnt;
-
- mnt = vfs_kern_mount(&nfsd_fs_type, SB_KERNMOUNT, "nfsd", NULL);
- if (IS_ERR(mnt))
- return PTR_ERR(mnt);
- nn->nfsd_mnt = mnt;
- return 0;
-}
-
#ifdef CONFIG_PROC_FS
static int create_proc_exports_entry(void)
{
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 8bdc37aa2c2e..14dbfa75059d 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -93,13 +93,12 @@ int nfsd_get_nrthreads(int n, int *, struct net *);
int nfsd_set_nrthreads(int n, int *, struct net *);
int nfsd_pool_stats_open(struct inode *, struct file *);
int nfsd_pool_stats_release(struct inode *, struct file *);
+void nfsd_shutdown_threads(struct net *net);
void nfsd_destroy(struct net *net);
bool i_am_nfsd(void);
-int get_nfsdfs(struct net *);
-
struct nfsdfs_client {
struct kref cl_ref;
void (*cl_release)(struct kref *kref);
@@ -107,7 +106,9 @@ struct nfsdfs_client {
struct nfsdfs_client *get_nfsdfs_client(struct inode *);
struct dentry *nfsd_client_mkdir(struct nfsd_net *nn,
- struct nfsdfs_client *ncl, u32 id, const struct tree_descr *);
+ struct nfsdfs_client *ncl, u32 id,
+ const struct tree_descr *,
+ struct dentry **fdentries);
void nfsd_client_rmdir(struct dentry *dentry);
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 10b44421eace..c475d2271f9c 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -711,7 +711,7 @@ char * SVCFH_fmt(struct svc_fh *fhp)
return buf;
}
-enum fsid_source fsid_source(struct svc_fh *fhp)
+enum fsid_source fsid_source(const struct svc_fh *fhp)
{
if (fhp->fh_handle.fh_version != 1)
return FSIDSOURCE_DEV;
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index f58933519f38..aff2cda5c6c3 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -82,7 +82,7 @@ enum fsid_source {
FSIDSOURCE_FSID,
FSIDSOURCE_UUID,
};
-extern enum fsid_source fsid_source(struct svc_fh *fhp);
+extern enum fsid_source fsid_source(const struct svc_fh *fhp);
/*
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index a8d5449dd0e9..60d7c59e7935 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -151,13 +151,14 @@ nfsd_proc_readlink(struct svc_rqst *rqstp)
{
struct nfsd_fhandle *argp = rqstp->rq_argp;
struct nfsd_readlinkres *resp = rqstp->rq_resp;
- char *buffer = page_address(*(rqstp->rq_next_page++));
dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh));
/* Read the symlink. */
resp->len = NFS_MAXPATHLEN;
- resp->status = nfsd_readlink(rqstp, &argp->fh, buffer, &resp->len);
+ resp->page = *(rqstp->rq_next_page++);
+ resp->status = nfsd_readlink(rqstp, &argp->fh,
+ page_address(resp->page), &resp->len);
fh_put(&argp->fh);
return rpc_success;
@@ -184,6 +185,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
v = 0;
len = argp->count;
+ resp->pages = rqstp->rq_next_page;
while (len > 0) {
struct page *page = *(rqstp->rq_next_page++);
@@ -381,7 +383,7 @@ nfsd_proc_create(struct svc_rqst *rqstp)
/* Make sure the type and device matches */
resp->status = nfserr_exist;
- if (inode && type != (inode->i_mode & S_IFMT))
+ if (inode && inode_wrong_type(inode, type))
goto out_unlock;
}
@@ -557,14 +559,27 @@ static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp,
struct nfsd_readdirres *resp,
int count)
{
+ struct xdr_buf *buf = &resp->dirlist;
+ struct xdr_stream *xdr = &resp->xdr;
+
count = min_t(u32, count, PAGE_SIZE);
- /* Convert byte count to number of words (i.e. >> 2),
- * and reserve room for the NULL ptr & eof flag (-2 words) */
- resp->buflen = (count >> 2) - 2;
+ memset(buf, 0, sizeof(*buf));
- resp->buffer = page_address(*rqstp->rq_next_page);
+ /* Reserve room for the NULL ptr & eof flag (-2 words) */
+ buf->buflen = count - sizeof(__be32) * 2;
+ buf->pages = rqstp->rq_next_page;
rqstp->rq_next_page++;
+
+ /* This is xdr_init_encode(), but it assumes that
+ * the head kvec has already been consumed. */
+ xdr_set_scratch_buffer(xdr, NULL, 0);
+ xdr->buf = buf;
+ xdr->page_ptr = buf->pages;
+ xdr->iov = NULL;
+ xdr->p = page_address(*buf->pages);
+ xdr->end = xdr->p + (PAGE_SIZE >> 2);
+ xdr->rqst = NULL;
}
/*
@@ -576,25 +591,19 @@ nfsd_proc_readdir(struct svc_rqst *rqstp)
struct nfsd_readdirargs *argp = rqstp->rq_argp;
struct nfsd_readdirres *resp = rqstp->rq_resp;
loff_t offset;
- __be32 *buffer;
dprintk("nfsd: READDIR %s %d bytes at %d\n",
SVCFH_fmt(&argp->fh),
argp->count, argp->cookie);
nfsd_init_dirlist_pages(rqstp, resp, argp->count);
- buffer = resp->buffer;
- resp->offset = NULL;
resp->common.err = nfs_ok;
- /* Read directory and encode entries on the fly */
+ resp->cookie_offset = 0;
offset = argp->cookie;
resp->status = nfsd_readdir(rqstp, &argp->fh, &offset,
&resp->common, nfssvc_encode_entry);
-
- resp->count = resp->buffer - buffer;
- if (resp->offset)
- *resp->offset = htonl(offset);
+ nfssvc_encode_nfscookie(resp, offset);
fh_put(&argp->fh);
return rpc_success;
@@ -640,7 +649,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
[NFSPROC_GETATTR] = {
.pc_func = nfsd_proc_getattr,
.pc_decode = nfssvc_decode_fhandleargs,
- .pc_encode = nfssvc_encode_attrstat,
+ .pc_encode = nfssvc_encode_attrstatres,
.pc_release = nfssvc_release_attrstat,
.pc_argsize = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd_attrstat),
@@ -651,7 +660,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
[NFSPROC_SETATTR] = {
.pc_func = nfsd_proc_setattr,
.pc_decode = nfssvc_decode_sattrargs,
- .pc_encode = nfssvc_encode_attrstat,
+ .pc_encode = nfssvc_encode_attrstatres,
.pc_release = nfssvc_release_attrstat,
.pc_argsize = sizeof(struct nfsd_sattrargs),
.pc_ressize = sizeof(struct nfsd_attrstat),
@@ -714,7 +723,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
[NFSPROC_WRITE] = {
.pc_func = nfsd_proc_write,
.pc_decode = nfssvc_decode_writeargs,
- .pc_encode = nfssvc_encode_attrstat,
+ .pc_encode = nfssvc_encode_attrstatres,
.pc_release = nfssvc_release_attrstat,
.pc_argsize = sizeof(struct nfsd_writeargs),
.pc_ressize = sizeof(struct nfsd_attrstat),
@@ -736,7 +745,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
[NFSPROC_REMOVE] = {
.pc_func = nfsd_proc_remove,
.pc_decode = nfssvc_decode_diropargs,
- .pc_encode = nfssvc_encode_stat,
+ .pc_encode = nfssvc_encode_statres,
.pc_argsize = sizeof(struct nfsd_diropargs),
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
@@ -746,7 +755,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
[NFSPROC_RENAME] = {
.pc_func = nfsd_proc_rename,
.pc_decode = nfssvc_decode_renameargs,
- .pc_encode = nfssvc_encode_stat,
+ .pc_encode = nfssvc_encode_statres,
.pc_argsize = sizeof(struct nfsd_renameargs),
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
@@ -756,7 +765,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
[NFSPROC_LINK] = {
.pc_func = nfsd_proc_link,
.pc_decode = nfssvc_decode_linkargs,
- .pc_encode = nfssvc_encode_stat,
+ .pc_encode = nfssvc_encode_statres,
.pc_argsize = sizeof(struct nfsd_linkargs),
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
@@ -766,7 +775,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
[NFSPROC_SYMLINK] = {
.pc_func = nfsd_proc_symlink,
.pc_decode = nfssvc_decode_symlinkargs,
- .pc_encode = nfssvc_encode_stat,
+ .pc_encode = nfssvc_encode_statres,
.pc_argsize = sizeof(struct nfsd_symlinkargs),
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
@@ -787,7 +796,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
[NFSPROC_RMDIR] = {
.pc_func = nfsd_proc_rmdir,
.pc_decode = nfssvc_decode_diropargs,
- .pc_encode = nfssvc_encode_stat,
+ .pc_encode = nfssvc_encode_statres,
.pc_argsize = sizeof(struct nfsd_diropargs),
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 6de406322106..82ba034fa579 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -84,7 +84,7 @@ DEFINE_MUTEX(nfsd_mutex);
* version 4.1 DRC caches.
* nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage.
*/
-spinlock_t nfsd_drc_lock;
+DEFINE_SPINLOCK(nfsd_drc_lock);
unsigned long nfsd_drc_max_mem;
unsigned long nfsd_drc_mem_used;
@@ -563,7 +563,6 @@ static void set_max_drc(void)
nfsd_drc_max_mem = (nr_free_buffer_pages()
>> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE;
nfsd_drc_mem_used = 0;
- spin_lock_init(&nfsd_drc_lock);
dprintk("%s nfsd_drc_max_mem %lu \n", __func__, nfsd_drc_max_mem);
}
@@ -596,6 +595,37 @@ static const struct svc_serv_ops nfsd_thread_sv_ops = {
.svo_module = THIS_MODULE,
};
+static void nfsd_complete_shutdown(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ WARN_ON(!mutex_is_locked(&nfsd_mutex));
+
+ nn->nfsd_serv = NULL;
+ complete(&nn->nfsd_shutdown_complete);
+}
+
+void nfsd_shutdown_threads(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct svc_serv *serv;
+
+ mutex_lock(&nfsd_mutex);
+ serv = nn->nfsd_serv;
+ if (serv == NULL) {
+ mutex_unlock(&nfsd_mutex);
+ return;
+ }
+
+ svc_get(serv);
+ /* Kill outstanding nfsd threads */
+ serv->sv_ops->svo_setup(serv, NULL, 0);
+ nfsd_destroy(net);
+ mutex_unlock(&nfsd_mutex);
+ /* Wait for shutdown of nfsd_serv to complete */
+ wait_for_completion(&nn->nfsd_shutdown_complete);
+}
+
bool i_am_nfsd(void)
{
return kthread_func(current) == nfsd;
@@ -618,11 +648,13 @@ int nfsd_create_serv(struct net *net)
&nfsd_thread_sv_ops);
if (nn->nfsd_serv == NULL)
return -ENOMEM;
+ init_completion(&nn->nfsd_shutdown_complete);
nn->nfsd_serv->sv_maxconn = nn->max_connections;
error = svc_bind(nn->nfsd_serv, net);
if (error < 0) {
svc_destroy(nn->nfsd_serv);
+ nfsd_complete_shutdown(net);
return error;
}
@@ -671,7 +703,7 @@ void nfsd_destroy(struct net *net)
svc_shutdown_net(nn->nfsd_serv, net);
svc_destroy(nn->nfsd_serv);
if (destroy)
- nn->nfsd_serv = NULL;
+ nfsd_complete_shutdown(net);
}
int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
@@ -997,7 +1029,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
* NFSv4 does some encoding while processing
*/
p = resv->iov_base + resv->iov_len;
- resv->iov_len += sizeof(__be32);
+ svcxdr_init_encode(rqstp);
*statp = proc->pc_func(rqstp);
if (*statp == rpc_drop_reply || test_bit(RQ_DROPME, &rqstp->rq_flags))
@@ -1052,7 +1084,7 @@ int nfssvc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p)
*/
int nfssvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p)
{
- return xdr_ressize_check(rqstp, p);
+ return 1;
}
int nfsd_pool_stats_open(struct inode *inode, struct file *file)
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 5d79ef6a0c7f..a06c05fe3b42 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -9,12 +9,10 @@
#include "xdr.h"
#include "auth.h"
-#define NFSDDBG_FACILITY NFSDDBG_XDR
-
/*
* Mapping of S_IF* types to NFS file types
*/
-static u32 nfs_ftypes[] = {
+static const u32 nfs_ftypes[] = {
NFNON, NFCHR, NFCHR, NFBAD,
NFDIR, NFBAD, NFBLK, NFBAD,
NFREG, NFBAD, NFLNK, NFBAD,
@@ -27,6 +25,28 @@ static u32 nfs_ftypes[] = {
*/
/**
+ * svcxdr_encode_stat - Encode an NFSv2 status code
+ * @xdr: XDR stream
+ * @status: status value to encode
+ *
+ * Return values:
+ * %false: Send buffer space was exhausted
+ * %true: Success
+ */
+bool
+svcxdr_encode_stat(struct xdr_stream *xdr, __be32 status)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, sizeof(status));
+ if (!p)
+ return false;
+ *p = status;
+
+ return true;
+}
+
+/**
* svcxdr_decode_fhandle - Decode an NFSv2 file handle
* @xdr: XDR stream positioned at an encoded NFSv2 FH
* @fhp: OUT: filled-in server file handle
@@ -50,11 +70,28 @@ svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp)
return true;
}
-static __be32 *
-encode_fh(__be32 *p, struct svc_fh *fhp)
+static bool
+svcxdr_encode_fhandle(struct xdr_stream *xdr, const struct svc_fh *fhp)
{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, NFS_FHSIZE);
+ if (!p)
+ return false;
memcpy(p, &fhp->fh_handle.fh_base, NFS_FHSIZE);
- return p + (NFS_FHSIZE>> 2);
+
+ return true;
+}
+
+static __be32 *
+encode_timeval(__be32 *p, const struct timespec64 *time)
+{
+ *p++ = cpu_to_be32((u32)time->tv_sec);
+ if (time->tv_nsec)
+ *p++ = cpu_to_be32(time->tv_nsec / NSEC_PER_USEC);
+ else
+ *p++ = xdr_zero;
+ return p;
}
static bool
@@ -162,68 +199,73 @@ svcxdr_decode_sattr(struct svc_rqst *rqstp, struct xdr_stream *xdr,
return true;
}
-static __be32 *
-encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
- struct kstat *stat)
+/**
+ * svcxdr_encode_fattr - Encode NFSv2 file attributes
+ * @rqstp: Context of a completed RPC transaction
+ * @xdr: XDR stream
+ * @fhp: File handle to encode
+ * @stat: Attributes to encode
+ *
+ * Return values:
+ * %false: Send buffer space was exhausted
+ * %true: Success
+ */
+bool
+svcxdr_encode_fattr(struct svc_rqst *rqstp, struct xdr_stream *xdr,
+ const struct svc_fh *fhp, const struct kstat *stat)
{
struct user_namespace *userns = nfsd_user_namespace(rqstp);
- struct dentry *dentry = fhp->fh_dentry;
- int type;
+ struct dentry *dentry = fhp->fh_dentry;
+ int type = stat->mode & S_IFMT;
struct timespec64 time;
- u32 f;
+ __be32 *p;
+ u32 fsid;
- type = (stat->mode & S_IFMT);
+ p = xdr_reserve_space(xdr, XDR_UNIT * 17);
+ if (!p)
+ return false;
- *p++ = htonl(nfs_ftypes[type >> 12]);
- *p++ = htonl((u32) stat->mode);
- *p++ = htonl((u32) stat->nlink);
- *p++ = htonl((u32) from_kuid_munged(userns, stat->uid));
- *p++ = htonl((u32) from_kgid_munged(userns, stat->gid));
+ *p++ = cpu_to_be32(nfs_ftypes[type >> 12]);
+ *p++ = cpu_to_be32((u32)stat->mode);
+ *p++ = cpu_to_be32((u32)stat->nlink);
+ *p++ = cpu_to_be32((u32)from_kuid_munged(userns, stat->uid));
+ *p++ = cpu_to_be32((u32)from_kgid_munged(userns, stat->gid));
- if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) {
- *p++ = htonl(NFS_MAXPATHLEN);
- } else {
- *p++ = htonl((u32) stat->size);
- }
- *p++ = htonl((u32) stat->blksize);
+ if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN)
+ *p++ = cpu_to_be32(NFS_MAXPATHLEN);
+ else
+ *p++ = cpu_to_be32((u32) stat->size);
+ *p++ = cpu_to_be32((u32) stat->blksize);
if (S_ISCHR(type) || S_ISBLK(type))
- *p++ = htonl(new_encode_dev(stat->rdev));
+ *p++ = cpu_to_be32(new_encode_dev(stat->rdev));
else
- *p++ = htonl(0xffffffff);
- *p++ = htonl((u32) stat->blocks);
+ *p++ = cpu_to_be32(0xffffffff);
+ *p++ = cpu_to_be32((u32)stat->blocks);
+
switch (fsid_source(fhp)) {
- default:
- case FSIDSOURCE_DEV:
- *p++ = htonl(new_encode_dev(stat->dev));
- break;
case FSIDSOURCE_FSID:
- *p++ = htonl((u32) fhp->fh_export->ex_fsid);
+ fsid = (u32)fhp->fh_export->ex_fsid;
break;
case FSIDSOURCE_UUID:
- f = ((u32*)fhp->fh_export->ex_uuid)[0];
- f ^= ((u32*)fhp->fh_export->ex_uuid)[1];
- f ^= ((u32*)fhp->fh_export->ex_uuid)[2];
- f ^= ((u32*)fhp->fh_export->ex_uuid)[3];
- *p++ = htonl(f);
+ fsid = ((u32 *)fhp->fh_export->ex_uuid)[0];
+ fsid ^= ((u32 *)fhp->fh_export->ex_uuid)[1];
+ fsid ^= ((u32 *)fhp->fh_export->ex_uuid)[2];
+ fsid ^= ((u32 *)fhp->fh_export->ex_uuid)[3];
+ break;
+ default:
+ fsid = new_encode_dev(stat->dev);
break;
}
- *p++ = htonl((u32) stat->ino);
- *p++ = htonl((u32) stat->atime.tv_sec);
- *p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0);
- time = stat->mtime;
- lease_get_mtime(d_inode(dentry), &time);
- *p++ = htonl((u32) time.tv_sec);
- *p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0);
- *p++ = htonl((u32) stat->ctime.tv_sec);
- *p++ = htonl(stat->ctime.tv_nsec ? stat->ctime.tv_nsec / 1000 : 0);
+ *p++ = cpu_to_be32(fsid);
- return p;
-}
+ *p++ = cpu_to_be32((u32)stat->ino);
+ p = encode_timeval(p, &stat->atime);
+ time = stat->mtime;
+ lease_get_mtime(d_inode(dentry), &time);
+ p = encode_timeval(p, &time);
+ encode_timeval(p, &stat->ctime);
-/* Helper function for NFSv2 ACL code */
-__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat)
-{
- return encode_fattr(rqstp, p, fhp, stat);
+ return true;
}
/*
@@ -390,106 +432,118 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p)
*/
int
-nfssvc_encode_stat(struct svc_rqst *rqstp, __be32 *p)
+nfssvc_encode_statres(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd_stat *resp = rqstp->rq_resp;
- *p++ = resp->status;
- return xdr_ressize_check(rqstp, p);
+ return svcxdr_encode_stat(xdr, resp->status);
}
int
-nfssvc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p)
+nfssvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd_attrstat *resp = rqstp->rq_resp;
- *p++ = resp->status;
- if (resp->status != nfs_ok)
- goto out;
- p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
-out:
- return xdr_ressize_check(rqstp, p);
+ if (!svcxdr_encode_stat(xdr, resp->status))
+ return 0;
+ switch (resp->status) {
+ case nfs_ok:
+ if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat))
+ return 0;
+ break;
+ }
+
+ return 1;
}
int
nfssvc_encode_diropres(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd_diropres *resp = rqstp->rq_resp;
- *p++ = resp->status;
- if (resp->status != nfs_ok)
- goto out;
- p = encode_fh(p, &resp->fh);
- p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
-out:
- return xdr_ressize_check(rqstp, p);
+ if (!svcxdr_encode_stat(xdr, resp->status))
+ return 0;
+ switch (resp->status) {
+ case nfs_ok:
+ if (!svcxdr_encode_fhandle(xdr, &resp->fh))
+ return 0;
+ if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat))
+ return 0;
+ break;
+ }
+
+ return 1;
}
int
nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd_readlinkres *resp = rqstp->rq_resp;
struct kvec *head = rqstp->rq_res.head;
- *p++ = resp->status;
- if (resp->status != nfs_ok)
- return xdr_ressize_check(rqstp, p);
-
- *p++ = htonl(resp->len);
- xdr_ressize_check(rqstp, p);
- rqstp->rq_res.page_len = resp->len;
- if (resp->len & 3) {
- /* need to pad the tail */
- rqstp->rq_res.tail[0].iov_base = p;
- *p = 0;
- rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3);
- }
- if (svc_encode_result_payload(rqstp, head->iov_len, resp->len))
+ if (!svcxdr_encode_stat(xdr, resp->status))
return 0;
+ switch (resp->status) {
+ case nfs_ok:
+ if (xdr_stream_encode_u32(xdr, resp->len) < 0)
+ return 0;
+ xdr_write_pages(xdr, &resp->page, 0, resp->len);
+ if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0)
+ return 0;
+ break;
+ }
+
return 1;
}
int
nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd_readres *resp = rqstp->rq_resp;
struct kvec *head = rqstp->rq_res.head;
- *p++ = resp->status;
- if (resp->status != nfs_ok)
- return xdr_ressize_check(rqstp, p);
-
- p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
- *p++ = htonl(resp->count);
- xdr_ressize_check(rqstp, p);
-
- /* now update rqstp->rq_res to reflect data as well */
- rqstp->rq_res.page_len = resp->count;
- if (resp->count & 3) {
- /* need to pad the tail */
- rqstp->rq_res.tail[0].iov_base = p;
- *p = 0;
- rqstp->rq_res.tail[0].iov_len = 4 - (resp->count&3);
- }
- if (svc_encode_result_payload(rqstp, head->iov_len, resp->count))
+ if (!svcxdr_encode_stat(xdr, resp->status))
return 0;
+ switch (resp->status) {
+ case nfs_ok:
+ if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat))
+ return 0;
+ if (xdr_stream_encode_u32(xdr, resp->count) < 0)
+ return 0;
+ xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base,
+ resp->count);
+ if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0)
+ return 0;
+ break;
+ }
+
return 1;
}
int
nfssvc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd_readdirres *resp = rqstp->rq_resp;
+ struct xdr_buf *dirlist = &resp->dirlist;
- *p++ = resp->status;
- if (resp->status != nfs_ok)
- return xdr_ressize_check(rqstp, p);
-
- xdr_ressize_check(rqstp, p);
- p = resp->buffer;
- *p++ = 0; /* no more entries */
- *p++ = htonl((resp->common.err == nfserr_eof));
- rqstp->rq_res.page_len = (((unsigned long)p-1) & ~PAGE_MASK)+1;
+ if (!svcxdr_encode_stat(xdr, resp->status))
+ return 0;
+ switch (resp->status) {
+ case nfs_ok:
+ xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len);
+ /* no more entries */
+ if (xdr_stream_encode_item_absent(xdr) < 0)
+ return 0;
+ if (xdr_stream_encode_bool(xdr, resp->common.err == nfserr_eof) < 0)
+ return 0;
+ break;
+ }
return 1;
}
@@ -497,64 +551,113 @@ nfssvc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p)
int
nfssvc_encode_statfsres(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_res_stream;
struct nfsd_statfsres *resp = rqstp->rq_resp;
struct kstatfs *stat = &resp->stats;
- *p++ = resp->status;
- if (resp->status != nfs_ok)
- return xdr_ressize_check(rqstp, p);
+ if (!svcxdr_encode_stat(xdr, resp->status))
+ return 0;
+ switch (resp->status) {
+ case nfs_ok:
+ p = xdr_reserve_space(xdr, XDR_UNIT * 5);
+ if (!p)
+ return 0;
+ *p++ = cpu_to_be32(NFSSVC_MAXBLKSIZE_V2);
+ *p++ = cpu_to_be32(stat->f_bsize);
+ *p++ = cpu_to_be32(stat->f_blocks);
+ *p++ = cpu_to_be32(stat->f_bfree);
+ *p = cpu_to_be32(stat->f_bavail);
+ break;
+ }
- *p++ = htonl(NFSSVC_MAXBLKSIZE_V2); /* max transfer size */
- *p++ = htonl(stat->f_bsize);
- *p++ = htonl(stat->f_blocks);
- *p++ = htonl(stat->f_bfree);
- *p++ = htonl(stat->f_bavail);
- return xdr_ressize_check(rqstp, p);
+ return 1;
}
-int
-nfssvc_encode_entry(void *ccdv, const char *name,
- int namlen, loff_t offset, u64 ino, unsigned int d_type)
+/**
+ * nfssvc_encode_nfscookie - Encode a directory offset cookie
+ * @resp: readdir result context
+ * @offset: offset cookie to encode
+ *
+ * The buffer space for the offset cookie has already been reserved
+ * by svcxdr_encode_entry_common().
+ */
+void nfssvc_encode_nfscookie(struct nfsd_readdirres *resp, u32 offset)
{
- struct readdir_cd *ccd = ccdv;
- struct nfsd_readdirres *cd = container_of(ccd, struct nfsd_readdirres, common);
- __be32 *p = cd->buffer;
- int buflen, slen;
+ __be32 cookie = cpu_to_be32(offset);
- /*
- dprintk("nfsd: entry(%.*s off %ld ino %ld)\n",
- namlen, name, offset, ino);
- */
+ if (!resp->cookie_offset)
+ return;
- if (offset > ~((u32) 0)) {
- cd->common.err = nfserr_fbig;
- return -EINVAL;
- }
- if (cd->offset)
- *cd->offset = htonl(offset);
+ write_bytes_to_xdr_buf(&resp->dirlist, resp->cookie_offset, &cookie,
+ sizeof(cookie));
+ resp->cookie_offset = 0;
+}
- /* truncate filename */
- namlen = min(namlen, NFS2_MAXNAMLEN);
- slen = XDR_QUADLEN(namlen);
+static bool
+svcxdr_encode_entry_common(struct nfsd_readdirres *resp, const char *name,
+ int namlen, loff_t offset, u64 ino)
+{
+ struct xdr_buf *dirlist = &resp->dirlist;
+ struct xdr_stream *xdr = &resp->xdr;
- if ((buflen = cd->buflen - slen - 4) < 0) {
- cd->common.err = nfserr_toosmall;
- return -EINVAL;
- }
- if (ino > ~((u32) 0)) {
- cd->common.err = nfserr_fbig;
- return -EINVAL;
- }
- *p++ = xdr_one; /* mark entry present */
- *p++ = htonl((u32) ino); /* file id */
- p = xdr_encode_array(p, name, namlen);/* name length & name */
- cd->offset = p; /* remember pointer */
- *p++ = htonl(~0U); /* offset of next entry */
-
- cd->buflen = buflen;
- cd->buffer = p;
- cd->common.err = nfs_ok;
+ if (xdr_stream_encode_item_present(xdr) < 0)
+ return false;
+ /* fileid */
+ if (xdr_stream_encode_u32(xdr, (u32)ino) < 0)
+ return false;
+ /* name */
+ if (xdr_stream_encode_opaque(xdr, name, min(namlen, NFS2_MAXNAMLEN)) < 0)
+ return false;
+ /* cookie */
+ resp->cookie_offset = dirlist->len;
+ if (xdr_stream_encode_u32(xdr, ~0U) < 0)
+ return false;
+
+ return true;
+}
+
+/**
+ * nfssvc_encode_entry - encode one NFSv2 READDIR entry
+ * @data: directory context
+ * @name: name of the object to be encoded
+ * @namlen: length of that name, in bytes
+ * @offset: the offset of the previous entry
+ * @ino: the fileid of this entry
+ * @d_type: unused
+ *
+ * Return values:
+ * %0: Entry was successfully encoded.
+ * %-EINVAL: An encoding problem occured, secondary status code in resp->common.err
+ *
+ * On exit, the following fields are updated:
+ * - resp->xdr
+ * - resp->common.err
+ * - resp->cookie_offset
+ */
+int nfssvc_encode_entry(void *data, const char *name, int namlen,
+ loff_t offset, u64 ino, unsigned int d_type)
+{
+ struct readdir_cd *ccd = data;
+ struct nfsd_readdirres *resp = container_of(ccd,
+ struct nfsd_readdirres,
+ common);
+ unsigned int starting_length = resp->dirlist.len;
+
+ /* The offset cookie for the previous entry */
+ nfssvc_encode_nfscookie(resp, offset);
+
+ if (!svcxdr_encode_entry_common(resp, name, namlen, offset, ino))
+ goto out_toosmall;
+
+ xdr_commit_encode(&resp->xdr);
+ resp->common.err = nfs_ok;
return 0;
+
+out_toosmall:
+ resp->cookie_offset = 0;
+ resp->common.err = nfserr_toosmall;
+ resp->dirlist.len = starting_length;
+ return -EINVAL;
}
/*
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 73deea353169..54cab651ac1d 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -371,6 +371,10 @@ struct nfs4_client {
/* debugging info directory under nfsd/clients/ : */
struct dentry *cl_nfsd_dentry;
+ /* 'info' file within that directory. Ref is not counted,
+ * but will remain valid iff cl_nfsd_dentry != NULL
+ */
+ struct dentry *cl_nfsd_info_dentry;
/* for nfs41 callbacks */
/* We currently support a single back channel with a single slot */
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 92a0973dd671..27a93ebd1d80 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -391,6 +391,30 @@ DEFINE_EVENT(nfsd_err_class, nfsd_##name, \
DEFINE_NFSD_ERR_EVENT(read_err);
DEFINE_NFSD_ERR_EVENT(write_err);
+TRACE_EVENT(nfsd_dirent,
+ TP_PROTO(struct svc_fh *fhp,
+ u64 ino,
+ const char *name,
+ int namlen),
+ TP_ARGS(fhp, ino, name, namlen),
+ TP_STRUCT__entry(
+ __field(u32, fh_hash)
+ __field(u64, ino)
+ __field(int, len)
+ __dynamic_array(unsigned char, name, namlen)
+ ),
+ TP_fast_assign(
+ __entry->fh_hash = fhp ? knfsd_fh_hash(&fhp->fh_handle) : 0;
+ __entry->ino = ino;
+ __entry->len = namlen;
+ memcpy(__get_str(name), name, namlen);
+ __assign_str(name, name);
+ ),
+ TP_printk("fh_hash=0x%08x ino=%llu name=%.*s",
+ __entry->fh_hash, __entry->ino,
+ __entry->len, __get_str(name))
+)
+
#include "state.h"
#include "filecache.h"
#include "vfs.h"
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index fd6be35a1642..15adf1f6ab21 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1968,8 +1968,9 @@ static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name,
return 0;
}
-static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
- struct readdir_cd *cdp, loff_t *offsetp)
+static __be32 nfsd_buffered_readdir(struct file *file, struct svc_fh *fhp,
+ nfsd_filldir_t func, struct readdir_cd *cdp,
+ loff_t *offsetp)
{
struct buffered_dirent *de;
int host_err;
@@ -2015,6 +2016,8 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
if (cdp->err != nfs_ok)
break;
+ trace_nfsd_dirent(fhp, de->ino, de->name, de->namlen);
+
reclen = ALIGN(sizeof(*de) + de->namlen,
sizeof(u64));
size -= reclen;
@@ -2062,7 +2065,7 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
goto out_close;
}
- err = nfsd_buffered_readdir(file, func, cdp, offsetp);
+ err = nfsd_buffered_readdir(file, fhp, func, cdp, offsetp);
if (err == nfserr_eof || err == nfserr_toosmall)
err = nfs_ok; /* can still be found in ->err */
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index a2442ebe5acf..b21b76e6b9a8 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -152,7 +152,7 @@ static inline void fh_drop_write(struct svc_fh *fh)
}
}
-static inline __be32 fh_getattr(struct svc_fh *fh, struct kstat *stat)
+static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat)
{
struct path p = {.mnt = fh->fh_export->ex_path.mnt,
.dentry = fh->fh_dentry};
diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h
index 3018b52b6d5e..f45b4bc93f52 100644
--- a/fs/nfsd/xdr.h
+++ b/fs/nfsd/xdr.h
@@ -94,6 +94,7 @@ struct nfsd_diropres {
struct nfsd_readlinkres {
__be32 status;
int len;
+ struct page *page;
};
struct nfsd_readres {
@@ -101,17 +102,20 @@ struct nfsd_readres {
struct svc_fh fh;
unsigned long count;
struct kstat stat;
+ struct page **pages;
};
struct nfsd_readdirres {
+ /* Components of the reply */
__be32 status;
int count;
+ /* Used to encode the reply's entry list */
+ struct xdr_stream xdr;
+ struct xdr_buf dirlist;
struct readdir_cd common;
- __be32 * buffer;
- int buflen;
- __be32 * offset;
+ unsigned int cookie_offset;
};
struct nfsd_statfsres {
@@ -147,23 +151,26 @@ int nfssvc_decode_renameargs(struct svc_rqst *, __be32 *);
int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *);
int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *);
int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *);
-int nfssvc_encode_stat(struct svc_rqst *, __be32 *);
-int nfssvc_encode_attrstat(struct svc_rqst *, __be32 *);
+int nfssvc_encode_statres(struct svc_rqst *, __be32 *);
+int nfssvc_encode_attrstatres(struct svc_rqst *, __be32 *);
int nfssvc_encode_diropres(struct svc_rqst *, __be32 *);
int nfssvc_encode_readlinkres(struct svc_rqst *, __be32 *);
int nfssvc_encode_readres(struct svc_rqst *, __be32 *);
int nfssvc_encode_statfsres(struct svc_rqst *, __be32 *);
int nfssvc_encode_readdirres(struct svc_rqst *, __be32 *);
-int nfssvc_encode_entry(void *, const char *name,
- int namlen, loff_t offset, u64 ino, unsigned int);
+void nfssvc_encode_nfscookie(struct nfsd_readdirres *resp, u32 offset);
+int nfssvc_encode_entry(void *data, const char *name, int namlen,
+ loff_t offset, u64 ino, unsigned int d_type);
void nfssvc_release_attrstat(struct svc_rqst *rqstp);
void nfssvc_release_diropres(struct svc_rqst *rqstp);
void nfssvc_release_readres(struct svc_rqst *rqstp);
/* Helper functions for NFSv2 ACL code */
-__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat);
bool svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp);
+bool svcxdr_encode_stat(struct xdr_stream *xdr, __be32 status);
+bool svcxdr_encode_fattr(struct svc_rqst *rqstp, struct xdr_stream *xdr,
+ const struct svc_fh *fhp, const struct kstat *stat);
#endif /* LINUX_NFSD_H */
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index 3e1578953f54..933008382bbe 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -137,6 +137,7 @@ struct nfsd3_readlinkres {
__be32 status;
struct svc_fh fh;
__u32 len;
+ struct page **pages;
};
struct nfsd3_readres {
@@ -144,6 +145,7 @@ struct nfsd3_readres {
struct svc_fh fh;
unsigned long count;
__u32 eof;
+ struct page **pages;
};
struct nfsd3_writeres {
@@ -167,19 +169,17 @@ struct nfsd3_linkres {
};
struct nfsd3_readdirres {
+ /* Components of the reply */
__be32 status;
struct svc_fh fh;
- /* Just to save kmalloc on every readdirplus entry (svc_fh is a
- * little large for the stack): */
- struct svc_fh scratch;
- int count;
__be32 verf[2];
+ /* Used to encode the reply's entry list */
+ struct xdr_stream xdr;
+ struct xdr_buf dirlist;
+ struct svc_fh scratch;
struct readdir_cd common;
- __be32 * buffer;
- int buflen;
- __be32 * offset;
- __be32 * offset1;
+ unsigned int cookie_offset;
struct svc_rqst * rqstp;
};
@@ -280,9 +280,9 @@ int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *);
int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *);
int nfs3svc_decode_readdirplusargs(struct svc_rqst *, __be32 *);
int nfs3svc_decode_commitargs(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_attrstat(struct svc_rqst *, __be32 *);
+int nfs3svc_encode_getattrres(struct svc_rqst *, __be32 *);
int nfs3svc_encode_wccstat(struct svc_rqst *, __be32 *);
-int nfs3svc_encode_diropres(struct svc_rqst *, __be32 *);
+int nfs3svc_encode_lookupres(struct svc_rqst *, __be32 *);
int nfs3svc_encode_accessres(struct svc_rqst *, __be32 *);
int nfs3svc_encode_readlinkres(struct svc_rqst *, __be32 *);
int nfs3svc_encode_readres(struct svc_rqst *, __be32 *);
@@ -298,15 +298,16 @@ int nfs3svc_encode_commitres(struct svc_rqst *, __be32 *);
void nfs3svc_release_fhandle(struct svc_rqst *);
void nfs3svc_release_fhandle2(struct svc_rqst *);
-int nfs3svc_encode_entry(void *, const char *name,
- int namlen, loff_t offset, u64 ino,
- unsigned int);
-int nfs3svc_encode_entry_plus(void *, const char *name,
- int namlen, loff_t offset, u64 ino,
- unsigned int);
+
+void nfs3svc_encode_cookie3(struct nfsd3_readdirres *resp, u64 offset);
+int nfs3svc_encode_entry3(void *data, const char *name, int namlen,
+ loff_t offset, u64 ino, unsigned int d_type);
+int nfs3svc_encode_entryplus3(void *data, const char *name, int namlen,
+ loff_t offset, u64 ino, unsigned int d_type);
/* Helper functions for NFSv3 ACL code */
-__be32 *nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p,
- struct svc_fh *fhp);
bool svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp);
+bool svcxdr_encode_nfsstat3(struct xdr_stream *xdr, __be32 status);
+bool svcxdr_encode_post_op_attr(struct svc_rqst *rqstp, struct xdr_stream *xdr,
+ const struct svc_fh *fhp);
#endif /* _LINUX_NFSD_XDR3_H */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index c300885ae75d..fe540a3415c6 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -698,7 +698,7 @@ struct nfsd4_compoundargs {
struct nfsd4_compoundres {
/* scratch variables for XDR encode */
- struct xdr_stream xdr;
+ struct xdr_stream *xdr;
struct svc_rqst * rqstp;
u32 taglen;
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index e1bd592ce700..7cf765258fda 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -148,6 +148,8 @@ const struct inode_operations nilfs_file_inode_operations = {
.setattr = nilfs_setattr,
.permission = nilfs_permission,
.fiemap = nilfs_fiemap,
+ .fileattr_get = nilfs_fileattr_get,
+ .fileattr_set = nilfs_fileattr_set,
};
/* end of file */
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index b053b40315bf..3fcb9357bbfd 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -16,6 +16,7 @@
#include <linux/compat.h> /* compat_ptr() */
#include <linux/mount.h> /* mnt_want_write_file(), mnt_drop_write_file() */
#include <linux/buffer_head.h>
+#include <linux/fileattr.h>
#include "nilfs.h"
#include "segment.h"
#include "bmap.h"
@@ -113,51 +114,39 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
}
/**
- * nilfs_ioctl_getflags - ioctl to support lsattr
+ * nilfs_fileattr_get - ioctl to support lsattr
*/
-static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp)
+int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
{
- unsigned int flags = NILFS_I(inode)->i_flags & FS_FL_USER_VISIBLE;
+ struct inode *inode = d_inode(dentry);
- return put_user(flags, (int __user *)argp);
+ fileattr_fill_flags(fa, NILFS_I(inode)->i_flags & FS_FL_USER_VISIBLE);
+
+ return 0;
}
/**
- * nilfs_ioctl_setflags - ioctl to support chattr
+ * nilfs_fileattr_set - ioctl to support chattr
*/
-static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
- void __user *argp)
+int nilfs_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
{
+ struct inode *inode = d_inode(dentry);
struct nilfs_transaction_info ti;
unsigned int flags, oldflags;
int ret;
- if (!inode_owner_or_capable(&init_user_ns, inode))
- return -EACCES;
-
- if (get_user(flags, (int __user *)argp))
- return -EFAULT;
-
- ret = mnt_want_write_file(filp);
- if (ret)
- return ret;
-
- flags = nilfs_mask_flags(inode->i_mode, flags);
-
- inode_lock(inode);
-
- oldflags = NILFS_I(inode)->i_flags;
+ if (fileattr_has_fsx(fa))
+ return -EOPNOTSUPP;
- ret = vfs_ioc_setflags_prepare(inode, oldflags, flags);
- if (ret)
- goto out;
+ flags = nilfs_mask_flags(inode->i_mode, fa->flags);
ret = nilfs_transaction_begin(inode->i_sb, &ti, 0);
if (ret)
- goto out;
+ return ret;
- NILFS_I(inode)->i_flags = (oldflags & ~FS_FL_USER_MODIFIABLE) |
- (flags & FS_FL_USER_MODIFIABLE);
+ oldflags = NILFS_I(inode)->i_flags & ~FS_FL_USER_MODIFIABLE;
+ NILFS_I(inode)->i_flags = oldflags | (flags & FS_FL_USER_MODIFIABLE);
nilfs_set_inode_flags(inode);
inode->i_ctime = current_time(inode);
@@ -165,11 +154,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
nilfs_set_transaction_flag(NILFS_TI_SYNC);
nilfs_mark_inode_dirty(inode);
- ret = nilfs_transaction_commit(inode->i_sb);
-out:
- inode_unlock(inode);
- mnt_drop_write_file(filp);
- return ret;
+ return nilfs_transaction_commit(inode->i_sb);
}
/**
@@ -1282,10 +1267,6 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
void __user *argp = (void __user *)arg;
switch (cmd) {
- case FS_IOC_GETFLAGS:
- return nilfs_ioctl_getflags(inode, argp);
- case FS_IOC_SETFLAGS:
- return nilfs_ioctl_setflags(inode, filp, argp);
case FS_IOC_GETVERSION:
return nilfs_ioctl_getversion(inode, argp);
case NILFS_IOCTL_CHANGE_CPMODE:
@@ -1331,12 +1312,6 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
switch (cmd) {
- case FS_IOC32_GETFLAGS:
- cmd = FS_IOC_GETFLAGS;
- break;
- case FS_IOC32_SETFLAGS:
- cmd = FS_IOC_SETFLAGS;
- break;
case FS_IOC32_GETVERSION:
cmd = FS_IOC_GETVERSION;
break;
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index ecace5f96a95..189bd1007a2f 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -552,6 +552,8 @@ const struct inode_operations nilfs_dir_inode_operations = {
.setattr = nilfs_setattr,
.permission = nilfs_permission,
.fiemap = nilfs_fiemap,
+ .fileattr_get = nilfs_fileattr_get,
+ .fileattr_set = nilfs_fileattr_set,
};
const struct inode_operations nilfs_special_inode_operations = {
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index c4a45a081ade..60b21b6eeac0 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -243,6 +243,9 @@ extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
extern int nilfs_sync_file(struct file *, loff_t, loff_t, int);
/* ioctl.c */
+int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *m);
+int nilfs_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa);
long nilfs_ioctl(struct file *, unsigned int, unsigned long);
long nilfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 3bfb4147895a..ad20403b383f 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2295,7 +2295,7 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
struct ocfs2_alloc_context *meta_ac = NULL;
handle_t *handle = NULL;
loff_t end = offset + bytes;
- int ret = 0, credits = 0, locked = 0;
+ int ret = 0, credits = 0;
ocfs2_init_dealloc_ctxt(&dealloc);
@@ -2306,13 +2306,6 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
!dwc->dw_orphaned)
goto out;
- /* ocfs2_file_write_iter will get i_mutex, so we need not lock if we
- * are in that context. */
- if (dwc->dw_writer_pid != task_pid_nr(current)) {
- inode_lock(inode);
- locked = 1;
- }
-
ret = ocfs2_inode_lock(inode, &di_bh, 1);
if (ret < 0) {
mlog_errno(ret);
@@ -2393,8 +2386,6 @@ out:
if (meta_ac)
ocfs2_free_alloc_context(meta_ac);
ocfs2_run_deallocs(osb, &dealloc);
- if (locked)
- inode_unlock(inode);
ocfs2_dio_free_write_ctx(inode, dwc);
return ret;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 8e3a369086db..0fbe8bf7190f 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2204,7 +2204,7 @@ static void ocfs2_unpack_timespec(struct timespec64 *spec,
spec->tv_nsec = packed_time & OCFS2_NSEC_MASK;
}
-static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
+static int ocfs2_refresh_inode_from_lvb(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
@@ -2213,6 +2213,8 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
mlog_meta_lvb(0, lockres);
lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ if (inode_wrong_type(inode, be16_to_cpu(lvb->lvb_imode)))
+ return -ESTALE;
/* We're safe here without the lockres lock... */
spin_lock(&oi->ip_lock);
@@ -2240,6 +2242,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
ocfs2_unpack_timespec(&inode->i_ctime,
be64_to_cpu(lvb->lvb_ictime_packed));
spin_unlock(&oi->ip_lock);
+ return 0;
}
static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
@@ -2342,7 +2345,8 @@ static int ocfs2_inode_lock_update(struct inode *inode,
if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
mlog(0, "Trusting LVB on inode %llu\n",
(unsigned long long)oi->ip_blkno);
- ocfs2_refresh_inode_from_lvb(inode);
+ status = ocfs2_refresh_inode_from_lvb(inode);
+ goto bail_refresh;
} else {
/* Boo, we have to go to disk. */
/* read bh, cast, ocfs2_refresh_inode */
@@ -2352,6 +2356,10 @@ static int ocfs2_inode_lock_update(struct inode *inode,
goto bail_refresh;
}
fe = (struct ocfs2_dinode *) (*bh)->b_data;
+ if (inode_wrong_type(inode, le16_to_cpu(fe->i_mode))) {
+ status = -ESTALE;
+ goto bail_refresh;
+ }
/* This is a good chance to make sure we're not
* locking an invalid object. ocfs2_read_inode_block()
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6611c64ca0be..db8a6265b749 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1245,22 +1245,24 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
goto bail_unlock;
}
}
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
2 * ocfs2_quota_trans_credits(sb));
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
- goto bail_unlock;
+ goto bail_unlock_alloc;
}
status = __dquot_transfer(inode, transfer_to);
if (status < 0)
goto bail_commit;
} else {
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
- goto bail_unlock;
+ goto bail_unlock_alloc;
}
}
@@ -1273,6 +1275,8 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
bail_commit:
ocfs2_commit_trans(osb, handle);
+bail_unlock_alloc:
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
bail_unlock:
if (status && inode_locked) {
ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
@@ -2645,6 +2649,8 @@ const struct inode_operations ocfs2_file_iops = {
.fiemap = ocfs2_fiemap,
.get_acl = ocfs2_iop_get_acl,
.set_acl = ocfs2_iop_set_acl,
+ .fileattr_get = ocfs2_fileattr_get,
+ .fileattr_set = ocfs2_fileattr_set,
};
const struct inode_operations ocfs2_special_file_iops = {
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 50c9b30ee9f6..f59461d85da4 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -10,6 +10,7 @@
#include <linux/mount.h>
#include <linux/blkdev.h>
#include <linux/compat.h>
+#include <linux/fileattr.h>
#include <cluster/masklog.h>
@@ -61,8 +62,10 @@ static inline int o2info_coherent(struct ocfs2_info_request *req)
return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT));
}
-static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
+int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa)
{
+ struct inode *inode = d_inode(dentry);
+ unsigned int flags;
int status;
status = ocfs2_inode_lock(inode, NULL, 0);
@@ -71,15 +74,19 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
return status;
}
ocfs2_get_inode_flags(OCFS2_I(inode));
- *flags = OCFS2_I(inode)->ip_attr;
+ flags = OCFS2_I(inode)->ip_attr;
ocfs2_inode_unlock(inode, 0);
+ fileattr_fill_flags(fa, flags & OCFS2_FL_VISIBLE);
+
return status;
}
-static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
- unsigned mask)
+int ocfs2_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
{
+ struct inode *inode = d_inode(dentry);
+ unsigned int flags = fa->flags;
struct ocfs2_inode_info *ocfs2_inode = OCFS2_I(inode);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
handle_t *handle = NULL;
@@ -87,7 +94,8 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
unsigned oldflags;
int status;
- inode_lock(inode);
+ if (fileattr_has_fsx(fa))
+ return -EOPNOTSUPP;
status = ocfs2_inode_lock(inode, &bh, 1);
if (status < 0) {
@@ -95,19 +103,17 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
goto bail;
}
- status = -EACCES;
- if (!inode_owner_or_capable(&init_user_ns, inode))
- goto bail_unlock;
-
if (!S_ISDIR(inode->i_mode))
flags &= ~OCFS2_DIRSYNC_FL;
oldflags = ocfs2_inode->ip_attr;
- flags = flags & mask;
- flags |= oldflags & ~mask;
+ flags = flags & OCFS2_FL_MODIFIABLE;
+ flags |= oldflags & ~OCFS2_FL_MODIFIABLE;
- status = vfs_ioc_setflags_prepare(inode, oldflags, flags);
- if (status)
+ /* Check already done by VFS, but repeat with ocfs lock */
+ status = -EPERM;
+ if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) &&
+ !capable(CAP_LINUX_IMMUTABLE))
goto bail_unlock;
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
@@ -129,8 +135,6 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
bail_unlock:
ocfs2_inode_unlock(inode, 1);
bail:
- inode_unlock(inode);
-
brelse(bh);
return status;
@@ -836,7 +840,6 @@ bail:
long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
- unsigned int flags;
int new_clusters;
int status;
struct ocfs2_space_resv sr;
@@ -849,24 +852,6 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
void __user *argp = (void __user *)arg;
switch (cmd) {
- case OCFS2_IOC_GETFLAGS:
- status = ocfs2_get_inode_attr(inode, &flags);
- if (status < 0)
- return status;
-
- flags &= OCFS2_FL_VISIBLE;
- return put_user(flags, (int __user *) arg);
- case OCFS2_IOC_SETFLAGS:
- if (get_user(flags, (int __user *) arg))
- return -EFAULT;
-
- status = mnt_want_write_file(filp);
- if (status)
- return status;
- status = ocfs2_set_inode_attr(inode, flags,
- OCFS2_FL_MODIFIABLE);
- mnt_drop_write_file(filp);
- return status;
case OCFS2_IOC_RESVSP:
case OCFS2_IOC_RESVSP64:
case OCFS2_IOC_UNRESVSP:
@@ -959,12 +944,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
void __user *argp = (void __user *)arg;
switch (cmd) {
- case OCFS2_IOC32_GETFLAGS:
- cmd = OCFS2_IOC_GETFLAGS;
- break;
- case OCFS2_IOC32_SETFLAGS:
- cmd = OCFS2_IOC_SETFLAGS;
- break;
case OCFS2_IOC_RESVSP:
case OCFS2_IOC_RESVSP64:
case OCFS2_IOC_UNRESVSP:
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index 9f5e4d95e37f..0297c8846945 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -11,6 +11,9 @@
#ifndef OCFS2_IOCTL_PROTO_H
#define OCFS2_IOCTL_PROTO_H
+int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int ocfs2_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa);
long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 3abdd36da2e2..05ced86580d1 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -50,6 +50,7 @@
#include "xattr.h"
#include "acl.h"
#include "ocfs2_trace.h"
+#include "ioctl.h"
#include "buffer_head_io.h"
@@ -2918,4 +2919,6 @@ const struct inode_operations ocfs2_dir_iops = {
.fiemap = ocfs2_fiemap,
.get_acl = ocfs2_iop_get_acl,
.set_acl = ocfs2_iop_set_acl,
+ .fileattr_get = ocfs2_fileattr_get,
+ .fileattr_set = ocfs2_fileattr_set,
};
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index d7b31734f6be..273616bd4f19 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -13,14 +13,6 @@
#define OCFS2_IOCTL_H
/*
- * ioctl commands
- */
-#define OCFS2_IOC_GETFLAGS FS_IOC_GETFLAGS
-#define OCFS2_IOC_SETFLAGS FS_IOC_SETFLAGS
-#define OCFS2_IOC32_GETFLAGS FS_IOC32_GETFLAGS
-#define OCFS2_IOC32_SETFLAGS FS_IOC32_SETFLAGS
-
-/*
* Space reservation / allocation / free ioctls and argument structure
* are designed to be compatible with XFS.
*
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 40c8c2e32fa3..f825176ff4ed 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -236,27 +236,31 @@ found:
mutex_unlock(&op_mutex);
if (IS_ERR(inode))
return ERR_CAST(inode);
- ent_oi = OP_I(inode);
- ent_oi->type = ent_type;
- ent_oi->u = ent_data;
-
- switch (ent_type) {
- case op_inode_node:
- inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
- inode->i_op = &openprom_inode_operations;
- inode->i_fop = &openprom_operations;
- set_nlink(inode, 2);
- break;
- case op_inode_prop:
- if (of_node_name_eq(dp, "options") && (len == 17) &&
- !strncmp (name, "security-password", 17))
- inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
- else
- inode->i_mode = S_IFREG | S_IRUGO;
- inode->i_fop = &openpromfs_prop_ops;
- set_nlink(inode, 1);
- inode->i_size = ent_oi->u.prop->length;
- break;
+ if (inode->i_state & I_NEW) {
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ ent_oi = OP_I(inode);
+ ent_oi->type = ent_type;
+ ent_oi->u = ent_data;
+
+ switch (ent_type) {
+ case op_inode_node:
+ inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
+ inode->i_op = &openprom_inode_operations;
+ inode->i_fop = &openprom_operations;
+ set_nlink(inode, 2);
+ break;
+ case op_inode_prop:
+ if (of_node_name_eq(dp, "options") && (len == 17) &&
+ !strncmp (name, "security-password", 17))
+ inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
+ else
+ inode->i_mode = S_IFREG | S_IRUGO;
+ inode->i_fop = &openpromfs_prop_ops;
+ set_nlink(inode, 1);
+ inode->i_size = ent_oi->u.prop->length;
+ break;
+ }
+ unlock_new_inode(inode);
}
return d_splice_alias(inode, dentry);
@@ -345,20 +349,9 @@ static void openprom_free_inode(struct inode *inode)
static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
{
- struct inode *inode;
-
- inode = iget_locked(sb, ino);
+ struct inode *inode = iget_locked(sb, ino);
if (!inode)
- return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW) {
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
- if (inode->i_ino == OPENPROM_ROOT_INO) {
- inode->i_op = &openprom_inode_operations;
- inode->i_fop = &openprom_operations;
- inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
- }
- unlock_new_inode(inode);
- }
+ inode = ERR_PTR(-ENOMEM);
return inode;
}
@@ -394,9 +387,15 @@ static int openprom_fill_super(struct super_block *s, struct fs_context *fc)
goto out_no_root;
}
+ root_inode->i_mtime = root_inode->i_atime =
+ root_inode->i_ctime = current_time(root_inode);
+ root_inode->i_op = &openprom_inode_operations;
+ root_inode->i_fop = &openprom_operations;
+ root_inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
oi = OP_I(root_inode);
oi->type = op_inode_node;
oi->u.node = of_find_node_by_path("/");
+ unlock_new_inode(root_inode);
s->s_root = d_make_root(root_inode);
if (!s->s_root)
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 9b28a7132466..ccef8c9dd516 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -375,84 +375,6 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb,
return ret;
}
-static int orangefs_getflags(struct inode *inode, unsigned long *uval)
-{
- __u64 val = 0;
- int ret;
-
- ret = orangefs_inode_getxattr(inode,
- "user.pvfs2.meta_hint",
- &val, sizeof(val));
- if (ret < 0 && ret != -ENODATA)
- return ret;
- else if (ret == -ENODATA)
- val = 0;
- *uval = val;
- return 0;
-}
-
-/*
- * Perform a miscellaneous operation on a file.
- */
-static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
- struct inode *inode = file_inode(file);
- int ret = -ENOTTY;
- __u64 val = 0;
- unsigned long uval;
-
- gossip_debug(GOSSIP_FILE_DEBUG,
- "orangefs_ioctl: called with cmd %d\n",
- cmd);
-
- /*
- * we understand some general ioctls on files, such as the immutable
- * and append flags
- */
- if (cmd == FS_IOC_GETFLAGS) {
- ret = orangefs_getflags(inode, &uval);
- if (ret)
- return ret;
- gossip_debug(GOSSIP_FILE_DEBUG,
- "orangefs_ioctl: FS_IOC_GETFLAGS: %llu\n",
- (unsigned long long)uval);
- return put_user(uval, (int __user *)arg);
- } else if (cmd == FS_IOC_SETFLAGS) {
- unsigned long old_uval;
-
- ret = 0;
- if (get_user(uval, (int __user *)arg))
- return -EFAULT;
- /*
- * ORANGEFS_MIRROR_FL is set internally when the mirroring mode
- * is turned on for a file. The user is not allowed to turn
- * on this bit, but the bit is present if the user first gets
- * the flags and then updates the flags with some new
- * settings. So, we ignore it in the following edit. bligon.
- */
- if ((uval & ~ORANGEFS_MIRROR_FL) &
- (~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL))) {
- gossip_err("orangefs_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n");
- return -EINVAL;
- }
- ret = orangefs_getflags(inode, &old_uval);
- if (ret)
- return ret;
- ret = vfs_ioc_setflags_prepare(inode, old_uval, uval);
- if (ret)
- return ret;
- val = uval;
- gossip_debug(GOSSIP_FILE_DEBUG,
- "orangefs_ioctl: FS_IOC_SETFLAGS: %llu\n",
- (unsigned long long)val);
- ret = orangefs_inode_setxattr(inode,
- "user.pvfs2.meta_hint",
- &val, sizeof(val), 0);
- }
-
- return ret;
-}
-
static vm_fault_t orangefs_fault(struct vm_fault *vmf)
{
struct file *file = vmf->vma->vm_file;
@@ -657,7 +579,6 @@ const struct file_operations orangefs_file_operations = {
.read_iter = orangefs_file_read_iter,
.write_iter = orangefs_file_write_iter,
.lock = orangefs_lock,
- .unlocked_ioctl = orangefs_ioctl,
.mmap = orangefs_file_mmap,
.open = generic_file_open,
.splice_read = generic_file_splice_read,
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 5079cfafa8d7..85b3dd2d769d 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -11,6 +11,7 @@
*/
#include <linux/bvec.h>
+#include <linux/fileattr.h>
#include "protocol.h"
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"
@@ -954,6 +955,53 @@ int orangefs_update_time(struct inode *inode, struct timespec64 *time, int flags
return __orangefs_setattr(inode, &iattr);
}
+static int orangefs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+ u64 val = 0;
+ int ret;
+
+ gossip_debug(GOSSIP_FILE_DEBUG, "%s: called on %pd\n", __func__,
+ dentry);
+
+ ret = orangefs_inode_getxattr(d_inode(dentry),
+ "user.pvfs2.meta_hint",
+ &val, sizeof(val));
+ if (ret < 0 && ret != -ENODATA)
+ return ret;
+
+ gossip_debug(GOSSIP_FILE_DEBUG, "%s: flags=%u\n", __func__, (u32) val);
+
+ fileattr_fill_flags(fa, val);
+ return 0;
+}
+
+static int orangefs_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
+{
+ u64 val = 0;
+
+ gossip_debug(GOSSIP_FILE_DEBUG, "%s: called on %pd\n", __func__,
+ dentry);
+ /*
+ * ORANGEFS_MIRROR_FL is set internally when the mirroring mode is
+ * turned on for a file. The user is not allowed to turn on this bit,
+ * but the bit is present if the user first gets the flags and then
+ * updates the flags with some new settings. So, we ignore it in the
+ * following edit. bligon.
+ */
+ if (fileattr_has_fsx(fa) ||
+ (fa->flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL | ORANGEFS_MIRROR_FL))) {
+ gossip_err("%s: only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n",
+ __func__);
+ return -EOPNOTSUPP;
+ }
+ val = fa->flags;
+ gossip_debug(GOSSIP_FILE_DEBUG, "%s: flags=%u\n", __func__, (u32) val);
+ return orangefs_inode_setxattr(d_inode(dentry),
+ "user.pvfs2.meta_hint",
+ &val, sizeof(val), 0);
+}
+
/* ORANGEFS2 implementation of VFS inode operations for files */
static const struct inode_operations orangefs_file_inode_operations = {
.get_acl = orangefs_get_acl,
@@ -963,6 +1011,8 @@ static const struct inode_operations orangefs_file_inode_operations = {
.listxattr = orangefs_listxattr,
.permission = orangefs_permission,
.update_time = orangefs_update_time,
+ .fileattr_get = orangefs_fileattr_get,
+ .fileattr_set = orangefs_fileattr_set,
};
static int orangefs_init_iops(struct inode *inode)
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index d4b7ae763186..46b7dcff18ac 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -221,7 +221,7 @@ static int orangefs_inode_is_stale(struct inode *inode,
* If the inode type or symlink target have changed then this
* inode is stale.
*/
- if (type == -1 || !(inode->i_mode & type)) {
+ if (type == -1 || inode_wrong_type(inode, type)) {
orangefs_make_bad_inode(inode);
return 1;
}
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 836f14b9d3a6..93efe7048a77 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -1301,4 +1301,6 @@ const struct inode_operations ovl_dir_inode_operations = {
.listxattr = ovl_listxattr,
.get_acl = ovl_get_acl,
.update_time = ovl_update_time,
+ .fileattr_get = ovl_fileattr_get,
+ .fileattr_set = ovl_fileattr_set,
};
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index dbfb35fb0ff7..c144183a7e09 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -430,20 +430,11 @@ static int ovl_mmap(struct file *file, struct vm_area_struct *vma)
if (WARN_ON(file != vma->vm_file))
return -EIO;
- vma->vm_file = get_file(realfile);
+ vma_set_file(vma, realfile);
old_cred = ovl_override_creds(file_inode(file)->i_sb);
ret = call_mmap(vma->vm_file, vma);
revert_creds(old_cred);
-
- if (ret) {
- /* Drop reference count from new vm_file value */
- fput(realfile);
- } else {
- /* Drop reference count from previous vm_file value */
- fput(file);
- }
-
ovl_file_accessed(file);
return ret;
@@ -491,112 +482,6 @@ static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
return ret;
}
-static long ovl_real_ioctl(struct file *file, unsigned int cmd,
- unsigned long arg)
-{
- struct fd real;
- long ret;
-
- ret = ovl_real_fdget(file, &real);
- if (ret)
- return ret;
-
- ret = security_file_ioctl(real.file, cmd, arg);
- if (!ret) {
- /*
- * Don't override creds, since we currently can't safely check
- * permissions before doing so.
- */
- ret = vfs_ioctl(real.file, cmd, arg);
- }
-
- fdput(real);
-
- return ret;
-}
-
-static long ovl_ioctl_set_flags(struct file *file, unsigned int cmd,
- unsigned long arg)
-{
- long ret;
- struct inode *inode = file_inode(file);
-
- if (!inode_owner_or_capable(&init_user_ns, inode))
- return -EACCES;
-
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
- inode_lock(inode);
-
- /*
- * Prevent copy up if immutable and has no CAP_LINUX_IMMUTABLE
- * capability.
- */
- ret = -EPERM;
- if (!ovl_has_upperdata(inode) && IS_IMMUTABLE(inode) &&
- !capable(CAP_LINUX_IMMUTABLE))
- goto unlock;
-
- ret = ovl_maybe_copy_up(file_dentry(file), O_WRONLY);
- if (ret)
- goto unlock;
-
- ret = ovl_real_ioctl(file, cmd, arg);
-
- ovl_copyflags(ovl_inode_real(inode), inode);
-unlock:
- inode_unlock(inode);
-
- mnt_drop_write_file(file);
-
- return ret;
-
-}
-
-long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
- long ret;
-
- switch (cmd) {
- case FS_IOC_GETFLAGS:
- case FS_IOC_FSGETXATTR:
- ret = ovl_real_ioctl(file, cmd, arg);
- break;
-
- case FS_IOC_FSSETXATTR:
- case FS_IOC_SETFLAGS:
- ret = ovl_ioctl_set_flags(file, cmd, arg);
- break;
-
- default:
- ret = -ENOTTY;
- }
-
- return ret;
-}
-
-#ifdef CONFIG_COMPAT
-long ovl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
- switch (cmd) {
- case FS_IOC32_GETFLAGS:
- cmd = FS_IOC_GETFLAGS;
- break;
-
- case FS_IOC32_SETFLAGS:
- cmd = FS_IOC_SETFLAGS;
- break;
-
- default:
- return -ENOIOCTLCMD;
- }
-
- return ovl_ioctl(file, cmd, arg);
-}
-#endif
-
enum ovl_copyop {
OVL_COPY,
OVL_CLONE,
@@ -696,10 +581,6 @@ const struct file_operations ovl_file_operations = {
.mmap = ovl_mmap,
.fallocate = ovl_fallocate,
.fadvise = ovl_fadvise,
- .unlocked_ioctl = ovl_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = ovl_compat_ioctl,
-#endif
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 003cf83bf78a..c3c96b4b3b33 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -11,6 +11,8 @@
#include <linux/posix_acl.h>
#include <linux/ratelimit.h>
#include <linux/fiemap.h>
+#include <linux/fileattr.h>
+#include <linux/security.h>
#include "overlayfs.h"
@@ -500,6 +502,79 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return err;
}
+/*
+ * Work around the fact that security_file_ioctl() takes a file argument.
+ * Introducing security_inode_fileattr_get/set() hooks would solve this issue
+ * properly.
+ */
+static int ovl_security_fileattr(struct dentry *dentry, struct fileattr *fa,
+ bool set)
+{
+ struct path realpath;
+ struct file *file;
+ unsigned int cmd;
+ int err;
+
+ ovl_path_real(dentry, &realpath);
+ file = dentry_open(&realpath, O_RDONLY, current_cred());
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ if (set)
+ cmd = fa->fsx_valid ? FS_IOC_FSSETXATTR : FS_IOC_SETFLAGS;
+ else
+ cmd = fa->fsx_valid ? FS_IOC_FSGETXATTR : FS_IOC_GETFLAGS;
+
+ err = security_file_ioctl(file, cmd, 0);
+ fput(file);
+
+ return err;
+}
+
+int ovl_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ struct dentry *upperdentry;
+ const struct cred *old_cred;
+ int err;
+
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
+ err = ovl_copy_up(dentry);
+ if (!err) {
+ upperdentry = ovl_dentry_upper(dentry);
+
+ old_cred = ovl_override_creds(inode->i_sb);
+ err = ovl_security_fileattr(dentry, fa, true);
+ if (!err)
+ err = vfs_fileattr_set(&init_user_ns, upperdentry, fa);
+ revert_creds(old_cred);
+ ovl_copyflags(ovl_inode_real(inode), inode);
+ }
+ ovl_drop_write(dentry);
+out:
+ return err;
+}
+
+int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ struct dentry *realdentry = ovl_dentry_real(dentry);
+ const struct cred *old_cred;
+ int err;
+
+ old_cred = ovl_override_creds(inode->i_sb);
+ err = ovl_security_fileattr(dentry, fa, false);
+ if (!err)
+ err = vfs_fileattr_get(realdentry, fa);
+ revert_creds(old_cred);
+
+ return err;
+}
+
static const struct inode_operations ovl_file_inode_operations = {
.setattr = ovl_setattr,
.permission = ovl_permission,
@@ -508,6 +583,8 @@ static const struct inode_operations ovl_file_inode_operations = {
.get_acl = ovl_get_acl,
.update_time = ovl_update_time,
.fiemap = ovl_fiemap,
+ .fileattr_get = ovl_fileattr_get,
+ .fileattr_set = ovl_fileattr_set,
};
static const struct inode_operations ovl_symlink_inode_operations = {
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 3fe05fb5d145..1d573972ce22 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -371,7 +371,7 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
return PTR_ERR(origin);
if (upperdentry && !ovl_is_whiteout(upperdentry) &&
- ((d_inode(origin)->i_mode ^ d_inode(upperdentry)->i_mode) & S_IFMT))
+ inode_wrong_type(d_inode(upperdentry), d_inode(origin)->i_mode))
goto invalid;
if (!*stackp)
@@ -730,7 +730,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
index = ERR_PTR(-ESTALE);
goto out;
} else if (ovl_dentry_weird(index) || ovl_is_whiteout(index) ||
- ((inode->i_mode ^ d_inode(origin)->i_mode) & S_IFMT)) {
+ inode_wrong_type(inode, d_inode(origin)->i_mode)) {
/*
* Index should always be of the same file type as origin
* except for the case of a whiteout index. A whiteout
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 95cff83786a5..f38cb5e07eff 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -519,8 +519,9 @@ struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr);
extern const struct file_operations ovl_file_operations;
int __init ovl_aio_request_cache_init(void);
void ovl_aio_request_cache_destroy(void);
-long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-long ovl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int ovl_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index f404a78e6b60..1ddad0967255 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -963,10 +963,6 @@ const struct file_operations ovl_dir_operations = {
.llseek = ovl_dir_llseek,
.fsync = ovl_dir_fsync,
.release = ovl_dir_release,
- .unlocked_ioctl = ovl_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = ovl_compat_ioctl,
-#endif
};
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
diff --git a/fs/proc/array.c b/fs/proc/array.c
index bb87e4d89cd8..7ec59171f197 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -342,9 +342,11 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p));
#ifdef CONFIG_SECCOMP
seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
+#ifdef CONFIG_SECCOMP_FILTER
seq_put_decimal_ull(m, "\nSeccomp_filters:\t",
atomic_read(&p->seccomp.filter_count));
#endif
+#endif
seq_puts(m, "\nSpeculation_Store_Bypass:\t");
switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) {
case -EINVAL:
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index d963ae7902f9..b9614db48b1d 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -385,6 +385,7 @@ void pstore_record_init(struct pstore_record *record,
static void pstore_dump(struct kmsg_dumper *dumper,
enum kmsg_dump_reason reason)
{
+ struct kmsg_dump_iter iter;
unsigned long total = 0;
const char *why;
unsigned int part = 1;
@@ -405,6 +406,8 @@ static void pstore_dump(struct kmsg_dumper *dumper,
}
}
+ kmsg_dump_rewind(&iter);
+
oopscount++;
while (total < kmsg_bytes) {
char *dst;
@@ -435,7 +438,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
dst_size -= header_size;
/* Write dump contents. */
- if (!kmsg_dump_get_buffer(dumper, true, dst + header_size,
+ if (!kmsg_dump_get_buffer(&iter, true, dst + header_size,
dst_size, &dump_size))
break;
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index ca6d8a867285..fefe3d391d3a 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -56,7 +56,7 @@ MODULE_PARM_DESC(mem_size,
static unsigned int mem_type;
module_param(mem_type, uint, 0400);
MODULE_PARM_DESC(mem_type,
- "set to 1 to try to use unbuffered memory (default 0)");
+ "memory type: 0=write-combined (default), 1=unbuffered, 2=cached");
static int ramoops_max_reason = -1;
module_param_named(max_reason, ramoops_max_reason, int, 0400);
@@ -648,6 +648,10 @@ static int ramoops_parse_dt(struct platform_device *pdev,
pdata->mem_size = resource_size(res);
pdata->mem_address = res->start;
+ /*
+ * Setting "unbuffered" is deprecated and will be ignored if
+ * "mem_type" is also specified.
+ */
pdata->mem_type = of_property_read_bool(of_node, "unbuffered");
/*
* Setting "no-dump-oops" is deprecated and will be ignored if
@@ -666,6 +670,7 @@ static int ramoops_parse_dt(struct platform_device *pdev,
field = value; \
}
+ parse_u32("mem-type", pdata->record_size, pdata->mem_type);
parse_u32("record-size", pdata->record_size, 0);
parse_u32("console-size", pdata->console_size, 0);
parse_u32("ftrace-size", pdata->ftrace_size, 0);
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index fff363bfd484..fe5305028c6e 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -396,6 +396,10 @@ void persistent_ram_zap(struct persistent_ram_zone *prz)
persistent_ram_update_header_ecc(prz);
}
+#define MEM_TYPE_WCOMBINE 0
+#define MEM_TYPE_NONCACHED 1
+#define MEM_TYPE_NORMAL 2
+
static void *persistent_ram_vmap(phys_addr_t start, size_t size,
unsigned int memtype)
{
@@ -409,10 +413,20 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size,
page_start = start - offset_in_page(start);
page_count = DIV_ROUND_UP(size + offset_in_page(start), PAGE_SIZE);
- if (memtype)
+ switch (memtype) {
+ case MEM_TYPE_NORMAL:
+ prot = PAGE_KERNEL;
+ break;
+ case MEM_TYPE_NONCACHED:
prot = pgprot_noncached(PAGE_KERNEL);
- else
+ break;
+ case MEM_TYPE_WCOMBINE:
prot = pgprot_writecombine(PAGE_KERNEL);
+ break;
+ default:
+ pr_err("invalid mem_type=%d\n", memtype);
+ return NULL;
+ }
pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL);
if (!pages) {
diff --git a/fs/readdir.c b/fs/readdir.c
index 19434b3c982c..09e8ed7d4161 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -150,6 +150,9 @@ static int fillonedir(struct dir_context *ctx, const char *name, int namlen,
if (buf->result)
return -EINVAL;
+ buf->result = verify_dirent_name(name, namlen);
+ if (buf->result < 0)
+ return buf->result;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
buf->result = -EOVERFLOW;
@@ -405,6 +408,9 @@ static int compat_fillonedir(struct dir_context *ctx, const char *name,
if (buf->result)
return -EINVAL;
+ buf->result = verify_dirent_name(name, namlen);
+ if (buf->result < 0)
+ return buf->result;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
buf->result = -EOVERFLOW;
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 1db0254bc38b..203a47232707 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -258,4 +258,6 @@ const struct inode_operations reiserfs_file_inode_operations = {
.permission = reiserfs_permission,
.get_acl = reiserfs_get_acl,
.set_acl = reiserfs_set_acl,
+ .fileattr_get = reiserfs_fileattr_get,
+ .fileattr_set = reiserfs_fileattr_set,
};
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 4f1cbd930179..4b86ecf5817e 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -10,6 +10,59 @@
#include <linux/uaccess.h>
#include <linux/pagemap.h>
#include <linux/compat.h>
+#include <linux/fileattr.h>
+
+int reiserfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+
+ if (!reiserfs_attrs(inode->i_sb))
+ return -ENOTTY;
+
+ fileattr_fill_flags(fa, REISERFS_I(inode)->i_attrs);
+
+ return 0;
+}
+
+int reiserfs_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ unsigned int flags = fa->flags;
+ int err;
+
+ reiserfs_write_lock(inode->i_sb);
+
+ err = -ENOTTY;
+ if (!reiserfs_attrs(inode->i_sb))
+ goto unlock;
+
+ err = -EOPNOTSUPP;
+ if (fileattr_has_fsx(fa))
+ goto unlock;
+
+ /*
+ * Is it quota file? Do not allow user to mess with it
+ */
+ err = -EPERM;
+ if (IS_NOQUOTA(inode))
+ goto unlock;
+
+ if ((flags & REISERFS_NOTAIL_FL) && S_ISREG(inode->i_mode)) {
+ err = reiserfs_unpack(inode);
+ if (err)
+ goto unlock;
+ }
+ sd_attrs_to_i_attrs(flags, inode);
+ REISERFS_I(inode)->i_attrs = flags;
+ inode->i_ctime = current_time(inode);
+ mark_inode_dirty(inode);
+ err = 0;
+unlock:
+ reiserfs_write_unlock(inode->i_sb);
+
+ return err;
+}
/*
* reiserfs_ioctl - handler for ioctl for inode
@@ -23,7 +76,6 @@
long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
- unsigned int flags;
int err = 0;
reiserfs_write_lock(inode->i_sb);
@@ -32,7 +84,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case REISERFS_IOC_UNPACK:
if (S_ISREG(inode->i_mode)) {
if (arg)
- err = reiserfs_unpack(inode, filp);
+ err = reiserfs_unpack(inode);
} else
err = -ENOTTY;
break;
@@ -40,63 +92,6 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
* following two cases are taken from fs/ext2/ioctl.c by Remy
* Card (card@masi.ibp.fr)
*/
- case REISERFS_IOC_GETFLAGS:
- if (!reiserfs_attrs(inode->i_sb)) {
- err = -ENOTTY;
- break;
- }
-
- flags = REISERFS_I(inode)->i_attrs;
- err = put_user(flags, (int __user *)arg);
- break;
- case REISERFS_IOC_SETFLAGS:{
- if (!reiserfs_attrs(inode->i_sb)) {
- err = -ENOTTY;
- break;
- }
-
- err = mnt_want_write_file(filp);
- if (err)
- break;
-
- if (!inode_owner_or_capable(&init_user_ns, inode)) {
- err = -EPERM;
- goto setflags_out;
- }
- if (get_user(flags, (int __user *)arg)) {
- err = -EFAULT;
- goto setflags_out;
- }
- /*
- * Is it quota file? Do not allow user to mess with it
- */
- if (IS_NOQUOTA(inode)) {
- err = -EPERM;
- goto setflags_out;
- }
- err = vfs_ioc_setflags_prepare(inode,
- REISERFS_I(inode)->i_attrs,
- flags);
- if (err)
- goto setflags_out;
- if ((flags & REISERFS_NOTAIL_FL) &&
- S_ISREG(inode->i_mode)) {
- int result;
-
- result = reiserfs_unpack(inode, filp);
- if (result) {
- err = result;
- goto setflags_out;
- }
- }
- sd_attrs_to_i_attrs(flags, inode);
- REISERFS_I(inode)->i_attrs = flags;
- inode->i_ctime = current_time(inode);
- mark_inode_dirty(inode);
-setflags_out:
- mnt_drop_write_file(filp);
- break;
- }
case REISERFS_IOC_GETVERSION:
err = put_user(inode->i_generation, (int __user *)arg);
break;
@@ -138,12 +133,6 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
case REISERFS_IOC32_UNPACK:
cmd = REISERFS_IOC_UNPACK;
break;
- case REISERFS_IOC32_GETFLAGS:
- cmd = REISERFS_IOC_GETFLAGS;
- break;
- case REISERFS_IOC32_SETFLAGS:
- cmd = REISERFS_IOC_SETFLAGS;
- break;
case REISERFS_IOC32_GETVERSION:
cmd = REISERFS_IOC_GETVERSION;
break;
@@ -165,7 +154,7 @@ int reiserfs_commit_write(struct file *f, struct page *page,
* Function try to convert tail from direct item into indirect.
* It set up nopack attribute in the REISERFS_I(inode)->nopack
*/
-int reiserfs_unpack(struct inode *inode, struct file *filp)
+int reiserfs_unpack(struct inode *inode)
{
int retval = 0;
int index;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index e6eb05e2b2f1..017db70d0f48 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1660,6 +1660,8 @@ const struct inode_operations reiserfs_dir_inode_operations = {
.permission = reiserfs_permission,
.get_acl = reiserfs_get_acl,
.set_acl = reiserfs_set_acl,
+ .fileattr_get = reiserfs_fileattr_get,
+ .fileattr_set = reiserfs_fileattr_set,
};
/*
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 0ca2ac62e534..3aa928ec527a 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -18,8 +18,6 @@
/* the 32 bit compat definitions with int argument */
#define REISERFS_IOC32_UNPACK _IOW(0xCD, 1, int)
-#define REISERFS_IOC32_GETFLAGS FS_IOC32_GETFLAGS
-#define REISERFS_IOC32_SETFLAGS FS_IOC32_SETFLAGS
#define REISERFS_IOC32_GETVERSION FS_IOC32_GETVERSION
#define REISERFS_IOC32_SETVERSION FS_IOC32_SETVERSION
@@ -3408,7 +3406,10 @@ __u32 r5_hash(const signed char *msg, int len);
#define SPARE_SPACE 500
/* prototypes from ioctl.c */
+int reiserfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int reiserfs_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa);
long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
long reiserfs_compat_ioctl(struct file *filp,
unsigned int cmd, unsigned long arg);
-int reiserfs_unpack(struct inode *inode, struct file *filp);
+int reiserfs_unpack(struct inode *inode);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 1b9c7a387dc7..3ffafc73acf0 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2408,7 +2408,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
* IO to work
*/
if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
- err = reiserfs_unpack(inode, NULL);
+ err = reiserfs_unpack(inode);
if (err) {
reiserfs_warning(sb, "super-6520",
"Unpacking tail of quota file failed"
diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h
index 9b3b06da568c..e47fde1182de 100644
--- a/fs/reiserfs/xattr.h
+++ b/fs/reiserfs/xattr.h
@@ -44,7 +44,7 @@ void reiserfs_security_free(struct reiserfs_security_handle *sec);
static inline int reiserfs_xattrs_initialized(struct super_block *sb)
{
- return REISERFS_SB(sb)->priv_root != NULL;
+ return REISERFS_SB(sb)->priv_root && REISERFS_SB(sb)->xattr_root;
}
#define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header))
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 456046e15873..040a1142915f 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -134,6 +134,10 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
#endif
new.ssi_addr_lsb = (short) kinfo->si_addr_lsb;
break;
+ case SIL_PERF_EVENT:
+ new.ssi_addr = (long) kinfo->si_addr;
+ new.ssi_perf = kinfo->si_perf;
+ break;
case SIL_CHLD:
new.ssi_pid = kinfo->si_pid;
new.ssi_uid = kinfo->si_uid;
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index eb02072d28dd..723763746238 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -152,14 +152,18 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
start = le64_to_cpu(table[n]);
end = le64_to_cpu(table[n + 1]);
- if (start >= end || (end - start) > SQUASHFS_METADATA_SIZE) {
+ if (start >= end
+ || (end - start) >
+ (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) {
kfree(table);
return ERR_PTR(-EINVAL);
}
}
start = le64_to_cpu(table[indexes - 1]);
- if (start >= lookup_table_start || (lookup_table_start - start) > SQUASHFS_METADATA_SIZE) {
+ if (start >= lookup_table_start ||
+ (lookup_table_start - start) >
+ (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) {
kfree(table);
return ERR_PTR(-EINVAL);
}
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index 11581bf31af4..ea5387679723 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -97,14 +97,16 @@ __le64 *squashfs_read_id_index_table(struct super_block *sb,
start = le64_to_cpu(table[n]);
end = le64_to_cpu(table[n + 1]);
- if (start >= end || (end - start) > SQUASHFS_METADATA_SIZE) {
+ if (start >= end || (end - start) >
+ (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) {
kfree(table);
return ERR_PTR(-EINVAL);
}
}
start = le64_to_cpu(table[indexes - 1]);
- if (start >= id_table_start || (id_table_start - start) > SQUASHFS_METADATA_SIZE) {
+ if (start >= id_table_start || (id_table_start - start) >
+ (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) {
kfree(table);
return ERR_PTR(-EINVAL);
}
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 8d64edb80ebf..b3fdc8212c5f 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -17,6 +17,7 @@
/* size of metadata (inode and directory) blocks */
#define SQUASHFS_METADATA_SIZE 8192
+#define SQUASHFS_BLOCK_OFFSET 2
/* default size of block device I/O */
#ifdef CONFIG_SQUASHFS_4K_DEVBLK_SIZE
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
index ead66670b41a..087cab8c78f4 100644
--- a/fs/squashfs/xattr_id.c
+++ b/fs/squashfs/xattr_id.c
@@ -109,14 +109,16 @@ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start,
start = le64_to_cpu(table[n]);
end = le64_to_cpu(table[n + 1]);
- if (start >= end || (end - start) > SQUASHFS_METADATA_SIZE) {
+ if (start >= end || (end - start) >
+ (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) {
kfree(table);
return ERR_PTR(-EINVAL);
}
}
start = le64_to_cpu(table[indexes - 1]);
- if (start >= table_start || (table_start - start) > SQUASHFS_METADATA_SIZE) {
+ if (start >= table_start || (table_start - start) >
+ (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) {
kfree(table);
return ERR_PTR(-EINVAL);
}
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index d9d8d7794eff..5bd8482e660a 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1637,6 +1637,8 @@ const struct inode_operations ubifs_dir_inode_operations = {
.listxattr = ubifs_listxattr,
.update_time = ubifs_update_time,
.tmpfile = ubifs_tmpfile,
+ .fileattr_get = ubifs_fileattr_get,
+ .fileattr_set = ubifs_fileattr_set,
};
const struct file_operations ubifs_dir_operations = {
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 0e4b4be3aa26..2e4e1d159969 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1648,6 +1648,8 @@ const struct inode_operations ubifs_file_inode_operations = {
.getattr = ubifs_getattr,
.listxattr = ubifs_listxattr,
.update_time = ubifs_update_time,
+ .fileattr_get = ubifs_fileattr_get,
+ .fileattr_set = ubifs_fileattr_set,
};
const struct inode_operations ubifs_symlink_inode_operations = {
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index a4aaeea63893..dc3e26e9ed7b 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -102,7 +102,8 @@ static int switch_gc_head(struct ubifs_info *c)
* This function compares data nodes @a and @b. Returns %1 if @a has greater
* inode or block number, and %-1 otherwise.
*/
-static int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int data_nodes_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
{
ino_t inuma, inumb;
struct ubifs_info *c = priv;
@@ -145,8 +146,8 @@ static int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
* first and sorted by length in descending order. Directory entry nodes go
* after inode nodes and are sorted in ascending hash valuer order.
*/
-static int nondata_nodes_cmp(void *priv, struct list_head *a,
- struct list_head *b)
+static int nondata_nodes_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
{
ino_t inuma, inumb;
struct ubifs_info *c = priv;
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 2326d5122beb..c6a863487780 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -14,6 +14,7 @@
#include <linux/compat.h>
#include <linux/mount.h>
+#include <linux/fileattr.h>
#include "ubifs.h"
/* Need to be kept consistent with checked flags in ioctl2ubifs() */
@@ -103,7 +104,7 @@ static int ubifs2ioctl(int ubifs_flags)
static int setflags(struct inode *inode, int flags)
{
- int oldflags, err, release;
+ int err, release;
struct ubifs_inode *ui = ubifs_inode(inode);
struct ubifs_info *c = inode->i_sb->s_fs_info;
struct ubifs_budget_req req = { .dirtied_ino = 1,
@@ -114,11 +115,6 @@ static int setflags(struct inode *inode, int flags)
return err;
mutex_lock(&ui->ui_mutex);
- oldflags = ubifs2ioctl(ui->flags);
- err = vfs_ioc_setflags_prepare(inode, oldflags, flags);
- if (err)
- goto out_unlock;
-
ui->flags &= ~ioctl2ubifs(UBIFS_SETTABLE_IOCTL_FLAGS);
ui->flags |= ioctl2ubifs(flags);
ubifs_set_inode_flags(inode);
@@ -132,54 +128,52 @@ static int setflags(struct inode *inode, int flags)
if (IS_SYNC(inode))
err = write_inode_now(inode, 1);
return err;
+}
-out_unlock:
- mutex_unlock(&ui->ui_mutex);
- ubifs_release_budget(c, &req);
- return err;
+int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ int flags = ubifs2ioctl(ubifs_inode(inode)->flags);
+
+ if (d_is_special(dentry))
+ return -ENOTTY;
+
+ dbg_gen("get flags: %#x, i_flags %#x", flags, inode->i_flags);
+ fileattr_fill_flags(fa, flags);
+
+ return 0;
}
-long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+int ubifs_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
{
- int flags, err;
- struct inode *inode = file_inode(file);
+ struct inode *inode = d_inode(dentry);
+ int flags = fa->flags;
- switch (cmd) {
- case FS_IOC_GETFLAGS:
- flags = ubifs2ioctl(ubifs_inode(inode)->flags);
+ if (d_is_special(dentry))
+ return -ENOTTY;
- dbg_gen("get flags: %#x, i_flags %#x", flags, inode->i_flags);
- return put_user(flags, (int __user *) arg);
+ if (fileattr_has_fsx(fa))
+ return -EOPNOTSUPP;
- case FS_IOC_SETFLAGS: {
- if (IS_RDONLY(inode))
- return -EROFS;
+ if (flags & ~UBIFS_GETTABLE_IOCTL_FLAGS)
+ return -EOPNOTSUPP;
- if (!inode_owner_or_capable(&init_user_ns, inode))
- return -EACCES;
+ flags &= UBIFS_SETTABLE_IOCTL_FLAGS;
- if (get_user(flags, (int __user *) arg))
- return -EFAULT;
+ if (!S_ISDIR(inode->i_mode))
+ flags &= ~FS_DIRSYNC_FL;
- if (flags & ~UBIFS_GETTABLE_IOCTL_FLAGS)
- return -EOPNOTSUPP;
- flags &= UBIFS_SETTABLE_IOCTL_FLAGS;
+ dbg_gen("set flags: %#x, i_flags %#x", flags, inode->i_flags);
+ return setflags(inode, flags);
+}
- if (!S_ISDIR(inode->i_mode))
- flags &= ~FS_DIRSYNC_FL;
+long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ int err;
+ struct inode *inode = file_inode(file);
- /*
- * Make sure the file-system is read-write and make sure it
- * will not become read-only while we are changing the flags.
- */
- err = mnt_want_write_file(file);
- if (err)
- return err;
- dbg_gen("set flags: %#x, i_flags %#x", flags, inode->i_flags);
- err = setflags(inode, flags);
- mnt_drop_write_file(file);
- return err;
- }
+ switch (cmd) {
case FS_IOC_SET_ENCRYPTION_POLICY: {
struct ubifs_info *c = inode->i_sb->s_fs_info;
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 0f8a6a16421b..4d17e5382b74 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -298,8 +298,8 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
* entries @a and @b by comparing their sequence numer. Returns %1 if @a has
* greater sequence number and %-1 otherwise.
*/
-static int replay_entries_cmp(void *priv, struct list_head *a,
- struct list_head *b)
+static int replay_entries_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
{
struct ubifs_info *c = priv;
struct replay_entry *ra, *rb;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 7fdfdbda4b8a..b65c599a386a 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -2053,6 +2053,9 @@ int ubifs_recover_size(struct ubifs_info *c, bool in_place);
void ubifs_destroy_size_tree(struct ubifs_info *c);
/* ioctl.c */
+int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int ubifs_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa);
long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
void ubifs_set_inode_flags(struct inode *inode);
#ifdef CONFIG_COMPAT
diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c
index 7aee0ec63ade..eac6788fc6cf 100644
--- a/fs/vboxsf/dir.c
+++ b/fs/vboxsf/dir.c
@@ -225,7 +225,7 @@ static struct dentry *vboxsf_dir_lookup(struct inode *parent,
} else {
inode = vboxsf_new_inode(parent->i_sb);
if (!IS_ERR(inode))
- vboxsf_init_inode(sbi, inode, &fsinfo);
+ vboxsf_init_inode(sbi, inode, &fsinfo, false);
}
return d_splice_alias(inode, dentry);
@@ -245,7 +245,7 @@ static int vboxsf_dir_instantiate(struct inode *parent, struct dentry *dentry,
sf_i = VBOXSF_I(inode);
/* The host may have given us different attr then requested */
sf_i->force_restat = 1;
- vboxsf_init_inode(sbi, inode, info);
+ vboxsf_init_inode(sbi, inode, info, false);
d_instantiate(dentry, inode);
diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c
index d7816c01a4f6..4f5e59f06284 100644
--- a/fs/vboxsf/super.c
+++ b/fs/vboxsf/super.c
@@ -207,7 +207,7 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc)
err = -ENOMEM;
goto fail_unmap;
}
- vboxsf_init_inode(sbi, iroot, &sbi->root_info);
+ vboxsf_init_inode(sbi, iroot, &sbi->root_info, false);
unlock_new_inode(iroot);
droot = d_make_root(iroot);
@@ -418,7 +418,7 @@ static int vboxsf_reconfigure(struct fs_context *fc)
/* Apply changed options to the root inode */
sbi->o = ctx->o;
- vboxsf_init_inode(sbi, iroot, &sbi->root_info);
+ vboxsf_init_inode(sbi, iroot, &sbi->root_info, true);
return 0;
}
diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c
index 3b847e3fba24..aec2ebf7d25a 100644
--- a/fs/vboxsf/utils.c
+++ b/fs/vboxsf/utils.c
@@ -45,12 +45,12 @@ struct inode *vboxsf_new_inode(struct super_block *sb)
}
/* set [inode] attributes based on [info], uid/gid based on [sbi] */
-void vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode,
- const struct shfl_fsobjinfo *info)
+int vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode,
+ const struct shfl_fsobjinfo *info, bool reinit)
{
const struct shfl_fsobjattr *attr;
s64 allocated;
- int mode;
+ umode_t mode;
attr = &info->attr;
@@ -75,29 +75,44 @@ void vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode,
inode->i_mapping->a_ops = &vboxsf_reg_aops;
if (SHFL_IS_DIRECTORY(attr->mode)) {
- inode->i_mode = sbi->o.dmode_set ? sbi->o.dmode : mode;
- inode->i_mode &= ~sbi->o.dmask;
- inode->i_mode |= S_IFDIR;
- inode->i_op = &vboxsf_dir_iops;
- inode->i_fop = &vboxsf_dir_fops;
- /*
- * XXX: this probably should be set to the number of entries
- * in the directory plus two (. ..)
- */
- set_nlink(inode, 1);
+ if (sbi->o.dmode_set)
+ mode = sbi->o.dmode;
+ mode &= ~sbi->o.dmask;
+ mode |= S_IFDIR;
+ if (!reinit) {
+ inode->i_op = &vboxsf_dir_iops;
+ inode->i_fop = &vboxsf_dir_fops;
+ /*
+ * XXX: this probably should be set to the number of entries
+ * in the directory plus two (. ..)
+ */
+ set_nlink(inode, 1);
+ } else if (!S_ISDIR(inode->i_mode))
+ return -ESTALE;
+ inode->i_mode = mode;
} else if (SHFL_IS_SYMLINK(attr->mode)) {
- inode->i_mode = sbi->o.fmode_set ? sbi->o.fmode : mode;
- inode->i_mode &= ~sbi->o.fmask;
- inode->i_mode |= S_IFLNK;
- inode->i_op = &vboxsf_lnk_iops;
- set_nlink(inode, 1);
+ if (sbi->o.fmode_set)
+ mode = sbi->o.fmode;
+ mode &= ~sbi->o.fmask;
+ mode |= S_IFLNK;
+ if (!reinit) {
+ inode->i_op = &vboxsf_lnk_iops;
+ set_nlink(inode, 1);
+ } else if (!S_ISLNK(inode->i_mode))
+ return -ESTALE;
+ inode->i_mode = mode;
} else {
- inode->i_mode = sbi->o.fmode_set ? sbi->o.fmode : mode;
- inode->i_mode &= ~sbi->o.fmask;
- inode->i_mode |= S_IFREG;
- inode->i_op = &vboxsf_reg_iops;
- inode->i_fop = &vboxsf_reg_fops;
- set_nlink(inode, 1);
+ if (sbi->o.fmode_set)
+ mode = sbi->o.fmode;
+ mode &= ~sbi->o.fmask;
+ mode |= S_IFREG;
+ if (!reinit) {
+ inode->i_op = &vboxsf_reg_iops;
+ inode->i_fop = &vboxsf_reg_fops;
+ set_nlink(inode, 1);
+ } else if (!S_ISREG(inode->i_mode))
+ return -ESTALE;
+ inode->i_mode = mode;
}
inode->i_uid = sbi->o.uid;
@@ -116,6 +131,7 @@ void vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode,
info->change_time.ns_relative_to_unix_epoch);
inode->i_mtime = ns_to_timespec64(
info->modification_time.ns_relative_to_unix_epoch);
+ return 0;
}
int vboxsf_create_at_dentry(struct dentry *dentry,
@@ -199,7 +215,9 @@ int vboxsf_inode_revalidate(struct dentry *dentry)
dentry->d_time = jiffies;
sf_i->force_restat = 0;
- vboxsf_init_inode(sbi, inode, &info);
+ err = vboxsf_init_inode(sbi, inode, &info, true);
+ if (err)
+ return err;
/*
* If the file was changed on the host side we need to invalidate the
diff --git a/fs/vboxsf/vfsmod.h b/fs/vboxsf/vfsmod.h
index 760524e78c88..6a7a9cedebc6 100644
--- a/fs/vboxsf/vfsmod.h
+++ b/fs/vboxsf/vfsmod.h
@@ -82,8 +82,8 @@ extern const struct dentry_operations vboxsf_dentry_ops;
/* from utils.c */
struct inode *vboxsf_new_inode(struct super_block *sb);
-void vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode,
- const struct shfl_fsobjinfo *info);
+int vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode,
+ const struct shfl_fsobjinfo *info, bool reinit);
int vboxsf_create_at_dentry(struct dentry *dentry,
struct shfl_createparms *params);
int vboxsf_stat(struct vboxsf_sbi *sbi, struct shfl_string *path,
diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig
index 88fb25119899..24d1b54de807 100644
--- a/fs/verity/Kconfig
+++ b/fs/verity/Kconfig
@@ -3,9 +3,13 @@
config FS_VERITY
bool "FS Verity (read-only file-based authenticity protection)"
select CRYPTO
- # SHA-256 is selected as it's intended to be the default hash algorithm.
+ # SHA-256 is implied as it's intended to be the default hash algorithm.
# To avoid bloat, other wanted algorithms must be selected explicitly.
- select CRYPTO_SHA256
+ # Note that CRYPTO_SHA256 denotes the generic C implementation, but
+ # some architectures provided optimized implementations of the same
+ # algorithm that may be used instead. In this case, CRYPTO_SHA256 may
+ # be omitted even if SHA-256 is being used.
+ imply CRYPTO_SHA256
help
This option enables fs-verity. fs-verity is the dm-verity
mechanism implemented at the file level. On supported
diff --git a/fs/xattr.c b/fs/xattr.c
index b3444e06cded..5c8c5175b385 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -186,12 +186,12 @@ EXPORT_SYMBOL(__vfs_setxattr);
* __vfs_setxattr_noperm - perform setxattr operation without performing
* permission checks.
*
- * @mnt_userns - user namespace of the mount the inode was found from
- * @dentry - object to perform setxattr on
- * @name - xattr name to set
- * @value - value to set @name to
- * @size - size of @value
- * @flags - flags to pass into filesystem operations
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @dentry: object to perform setxattr on
+ * @name: xattr name to set
+ * @value: value to set @name to
+ * @size: size of @value
+ * @flags: flags to pass into filesystem operations
*
* returns the result of the internal setxattr or setsecurity operations.
*
@@ -242,6 +242,7 @@ int __vfs_setxattr_noperm(struct user_namespace *mnt_userns,
* __vfs_setxattr_locked - set an extended attribute while holding the inode
* lock
*
+ * @mnt_userns: user namespace of the mount of the target inode
* @dentry: object to perform setxattr on
* @name: xattr name to set
* @value: value to set @name to
@@ -473,6 +474,7 @@ EXPORT_SYMBOL(__vfs_removexattr);
* __vfs_removexattr_locked - set an extended attribute while holding the inode
* lock
*
+ * @mnt_userns: user namespace of the mount of the target inode
* @dentry: object to perform setxattr on
* @name: name of xattr to remove
* @delegated_inode: on return, will contain an inode pointer that
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 6fad140d4c8e..6bf7d8b7d743 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -770,8 +770,6 @@ struct xfs_scrub_metadata {
/*
* ioctl commands that are used by Linux filesystems
*/
-#define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS
-#define XFS_IOC_SETXFLAGS FS_IOC_SETFLAGS
#define XFS_IOC_GETVERSION FS_IOC_GETVERSION
/*
@@ -782,8 +780,6 @@ struct xfs_scrub_metadata {
#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64)
#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64)
#define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr)
-#define XFS_IOC_FSGETXATTR FS_IOC_FSGETXATTR
-#define XFS_IOC_FSSETXATTR FS_IOC_FSSETXATTR
#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64)
#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64)
#define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap)
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index f88694f22d05..813b5f219113 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -63,8 +63,8 @@ xbitmap_init(
static int
xbitmap_range_cmp(
void *priv,
- struct list_head *a,
- struct list_head *b)
+ const struct list_head *a,
+ const struct list_head *b)
{
struct xbitmap_range *ap;
struct xbitmap_range *bp;
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 2344757ede63..e3a691937e92 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -265,8 +265,8 @@ xfs_trans_log_finish_bmap_update(
static int
xfs_bmap_update_diff_items(
void *priv,
- struct list_head *a,
- struct list_head *b)
+ const struct list_head *a,
+ const struct list_head *b)
{
struct xfs_bmap_intent *ba;
struct xfs_bmap_intent *bb;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 37a1d12762d8..592800c8852f 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -2124,9 +2124,9 @@ xfs_buf_delwri_queue(
*/
static int
xfs_buf_cmp(
- void *priv,
- struct list_head *a,
- struct list_head *b)
+ void *priv,
+ const struct list_head *a,
+ const struct list_head *b)
{
struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list);
struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index ef17c1f6db32..a4075685d9eb 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -629,8 +629,8 @@ xfs_extent_busy_wait_all(
int
xfs_extent_busy_ag_cmp(
void *priv,
- struct list_head *l1,
- struct list_head *l2)
+ const struct list_head *l1,
+ const struct list_head *l2)
{
struct xfs_extent_busy *b1 =
container_of(l1, struct xfs_extent_busy, list);
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
index 990ab3891971..8aea07100092 100644
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@@ -58,7 +58,8 @@ void
xfs_extent_busy_wait_all(struct xfs_mount *mp);
int
-xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
+xfs_extent_busy_ag_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b);
static inline void xfs_extent_busy_sort(struct list_head *list)
{
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 93223ebb3372..2424230ca2c3 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -397,8 +397,8 @@ xfs_trans_free_extent(
static int
xfs_extent_free_diff_items(
void *priv,
- struct list_head *a,
- struct list_head *b)
+ const struct list_head *a,
+ const struct list_head *b)
{
struct xfs_mount *mp = priv;
struct xfs_extent_free_item *ra;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index f93370bd7b1e..2a8bdf33e6c4 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -812,7 +812,7 @@ xfs_init_new_inode(
if (dir && !(dir->i_mode & S_ISGID) &&
(mp->m_flags & XFS_MOUNT_GRPID)) {
- inode->i_uid = fsuid_into_mnt(mnt_userns);
+ inode_fsuid_set(inode, mnt_userns);
inode->i_gid = dir->i_gid;
inode->i_mode = mode;
} else {
@@ -1007,8 +1007,8 @@ xfs_create(
/*
* Make sure that we have allocated dquot(s) on disk.
*/
- error = xfs_qm_vop_dqalloc(dp, fsuid_into_mnt(mnt_userns),
- fsgid_into_mnt(mnt_userns), prid,
+ error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns),
+ mapped_fsgid(mnt_userns), prid,
XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
&udqp, &gdqp, &pdqp);
if (error)
@@ -1158,8 +1158,8 @@ xfs_create_tmpfile(
/*
* Make sure that we have allocated dquot(s) on disk.
*/
- error = xfs_qm_vop_dqalloc(dp, fsuid_into_mnt(mnt_userns),
- fsgid_into_mnt(mnt_userns), prid,
+ error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns),
+ mapped_fsgid(mnt_userns), prid,
XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
&udqp, &gdqp, &pdqp);
if (error)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 99dfe89a8d08..bbda105a2ce5 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -40,6 +40,7 @@
#include <linux/mount.h>
#include <linux/namei.h>
+#include <linux/fileattr.h>
/*
* xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
@@ -1053,73 +1054,15 @@ xfs_ioc_ag_geometry(
* Linux extended inode flags interface.
*/
-STATIC unsigned int
-xfs_merge_ioc_xflags(
- unsigned int flags,
- unsigned int start)
-{
- unsigned int xflags = start;
-
- if (flags & FS_IMMUTABLE_FL)
- xflags |= FS_XFLAG_IMMUTABLE;
- else
- xflags &= ~FS_XFLAG_IMMUTABLE;
- if (flags & FS_APPEND_FL)
- xflags |= FS_XFLAG_APPEND;
- else
- xflags &= ~FS_XFLAG_APPEND;
- if (flags & FS_SYNC_FL)
- xflags |= FS_XFLAG_SYNC;
- else
- xflags &= ~FS_XFLAG_SYNC;
- if (flags & FS_NOATIME_FL)
- xflags |= FS_XFLAG_NOATIME;
- else
- xflags &= ~FS_XFLAG_NOATIME;
- if (flags & FS_NODUMP_FL)
- xflags |= FS_XFLAG_NODUMP;
- else
- xflags &= ~FS_XFLAG_NODUMP;
- if (flags & FS_DAX_FL)
- xflags |= FS_XFLAG_DAX;
- else
- xflags &= ~FS_XFLAG_DAX;
-
- return xflags;
-}
-
-STATIC unsigned int
-xfs_di2lxflags(
- uint16_t di_flags,
- uint64_t di_flags2)
-{
- unsigned int flags = 0;
-
- if (di_flags & XFS_DIFLAG_IMMUTABLE)
- flags |= FS_IMMUTABLE_FL;
- if (di_flags & XFS_DIFLAG_APPEND)
- flags |= FS_APPEND_FL;
- if (di_flags & XFS_DIFLAG_SYNC)
- flags |= FS_SYNC_FL;
- if (di_flags & XFS_DIFLAG_NOATIME)
- flags |= FS_NOATIME_FL;
- if (di_flags & XFS_DIFLAG_NODUMP)
- flags |= FS_NODUMP_FL;
- if (di_flags2 & XFS_DIFLAG2_DAX) {
- flags |= FS_DAX_FL;
- }
- return flags;
-}
-
static void
xfs_fill_fsxattr(
struct xfs_inode *ip,
- bool attr,
- struct fsxattr *fa)
+ int whichfork,
+ struct fileattr *fa)
{
- struct xfs_ifork *ifp = attr ? ip->i_afp : &ip->i_df;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
- simple_fill_fsxattr(fa, xfs_ip2xflags(ip));
+ fileattr_fill_xflags(fa, xfs_ip2xflags(ip));
fa->fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
fa->fsx_cowextsize = ip->i_d.di_cowextsize <<
ip->i_mount->m_sb.sb_blocklog;
@@ -1131,19 +1074,33 @@ xfs_fill_fsxattr(
}
STATIC int
-xfs_ioc_fsgetxattr(
+xfs_ioc_fsgetxattra(
xfs_inode_t *ip,
- int attr,
void __user *arg)
{
- struct fsxattr fa;
+ struct fileattr fa;
xfs_ilock(ip, XFS_ILOCK_SHARED);
- xfs_fill_fsxattr(ip, attr, &fa);
+ xfs_fill_fsxattr(ip, XFS_ATTR_FORK, &fa);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ return copy_fsxattr_to_user(&fa, arg);
+}
+
+int
+xfs_fileattr_get(
+ struct dentry *dentry,
+ struct fileattr *fa)
+{
+ struct xfs_inode *ip = XFS_I(d_inode(dentry));
+
+ if (d_is_special(dentry))
+ return -ENOTTY;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ xfs_fill_fsxattr(ip, XFS_DATA_FORK, fa);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
- if (copy_to_user(arg, &fa, sizeof(fa)))
- return -EFAULT;
return 0;
}
@@ -1210,7 +1167,7 @@ static int
xfs_ioctl_setattr_xflags(
struct xfs_trans *tp,
struct xfs_inode *ip,
- struct fsxattr *fa)
+ struct fileattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
uint64_t di_flags2;
@@ -1253,7 +1210,7 @@ xfs_ioctl_setattr_xflags(
static void
xfs_ioctl_setattr_prepare_dax(
struct xfs_inode *ip,
- struct fsxattr *fa)
+ struct fileattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
struct inode *inode = VFS_I(ip);
@@ -1280,10 +1237,9 @@ xfs_ioctl_setattr_prepare_dax(
*/
static struct xfs_trans *
xfs_ioctl_setattr_get_trans(
- struct file *file,
+ struct xfs_inode *ip,
struct xfs_dquot *pdqp)
{
- struct xfs_inode *ip = XFS_I(file_inode(file));
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
int error = -EROFS;
@@ -1299,24 +1255,11 @@ xfs_ioctl_setattr_get_trans(
if (error)
goto out_error;
- /*
- * CAP_FOWNER overrides the following restrictions:
- *
- * The user ID of the calling process must be equal to the file owner
- * ID, except in cases where the CAP_FSETID capability is applicable.
- */
- if (!inode_owner_or_capable(file_mnt_user_ns(file), VFS_I(ip))) {
- error = -EPERM;
- goto out_cancel;
- }
-
if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(tp);
return tp;
-out_cancel:
- xfs_trans_cancel(tp);
out_error:
return ERR_PTR(error);
}
@@ -1340,12 +1283,15 @@ out_error:
static int
xfs_ioctl_setattr_check_extsize(
struct xfs_inode *ip,
- struct fsxattr *fa)
+ struct fileattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
xfs_extlen_t size;
xfs_fsblock_t extsize_fsb;
+ if (!fa->fsx_valid)
+ return 0;
+
if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_df.if_nextents &&
((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize))
return -EINVAL;
@@ -1390,12 +1336,15 @@ xfs_ioctl_setattr_check_extsize(
static int
xfs_ioctl_setattr_check_cowextsize(
struct xfs_inode *ip,
- struct fsxattr *fa)
+ struct fileattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
xfs_extlen_t size;
xfs_fsblock_t cowextsize_fsb;
+ if (!fa->fsx_valid)
+ return 0;
+
if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE))
return 0;
@@ -1422,8 +1371,11 @@ xfs_ioctl_setattr_check_cowextsize(
static int
xfs_ioctl_setattr_check_projid(
struct xfs_inode *ip,
- struct fsxattr *fa)
+ struct fileattr *fa)
{
+ if (!fa->fsx_valid)
+ return 0;
+
/* Disallow 32bit project ids if projid32bit feature is not enabled. */
if (fa->fsx_projid > (uint16_t)-1 &&
!xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
@@ -1431,14 +1383,13 @@ xfs_ioctl_setattr_check_projid(
return 0;
}
-STATIC int
-xfs_ioctl_setattr(
- struct file *file,
- struct fsxattr *fa)
+int
+xfs_fileattr_set(
+ struct user_namespace *mnt_userns,
+ struct dentry *dentry,
+ struct fileattr *fa)
{
- struct user_namespace *mnt_userns = file_mnt_user_ns(file);
- struct xfs_inode *ip = XFS_I(file_inode(file));
- struct fsxattr old_fa;
+ struct xfs_inode *ip = XFS_I(d_inode(dentry));
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
struct xfs_dquot *pdqp = NULL;
@@ -1447,6 +1398,16 @@ xfs_ioctl_setattr(
trace_xfs_ioctl_setattr(ip);
+ if (d_is_special(dentry))
+ return -ENOTTY;
+
+ if (!fa->fsx_valid) {
+ if (fa->flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL |
+ FS_NOATIME_FL | FS_NODUMP_FL |
+ FS_SYNC_FL | FS_DAX_FL | FS_PROJINHERIT_FL))
+ return -EOPNOTSUPP;
+ }
+
error = xfs_ioctl_setattr_check_projid(ip, fa);
if (error)
return error;
@@ -1459,7 +1420,7 @@ xfs_ioctl_setattr(
* If the IDs do change before we take the ilock, we're covered
* because the i_*dquot fields will get updated anyway.
*/
- if (XFS_IS_QUOTA_ON(mp)) {
+ if (fa->fsx_valid && XFS_IS_QUOTA_ON(mp)) {
error = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid,
VFS_I(ip)->i_gid, fa->fsx_projid,
XFS_QMOPT_PQUOTA, NULL, NULL, &pdqp);
@@ -1469,17 +1430,12 @@ xfs_ioctl_setattr(
xfs_ioctl_setattr_prepare_dax(ip, fa);
- tp = xfs_ioctl_setattr_get_trans(file, pdqp);
+ tp = xfs_ioctl_setattr_get_trans(ip, pdqp);
if (IS_ERR(tp)) {
error = PTR_ERR(tp);
goto error_free_dquots;
}
- xfs_fill_fsxattr(ip, false, &old_fa);
- error = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, fa);
- if (error)
- goto error_trans_cancel;
-
error = xfs_ioctl_setattr_check_extsize(ip, fa);
if (error)
goto error_trans_cancel;
@@ -1492,6 +1448,8 @@ xfs_ioctl_setattr(
if (error)
goto error_trans_cancel;
+ if (!fa->fsx_valid)
+ goto skip_xattr;
/*
* Change file ownership. Must be the owner or privileged. CAP_FSETID
* overrides the following restrictions:
@@ -1529,6 +1487,7 @@ xfs_ioctl_setattr(
else
ip->i_d.di_cowextsize = 0;
+skip_xattr:
error = xfs_trans_commit(tp);
/*
@@ -1546,91 +1505,6 @@ error_free_dquots:
return error;
}
-STATIC int
-xfs_ioc_fssetxattr(
- struct file *filp,
- void __user *arg)
-{
- struct fsxattr fa;
- int error;
-
- if (copy_from_user(&fa, arg, sizeof(fa)))
- return -EFAULT;
-
- error = mnt_want_write_file(filp);
- if (error)
- return error;
- error = xfs_ioctl_setattr(filp, &fa);
- mnt_drop_write_file(filp);
- return error;
-}
-
-STATIC int
-xfs_ioc_getxflags(
- xfs_inode_t *ip,
- void __user *arg)
-{
- unsigned int flags;
-
- flags = xfs_di2lxflags(ip->i_d.di_flags, ip->i_d.di_flags2);
- if (copy_to_user(arg, &flags, sizeof(flags)))
- return -EFAULT;
- return 0;
-}
-
-STATIC int
-xfs_ioc_setxflags(
- struct xfs_inode *ip,
- struct file *filp,
- void __user *arg)
-{
- struct xfs_trans *tp;
- struct fsxattr fa;
- struct fsxattr old_fa;
- unsigned int flags;
- int error;
-
- if (copy_from_user(&flags, arg, sizeof(flags)))
- return -EFAULT;
-
- if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
- FS_NOATIME_FL | FS_NODUMP_FL | \
- FS_SYNC_FL | FS_DAX_FL))
- return -EOPNOTSUPP;
-
- fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
-
- error = mnt_want_write_file(filp);
- if (error)
- return error;
-
- xfs_ioctl_setattr_prepare_dax(ip, &fa);
-
- tp = xfs_ioctl_setattr_get_trans(filp, NULL);
- if (IS_ERR(tp)) {
- error = PTR_ERR(tp);
- goto out_drop_write;
- }
-
- xfs_fill_fsxattr(ip, false, &old_fa);
- error = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, &fa);
- if (error) {
- xfs_trans_cancel(tp);
- goto out_drop_write;
- }
-
- error = xfs_ioctl_setattr_xflags(tp, ip, &fa);
- if (error) {
- xfs_trans_cancel(tp);
- goto out_drop_write;
- }
-
- error = xfs_trans_commit(tp);
-out_drop_write:
- mnt_drop_write_file(filp);
- return error;
-}
-
static bool
xfs_getbmap_format(
struct kgetbmap *p,
@@ -2137,16 +2011,8 @@ xfs_file_ioctl(
case XFS_IOC_GETVERSION:
return put_user(inode->i_generation, (int __user *)arg);
- case XFS_IOC_FSGETXATTR:
- return xfs_ioc_fsgetxattr(ip, 0, arg);
case XFS_IOC_FSGETXATTRA:
- return xfs_ioc_fsgetxattr(ip, 1, arg);
- case XFS_IOC_FSSETXATTR:
- return xfs_ioc_fssetxattr(filp, arg);
- case XFS_IOC_GETXFLAGS:
- return xfs_ioc_getxflags(ip, arg);
- case XFS_IOC_SETXFLAGS:
- return xfs_ioc_setxflags(ip, filp, arg);
+ return xfs_ioc_fsgetxattra(ip, arg);
case XFS_IOC_GETBMAP:
case XFS_IOC_GETBMAPA:
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index bab6a5a92407..28453a6d4461 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -47,6 +47,17 @@ xfs_handle_to_dentry(
void __user *uhandle,
u32 hlen);
+extern int
+xfs_fileattr_get(
+ struct dentry *dentry,
+ struct fileattr *fa);
+
+extern int
+xfs_fileattr_set(
+ struct user_namespace *mnt_userns,
+ struct dentry *dentry,
+ struct fileattr *fa);
+
extern long
xfs_file_ioctl(
struct file *filp,
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 33c09ec8e6c0..e6506773ba55 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -484,8 +484,6 @@ xfs_file_compat_ioctl(
}
#endif
/* long changes size, but xfs only copiese out 32 bits */
- case XFS_IOC_GETXFLAGS_32:
- case XFS_IOC_SETXFLAGS_32:
case XFS_IOC_GETVERSION_32:
cmd = _NATIVE_IOC(cmd, long);
return xfs_file_ioctl(filp, cmd, p);
diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h
index 053de7d894cd..9929482bf358 100644
--- a/fs/xfs/xfs_ioctl32.h
+++ b/fs/xfs/xfs_ioctl32.h
@@ -17,8 +17,6 @@
*/
/* stock kernel-level ioctls we support */
-#define XFS_IOC_GETXFLAGS_32 FS_IOC32_GETFLAGS
-#define XFS_IOC_SETXFLAGS_32 FS_IOC32_SETFLAGS
#define XFS_IOC_GETVERSION_32 FS_IOC32_GETVERSION
/*
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 66ebccb5a6ff..e1f749b711f8 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -21,6 +21,7 @@
#include "xfs_dir2.h"
#include "xfs_iomap.h"
#include "xfs_error.h"
+#include "xfs_ioctl.h"
#include <linux/posix_acl.h>
#include <linux/security.h>
@@ -1152,6 +1153,8 @@ static const struct inode_operations xfs_inode_operations = {
.listxattr = xfs_vn_listxattr,
.fiemap = xfs_vn_fiemap,
.update_time = xfs_vn_update_time,
+ .fileattr_get = xfs_fileattr_get,
+ .fileattr_set = xfs_fileattr_set,
};
static const struct inode_operations xfs_dir_inode_operations = {
@@ -1177,6 +1180,8 @@ static const struct inode_operations xfs_dir_inode_operations = {
.listxattr = xfs_vn_listxattr,
.update_time = xfs_vn_update_time,
.tmpfile = xfs_vn_tmpfile,
+ .fileattr_get = xfs_fileattr_get,
+ .fileattr_set = xfs_fileattr_set,
};
static const struct inode_operations xfs_dir_ci_inode_operations = {
@@ -1202,6 +1207,8 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
.listxattr = xfs_vn_listxattr,
.update_time = xfs_vn_update_time,
.tmpfile = xfs_vn_tmpfile,
+ .fileattr_get = xfs_fileattr_get,
+ .fileattr_set = xfs_fileattr_set,
};
static const struct inode_operations xfs_symlink_inode_operations = {
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 07ebccbbf4df..746f4eda724c 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -269,8 +269,8 @@ xfs_trans_log_finish_refcount_update(
static int
xfs_refcount_update_diff_items(
void *priv,
- struct list_head *a,
- struct list_head *b)
+ const struct list_head *a,
+ const struct list_head *b)
{
struct xfs_mount *mp = priv;
struct xfs_refcount_intent *ra;
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 49cebd68b672..dc4f0c9f0897 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -337,8 +337,8 @@ xfs_trans_log_finish_rmap_update(
static int
xfs_rmap_update_diff_items(
void *priv,
- struct list_head *a,
- struct list_head *b)
+ const struct list_head *a,
+ const struct list_head *b)
{
struct xfs_mount *mp = priv;
struct xfs_rmap_intent *ra;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 7f368b10ded1..63edb4dbed4a 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -182,8 +182,8 @@ xfs_symlink(
/*
* Make sure that we have allocated dquot(s) on disk.
*/
- error = xfs_qm_vop_dqalloc(dp, fsuid_into_mnt(mnt_userns),
- fsgid_into_mnt(mnt_userns), prid,
+ error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(mnt_userns),
+ mapped_fsgid(mnt_userns), prid,
XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
&udqp, &gdqp, &pdqp);
if (error)