summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.c29
-rw-r--r--fs/9p/vfs_file.c6
-rw-r--r--fs/affs/file.c2
-rw-r--r--fs/afs/cache.c43
-rw-r--r--fs/afs/misc.c1
-rw-r--r--fs/afs/rxrpc.c47
-rw-r--r--fs/afs/write.c2
-rw-r--r--fs/aio.c27
-rw-r--r--fs/autofs4/autofs_i.h22
-rw-r--r--fs/autofs4/dev-ioctl.c44
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/binfmt_elf_fdpic.c2
-rw-r--r--fs/binfmt_flat.c10
-rw-r--r--fs/buffer.c31
-rw-r--r--fs/ceph/addr.c24
-rw-r--r--fs/ceph/cache.c43
-rw-r--r--fs/ceph/locks.c2
-rw-r--r--fs/char_dev.c58
-rw-r--r--fs/cifs/cache.c31
-rw-r--r--fs/cifs/cifsglob.h2
-rw-r--r--fs/cifs/cifsproto.h3
-rw-r--r--fs/cifs/cifssmb.c17
-rw-r--r--fs/cifs/connect.c25
-rw-r--r--fs/cifs/dir.c2
-rw-r--r--fs/cifs/file.c4
-rw-r--r--fs/cifs/smb2ops.c204
-rw-r--r--fs/cifs/smb2pdu.c41
-rw-r--r--fs/cifs/smb2pdu.h14
-rw-r--r--fs/cifs/smb2proto.h6
-rw-r--r--fs/cifs/xattr.c2
-rw-r--r--fs/compat_ioctl.c2
-rw-r--r--fs/dax.c382
-rw-r--r--fs/dlm/debug_fs.c25
-rw-r--r--fs/dlm/lock.c8
-rw-r--r--fs/dlm/lockspace.c9
-rw-r--r--fs/dlm/lowcomms.c2
-rw-r--r--fs/dlm/member.c15
-rw-r--r--fs/dlm/plock.c2
-rw-r--r--fs/dlm/user.c6
-rw-r--r--fs/ecryptfs/file.c2
-rw-r--r--fs/eventpoll.c72
-rw-r--r--fs/exec.c56
-rw-r--r--fs/exofs/file.c2
-rw-r--r--fs/ext2/file.c25
-rw-r--r--fs/ext4/dir.c2
-rw-r--r--fs/ext4/ext4.h11
-rw-r--r--fs/ext4/file.c67
-rw-r--r--fs/ext4/hash.c4
-rw-r--r--fs/ext4/ialloc.c93
-rw-r--r--fs/ext4/inode.c23
-rw-r--r--fs/ext4/mmp.c2
-rw-r--r--fs/ext4/super.c64
-rw-r--r--fs/ext4/xattr.c141
-rw-r--r--fs/f2fs/data.c5
-rw-r--r--fs/f2fs/file.c2
-rw-r--r--fs/fat/namei_vfat.c35
-rw-r--r--fs/fscache/page.c5
-rw-r--r--fs/fuse/file.c12
-rw-r--r--fs/gfs2/acl.c30
-rw-r--r--fs/gfs2/aops.c14
-rw-r--r--fs/gfs2/bmap.c24
-rw-r--r--fs/gfs2/dir.c4
-rw-r--r--fs/gfs2/file.c9
-rw-r--r--fs/gfs2/glock.c137
-rw-r--r--fs/gfs2/glock.h36
-rw-r--r--fs/gfs2/glops.c30
-rw-r--r--fs/gfs2/incore.h4
-rw-r--r--fs/gfs2/inode.c17
-rw-r--r--fs/gfs2/lock_dlm.c5
-rw-r--r--fs/gfs2/log.c13
-rw-r--r--fs/gfs2/lops.c7
-rw-r--r--fs/gfs2/meta_io.c9
-rw-r--r--fs/gfs2/ops_fstype.c7
-rw-r--r--fs/gfs2/quota.c7
-rw-r--r--fs/gfs2/rgrp.c3
-rw-r--r--fs/gfs2/super.c71
-rw-r--r--fs/gfs2/util.h1
-rw-r--r--fs/gfs2/xattr.c9
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfsplus/inode.c2
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hpfs/file.c2
-rw-r--r--fs/hugetlbfs/inode.c41
-rw-r--r--fs/inode.c3
-rw-r--r--fs/internal.h1
-rw-r--r--fs/iomap.c4
-rw-r--r--fs/isofs/inode.c11
-rw-r--r--fs/jffs2/file.c2
-rw-r--r--fs/jfs/file.c2
-rw-r--r--fs/jfs/super.c12
-rw-r--r--fs/kernfs/file.c2
-rw-r--r--fs/kernfs/mount.c4
-rw-r--r--fs/locks.c130
-rw-r--r--fs/namei.c15
-rw-r--r--fs/ncpfs/file.c2
-rw-r--r--fs/nfs/fscache-index.c40
-rw-r--r--fs/nfsd/vfs.c2
-rw-r--r--fs/nilfs2/page.c3
-rw-r--r--fs/notify/dnotify/dnotify.c2
-rw-r--r--fs/ntfs/dir.c2
-rw-r--r--fs/ntfs/file.c2
-rw-r--r--fs/ocfs2/acl.c2
-rw-r--r--fs/ocfs2/acl.h7
-rw-r--r--fs/ocfs2/alloc.c22
-rw-r--r--fs/ocfs2/alloc.h3
-rw-r--r--fs/ocfs2/cluster/heartbeat.c42
-rw-r--r--fs/ocfs2/dir.c2
-rw-r--r--fs/ocfs2/file.c9
-rw-r--r--fs/ocfs2/journal.c1
-rw-r--r--fs/ocfs2/move_extents.c2
-rw-r--r--fs/ocfs2/ocfs2.h4
-rw-r--r--fs/ocfs2/quota_global.c36
-rw-r--r--fs/ocfs2/quota_local.c17
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/ocfs2/suballoc.c2
-rw-r--r--fs/ocfs2/super.c1
-rw-r--r--fs/ocfs2/xattr.c2
-rw-r--r--fs/overlayfs/readdir.c4
-rw-r--r--fs/proc/base.c5
-rw-r--r--fs/proc/devices.c8
-rw-r--r--fs/proc/generic.c34
-rw-r--r--fs/proc/internal.h5
-rw-r--r--fs/proc/meminfo.c10
-rw-r--r--fs/proc/proc_net.c2
-rw-r--r--fs/proc/root.c2
-rw-r--r--fs/proc/task_mmu.c270
-rw-r--r--fs/proc/task_nommu.c5
-rw-r--r--fs/pstore/inode.c24
-rw-r--r--fs/quota/dquot.c504
-rw-r--r--fs/quota/quota_tree.c10
-rw-r--r--fs/quota/quota_v1.c11
-rw-r--r--fs/quota/quota_v2.c93
-rw-r--r--fs/ramfs/file-nommu.c2
-rw-r--r--fs/read_write.c50
-rw-r--r--fs/reiserfs/dir.c2
-rw-r--r--fs/reiserfs/file.c2
-rw-r--r--fs/reiserfs/journal.c2
-rw-r--r--fs/select.c6
-rw-r--r--fs/stat.c2
-rw-r--r--fs/super.c2
-rw-r--r--fs/sync.c9
-rw-r--r--fs/ubifs/file.c7
-rw-r--r--fs/udf/inode.c13
-rw-r--r--fs/udf/namei.c2
-rw-r--r--fs/udf/super.c12
-rw-r--r--fs/userfaultfd.c66
-rw-r--r--fs/xfs/libxfs/xfs_attr.c156
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c6
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c39
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c280
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c1
-rw-r--r--fs/xfs/libxfs/xfs_btree.c27
-rw-r--r--fs/xfs/libxfs/xfs_btree.h3
-rw-r--r--fs/xfs/libxfs/xfs_defer.c29
-rw-r--r--fs/xfs/libxfs/xfs_defer.h5
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c57
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c21
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h2
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c2
-rw-r--r--fs/xfs/xfs_aops.c71
-rw-r--r--fs/xfs/xfs_attr_inactive.c6
-rw-r--r--fs/xfs/xfs_bmap_item.c2
-rw-r--r--fs/xfs/xfs_bmap_util.c122
-rw-r--r--fs/xfs/xfs_bmap_util.h1
-rw-r--r--fs/xfs/xfs_buf_item.c137
-rw-r--r--fs/xfs/xfs_buf_item.h5
-rw-r--r--fs/xfs/xfs_dquot.c2
-rw-r--r--fs/xfs/xfs_error.c3
-rw-r--r--fs/xfs/xfs_error.h4
-rw-r--r--fs/xfs/xfs_file.c99
-rw-r--r--fs/xfs/xfs_icache.c10
-rw-r--r--fs/xfs/xfs_inode.c40
-rw-r--r--fs/xfs/xfs_inode_item.c47
-rw-r--r--fs/xfs/xfs_ioctl.c41
-rw-r--r--fs/xfs/xfs_iomap.c10
-rw-r--r--fs/xfs/xfs_iops.c2
-rw-r--r--fs/xfs/xfs_log.c33
-rw-r--r--fs/xfs/xfs_log_recover.c161
-rw-r--r--fs/xfs/xfs_qm.c44
-rw-r--r--fs/xfs/xfs_refcount_item.c2
-rw-r--r--fs/xfs/xfs_reflink.c11
-rw-r--r--fs/xfs/xfs_rtalloc.c2
-rw-r--r--fs/xfs/xfs_super.c2
-rw-r--r--fs/xfs/xfs_symlink.c5
-rw-r--r--fs/xfs/xfs_trace.h48
-rw-r--r--fs/xfs/xfs_trans.c28
-rw-r--r--fs/xfs/xfs_trans.h17
-rw-r--r--fs/xfs/xfs_trans_ail.c20
-rw-r--r--fs/xfs/xfs_trans_buf.c79
-rw-r--r--fs/xfs/xfs_trans_inode.c14
-rw-r--r--fs/xfs/xfs_trans_priv.h31
191 files changed, 3220 insertions, 2370 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 103ca5e1267b..64c58eb26159 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -151,34 +151,6 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
return FSCACHE_CHECKAUX_OKAY;
}
-static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
-{
- struct v9fs_inode *v9inode = cookie_netfs_data;
- struct pagevec pvec;
- pgoff_t first;
- int loop, nr_pages;
-
- pagevec_init(&pvec, 0);
- first = 0;
-
- for (;;) {
- nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping,
- first,
- PAGEVEC_SIZE - pagevec_count(&pvec));
- if (!nr_pages)
- break;
-
- for (loop = 0; loop < nr_pages; loop++)
- ClearPageFsCache(pvec.pages[loop]);
-
- first = pvec.pages[nr_pages - 1]->index + 1;
-
- pvec.nr = nr_pages;
- pagevec_release(&pvec);
- cond_resched();
- }
-}
-
const struct fscache_cookie_def v9fs_cache_inode_index_def = {
.name = "9p.inode",
.type = FSCACHE_COOKIE_TYPE_DATAFILE,
@@ -186,7 +158,6 @@ const struct fscache_cookie_def v9fs_cache_inode_index_def = {
.get_attr = v9fs_cache_inode_get_attr,
.get_aux = v9fs_cache_inode_get_aux,
.check_aux = v9fs_cache_inode_check_aux,
- .now_uncached = v9fs_cache_inode_now_uncached,
};
void v9fs_cache_inode_get_cookie(struct inode *inode)
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3de3b4a89d89..03c9e325bfbc 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -288,7 +288,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
fl->fl_end = OFFSET_MAX;
else
fl->fl_end = glock.start + glock.length - 1;
- fl->fl_pid = glock.proc_id;
+ fl->fl_pid = -glock.proc_id;
}
kfree(glock.client_id);
return res;
@@ -445,7 +445,7 @@ static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end,
struct p9_wstat wstat;
int retval;
- retval = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ retval = file_write_and_wait_range(filp, start, end);
if (retval)
return retval;
@@ -468,7 +468,7 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
struct inode *inode = filp->f_mapping->host;
int retval;
- retval = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ retval = file_write_and_wait_range(filp, start, end);
if (retval)
return retval;
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 196ee7f6fdc4..00331810f690 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -954,7 +954,7 @@ int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
struct inode *inode = filp->f_mapping->host;
int ret, err;
- err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ err = file_write_and_wait_range(filp, start, end);
if (err)
return err;
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index 577763c3d88b..1fe855191261 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -39,7 +39,6 @@ static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
const void *buffer,
uint16_t buflen);
-static void afs_vnode_cache_now_uncached(void *cookie_netfs_data);
struct fscache_netfs afs_cache_netfs = {
.name = "afs",
@@ -75,7 +74,6 @@ struct fscache_cookie_def afs_vnode_cache_index_def = {
.get_attr = afs_vnode_cache_get_attr,
.get_aux = afs_vnode_cache_get_aux,
.check_aux = afs_vnode_cache_check_aux,
- .now_uncached = afs_vnode_cache_now_uncached,
};
/*
@@ -359,44 +357,3 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
_leave(" = SUCCESS");
return FSCACHE_CHECKAUX_OKAY;
}
-
-/*
- * indication the cookie is no longer uncached
- * - this function is called when the backing store currently caching a cookie
- * is removed
- * - the netfs should use this to clean up any markers indicating cached pages
- * - this is mandatory for any object that may have data
- */
-static void afs_vnode_cache_now_uncached(void *cookie_netfs_data)
-{
- struct afs_vnode *vnode = cookie_netfs_data;
- struct pagevec pvec;
- pgoff_t first;
- int loop, nr_pages;
-
- _enter("{%x,%x,%Lx}",
- vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version);
-
- pagevec_init(&pvec, 0);
- first = 0;
-
- for (;;) {
- /* grab a bunch of pages to clean */
- nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping,
- first,
- PAGEVEC_SIZE - pagevec_count(&pvec));
- if (!nr_pages)
- break;
-
- for (loop = 0; loop < nr_pages; loop++)
- ClearPageFsCache(pvec.pages[loop]);
-
- first = pvec.pages[nr_pages - 1]->index + 1;
-
- pvec.nr = nr_pages;
- pagevec_release(&pvec);
- cond_resched();
- }
-
- _leave("");
-}
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 100b207efc9e..c05f1f1c0d41 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -12,7 +12,6 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/errno.h>
-#include <rxrpc/packet.h>
#include "internal.h"
#include "afs_fs.h"
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 02781e78ffb6..0bf191f0dbaf 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -14,7 +14,6 @@
#include <net/sock.h>
#include <net/af_rxrpc.h>
-#include <rxrpc/packet.h>
#include "internal.h"
#include "afs_cm.h"
@@ -293,6 +292,19 @@ static void afs_load_bvec(struct afs_call *call, struct msghdr *msg,
}
/*
+ * Advance the AFS call state when the RxRPC call ends the transmit phase.
+ */
+static void afs_notify_end_request_tx(struct sock *sock,
+ struct rxrpc_call *rxcall,
+ unsigned long call_user_ID)
+{
+ struct afs_call *call = (struct afs_call *)call_user_ID;
+
+ if (call->state == AFS_CALL_REQUESTING)
+ call->state = AFS_CALL_AWAIT_REPLY;
+}
+
+/*
* attach the data from a bunch of pages on an inode to a call
*/
static int afs_send_pages(struct afs_call *call, struct msghdr *msg)
@@ -311,14 +323,8 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg)
bytes = msg->msg_iter.count;
nr = msg->msg_iter.nr_segs;
- /* Have to change the state *before* sending the last
- * packet as RxRPC might give us the reply before it
- * returns from sending the request.
- */
- if (first + nr - 1 >= last)
- call->state = AFS_CALL_AWAIT_REPLY;
- ret = rxrpc_kernel_send_data(afs_socket, call->rxcall,
- msg, bytes);
+ ret = rxrpc_kernel_send_data(afs_socket, call->rxcall, msg,
+ bytes, afs_notify_end_request_tx);
for (loop = 0; loop < nr; loop++)
put_page(bv[loop].bv_page);
if (ret < 0)
@@ -410,7 +416,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
if (!call->send_pages)
call->state = AFS_CALL_AWAIT_REPLY;
ret = rxrpc_kernel_send_data(afs_socket, rxcall,
- &msg, call->request_size);
+ &msg, call->request_size,
+ afs_notify_end_request_tx);
if (ret < 0)
goto error_do_abort;
@@ -742,6 +749,20 @@ static int afs_deliver_cm_op_id(struct afs_call *call)
}
/*
+ * Advance the AFS call state when an RxRPC service call ends the transmit
+ * phase.
+ */
+static void afs_notify_end_reply_tx(struct sock *sock,
+ struct rxrpc_call *rxcall,
+ unsigned long call_user_ID)
+{
+ struct afs_call *call = (struct afs_call *)call_user_ID;
+
+ if (call->state == AFS_CALL_REPLYING)
+ call->state = AFS_CALL_AWAIT_ACK;
+}
+
+/*
* send an empty reply
*/
void afs_send_empty_reply(struct afs_call *call)
@@ -760,7 +781,8 @@ void afs_send_empty_reply(struct afs_call *call)
msg.msg_flags = 0;
call->state = AFS_CALL_AWAIT_ACK;
- switch (rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, 0)) {
+ switch (rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, 0,
+ afs_notify_end_reply_tx)) {
case 0:
_leave(" [replied]");
return;
@@ -798,7 +820,8 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
msg.msg_flags = 0;
call->state = AFS_CALL_AWAIT_ACK;
- n = rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, len);
+ n = rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, len,
+ afs_notify_end_reply_tx);
if (n >= 0) {
/* Success */
_leave(" [replied]");
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 2d2fccd5044b..106e43db1115 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -714,7 +714,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
vnode->fid.vid, vnode->fid.vnode, file,
datasync);
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ ret = file_write_and_wait_range(file, start, end);
if (ret)
return ret;
inode_lock(inode);
diff --git a/fs/aio.c b/fs/aio.c
index dcad3a66748c..b5d69f28d8b1 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -373,6 +373,14 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
pgoff_t idx;
int rc;
+ /*
+ * We cannot support the _NO_COPY case here, because copy needs to
+ * happen under the ctx->completion_lock. That does not work with the
+ * migration workflow of MIGRATE_SYNC_NO_COPY.
+ */
+ if (mode == MIGRATE_SYNC_NO_COPY)
+ return -EINVAL;
+
rc = 0;
/* mapping->private_lock here protects against the kioctx teardown. */
@@ -441,10 +449,9 @@ static const struct address_space_operations aio_ctx_aops = {
#endif
};
-static int aio_setup_ring(struct kioctx *ctx)
+static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
{
struct aio_ring *ring;
- unsigned nr_events = ctx->max_reqs;
struct mm_struct *mm = current->mm;
unsigned long size, unused;
int nr_pages;
@@ -707,6 +714,12 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
int err = -ENOMEM;
/*
+ * Store the original nr_events -- what userspace passed to io_setup(),
+ * for counting against the global limit -- before it changes.
+ */
+ unsigned int max_reqs = nr_events;
+
+ /*
* We keep track of the number of available ringbuffer slots, to prevent
* overflow (reqs_available), and we also use percpu counters for this.
*
@@ -724,14 +737,14 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
return ERR_PTR(-EINVAL);
}
- if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL))
+ if (!nr_events || (unsigned long)max_reqs > aio_max_nr)
return ERR_PTR(-EAGAIN);
ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
if (!ctx)
return ERR_PTR(-ENOMEM);
- ctx->max_reqs = nr_events;
+ ctx->max_reqs = max_reqs;
spin_lock_init(&ctx->ctx_lock);
spin_lock_init(&ctx->completion_lock);
@@ -753,7 +766,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
if (!ctx->cpu)
goto err;
- err = aio_setup_ring(ctx);
+ err = aio_setup_ring(ctx, nr_events);
if (err < 0)
goto err;
@@ -764,8 +777,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
/* limit the number of system wide aios */
spin_lock(&aio_nr_lock);
- if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
- aio_nr + nr_events < aio_nr) {
+ if (aio_nr + ctx->max_reqs > aio_max_nr ||
+ aio_nr + ctx->max_reqs < aio_nr) {
spin_unlock(&aio_nr_lock);
err = -EAGAIN;
goto err_ctx;
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index beef981aa54f..4737615f0eaa 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -11,10 +11,21 @@
#include <linux/auto_fs4.h>
#include <linux/auto_dev-ioctl.h>
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/string.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/uaccess.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/completion.h>
+#include <asm/current.h>
/* This is the range of ioctl() numbers we claim as ours */
#define AUTOFS_IOC_FIRST AUTOFS_IOC_READY
@@ -24,17 +35,6 @@
#define AUTOFS_DEV_IOCTL_IOC_COUNT \
(AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD - AUTOFS_DEV_IOCTL_VERSION_CMD)
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/string.h>
-#include <linux/wait.h>
-#include <linux/sched.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <asm/current.h>
-#include <linux/uaccess.h>
-
#ifdef pr_fmt
#undef pr_fmt
#endif
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index dd9f1bebb5a3..b7c816f39404 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -93,17 +93,17 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
* at the end of the struct.
*/
static struct autofs_dev_ioctl *
- copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
+copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
{
struct autofs_dev_ioctl tmp, *res;
- if (copy_from_user(&tmp, in, sizeof(tmp)))
+ if (copy_from_user(&tmp, in, AUTOFS_DEV_IOCTL_SIZE))
return ERR_PTR(-EFAULT);
- if (tmp.size < sizeof(tmp))
+ if (tmp.size < AUTOFS_DEV_IOCTL_SIZE)
return ERR_PTR(-EINVAL);
- if (tmp.size > (PATH_MAX + sizeof(tmp)))
+ if (tmp.size > AUTOFS_DEV_IOCTL_SIZE + PATH_MAX)
return ERR_PTR(-ENAMETOOLONG);
res = memdup_user(in, tmp.size);
@@ -133,8 +133,8 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
goto out;
}
- if (param->size > sizeof(*param)) {
- err = invalid_str(param->path, param->size - sizeof(*param));
+ if (param->size > AUTOFS_DEV_IOCTL_SIZE) {
+ err = invalid_str(param->path, param->size - AUTOFS_DEV_IOCTL_SIZE);
if (err) {
pr_warn(
"path string terminator missing for cmd(0x%08x)\n",
@@ -258,11 +258,6 @@ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)
if (err)
goto out;
- /*
- * Find autofs super block that has the device number
- * corresponding to the autofs fs we want to open.
- */
-
filp = dentry_open(&path, O_RDONLY, current_cred());
path_put(&path);
if (IS_ERR(filp)) {
@@ -451,7 +446,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
dev_t devid;
int err = -ENOENT;
- if (param->size <= sizeof(*param)) {
+ if (param->size <= AUTOFS_DEV_IOCTL_SIZE) {
err = -EINVAL;
goto out;
}
@@ -539,7 +534,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
unsigned int devid, magic;
int err = -ENOENT;
- if (param->size <= sizeof(*param)) {
+ if (param->size <= AUTOFS_DEV_IOCTL_SIZE) {
err = -EINVAL;
goto out;
}
@@ -628,10 +623,6 @@ static int _autofs_dev_ioctl(unsigned int command,
ioctl_fn fn = NULL;
int err = 0;
- /* only root can play with this */
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST);
cmd = _IOC_NR(command);
@@ -640,6 +631,14 @@ static int _autofs_dev_ioctl(unsigned int command,
return -ENOTTY;
}
+ /* Only root can use ioctls other than AUTOFS_DEV_IOCTL_VERSION_CMD
+ * and AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD
+ */
+ if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD &&
+ cmd != AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD &&
+ !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
/* Copy the parameters into kernel space. */
param = copy_dev_ioctl(user);
if (IS_ERR(param))
@@ -706,7 +705,8 @@ out:
return err;
}
-static long autofs_dev_ioctl(struct file *file, uint command, ulong u)
+static long autofs_dev_ioctl(struct file *file, unsigned int command,
+ unsigned long u)
{
int err;
@@ -715,9 +715,10 @@ static long autofs_dev_ioctl(struct file *file, uint command, ulong u)
}
#ifdef CONFIG_COMPAT
-static long autofs_dev_ioctl_compat(struct file *file, uint command, ulong u)
+static long autofs_dev_ioctl_compat(struct file *file, unsigned int command,
+ unsigned long u)
{
- return (long) autofs_dev_ioctl(file, command, (ulong) compat_ptr(u));
+ return autofs_dev_ioctl(file, command, (unsigned long) compat_ptr(u));
}
#else
#define autofs_dev_ioctl_compat NULL
@@ -733,7 +734,8 @@ static const struct file_operations _dev_ioctl_fops = {
static struct miscdevice _autofs_dev_ioctl_misc = {
.minor = AUTOFS_MINOR,
.name = AUTOFS_DEVICE_NAME,
- .fops = &_dev_ioctl_fops
+ .fops = &_dev_ioctl_fops,
+ .mode = 0644,
};
MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 6466153f2bf0..ec45d24875b1 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -252,7 +252,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
NEW_AUX_ENT(AT_EUID, from_kuid_munged(cred->user_ns, cred->euid));
NEW_AUX_ENT(AT_GID, from_kgid_munged(cred->user_ns, cred->gid));
NEW_AUX_ENT(AT_EGID, from_kgid_munged(cred->user_ns, cred->egid));
- NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
+ NEW_AUX_ENT(AT_SECURE, bprm->secureexec);
NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
#ifdef ELF_HWCAP2
NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index cf93a4fad012..5aa9199dfb13 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -650,7 +650,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
NEW_AUX_ENT(AT_EUID, (elf_addr_t) from_kuid_munged(cred->user_ns, cred->euid));
NEW_AUX_ENT(AT_GID, (elf_addr_t) from_kgid_munged(cred->user_ns, cred->gid));
NEW_AUX_ENT(AT_EGID, (elf_addr_t) from_kgid_munged(cred->user_ns, cred->egid));
- NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
+ NEW_AUX_ENT(AT_SECURE, bprm->secureexec);
NEW_AUX_ENT(AT_EXECFN, bprm->exec);
#ifdef ARCH_DLINFO
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index a1e6860b6f46..ce6537c50ec1 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -192,13 +192,11 @@ static int decompress_exec(
memset(&strm, 0, sizeof(strm));
strm.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL);
- if (strm.workspace == NULL) {
- pr_debug("no memory for decompress workspace\n");
+ if (!strm.workspace)
return -ENOMEM;
- }
+
buf = kmalloc(LBUFSIZE, GFP_KERNEL);
- if (buf == NULL) {
- pr_debug("no memory for read buffer\n");
+ if (!buf) {
retval = -ENOMEM;
goto out_free;
}
@@ -890,7 +888,7 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
* as we're past the point of no return and are dealing with shared
* libraries.
*/
- bprm.cred_prepared = 1;
+ bprm.called_set_creds = 1;
res = prepare_binprm(&bprm);
diff --git a/fs/buffer.c b/fs/buffer.c
index 50e51a67dc78..170df856bdb9 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1627,20 +1627,17 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
struct pagevec pvec;
pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
pgoff_t end;
- int i;
+ int i, count;
struct buffer_head *bh;
struct buffer_head *head;
end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
pagevec_init(&pvec, 0);
- while (index <= end && pagevec_lookup(&pvec, bd_mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
- for (i = 0; i < pagevec_count(&pvec); i++) {
+ while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) {
+ count = pagevec_count(&pvec);
+ for (i = 0; i < count; i++) {
struct page *page = pvec.pages[i];
- index = page->index;
- if (index > end)
- break;
if (!page_has_buffers(page))
continue;
/*
@@ -1670,7 +1667,9 @@ unlock_page:
}
pagevec_release(&pvec);
cond_resched();
- index++;
+ /* End of range already reached? */
+ if (index > end || !index)
+ break;
}
}
EXPORT_SYMBOL(clean_bdev_aliases);
@@ -3549,10 +3548,10 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
pagevec_init(&pvec, 0);
do {
- unsigned want, nr_pages, i;
+ unsigned nr_pages, i;
- want = min_t(unsigned, end - index, PAGEVEC_SIZE);
- nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want);
+ nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
+ end - 1);
if (nr_pages == 0)
break;
@@ -3573,10 +3572,6 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
lastoff < page_offset(page))
goto check_range;
- /* Searching done if the page index is out of range. */
- if (page->index >= end)
- goto not_found;
-
lock_page(page);
if (likely(page->mapping == inode->i_mapping) &&
page_has_buffers(page)) {
@@ -3589,12 +3584,6 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
unlock_page(page);
lastoff = page_offset(page) + PAGE_SIZE;
}
-
- /* Searching done if fewer pages returned than wanted. */
- if (nr_pages < want)
- break;
-
- index = pvec.pages[i - 1]->index + 1;
pagevec_release(&pvec);
} while (index < end);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 50836280a6f8..1bc709fe330a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -189,7 +189,7 @@ static int ceph_releasepage(struct page *page, gfp_t g)
/*
* read a single page, without unlocking it.
*/
-static int readpage_nounlock(struct file *filp, struct page *page)
+static int ceph_do_readpage(struct file *filp, struct page *page)
{
struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
@@ -219,7 +219,7 @@ static int readpage_nounlock(struct file *filp, struct page *page)
err = ceph_readpage_from_fscache(inode, page);
if (err == 0)
- goto out;
+ return -EINPROGRESS;
dout("readpage inode %p file %p page %p index %lu\n",
inode, filp, page, page->index);
@@ -249,8 +249,11 @@ out:
static int ceph_readpage(struct file *filp, struct page *page)
{
- int r = readpage_nounlock(filp, page);
- unlock_page(page);
+ int r = ceph_do_readpage(filp, page);
+ if (r != -EINPROGRESS)
+ unlock_page(page);
+ else
+ r = 0;
return r;
}
@@ -1237,7 +1240,7 @@ retry_locked:
goto retry_locked;
r = writepage_nounlock(page, NULL);
if (r < 0)
- goto fail_nosnap;
+ goto fail_unlock;
goto retry_locked;
}
@@ -1265,11 +1268,14 @@ retry_locked:
}
/* we need to read it. */
- r = readpage_nounlock(file, page);
- if (r < 0)
- goto fail_nosnap;
+ r = ceph_do_readpage(file, page);
+ if (r < 0) {
+ if (r == -EINPROGRESS)
+ return -EAGAIN;
+ goto fail_unlock;
+ }
goto retry_locked;
-fail_nosnap:
+fail_unlock:
unlock_page(page);
return r;
}
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index fd1172823f86..174d6e6569a8 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -194,36 +194,6 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux(
return FSCACHE_CHECKAUX_OKAY;
}
-static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data)
-{
- struct ceph_inode_info* ci = cookie_netfs_data;
- struct pagevec pvec;
- pgoff_t first;
- int loop, nr_pages;
-
- pagevec_init(&pvec, 0);
- first = 0;
-
- dout("ceph inode 0x%p now uncached", ci);
-
- while (1) {
- nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first,
- PAGEVEC_SIZE - pagevec_count(&pvec));
-
- if (!nr_pages)
- break;
-
- for (loop = 0; loop < nr_pages; loop++)
- ClearPageFsCache(pvec.pages[loop]);
-
- first = pvec.pages[nr_pages - 1]->index + 1;
-
- pvec.nr = nr_pages;
- pagevec_release(&pvec);
- cond_resched();
- }
-}
-
static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
.name = "CEPH.inode",
.type = FSCACHE_COOKIE_TYPE_DATAFILE,
@@ -231,7 +201,6 @@ static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
.get_attr = ceph_fscache_inode_get_attr,
.get_aux = ceph_fscache_inode_get_aux,
.check_aux = ceph_fscache_inode_check_aux,
- .now_uncached = ceph_fscache_inode_now_uncached,
};
void ceph_fscache_register_inode_cookie(struct inode *inode)
@@ -297,13 +266,7 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp)
}
}
-static void ceph_vfs_readpage_complete(struct page *page, void *data, int error)
-{
- if (!error)
- SetPageUptodate(page);
-}
-
-static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int error)
+static void ceph_readpage_from_fscache_complete(struct page *page, void *data, int error)
{
if (!error)
SetPageUptodate(page);
@@ -331,7 +294,7 @@ int ceph_readpage_from_fscache(struct inode *inode, struct page *page)
return -ENOBUFS;
ret = fscache_read_or_alloc_page(ci->fscache, page,
- ceph_vfs_readpage_complete, NULL,
+ ceph_readpage_from_fscache_complete, NULL,
GFP_KERNEL);
switch (ret) {
@@ -360,7 +323,7 @@ int ceph_readpages_from_fscache(struct inode *inode,
return -ENOBUFS;
ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages,
- ceph_vfs_readpage_complete_unlock,
+ ceph_readpage_from_fscache_complete,
NULL, mapping_gfp_mask(mapping));
switch (ret) {
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 64ae74472046..8cd63e8123d8 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -79,7 +79,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
err = ceph_mdsc_do_request(mdsc, inode, req);
if (operation == CEPH_MDS_OP_GETFILELOCK) {
- fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
+ fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
fl->fl_type = F_RDLCK;
else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
diff --git a/fs/char_dev.c b/fs/char_dev.c
index fb8507f521b2..ebcc8fb3fa66 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -28,6 +28,8 @@ static struct kobj_map *cdev_map;
static DEFINE_MUTEX(chrdevs_lock);
+#define CHRDEV_MAJOR_HASH_SIZE 255
+
static struct char_device_struct {
struct char_device_struct *next;
unsigned int major;
@@ -49,16 +51,39 @@ void chrdev_show(struct seq_file *f, off_t offset)
{
struct char_device_struct *cd;
- if (offset < CHRDEV_MAJOR_HASH_SIZE) {
- mutex_lock(&chrdevs_lock);
- for (cd = chrdevs[offset]; cd; cd = cd->next)
+ mutex_lock(&chrdevs_lock);
+ for (cd = chrdevs[major_to_index(offset)]; cd; cd = cd->next) {
+ if (cd->major == offset)
seq_printf(f, "%3d %s\n", cd->major, cd->name);
- mutex_unlock(&chrdevs_lock);
}
+ mutex_unlock(&chrdevs_lock);
}
#endif /* CONFIG_PROC_FS */
+static int find_dynamic_major(void)
+{
+ int i;
+ struct char_device_struct *cd;
+
+ for (i = ARRAY_SIZE(chrdevs)-1; i > CHRDEV_MAJOR_DYN_END; i--) {
+ if (chrdevs[i] == NULL)
+ return i;
+ }
+
+ for (i = CHRDEV_MAJOR_DYN_EXT_START;
+ i > CHRDEV_MAJOR_DYN_EXT_END; i--) {
+ for (cd = chrdevs[major_to_index(i)]; cd; cd = cd->next)
+ if (cd->major == i)
+ break;
+
+ if (cd == NULL || cd->major != i)
+ return i;
+ }
+
+ return -EBUSY;
+}
+
/*
* Register a single major with a specified minor range.
*
@@ -84,22 +109,21 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
mutex_lock(&chrdevs_lock);
- /* temporary */
if (major == 0) {
- for (i = ARRAY_SIZE(chrdevs)-1; i > 0; i--) {
- if (chrdevs[i] == NULL)
- break;
- }
-
- if (i < CHRDEV_MAJOR_DYN_END)
- pr_warn("CHRDEV \"%s\" major number %d goes below the dynamic allocation range\n",
- name, i);
-
- if (i == 0) {
- ret = -EBUSY;
+ ret = find_dynamic_major();
+ if (ret < 0) {
+ pr_err("CHRDEV \"%s\" dynamic allocation region is full\n",
+ name);
goto out;
}
- major = i;
+ major = ret;
+ }
+
+ if (major >= CHRDEV_MAJOR_MAX) {
+ pr_err("CHRDEV \"%s\" major requested (%d) is greater than the maximum (%d)\n",
+ name, major, CHRDEV_MAJOR_MAX);
+ ret = -EINVAL;
+ goto out;
}
cd->major = major;
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index 6c665bf4a27c..2c14020e5e1d 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -292,36 +292,6 @@ fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data,
return FSCACHE_CHECKAUX_OKAY;
}
-static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data)
-{
- struct cifsInodeInfo *cifsi = cookie_netfs_data;
- struct pagevec pvec;
- pgoff_t first;
- int loop, nr_pages;
-
- pagevec_init(&pvec, 0);
- first = 0;
-
- cifs_dbg(FYI, "%s: cifs inode 0x%p now uncached\n", __func__, cifsi);
-
- for (;;) {
- nr_pages = pagevec_lookup(&pvec,
- cifsi->vfs_inode.i_mapping, first,
- PAGEVEC_SIZE - pagevec_count(&pvec));
- if (!nr_pages)
- break;
-
- for (loop = 0; loop < nr_pages; loop++)
- ClearPageFsCache(pvec.pages[loop]);
-
- first = pvec.pages[nr_pages - 1]->index + 1;
-
- pvec.nr = nr_pages;
- pagevec_release(&pvec);
- cond_resched();
- }
-}
-
const struct fscache_cookie_def cifs_fscache_inode_object_def = {
.name = "CIFS.uniqueid",
.type = FSCACHE_COOKIE_TYPE_DATAFILE,
@@ -329,5 +299,4 @@ const struct fscache_cookie_def cifs_fscache_inode_object_def = {
.get_attr = cifs_fscache_inode_get_attr,
.get_aux = cifs_fscache_inode_get_aux,
.check_aux = cifs_fscache_inode_check_aux,
- .now_uncached = cifs_fscache_inode_now_uncached,
};
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 221693fe49ec..808486c29f0d 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -421,7 +421,7 @@ struct smb_version_operations {
size_t, struct cifs_sb_info *);
int (*set_EA)(const unsigned int, struct cifs_tcon *, const char *,
const char *, const void *, const __u16,
- const struct nls_table *, int);
+ const struct nls_table *, struct cifs_sb_info *);
struct cifs_ntsd * (*get_acl)(struct cifs_sb_info *, struct inode *,
const char *, u32 *);
struct cifs_ntsd * (*get_acl_by_fid)(struct cifs_sb_info *,
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 6eb3147132e3..4143c9dec463 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -484,7 +484,8 @@ extern ssize_t CIFSSMBQAllEAs(const unsigned int xid, struct cifs_tcon *tcon,
extern int CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon,
const char *fileName, const char *ea_name,
const void *ea_value, const __u16 ea_value_len,
- const struct nls_table *nls_codepage, int remap_special_chars);
+ const struct nls_table *nls_codepage,
+ struct cifs_sb_info *cifs_sb);
extern int CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon,
__u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen);
extern int CIFSSMBSetCIFSACL(const unsigned int, struct cifs_tcon *, __u16,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 72a53bd19865..35dc5bf01ee2 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -178,6 +178,18 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
* reconnect the same SMB session
*/
mutex_lock(&ses->session_mutex);
+
+ /*
+ * Recheck after acquire mutex. If another thread is negotiating
+ * and the server never sends an answer the socket will be closed
+ * and tcpStatus set to reconnect.
+ */
+ if (server->tcpStatus == CifsNeedReconnect) {
+ rc = -EHOSTDOWN;
+ mutex_unlock(&ses->session_mutex);
+ goto out;
+ }
+
rc = cifs_negotiate_protocol(0, ses);
if (rc == 0 && ses->need_reconnect)
rc = cifs_setup_session(0, ses, nls_codepage);
@@ -2522,7 +2534,7 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon,
pLockData->fl_start = le64_to_cpu(parm_data->start);
pLockData->fl_end = pLockData->fl_start +
le64_to_cpu(parm_data->length) - 1;
- pLockData->fl_pid = le32_to_cpu(parm_data->pid);
+ pLockData->fl_pid = -le32_to_cpu(parm_data->pid);
}
}
@@ -6264,7 +6276,7 @@ int
CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon,
const char *fileName, const char *ea_name, const void *ea_value,
const __u16 ea_value_len, const struct nls_table *nls_codepage,
- int remap)
+ struct cifs_sb_info *cifs_sb)
{
struct smb_com_transaction2_spi_req *pSMB = NULL;
struct smb_com_transaction2_spi_rsp *pSMBr = NULL;
@@ -6273,6 +6285,7 @@ CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
int bytes_returned = 0;
__u16 params, param_offset, byte_count, offset, count;
+ int remap = cifs_remap(cifs_sb);
cifs_dbg(FYI, "In SetEA\n");
SetEARetry:
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 59647eb72c5f..5aa2d278ca84 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -509,7 +509,8 @@ server_unresponsive(struct TCP_Server_Info *server)
* 65s kernel_recvmsg times out, and we see that we haven't gotten
* a response in >60s.
*/
- if (server->tcpStatus == CifsGood &&
+ if ((server->tcpStatus == CifsGood ||
+ server->tcpStatus == CifsNeedNegotiate) &&
time_after(jiffies, server->lstrp + 2 * server->echo_interval)) {
cifs_dbg(VFS, "Server %s has not responded in %lu seconds. Reconnecting...\n",
server->hostname, (2 * server->echo_interval) / HZ);
@@ -1223,6 +1224,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
char *tmp_end, *value;
char delim;
bool got_ip = false;
+ bool got_version = false;
unsigned short port = 0;
struct sockaddr *dstaddr = (struct sockaddr *)&vol->dstaddr;
@@ -1874,24 +1876,35 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
pr_warn("CIFS: server netbiosname longer than 15 truncated.\n");
break;
case Opt_ver:
+ /* version of mount userspace tools, not dialect */
string = match_strdup(args);
if (string == NULL)
goto out_nomem;
+ /* If interface changes in mount.cifs bump to new ver */
if (strncasecmp(string, "1", 1) == 0) {
+ if (strlen(string) > 1) {
+ pr_warn("Bad mount helper ver=%s. Did "
+ "you want SMB1 (CIFS) dialect "
+ "and mean to type vers=1.0 "
+ "instead?\n", string);
+ goto cifs_parse_mount_err;
+ }
/* This is the default */
break;
}
/* For all other value, error */
- pr_warn("CIFS: Invalid version specified\n");
+ pr_warn("CIFS: Invalid mount helper version specified\n");
goto cifs_parse_mount_err;
case Opt_vers:
+ /* protocol version (dialect) */
string = match_strdup(args);
if (string == NULL)
goto out_nomem;
if (cifs_parse_smb_version(string, vol) != 0)
goto cifs_parse_mount_err;
+ got_version = true;
break;
case Opt_sec:
string = match_strdup(args);
@@ -1973,6 +1986,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
else if (override_gid == 1)
pr_notice("CIFS: ignoring forcegid mount option specified with no gid= option.\n");
+ if (got_version == false)
+ pr_warn("No dialect specified on mount. Default has changed to "
+ "a more secure dialect, SMB3 (vers=3.0), from CIFS "
+ "(SMB1). To use the less secure SMB1 dialect to access "
+ "old servers which do not support SMB3 specify vers=1.0"
+ " on mount. For somewhat newer servers such as Windows "
+ "7 try vers=2.1.\n");
+
kfree(mountdata_copy);
return 0;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 569d3fb736be..e702d48bd023 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -205,7 +205,7 @@ check_name(struct dentry *direntry, struct cifs_tcon *tcon)
int i;
if (unlikely(direntry->d_name.len >
- tcon->fsAttrInfo.MaxPathNameComponentLength))
+ le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength)))
return -ENAMETOOLONG;
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index bc09df6b473a..0786f19d288f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2329,7 +2329,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
struct inode *inode = file_inode(file);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
- rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ rc = file_write_and_wait_range(file, start, end);
if (rc)
return rc;
inode_lock(inode);
@@ -2371,7 +2371,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
struct inode *inode = file->f_mapping->host;
- rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ rc = file_write_and_wait_range(file, start, end);
if (rc)
return rc;
inode_lock(inode);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index cfacf2c97e94..fb2934b9b97c 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -426,6 +426,194 @@ smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
+static ssize_t
+move_smb2_ea_to_cifs(char *dst, size_t dst_size,
+ struct smb2_file_full_ea_info *src, size_t src_size,
+ const unsigned char *ea_name)
+{
+ int rc = 0;
+ unsigned int ea_name_len = ea_name ? strlen(ea_name) : 0;
+ char *name, *value;
+ size_t name_len, value_len, user_name_len;
+
+ while (src_size > 0) {
+ name = &src->ea_data[0];
+ name_len = (size_t)src->ea_name_length;
+ value = &src->ea_data[src->ea_name_length + 1];
+ value_len = (size_t)le16_to_cpu(src->ea_value_length);
+
+ if (name_len == 0) {
+ break;
+ }
+
+ if (src_size < 8 + name_len + 1 + value_len) {
+ cifs_dbg(FYI, "EA entry goes beyond length of list\n");
+ rc = -EIO;
+ goto out;
+ }
+
+ if (ea_name) {
+ if (ea_name_len == name_len &&
+ memcmp(ea_name, name, name_len) == 0) {
+ rc = value_len;
+ if (dst_size == 0)
+ goto out;
+ if (dst_size < value_len) {
+ rc = -ERANGE;
+ goto out;
+ }
+ memcpy(dst, value, value_len);
+ goto out;
+ }
+ } else {
+ /* 'user.' plus a terminating null */
+ user_name_len = 5 + 1 + name_len;
+
+ rc += user_name_len;
+
+ if (dst_size >= user_name_len) {
+ dst_size -= user_name_len;
+ memcpy(dst, "user.", 5);
+ dst += 5;
+ memcpy(dst, src->ea_data, name_len);
+ dst += name_len;
+ *dst = 0;
+ ++dst;
+ } else if (dst_size == 0) {
+ /* skip copy - calc size only */
+ } else {
+ /* stop before overrun buffer */
+ rc = -ERANGE;
+ break;
+ }
+ }
+
+ if (!src->next_entry_offset)
+ break;
+
+ if (src_size < le32_to_cpu(src->next_entry_offset)) {
+ /* stop before overrun buffer */
+ rc = -ERANGE;
+ break;
+ }
+ src_size -= le32_to_cpu(src->next_entry_offset);
+ src = (void *)((char *)src +
+ le32_to_cpu(src->next_entry_offset));
+ }
+
+ /* didn't find the named attribute */
+ if (ea_name)
+ rc = -ENODATA;
+
+out:
+ return (ssize_t)rc;
+}
+
+static ssize_t
+smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon,
+ const unsigned char *path, const unsigned char *ea_name,
+ char *ea_data, size_t buf_size,
+ struct cifs_sb_info *cifs_sb)
+{
+ int rc;
+ __le16 *utf16_path;
+ __u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+ struct cifs_open_parms oparms;
+ struct cifs_fid fid;
+ struct smb2_file_full_ea_info *smb2_data;
+
+ utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
+ if (!utf16_path)
+ return -ENOMEM;
+
+ oparms.tcon = tcon;
+ oparms.desired_access = FILE_READ_EA;
+ oparms.disposition = FILE_OPEN;
+ oparms.create_options = 0;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
+ kfree(utf16_path);
+ if (rc) {
+ cifs_dbg(FYI, "open failed rc=%d\n", rc);
+ return rc;
+ }
+
+ smb2_data = kzalloc(SMB2_MAX_EA_BUF, GFP_KERNEL);
+ if (smb2_data == NULL) {
+ SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+ return -ENOMEM;
+ }
+
+ rc = SMB2_query_eas(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+ smb2_data);
+ SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+
+ if (!rc)
+ rc = move_smb2_ea_to_cifs(ea_data, buf_size, smb2_data,
+ SMB2_MAX_EA_BUF, ea_name);
+
+ kfree(smb2_data);
+ return rc;
+}
+
+
+static int
+smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
+ const char *path, const char *ea_name, const void *ea_value,
+ const __u16 ea_value_len, const struct nls_table *nls_codepage,
+ struct cifs_sb_info *cifs_sb)
+{
+ int rc;
+ __le16 *utf16_path;
+ __u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+ struct cifs_open_parms oparms;
+ struct cifs_fid fid;
+ struct smb2_file_full_ea_info *ea;
+ int ea_name_len = strlen(ea_name);
+ int len;
+
+ if (ea_name_len > 255)
+ return -EINVAL;
+
+ utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
+ if (!utf16_path)
+ return -ENOMEM;
+
+ oparms.tcon = tcon;
+ oparms.desired_access = FILE_WRITE_EA;
+ oparms.disposition = FILE_OPEN;
+ oparms.create_options = 0;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
+ kfree(utf16_path);
+ if (rc) {
+ cifs_dbg(FYI, "open failed rc=%d\n", rc);
+ return rc;
+ }
+
+ len = sizeof(ea) + ea_name_len + ea_value_len + 1;
+ ea = kzalloc(len, GFP_KERNEL);
+ if (ea == NULL) {
+ SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+ return -ENOMEM;
+ }
+
+ ea->ea_name_length = ea_name_len;
+ ea->ea_value_length = cpu_to_le16(ea_value_len);
+ memcpy(ea->ea_data, ea_name, ea_name_len + 1);
+ memcpy(ea->ea_data + ea_name_len + 1, ea_value, ea_value_len);
+
+ rc = SMB2_set_ea(xid, tcon, fid.persistent_fid, fid.volatile_fid, ea,
+ len);
+ SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+
+ return rc;
+}
+
static bool
smb2_can_echo(struct TCP_Server_Info *server)
{
@@ -2572,6 +2760,10 @@ struct smb_version_operations smb20_operations = {
.dir_needs_close = smb2_dir_needs_close,
.get_dfs_refer = smb2_get_dfs_refer,
.select_sectype = smb2_select_sectype,
+#ifdef CONFIG_CIFS_XATTR
+ .query_all_EAs = smb2_query_eas,
+ .set_EA = smb2_set_ea,
+#endif /* CIFS_XATTR */
#ifdef CONFIG_CIFS_ACL
.get_acl = get_smb2_acl,
.get_acl_by_fid = get_smb2_acl_by_fid,
@@ -2662,6 +2854,10 @@ struct smb_version_operations smb21_operations = {
.enum_snapshots = smb3_enum_snapshots,
.get_dfs_refer = smb2_get_dfs_refer,
.select_sectype = smb2_select_sectype,
+#ifdef CONFIG_CIFS_XATTR
+ .query_all_EAs = smb2_query_eas,
+ .set_EA = smb2_set_ea,
+#endif /* CIFS_XATTR */
#ifdef CONFIG_CIFS_ACL
.get_acl = get_smb2_acl,
.get_acl_by_fid = get_smb2_acl_by_fid,
@@ -2762,6 +2958,10 @@ struct smb_version_operations smb30_operations = {
.receive_transform = smb3_receive_transform,
.get_dfs_refer = smb2_get_dfs_refer,
.select_sectype = smb2_select_sectype,
+#ifdef CONFIG_CIFS_XATTR
+ .query_all_EAs = smb2_query_eas,
+ .set_EA = smb2_set_ea,
+#endif /* CIFS_XATTR */
#ifdef CONFIG_CIFS_ACL
.get_acl = get_smb2_acl,
.get_acl_by_fid = get_smb2_acl_by_fid,
@@ -2863,6 +3063,10 @@ struct smb_version_operations smb311_operations = {
.receive_transform = smb3_receive_transform,
.get_dfs_refer = smb2_get_dfs_refer,
.select_sectype = smb2_select_sectype,
+#ifdef CONFIG_CIFS_XATTR
+ .query_all_EAs = smb2_query_eas,
+ .set_EA = smb2_set_ea,
+#endif /* CIFS_XATTR */
};
#endif /* CIFS_SMB311 */
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 97edb4d376cd..5531e7ee1210 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -238,6 +238,18 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
* the same SMB session
*/
mutex_lock(&tcon->ses->session_mutex);
+
+ /*
+ * Recheck after acquire mutex. If another thread is negotiating
+ * and the server never sends an answer the socket will be closed
+ * and tcpStatus set to reconnect.
+ */
+ if (server->tcpStatus == CifsNeedReconnect) {
+ rc = -EHOSTDOWN;
+ mutex_unlock(&tcon->ses->session_mutex);
+ goto out;
+ }
+
rc = cifs_negotiate_protocol(0, tcon->ses);
if (!rc && tcon->ses->need_reconnect)
rc = cifs_setup_session(0, tcon->ses, nls_codepage);
@@ -514,7 +526,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
* No tcon so can't do
* cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]);
*/
- if (rc != 0)
+ if (rc == -EOPNOTSUPP) {
+ cifs_dbg(VFS, "Dialect not supported by server. Consider "
+ "specifying vers=1.0 or vers=2.1 on mount for accessing"
+ " older servers\n");
+ goto neg_exit;
+ } else if (rc != 0)
goto neg_exit;
cifs_dbg(FYI, "mode 0x%x\n", rsp->SecurityMode);
@@ -2140,6 +2157,18 @@ qinf_exit:
return rc;
}
+int SMB2_query_eas(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid,
+ struct smb2_file_full_ea_info *data)
+{
+ return query_info(xid, tcon, persistent_fid, volatile_fid,
+ FILE_FULL_EA_INFORMATION, SMB2_O_INFO_FILE, 0,
+ SMB2_MAX_EA_BUF,
+ sizeof(struct smb2_file_full_ea_info),
+ (void **)&data,
+ NULL);
+}
+
int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, struct smb2_file_all_info *data)
{
@@ -3180,6 +3209,16 @@ SMB2_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
}
int
+SMB2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid,
+ struct smb2_file_full_ea_info *buf, int len)
+{
+ return send_set_info(xid, tcon, persistent_fid, volatile_fid,
+ current->tgid, FILE_FULL_EA_INFORMATION, SMB2_O_INFO_FILE,
+ 0, 1, (void **)&buf, &len);
+}
+
+int
SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
const u64 persistent_fid, const u64 volatile_fid,
__u8 oplock_level)
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 18700fd25a0b..393ed5f4e1b6 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -84,8 +84,8 @@
#define NUMBER_OF_SMB2_COMMANDS 0x0013
-/* BB FIXME - analyze following length BB */
-#define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */
+/* 4 len + 52 transform hdr + 64 hdr + 56 create rsp */
+#define MAX_SMB2_HDR_SIZE 0x00b0
#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe)
#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd)
@@ -1178,6 +1178,16 @@ struct smb2_file_link_info { /* encoding of request for level 11 */
char FileName[0]; /* Name to be assigned to new link */
} __packed; /* level 11 Set */
+#define SMB2_MAX_EA_BUF 2048
+
+struct smb2_file_full_ea_info { /* encoding of response for level 15 */
+ __le32 next_entry_offset;
+ __u8 flags;
+ __u8 ea_name_length;
+ __le16 ea_value_length;
+ char ea_data[0]; /* \0 terminated name plus value */
+} __packed; /* level 15 Set */
+
/*
* This level 18, although with struct with same name is different from cifs
* level 0x107. Level 0x107 has an extra u64 between AccessFlags and
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 1cadaf9f3c58..003217099ef3 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -132,6 +132,9 @@ extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id);
extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id);
+extern int SMB2_query_eas(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_file_id, u64 volatile_file_id,
+ struct smb2_file_full_ea_info *data);
extern int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id,
struct smb2_file_all_info *data);
@@ -169,6 +172,9 @@ extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon,
extern int SMB2_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
struct cifs_ntsd *pnntsd, int pacllen, int aclflag);
+extern int SMB2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid,
+ struct smb2_file_full_ea_info *buf, int len);
extern int SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid);
extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index de50e749ff05..52f975d848a0 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -84,7 +84,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
if (pTcon->ses->server->ops->set_EA)
rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
full_path, name, value, (__u16)size,
- cifs_sb->local_nls, cifs_remap(cifs_sb));
+ cifs_sb->local_nls, cifs_sb);
break;
case XATTR_CIFS_ACL: {
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 2dd4a7af7dd7..d27b326d96f4 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1331,8 +1331,6 @@ COMPATIBLE_IOCTL(DMX_SET_FILTER)
COMPATIBLE_IOCTL(DMX_SET_PES_FILTER)
COMPATIBLE_IOCTL(DMX_SET_BUFFER_SIZE)
COMPATIBLE_IOCTL(DMX_GET_PES_PIDS)
-COMPATIBLE_IOCTL(DMX_GET_CAPS)
-COMPATIBLE_IOCTL(DMX_SET_SOURCE)
COMPATIBLE_IOCTL(DMX_GET_STC)
COMPATIBLE_IOCTL(FE_GET_INFO)
COMPATIBLE_IOCTL(FE_DISEQC_RESET_OVERLOAD)
diff --git a/fs/dax.c b/fs/dax.c
index 865d42c63e23..6afcacb3a87b 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -42,6 +42,9 @@
#define DAX_WAIT_TABLE_BITS 12
#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
+/* The 'colour' (ie low bits) within a PMD of a page offset. */
+#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
+
static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
static int __init init_dax_wait_table(void)
@@ -54,6 +57,40 @@ static int __init init_dax_wait_table(void)
}
fs_initcall(init_dax_wait_table);
+/*
+ * We use lowest available bit in exceptional entry for locking, one bit for
+ * the entry size (PMD) and two more to tell us if the entry is a zero page or
+ * an empty entry that is just used for locking. In total four special bits.
+ *
+ * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
+ * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
+ * block allocation.
+ */
+#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
+#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
+#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
+#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
+#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
+
+static unsigned long dax_radix_sector(void *entry)
+{
+ return (unsigned long)entry >> RADIX_DAX_SHIFT;
+}
+
+static void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
+{
+ return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
+ ((unsigned long)sector << RADIX_DAX_SHIFT) |
+ RADIX_DAX_ENTRY_LOCK);
+}
+
+static unsigned int dax_radix_order(void *entry)
+{
+ if ((unsigned long)entry & RADIX_DAX_PMD)
+ return PMD_SHIFT - PAGE_SHIFT;
+ return 0;
+}
+
static int dax_is_pmd_entry(void *entry)
{
return (unsigned long)entry & RADIX_DAX_PMD;
@@ -66,7 +103,7 @@ static int dax_is_pte_entry(void *entry)
static int dax_is_zero_entry(void *entry)
{
- return (unsigned long)entry & RADIX_DAX_HZP;
+ return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
}
static int dax_is_empty_entry(void *entry)
@@ -98,7 +135,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
* the range covered by the PMD map to the same bit lock.
*/
if (dax_is_pmd_entry(entry))
- index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1);
+ index &= ~PG_PMD_COLOUR;
key->mapping = mapping;
key->entry_start = index;
@@ -121,6 +158,31 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo
}
/*
+ * We do not necessarily hold the mapping->tree_lock when we call this
+ * function so it is possible that 'entry' is no longer a valid item in the
+ * radix tree. This is okay because all we really need to do is to find the
+ * correct waitqueue where tasks might be waiting for that old 'entry' and
+ * wake them.
+ */
+static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
+ pgoff_t index, void *entry, bool wake_all)
+{
+ struct exceptional_entry_key key;
+ wait_queue_head_t *wq;
+
+ wq = dax_entry_waitqueue(mapping, index, entry, &key);
+
+ /*
+ * Checking for locked entry and prepare_to_wait_exclusive() happens
+ * under mapping->tree_lock, ditto for entry handling in our callers.
+ * So at this point all tasks that could have seen our entry locked
+ * must be in the waitqueue and the following check will see them.
+ */
+ if (waitqueue_active(wq))
+ __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
+}
+
+/*
* Check whether the given slot is locked. The function must be called with
* mapping->tree_lock held
*/
@@ -181,7 +243,8 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
for (;;) {
entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
&slot);
- if (!entry || !radix_tree_exceptional_entry(entry) ||
+ if (!entry ||
+ WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
!slot_locked(mapping, slot)) {
if (slotp)
*slotp = slot;
@@ -216,14 +279,9 @@ static void dax_unlock_mapping_entry(struct address_space *mapping,
}
static void put_locked_mapping_entry(struct address_space *mapping,
- pgoff_t index, void *entry)
+ pgoff_t index)
{
- if (!radix_tree_exceptional_entry(entry)) {
- unlock_page(entry);
- put_page(entry);
- } else {
- dax_unlock_mapping_entry(mapping, index);
- }
+ dax_unlock_mapping_entry(mapping, index);
}
/*
@@ -233,7 +291,7 @@ static void put_locked_mapping_entry(struct address_space *mapping,
static void put_unlocked_mapping_entry(struct address_space *mapping,
pgoff_t index, void *entry)
{
- if (!radix_tree_exceptional_entry(entry))
+ if (!entry)
return;
/* We have to wake up next waiter for the radix tree entry lock */
@@ -241,15 +299,15 @@ static void put_unlocked_mapping_entry(struct address_space *mapping,
}
/*
- * Find radix tree entry at given index. If it points to a page, return with
- * the page locked. If it points to the exceptional entry, return with the
- * radix tree entry locked. If the radix tree doesn't contain given index,
- * create empty exceptional entry for the index and return with it locked.
+ * Find radix tree entry at given index. If it points to an exceptional entry,
+ * return it with the radix tree entry locked. If the radix tree doesn't
+ * contain given index, create an empty exceptional entry for the index and
+ * return with it locked.
*
* When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
* either return that locked entry or will return an error. This error will
- * happen if there are any 4k entries (either zero pages or DAX entries)
- * within the 2MiB range that we are requesting.
+ * happen if there are any 4k entries within the 2MiB range that we are
+ * requesting.
*
* We always favor 4k entries over 2MiB entries. There isn't a flow where we
* evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB
@@ -276,18 +334,21 @@ restart:
spin_lock_irq(&mapping->tree_lock);
entry = get_unlocked_mapping_entry(mapping, index, &slot);
+ if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
+ entry = ERR_PTR(-EIO);
+ goto out_unlock;
+ }
+
if (entry) {
if (size_flag & RADIX_DAX_PMD) {
- if (!radix_tree_exceptional_entry(entry) ||
- dax_is_pte_entry(entry)) {
+ if (dax_is_pte_entry(entry)) {
put_unlocked_mapping_entry(mapping, index,
entry);
entry = ERR_PTR(-EEXIST);
goto out_unlock;
}
} else { /* trying to grab a PTE entry */
- if (radix_tree_exceptional_entry(entry) &&
- dax_is_pmd_entry(entry) &&
+ if (dax_is_pmd_entry(entry) &&
(dax_is_zero_entry(entry) ||
dax_is_empty_entry(entry))) {
pmd_downgrade = true;
@@ -321,7 +382,7 @@ restart:
mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
if (err) {
if (pmd_downgrade)
- put_locked_mapping_entry(mapping, index, entry);
+ put_locked_mapping_entry(mapping, index);
return ERR_PTR(err);
}
spin_lock_irq(&mapping->tree_lock);
@@ -371,52 +432,12 @@ restart:
spin_unlock_irq(&mapping->tree_lock);
return entry;
}
- /* Normal page in radix tree? */
- if (!radix_tree_exceptional_entry(entry)) {
- struct page *page = entry;
-
- get_page(page);
- spin_unlock_irq(&mapping->tree_lock);
- lock_page(page);
- /* Page got truncated? Retry... */
- if (unlikely(page->mapping != mapping)) {
- unlock_page(page);
- put_page(page);
- goto restart;
- }
- return page;
- }
entry = lock_slot(mapping, slot);
out_unlock:
spin_unlock_irq(&mapping->tree_lock);
return entry;
}
-/*
- * We do not necessarily hold the mapping->tree_lock when we call this
- * function so it is possible that 'entry' is no longer a valid item in the
- * radix tree. This is okay because all we really need to do is to find the
- * correct waitqueue where tasks might be waiting for that old 'entry' and
- * wake them.
- */
-void dax_wake_mapping_entry_waiter(struct address_space *mapping,
- pgoff_t index, void *entry, bool wake_all)
-{
- struct exceptional_entry_key key;
- wait_queue_head_t *wq;
-
- wq = dax_entry_waitqueue(mapping, index, entry, &key);
-
- /*
- * Checking for locked entry and prepare_to_wait_exclusive() happens
- * under mapping->tree_lock, ditto for entry handling in our callers.
- * So at this point all tasks that could have seen our entry locked
- * must be in the waitqueue and the following check will see them.
- */
- if (waitqueue_active(wq))
- __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
-}
-
static int __dax_invalidate_mapping_entry(struct address_space *mapping,
pgoff_t index, bool trunc)
{
@@ -426,7 +447,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping,
spin_lock_irq(&mapping->tree_lock);
entry = get_unlocked_mapping_entry(mapping, index, NULL);
- if (!entry || !radix_tree_exceptional_entry(entry))
+ if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
goto out;
if (!trunc &&
(radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
@@ -468,50 +489,6 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
return __dax_invalidate_mapping_entry(mapping, index, false);
}
-/*
- * The user has performed a load from a hole in the file. Allocating
- * a new page in the file would cause excessive storage usage for
- * workloads with sparse files. We allocate a page cache page instead.
- * We'll kick it out of the page cache if it's ever written to,
- * otherwise it will simply fall out of the page cache under memory
- * pressure without ever having been dirtied.
- */
-static int dax_load_hole(struct address_space *mapping, void **entry,
- struct vm_fault *vmf)
-{
- struct inode *inode = mapping->host;
- struct page *page;
- int ret;
-
- /* Hole page already exists? Return it... */
- if (!radix_tree_exceptional_entry(*entry)) {
- page = *entry;
- goto finish_fault;
- }
-
- /* This will replace locked radix tree entry with a hole page */
- page = find_or_create_page(mapping, vmf->pgoff,
- vmf->gfp_mask | __GFP_ZERO);
- if (!page) {
- ret = VM_FAULT_OOM;
- goto out;
- }
-
-finish_fault:
- vmf->page = page;
- ret = finish_fault(vmf);
- vmf->page = NULL;
- *entry = page;
- if (!ret) {
- /* Grab reference for PTE that is now referencing the page */
- get_page(page);
- ret = VM_FAULT_NOPAGE;
- }
-out:
- trace_dax_load_hole(inode, vmf, ret);
- return ret;
-}
-
static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
sector_t sector, size_t size, struct page *to,
unsigned long vaddr)
@@ -552,47 +529,27 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
unsigned long flags)
{
struct radix_tree_root *page_tree = &mapping->page_tree;
- int error = 0;
- bool hole_fill = false;
void *new_entry;
pgoff_t index = vmf->pgoff;
if (vmf->flags & FAULT_FLAG_WRITE)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- /* Replacing hole page with block mapping? */
- if (!radix_tree_exceptional_entry(entry)) {
- hole_fill = true;
- /*
- * Unmap the page now before we remove it from page cache below.
- * The page is locked so it cannot be faulted in again.
- */
- unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
- PAGE_SIZE, 0);
- error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
- if (error)
- return ERR_PTR(error);
- } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
- /* replacing huge zero page with PMD block mapping */
- unmap_mapping_range(mapping,
- (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
+ if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
+ /* we are replacing a zero page with block mapping */
+ if (dax_is_pmd_entry(entry))
+ unmap_mapping_range(mapping,
+ (vmf->pgoff << PAGE_SHIFT) & PMD_MASK,
+ PMD_SIZE, 0);
+ else /* pte entry */
+ unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
+ PAGE_SIZE, 0);
}
spin_lock_irq(&mapping->tree_lock);
new_entry = dax_radix_locked_entry(sector, flags);
- if (hole_fill) {
- __delete_from_page_cache(entry, NULL);
- /* Drop pagecache reference */
- put_page(entry);
- error = __radix_tree_insert(page_tree, index,
- dax_radix_order(new_entry), new_entry);
- if (error) {
- new_entry = ERR_PTR(error);
- goto unlock;
- }
- mapping->nrexceptional++;
- } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+ if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
/*
* Only swap our new entry into the radix tree if the current
* entry is a zero page or an empty entry. If a normal PTE or
@@ -609,23 +566,14 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
WARN_ON_ONCE(ret != entry);
__radix_tree_replace(page_tree, node, slot,
new_entry, NULL, NULL);
+ entry = new_entry;
}
+
if (vmf->flags & FAULT_FLAG_WRITE)
radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
- unlock:
+
spin_unlock_irq(&mapping->tree_lock);
- if (hole_fill) {
- radix_tree_preload_end();
- /*
- * We don't need hole page anymore, it has been replaced with
- * locked radix tree entry now.
- */
- if (mapping->a_ops->freepage)
- mapping->a_ops->freepage(entry);
- unlock_page(entry);
- put_page(entry);
- }
- return new_entry;
+ return entry;
}
static inline unsigned long
@@ -646,11 +594,10 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
pte_t pte, *ptep = NULL;
pmd_t *pmdp = NULL;
spinlock_t *ptl;
- bool changed;
i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
- unsigned long address;
+ unsigned long address, start, end;
cond_resched();
@@ -658,8 +605,13 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
continue;
address = pgoff_address(index, vma);
- changed = false;
- if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl))
+
+ /*
+ * Note because we provide start/end to follow_pte_pmd it will
+ * call mmu_notifier_invalidate_range_start() on our behalf
+ * before taking any lock.
+ */
+ if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl))
continue;
if (pmdp) {
@@ -676,7 +628,7 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
set_pmd_at(vma->vm_mm, address, pmdp, pmd);
- changed = true;
+ mmu_notifier_invalidate_range(vma->vm_mm, start, end);
unlock_pmd:
spin_unlock(ptl);
#endif
@@ -691,13 +643,12 @@ unlock_pmd:
pte = pte_wrprotect(pte);
pte = pte_mkclean(pte);
set_pte_at(vma->vm_mm, address, ptep, pte);
- changed = true;
+ mmu_notifier_invalidate_range(vma->vm_mm, start, end);
unlock_pte:
pte_unmap_unlock(ptep, ptl);
}
- if (changed)
- mmu_notifier_invalidate_page(vma->vm_mm, address);
+ mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
}
i_mmap_unlock_read(mapping);
}
@@ -724,7 +675,7 @@ static int dax_writeback_one(struct block_device *bdev,
spin_lock_irq(&mapping->tree_lock);
entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
/* Entry got punched out / reallocated? */
- if (!entry2 || !radix_tree_exceptional_entry(entry2))
+ if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
goto put_unlocked;
/*
* Entry got reallocated elsewhere? No need to writeback. We have to
@@ -796,7 +747,7 @@ static int dax_writeback_one(struct block_device *bdev,
trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
dax_unlock:
dax_read_unlock(id);
- put_locked_mapping_entry(mapping, index, entry);
+ put_locked_mapping_entry(mapping, index);
return ret;
put_unlocked:
@@ -871,11 +822,10 @@ EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
static int dax_insert_mapping(struct address_space *mapping,
struct block_device *bdev, struct dax_device *dax_dev,
- sector_t sector, size_t size, void **entryp,
+ sector_t sector, size_t size, void *entry,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
unsigned long vaddr = vmf->address;
- void *entry = *entryp;
void *ret, *kaddr;
pgoff_t pgoff;
int id, rc;
@@ -896,47 +846,48 @@ static int dax_insert_mapping(struct address_space *mapping,
ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
if (IS_ERR(ret))
return PTR_ERR(ret);
- *entryp = ret;
trace_dax_insert_mapping(mapping->host, vmf, ret);
- return vm_insert_mixed(vma, vaddr, pfn);
+ if (vmf->flags & FAULT_FLAG_WRITE)
+ return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
+ else
+ return vm_insert_mixed(vma, vaddr, pfn);
}
-/**
- * dax_pfn_mkwrite - handle first write to DAX page
- * @vmf: The description of the fault
+/*
+ * The user has performed a load from a hole in the file. Allocating a new
+ * page in the file would cause excessive storage usage for workloads with
+ * sparse files. Instead we insert a read-only mapping of the 4k zero page.
+ * If this page is ever written to we will re-fault and change the mapping to
+ * point to real DAX storage instead.
*/
-int dax_pfn_mkwrite(struct vm_fault *vmf)
+static int dax_load_hole(struct address_space *mapping, void *entry,
+ struct vm_fault *vmf)
{
- struct file *file = vmf->vma->vm_file;
- struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- void *entry, **slot;
- pgoff_t index = vmf->pgoff;
+ unsigned long vaddr = vmf->address;
+ int ret = VM_FAULT_NOPAGE;
+ struct page *zero_page;
+ void *entry2;
- spin_lock_irq(&mapping->tree_lock);
- entry = get_unlocked_mapping_entry(mapping, index, &slot);
- if (!entry || !radix_tree_exceptional_entry(entry)) {
- if (entry)
- put_unlocked_mapping_entry(mapping, index, entry);
- spin_unlock_irq(&mapping->tree_lock);
- trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE);
- return VM_FAULT_NOPAGE;
+ zero_page = ZERO_PAGE(0);
+ if (unlikely(!zero_page)) {
+ ret = VM_FAULT_OOM;
+ goto out;
}
- radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
- entry = lock_slot(mapping, slot);
- spin_unlock_irq(&mapping->tree_lock);
- /*
- * If we race with somebody updating the PTE and finish_mkwrite_fault()
- * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
- * the fault in either case.
- */
- finish_mkwrite_fault(vmf);
- put_locked_mapping_entry(mapping, index, entry);
- trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE);
- return VM_FAULT_NOPAGE;
+
+ entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
+ RADIX_DAX_ZERO_PAGE);
+ if (IS_ERR(entry2)) {
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+
+ vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page));
+out:
+ trace_dax_load_hole(inode, vmf, ret);
+ return ret;
}
-EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
static bool dax_range_is_aligned(struct block_device *bdev,
unsigned int offset, unsigned int length)
@@ -1056,6 +1007,11 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
if (map_len > end - pos)
map_len = end - pos;
+ /*
+ * The userspace address for the memory copy has already been
+ * validated via access_ok() in either vfs_read() or
+ * vfs_write(), depending on which operation we are doing.
+ */
if (iov_iter_rw(iter) == WRITE)
map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
map_len, iter);
@@ -1220,7 +1176,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
major = VM_FAULT_MAJOR;
}
error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
- sector, PAGE_SIZE, &entry, vmf->vma, vmf);
+ sector, PAGE_SIZE, entry, vmf->vma, vmf);
/* -EBUSY is fine, somebody else faulted on the same PTE */
if (error == -EBUSY)
error = 0;
@@ -1228,7 +1184,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (!(vmf->flags & FAULT_FLAG_WRITE)) {
- vmf_ret = dax_load_hole(mapping, &entry, vmf);
+ vmf_ret = dax_load_hole(mapping, entry, vmf);
goto finish_iomap;
}
/*FALLTHRU*/
@@ -1255,21 +1211,15 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
}
unlock_entry:
- put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+ put_locked_mapping_entry(mapping, vmf->pgoff);
out:
trace_dax_pte_fault_done(inode, vmf, vmf_ret);
return vmf_ret;
}
#ifdef CONFIG_FS_DAX_PMD
-/*
- * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
- * more often than one might expect in the below functions.
- */
-#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
-
static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
- loff_t pos, void **entryp)
+ loff_t pos, void *entry)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
const sector_t sector = dax_iomap_sector(iomap, pos);
@@ -1280,7 +1230,7 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
void *ret = NULL, *kaddr;
long length = 0;
pgoff_t pgoff;
- pfn_t pfn;
+ pfn_t pfn = {};
int id;
if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
@@ -1300,11 +1250,10 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
goto unlock_fallback;
dax_read_unlock(id);
- ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector,
+ ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
RADIX_DAX_PMD);
if (IS_ERR(ret))
goto fallback;
- *entryp = ret;
trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
@@ -1318,7 +1267,7 @@ fallback:
}
static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
- void **entryp)
+ void *entry)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
unsigned long pmd_addr = vmf->address & PMD_MASK;
@@ -1333,11 +1282,10 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
if (unlikely(!zero_page))
goto fallback;
- ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
- RADIX_DAX_PMD | RADIX_DAX_HZP);
+ ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
+ RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE);
if (IS_ERR(ret))
goto fallback;
- *entryp = ret;
ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
if (!pmd_none(*(vmf->pmd))) {
@@ -1413,10 +1361,10 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
goto fallback;
/*
- * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
- * PMD or a HZP entry. If it can't (because a 4k page is already in
- * the tree, for instance), it will return -EEXIST and we just fall
- * back to 4k entries.
+ * grab_mapping_entry() will make sure we get a 2MiB empty entry, a
+ * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page
+ * is already in the tree, for instance), it will return -EEXIST and
+ * we just fall back to 4k entries.
*/
entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
if (IS_ERR(entry))
@@ -1449,13 +1397,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
switch (iomap.type) {
case IOMAP_MAPPED:
- result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
+ result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);
break;
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (WARN_ON_ONCE(write))
break;
- result = dax_pmd_load_hole(vmf, &iomap, &entry);
+ result = dax_pmd_load_hole(vmf, &iomap, entry);
break;
default:
WARN_ON_ONCE(1);
@@ -1478,7 +1426,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
&iomap);
}
unlock_entry:
- put_locked_mapping_entry(mapping, pgoff, entry);
+ put_locked_mapping_entry(mapping, pgoff);
fallback:
if (result == VM_FAULT_FALLBACK) {
split_huge_pmd(vma, vmf->pmd, vmf->address);
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index ca7089aeadab..fa08448e35dd 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -68,7 +68,7 @@ static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
if (lkb->lkb_wait_type)
seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
- seq_puts(s, "\n");
+ seq_putc(s, '\n');
}
static void print_format1(struct dlm_rsb *res, struct seq_file *s)
@@ -111,7 +111,7 @@ static void print_format1(struct dlm_rsb *res, struct seq_file *s)
}
if (rsb_flag(res, RSB_VALNOTVALID))
seq_puts(s, " (INVALID)");
- seq_puts(s, "\n");
+ seq_putc(s, '\n');
if (seq_has_overflowed(s))
goto out;
}
@@ -156,7 +156,7 @@ static void print_format1(struct dlm_rsb *res, struct seq_file *s)
lkb->lkb_id, print_lockmode(lkb->lkb_rqmode));
if (lkb->lkb_wait_type)
seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
- seq_puts(s, "\n");
+ seq_putc(s, '\n');
if (seq_has_overflowed(s))
goto out;
}
@@ -287,7 +287,7 @@ static void print_format3(struct dlm_rsb *r, struct seq_file *s)
else
seq_printf(s, " %02x", (unsigned char)r->res_name[i]);
}
- seq_puts(s, "\n");
+ seq_putc(s, '\n');
if (seq_has_overflowed(s))
goto out;
@@ -298,7 +298,7 @@ static void print_format3(struct dlm_rsb *r, struct seq_file *s)
for (i = 0; i < lvblen; i++)
seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]);
- seq_puts(s, "\n");
+ seq_putc(s, '\n');
if (seq_has_overflowed(s))
goto out;
@@ -361,8 +361,7 @@ static void print_format4(struct dlm_rsb *r, struct seq_file *s)
else
seq_printf(s, " %02x", (unsigned char)r->res_name[i]);
}
- seq_puts(s, "\n");
-
+ seq_putc(s, '\n');
unlock_rsb(r);
}
@@ -436,7 +435,7 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
if (bucket >= ls->ls_rsbtbl_size)
return NULL;
- ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_NOFS);
+ ri = kzalloc(sizeof(*ri), GFP_NOFS);
if (!ri)
return NULL;
if (n == 0)
@@ -742,7 +741,7 @@ void dlm_delete_debug_file(struct dlm_ls *ls)
int dlm_create_debug_file(struct dlm_ls *ls)
{
- char name[DLM_LOCKSPACE_LEN+8];
+ char name[DLM_LOCKSPACE_LEN + 8];
/* format 1 */
@@ -757,7 +756,7 @@ int dlm_create_debug_file(struct dlm_ls *ls)
/* format 2 */
memset(name, 0, sizeof(name));
- snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_locks", ls->ls_name);
+ snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_locks", ls->ls_name);
ls->ls_debug_locks_dentry = debugfs_create_file(name,
S_IFREG | S_IRUGO,
@@ -770,7 +769,7 @@ int dlm_create_debug_file(struct dlm_ls *ls)
/* format 3 */
memset(name, 0, sizeof(name));
- snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_all", ls->ls_name);
+ snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_all", ls->ls_name);
ls->ls_debug_all_dentry = debugfs_create_file(name,
S_IFREG | S_IRUGO,
@@ -783,7 +782,7 @@ int dlm_create_debug_file(struct dlm_ls *ls)
/* format 4 */
memset(name, 0, sizeof(name));
- snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_toss", ls->ls_name);
+ snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_toss", ls->ls_name);
ls->ls_debug_toss_dentry = debugfs_create_file(name,
S_IFREG | S_IRUGO,
@@ -794,7 +793,7 @@ int dlm_create_debug_file(struct dlm_ls *ls)
goto fail;
memset(name, 0, sizeof(name));
- snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
+ snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_waiters", ls->ls_name);
ls->ls_debug_waiters_dentry = debugfs_create_file(name,
S_IFREG | S_IRUGO,
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 6df332296c66..d4aaddec1b16 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1426,7 +1426,7 @@ void dlm_scan_waiters(struct dlm_ls *ls)
if (!num_nodes) {
num_nodes = ls->ls_num_nodes;
- warned = kzalloc(num_nodes * sizeof(int), GFP_KERNEL);
+ warned = kcalloc(num_nodes, sizeof(int), GFP_KERNEL);
}
if (!warned)
continue;
@@ -5119,11 +5119,9 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
int wait_type, stub_unlock_result, stub_cancel_result;
int dir_nodeid;
- ms_stub = kmalloc(sizeof(struct dlm_message), GFP_KERNEL);
- if (!ms_stub) {
- log_error(ls, "dlm_recover_waiters_pre no mem");
+ ms_stub = kmalloc(sizeof(*ms_stub), GFP_KERNEL);
+ if (!ms_stub)
return;
- }
mutex_lock(&ls->ls_waiters_mutex);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 91592b75c309..78a7c855b06b 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -235,7 +235,7 @@ static int dlm_uevent(struct kset *kset, struct kobject *kobj,
return 0;
}
-static struct kset_uevent_ops dlm_uevent_ops = {
+static const struct kset_uevent_ops dlm_uevent_ops = {
.uevent = dlm_uevent,
};
@@ -453,9 +453,14 @@ static int new_lockspace(const char *name, const char *cluster,
*ops_result = 0;
}
+ if (!cluster)
+ log_print("dlm cluster name '%s' is being used without an application provided cluster name",
+ dlm_config.ci_cluster_name);
+
if (dlm_config.ci_recover_callbacks && cluster &&
strncmp(cluster, dlm_config.ci_cluster_name, DLM_LOCKSPACE_LEN)) {
- log_print("dlm cluster name %s mismatch %s",
+ log_print("dlm cluster name '%s' does not match "
+ "the application cluster name '%s'",
dlm_config.ci_cluster_name, cluster);
error = -EBADR;
goto out;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 9382db998ec9..4813d0e0cd9b 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -729,7 +729,7 @@ static int tcp_accept_from_sock(struct connection *con)
mutex_unlock(&connections_lock);
memset(&peeraddr, 0, sizeof(peeraddr));
- result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+ result = sock_create_lite(dlm_local_addr[0]->ss_family,
SOCK_STREAM, IPPROTO_TCP, &newsock);
if (result < 0)
return -ENOMEM;
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 9c47f1c14a8b..3fda3832cf6a 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -217,8 +217,7 @@ int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
}
array_size = max + need;
-
- array = kzalloc(array_size * sizeof(struct dlm_slot), GFP_NOFS);
+ array = kcalloc(array_size, sizeof(*array), GFP_NOFS);
if (!array)
return -ENOMEM;
@@ -319,7 +318,7 @@ static int dlm_add_member(struct dlm_ls *ls, struct dlm_config_node *node)
struct dlm_member *memb;
int error;
- memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
+ memb = kzalloc(sizeof(*memb), GFP_NOFS);
if (!memb)
return -ENOMEM;
@@ -405,8 +404,7 @@ static void make_member_array(struct dlm_ls *ls)
}
ls->ls_total_weight = total;
-
- array = kmalloc(sizeof(int) * total, GFP_NOFS);
+ array = kmalloc_array(total, sizeof(*array), GFP_NOFS);
if (!array)
return;
@@ -492,8 +490,7 @@ void dlm_lsop_recover_done(struct dlm_ls *ls)
return;
num = ls->ls_num_nodes;
-
- slots = kzalloc(num * sizeof(struct dlm_slot), GFP_KERNEL);
+ slots = kcalloc(num, sizeof(*slots), GFP_KERNEL);
if (!slots)
return;
@@ -673,11 +670,11 @@ int dlm_ls_stop(struct dlm_ls *ls)
int dlm_ls_start(struct dlm_ls *ls)
{
- struct dlm_recover *rv = NULL, *rv_old;
+ struct dlm_recover *rv, *rv_old;
struct dlm_config_node *nodes;
int error, count;
- rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS);
+ rv = kzalloc(sizeof(*rv), GFP_NOFS);
if (!rv)
return -ENOMEM;
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index d401425f602a..e631b1689228 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -367,7 +367,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
locks_init_lock(fl);
fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
fl->fl_flags = FL_POSIX;
- fl->fl_pid = op->info.pid;
+ fl->fl_pid = -op->info.pid;
fl->fl_start = op->info.start;
fl->fl_end = op->info.end;
rv = 0;
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 23488f559cf9..d18e7a539f11 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -123,6 +123,8 @@ static void compat_input(struct dlm_write_request *kb,
static void compat_output(struct dlm_lock_result *res,
struct dlm_lock_result32 *res32)
{
+ memset(res32, 0, sizeof(*res32));
+
res32->version[0] = res->version[0];
res32->version[1] = res->version[1];
res32->version[2] = res->version[2];
@@ -355,6 +357,10 @@ static int dlm_device_register(struct dlm_ls *ls, char *name)
error = misc_register(&ls->ls_device);
if (error) {
kfree(ls->ls_device.name);
+ /* this has to be set to NULL
+ * to avoid a double-free in dlm_device_deregister
+ */
+ ls->ls_device.name = NULL;
}
fail:
return error;
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index ca4e83750214..c74ed3ca3372 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -328,7 +328,7 @@ ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
int rc;
- rc = filemap_write_and_wait(file->f_mapping);
+ rc = file_write_and_wait(file);
if (rc)
return rc;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index e767e4389cb1..2fabd19cdeea 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -205,7 +205,7 @@ struct eventpoll {
struct list_head rdllist;
/* RB tree root used to store monitored fd structs */
- struct rb_root rbr;
+ struct rb_root_cached rbr;
/*
* This is a single linked list that chains all the "struct epitem" that
@@ -600,8 +600,13 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq)
wait_queue_head_t *whead;
rcu_read_lock();
- /* If it is cleared by POLLFREE, it should be rcu-safe */
- whead = rcu_dereference(pwq->whead);
+ /*
+ * If it is cleared by POLLFREE, it should be rcu-safe.
+ * If we read NULL we need a barrier paired with
+ * smp_store_release() in ep_poll_callback(), otherwise
+ * we rely on whead->lock.
+ */
+ whead = smp_load_acquire(&pwq->whead);
if (whead)
remove_wait_queue(whead, &pwq->wait);
rcu_read_unlock();
@@ -791,7 +796,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
list_del_rcu(&epi->fllink);
spin_unlock(&file->f_lock);
- rb_erase(&epi->rbn, &ep->rbr);
+ rb_erase_cached(&epi->rbn, &ep->rbr);
spin_lock_irqsave(&ep->lock, flags);
if (ep_is_linked(&epi->rdllink))
@@ -835,7 +840,7 @@ static void ep_free(struct eventpoll *ep)
/*
* Walks through the whole tree by unregistering poll callbacks.
*/
- for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
epi = rb_entry(rbp, struct epitem, rbn);
ep_unregister_pollwait(ep, epi);
@@ -851,7 +856,7 @@ static void ep_free(struct eventpoll *ep)
* a lockdep warning.
*/
mutex_lock(&ep->mtx);
- while ((rbp = rb_first(&ep->rbr)) != NULL) {
+ while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
epi = rb_entry(rbp, struct epitem, rbn);
ep_remove(ep, epi);
cond_resched();
@@ -958,7 +963,7 @@ static void ep_show_fdinfo(struct seq_file *m, struct file *f)
struct rb_node *rbp;
mutex_lock(&ep->mtx);
- for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
struct inode *inode = file_inode(epi->ffd.file);
@@ -1035,7 +1040,7 @@ static int ep_alloc(struct eventpoll **pep)
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist);
- ep->rbr = RB_ROOT;
+ ep->rbr = RB_ROOT_CACHED;
ep->ovflist = EP_UNACTIVE_PTR;
ep->user = user;
@@ -1061,7 +1066,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
struct epoll_filefd ffd;
ep_set_ffd(&ffd, file, fd);
- for (rbp = ep->rbr.rb_node; rbp; ) {
+ for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
epi = rb_entry(rbp, struct epitem, rbn);
kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
if (kcmp > 0)
@@ -1083,7 +1088,7 @@ static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long t
struct rb_node *rbp;
struct epitem *epi;
- for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
epi = rb_entry(rbp, struct epitem, rbn);
if (epi->ffd.fd == tfd) {
if (toff == 0)
@@ -1134,17 +1139,6 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
struct eventpoll *ep = epi->ep;
int ewake = 0;
- if ((unsigned long)key & POLLFREE) {
- ep_pwq_from_wait(wait)->whead = NULL;
- /*
- * whead = NULL above can race with ep_remove_wait_queue()
- * which can do another remove_wait_queue() after us, so we
- * can't use __remove_wait_queue(). whead->lock is held by
- * the caller.
- */
- list_del_init(&wait->entry);
- }
-
spin_lock_irqsave(&ep->lock, flags);
ep_set_busy_poll_napi_id(epi);
@@ -1228,10 +1222,26 @@ out_unlock:
if (pwake)
ep_poll_safewake(&ep->poll_wait);
- if (epi->event.events & EPOLLEXCLUSIVE)
- return ewake;
+ if (!(epi->event.events & EPOLLEXCLUSIVE))
+ ewake = 1;
- return 1;
+ if ((unsigned long)key & POLLFREE) {
+ /*
+ * If we race with ep_remove_wait_queue() it can miss
+ * ->whead = NULL and do another remove_wait_queue() after
+ * us, so we can't use __remove_wait_queue().
+ */
+ list_del_init(&wait->entry);
+ /*
+ * ->whead != NULL protects us from the race with ep_free()
+ * or ep_remove(), ep_remove_wait_queue() takes whead->lock
+ * held by the caller. Once we nullify it, nothing protects
+ * ep/epi or even wait.
+ */
+ smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
+ }
+
+ return ewake;
}
/*
@@ -1263,20 +1273,22 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
{
int kcmp;
- struct rb_node **p = &ep->rbr.rb_node, *parent = NULL;
+ struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
struct epitem *epic;
+ bool leftmost = true;
while (*p) {
parent = *p;
epic = rb_entry(parent, struct epitem, rbn);
kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
- if (kcmp > 0)
+ if (kcmp > 0) {
p = &parent->rb_right;
- else
+ leftmost = false;
+ } else
p = &parent->rb_left;
}
rb_link_node(&epi->rbn, parent, p);
- rb_insert_color(&epi->rbn, &ep->rbr);
+ rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
}
@@ -1520,7 +1532,7 @@ error_remove_epi:
list_del_rcu(&epi->fllink);
spin_unlock(&tfile->f_lock);
- rb_erase(&epi->rbn, &ep->rbr);
+ rb_erase_cached(&epi->rbn, &ep->rbr);
error_unregister:
ep_unregister_pollwait(ep, epi);
@@ -1868,7 +1880,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
mutex_lock_nested(&ep->mtx, call_nests + 1);
ep->visited = 1;
list_add(&ep->visited_list_link, &visited_list);
- for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
epi = rb_entry(rbp, struct epitem, rbn);
if (unlikely(is_file_epoll(epi->ffd.file))) {
ep_tovisit = epi->ffd.file->private_data;
diff --git a/fs/exec.c b/fs/exec.c
index 62175cbcc801..01a9fb9d8ac3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1259,6 +1259,12 @@ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
perf_event_comm(tsk, exec);
}
+/*
+ * Calling this is the point of no return. None of the failures will be
+ * seen by userspace since either the process is already taking a fatal
+ * signal (via de_thread() or coredump), or will have SEGV raised
+ * (after exec_mmap()) by search_binary_handlers (see below).
+ */
int flush_old_exec(struct linux_binprm * bprm)
{
int retval;
@@ -1286,7 +1292,13 @@ int flush_old_exec(struct linux_binprm * bprm)
if (retval)
goto out;
- bprm->mm = NULL; /* We're using it now */
+ /*
+ * After clearing bprm->mm (to mark that current is using the
+ * prepared mm now), we have nothing left of the original
+ * process. If anything from here on returns an error, the check
+ * in search_binary_handler() will SEGV current.
+ */
+ bprm->mm = NULL;
set_fs(USER_DS);
current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
@@ -1331,15 +1343,38 @@ EXPORT_SYMBOL(would_dump);
void setup_new_exec(struct linux_binprm * bprm)
{
+ /*
+ * Once here, prepare_binrpm() will not be called any more, so
+ * the final state of setuid/setgid/fscaps can be merged into the
+ * secureexec flag.
+ */
+ bprm->secureexec |= bprm->cap_elevated;
+
+ if (bprm->secureexec) {
+ /* Make sure parent cannot signal privileged process. */
+ current->pdeath_signal = 0;
+
+ /*
+ * For secureexec, reset the stack limit to sane default to
+ * avoid bad behavior from the prior rlimits. This has to
+ * happen before arch_pick_mmap_layout(), which examines
+ * RLIMIT_STACK, but after the point of no return to avoid
+ * needing to clean up the change on failure.
+ */
+ if (current->signal->rlim[RLIMIT_STACK].rlim_cur > _STK_LIM)
+ current->signal->rlim[RLIMIT_STACK].rlim_cur = _STK_LIM;
+ }
+
arch_pick_mmap_layout(current->mm);
- /* This is the point of no return */
current->sas_ss_sp = current->sas_ss_size = 0;
- if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid()))
- set_dumpable(current->mm, SUID_DUMP_USER);
- else
+ /* Figure out dumpability. */
+ if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
+ bprm->secureexec)
set_dumpable(current->mm, suid_dumpable);
+ else
+ set_dumpable(current->mm, SUID_DUMP_USER);
arch_setup_new_exec();
perf_event_exec();
@@ -1351,15 +1386,6 @@ void setup_new_exec(struct linux_binprm * bprm)
*/
current->mm->task_size = TASK_SIZE;
- /* install the new credentials */
- if (!uid_eq(bprm->cred->uid, current_euid()) ||
- !gid_eq(bprm->cred->gid, current_egid())) {
- current->pdeath_signal = 0;
- } else {
- if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
- set_dumpable(current->mm, suid_dumpable);
- }
-
/* An exec changes our domain. We are no longer part of the thread
group */
current->self_exec_id++;
@@ -1548,7 +1574,7 @@ int prepare_binprm(struct linux_binprm *bprm)
retval = security_bprm_set_creds(bprm);
if (retval)
return retval;
- bprm->cred_prepared = 1;
+ bprm->called_set_creds = 1;
memset(bprm->buf, 0, BINPRM_BUF_SIZE);
return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE);
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 28645f0640f7..a94594ea2aa3 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -48,7 +48,7 @@ static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end,
struct inode *inode = filp->f_mapping->host;
int ret;
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ ret = file_write_and_wait_range(filp, start, end);
if (ret)
return ret;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index d34d32bdc944..ff3a3636a5ca 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -107,29 +107,6 @@ static int ext2_dax_fault(struct vm_fault *vmf)
return ret;
}
-static int ext2_dax_pfn_mkwrite(struct vm_fault *vmf)
-{
- struct inode *inode = file_inode(vmf->vma->vm_file);
- struct ext2_inode_info *ei = EXT2_I(inode);
- loff_t size;
- int ret;
-
- sb_start_pagefault(inode->i_sb);
- file_update_time(vmf->vma->vm_file);
- down_read(&ei->dax_sem);
-
- /* check that the faulting page hasn't raced with truncate */
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (vmf->pgoff >= size)
- ret = VM_FAULT_SIGBUS;
- else
- ret = dax_pfn_mkwrite(vmf);
-
- up_read(&ei->dax_sem);
- sb_end_pagefault(inode->i_sb);
- return ret;
-}
-
static const struct vm_operations_struct ext2_dax_vm_ops = {
.fault = ext2_dax_fault,
/*
@@ -138,7 +115,7 @@ static const struct vm_operations_struct ext2_dax_vm_ops = {
* will always fail and fail back to regular faults.
*/
.page_mkwrite = ext2_dax_fault,
- .pfn_mkwrite = ext2_dax_pfn_mkwrite,
+ .pfn_mkwrite = ext2_dax_fault,
};
static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index e8b365000d73..b04e882179c6 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -411,7 +411,7 @@ static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
{
struct dir_private_info *p;
- p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p)
return NULL;
p->curr_hash = pos2maj_hash(filp, pos);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a2bb7d2870e4..84b9da192238 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -838,13 +838,11 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
{
if (unlikely(sizeof(time->tv_sec) > 4 &&
(extra & cpu_to_le32(EXT4_EPOCH_MASK)))) {
-#if LINUX_VERSION_CODE < KERNEL_VERSION(4,20,0)
+
+#if 1
/* Handle legacy encoding of pre-1970 dates with epoch
- * bits 1,1. We assume that by kernel version 4.20,
- * everyone will have run fsck over the affected
- * filesystems to correct the problem. (This
- * backwards compatibility may be removed before this
- * time, at the discretion of the ext4 developers.)
+ * bits 1,1. (This backwards compatibility may be removed
+ * at the discretion of the ext4 developers.)
*/
u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK;
if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0)
@@ -1567,6 +1565,7 @@ enum {
nolocking */
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
+ EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 0d7cf0cc9b87..57dcaea762c3 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -279,7 +279,20 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
handle_t *handle = NULL;
struct inode *inode = file_inode(vmf->vma->vm_file);
struct super_block *sb = inode->i_sb;
- bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+ /*
+ * We have to distinguish real writes from writes which will result in a
+ * COW page; COW writes should *not* poke the journal (the file will not
+ * be changed). Doing so would cause unintended failures when mounted
+ * read-only.
+ *
+ * We check for VM_SHARED rather than vmf->cow_page since the latter is
+ * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for
+ * other sizes, dax_iomap_fault will handle splitting / fallback so that
+ * we eventually come back with a COW page.
+ */
+ bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
+ (vmf->vma->vm_flags & VM_SHARED);
if (write) {
sb_start_pagefault(sb);
@@ -311,41 +324,11 @@ static int ext4_dax_fault(struct vm_fault *vmf)
return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
}
-/*
- * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
- * handler we check for races agaist truncate. Note that since we cycle through
- * i_mmap_sem, we are sure that also any hole punching that began before we
- * were called is finished by now and so if it included part of the file we
- * are working on, our pte will get unmapped and the check for pte_same() in
- * wp_pfn_shared() fails. Thus fault gets retried and things work out as
- * desired.
- */
-static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
-{
- struct inode *inode = file_inode(vmf->vma->vm_file);
- struct super_block *sb = inode->i_sb;
- loff_t size;
- int ret;
-
- sb_start_pagefault(sb);
- file_update_time(vmf->vma->vm_file);
- down_read(&EXT4_I(inode)->i_mmap_sem);
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (vmf->pgoff >= size)
- ret = VM_FAULT_SIGBUS;
- else
- ret = dax_pfn_mkwrite(vmf);
- up_read(&EXT4_I(inode)->i_mmap_sem);
- sb_end_pagefault(sb);
-
- return ret;
-}
-
static const struct vm_operations_struct ext4_dax_vm_ops = {
.fault = ext4_dax_fault,
.huge_fault = ext4_dax_huge_fault,
.page_mkwrite = ext4_dax_fault,
- .pfn_mkwrite = ext4_dax_pfn_mkwrite,
+ .pfn_mkwrite = ext4_dax_fault,
};
#else
#define ext4_dax_vm_ops ext4_file_vm_ops
@@ -494,12 +477,11 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
pagevec_init(&pvec, 0);
do {
- int i, num;
+ int i;
unsigned long nr_pages;
- num = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1;
- nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
- (pgoff_t)num);
+ nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
+ &index, end);
if (nr_pages == 0)
break;
@@ -518,9 +500,6 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
goto out;
}
- if (page->index > end)
- goto out;
-
lock_page(page);
if (unlikely(page->mapping != inode->i_mapping)) {
@@ -563,14 +542,10 @@ next:
unlock_page(page);
}
- /* The no. of pages is less than our desired, we are done. */
- if (nr_pages < num)
- break;
-
- index = pvec.pages[i - 1]->index + 1;
pagevec_release(&pvec);
} while (index <= end);
+ /* There are no pages upto endoff - that would be a hole in there. */
if (whence == SEEK_HOLE && lastoff < endoff) {
found = 1;
*offset = lastoff;
@@ -595,7 +570,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
inode_lock(inode);
isize = i_size_read(inode);
- if (offset >= isize) {
+ if (offset < 0 || offset >= isize) {
inode_unlock(inode);
return -ENXIO;
}
@@ -658,7 +633,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
inode_lock(inode);
isize = i_size_read(inode);
- if (offset >= isize) {
+ if (offset < 0 || offset >= isize) {
inode_unlock(inode);
return -ENXIO;
}
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 38b8a96eb97c..00c6dd29e621 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -148,8 +148,6 @@ static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
if (len > num*4)
len = num * 4;
for (i = 0; i < len; i++) {
- if ((i % 4) == 0)
- val = pad;
val = ((int) scp[i]) + (val << 8);
if ((i % 4) == 3) {
*buf++ = val;
@@ -176,8 +174,6 @@ static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
if (len > num*4)
len = num * 4;
for (i = 0; i < len; i++) {
- if ((i % 4) == 0)
- val = pad;
val = ((int) ucp[i]) + (val << 8);
if ((i % 4) == 3) {
*buf++ = val;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 507bfb3344d4..71e93a23cec3 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -692,24 +692,25 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
* somewhat arbitrary...)
*/
#define RECENTCY_MIN 5
-#define RECENTCY_DIRTY 30
+#define RECENTCY_DIRTY 300
static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
{
struct ext4_group_desc *gdp;
struct ext4_inode *raw_inode;
struct buffer_head *bh;
- unsigned long dtime, now;
- int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
- int offset, ret = 0, recentcy = RECENTCY_MIN;
+ int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+ int offset, ret = 0;
+ int recentcy = RECENTCY_MIN;
+ u32 dtime, now;
gdp = ext4_get_group_desc(sb, group, NULL);
if (unlikely(!gdp))
return 0;
- bh = sb_getblk(sb, ext4_inode_table(sb, gdp) +
+ bh = sb_find_get_block(sb, ext4_inode_table(sb, gdp) +
(ino / inodes_per_block));
- if (unlikely(!bh) || !buffer_uptodate(bh))
+ if (!bh || !buffer_uptodate(bh))
/*
* If the block is not in the buffer cache, then it
* must have been written out.
@@ -718,18 +719,45 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb);
raw_inode = (struct ext4_inode *) (bh->b_data + offset);
+
+ /* i_dtime is only 32 bits on disk, but we only care about relative
+ * times in the range of a few minutes (i.e. long enough to sync a
+ * recently-deleted inode to disk), so using the low 32 bits of the
+ * clock (a 68 year range) is enough, see time_before32() */
dtime = le32_to_cpu(raw_inode->i_dtime);
- now = get_seconds();
+ now = ktime_get_real_seconds();
if (buffer_dirty(bh))
recentcy += RECENTCY_DIRTY;
- if (dtime && (dtime < now) && (now < dtime + recentcy))
+ if (dtime && time_before32(dtime, now) &&
+ time_before32(now, dtime + recentcy))
ret = 1;
out:
brelse(bh);
return ret;
}
+static int find_inode_bit(struct super_block *sb, ext4_group_t group,
+ struct buffer_head *bitmap, unsigned long *ino)
+{
+next:
+ *ino = ext4_find_next_zero_bit((unsigned long *)
+ bitmap->b_data,
+ EXT4_INODES_PER_GROUP(sb), *ino);
+ if (*ino >= EXT4_INODES_PER_GROUP(sb))
+ return 0;
+
+ if ((EXT4_SB(sb)->s_journal == NULL) &&
+ recently_deleted(sb, group, *ino)) {
+ *ino = *ino + 1;
+ if (*ino < EXT4_INODES_PER_GROUP(sb))
+ goto next;
+ return 0;
+ }
+
+ return 1;
+}
+
/*
* There are two policies for allocating an inode. If the new inode is
* a directory, then a forward search is made for a block group with both
@@ -892,19 +920,13 @@ got_group:
/*
* Check free inodes count before loading bitmap.
*/
- if (ext4_free_inodes_count(sb, gdp) == 0) {
- if (++group == ngroups)
- group = 0;
- continue;
- }
+ if (ext4_free_inodes_count(sb, gdp) == 0)
+ goto next_group;
grp = ext4_get_group_info(sb, group);
/* Skip groups with already-known suspicious inode tables */
- if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
- if (++group == ngroups)
- group = 0;
- continue;
- }
+ if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
+ goto next_group;
brelse(inode_bitmap_bh);
inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
@@ -912,27 +934,20 @@ got_group:
if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) ||
IS_ERR(inode_bitmap_bh)) {
inode_bitmap_bh = NULL;
- if (++group == ngroups)
- group = 0;
- continue;
+ goto next_group;
}
repeat_in_this_group:
- ino = ext4_find_next_zero_bit((unsigned long *)
- inode_bitmap_bh->b_data,
- EXT4_INODES_PER_GROUP(sb), ino);
- if (ino >= EXT4_INODES_PER_GROUP(sb))
+ ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
+ if (!ret2)
goto next_group;
- if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
+
+ if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) {
ext4_error(sb, "reserved inode found cleared - "
"inode=%lu", ino + 1);
- continue;
- }
- if ((EXT4_SB(sb)->s_journal == NULL) &&
- recently_deleted(sb, group, ino)) {
- ino++;
- goto next_inode;
+ goto next_group;
}
+
if (!handle) {
BUG_ON(nblocks <= 0);
handle = __ext4_journal_start_sb(dir->i_sb, line_no,
@@ -952,11 +967,23 @@ repeat_in_this_group:
}
ext4_lock_group(sb, group);
ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
+ if (ret2) {
+ /* Someone already took the bit. Repeat the search
+ * with lock held.
+ */
+ ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
+ if (ret2) {
+ ext4_set_bit(ino, inode_bitmap_bh->b_data);
+ ret2 = 0;
+ } else {
+ ret2 = 1; /* we didn't grab the inode */
+ }
+ }
ext4_unlock_group(sb, group);
ino++; /* the inode bitmap is zero-based */
if (!ret2)
goto got; /* we grabbed the inode! */
-next_inode:
+
if (ino < EXT4_INODES_PER_GROUP(sb))
goto repeat_in_this_group;
next_group:
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c774bdc22759..e963508ea35f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1720,13 +1720,12 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
pagevec_init(&pvec, 0);
while (index <= end) {
- nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+ nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end);
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- if (page->index > end)
- break;
+
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
if (invalidate) {
@@ -1737,7 +1736,6 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
}
unlock_page(page);
}
- index = pvec.pages[nr_pages - 1]->index + 1;
pagevec_release(&pvec);
}
}
@@ -2348,17 +2346,13 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
pagevec_init(&pvec, 0);
while (start <= end) {
- nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
- PAGEVEC_SIZE);
+ nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
+ &start, end);
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- if (page->index > end)
- break;
- /* Up to 'end' pages must be contiguous */
- BUG_ON(page->index != start);
bh = head = page_buffers(page);
do {
if (lblk < mpd->map.m_lblk)
@@ -2403,7 +2397,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
pagevec_release(&pvec);
return err;
}
- start++;
}
pagevec_release(&pvec);
}
@@ -4897,14 +4890,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
brelse(iloc.bh);
ext4_set_inode_flags(inode);
- if (ei->i_flags & EXT4_EA_INODE_FL) {
- ext4_xattr_inode_set_class(inode);
-
- inode_lock(inode);
- inode->i_flags |= S_NOQUOTA;
- inode_unlock(inode);
- }
-
unlock_new_inode(inode);
return inode;
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index eb9835638680..77cdce1f17ce 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -367,7 +367,7 @@ skip:
goto failed;
}
- mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
+ mmpd_data = kmalloc(sizeof(*mmpd_data), GFP_KERNEL);
if (!mmpd_data) {
ext4_warning(sb, "not enough memory for mmpd_data");
goto failed;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d61a70e2193a..93aece6891f2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2404,6 +2404,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
unsigned int s_flags = sb->s_flags;
int ret, nr_orphans = 0, nr_truncates = 0;
#ifdef CONFIG_QUOTA
+ int quota_update = 0;
int i;
#endif
if (!es->s_last_orphan) {
@@ -2442,14 +2443,32 @@ static void ext4_orphan_cleanup(struct super_block *sb,
#ifdef CONFIG_QUOTA
/* Needed for iput() to work correctly and not trash data */
sb->s_flags |= MS_ACTIVE;
- /* Turn on quotas so that they are updated correctly */
+
+ /*
+ * Turn on quotas which were not enabled for read-only mounts if
+ * filesystem has quota feature, so that they are updated correctly.
+ */
+ if (ext4_has_feature_quota(sb) && (s_flags & MS_RDONLY)) {
+ int ret = ext4_enable_quotas(sb);
+
+ if (!ret)
+ quota_update = 1;
+ else
+ ext4_msg(sb, KERN_ERR,
+ "Cannot turn on quotas: error %d", ret);
+ }
+
+ /* Turn on journaled quotas used for old sytle */
for (i = 0; i < EXT4_MAXQUOTAS; i++) {
if (EXT4_SB(sb)->s_qf_names[i]) {
int ret = ext4_quota_on_mount(sb, i);
- if (ret < 0)
+
+ if (!ret)
+ quota_update = 1;
+ else
ext4_msg(sb, KERN_ERR,
"Cannot turn on journaled "
- "quota: error %d", ret);
+ "quota: type %d: error %d", i, ret);
}
}
#endif
@@ -2510,10 +2529,12 @@ static void ext4_orphan_cleanup(struct super_block *sb,
ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
PLURAL(nr_truncates));
#ifdef CONFIG_QUOTA
- /* Turn quotas off */
- for (i = 0; i < EXT4_MAXQUOTAS; i++) {
- if (sb_dqopt(sb)->files[i])
- dquot_quota_off(sb, i);
+ /* Turn off quotas if they were enabled for orphan cleanup */
+ if (quota_update) {
+ for (i = 0; i < EXT4_MAXQUOTAS; i++) {
+ if (sb_dqopt(sb)->files[i])
+ dquot_quota_off(sb, i);
+ }
}
#endif
sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@ -5194,7 +5215,7 @@ static int ext4_statfs_project(struct super_block *sb,
dquot = dqget(sb, qid);
if (IS_ERR(dquot))
return PTR_ERR(dquot);
- spin_lock(&dq_data_lock);
+ spin_lock(&dquot->dq_dqb_lock);
limit = (dquot->dq_dqb.dqb_bsoftlimit ?
dquot->dq_dqb.dqb_bsoftlimit :
@@ -5217,7 +5238,7 @@ static int ext4_statfs_project(struct super_block *sb,
(buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
}
- spin_unlock(&dq_data_lock);
+ spin_unlock(&dquot->dq_dqb_lock);
dqput(dquot);
return 0;
}
@@ -5263,18 +5284,13 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-/* Helper function for writing quotas on sync - we need to start transaction
- * before quota file is locked for write. Otherwise the are possible deadlocks:
- * Process 1 Process 2
- * ext4_create() quota_sync()
- * jbd2_journal_start() write_dquot()
- * dquot_initialize() down(dqio_mutex)
- * down(dqio_mutex) jbd2_journal_start()
- *
- */
#ifdef CONFIG_QUOTA
+/*
+ * Helper functions so that transaction is started before we acquire dqio_sem
+ * to keep correct lock ordering of transaction > dqio_sem
+ */
static inline struct inode *dquot_to_inode(struct dquot *dquot)
{
return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
@@ -5409,6 +5425,13 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
ext4_msg(sb, KERN_WARNING,
"Quota file not on filesystem root. "
"Journaled quota will not work");
+ sb_dqopt(sb)->flags |= DQUOT_NOLIST_DIRTY;
+ } else {
+ /*
+ * Clear the flag just in case mount options changed since
+ * last time.
+ */
+ sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY;
}
/*
@@ -5505,13 +5528,16 @@ static int ext4_enable_quotas(struct super_block *sb)
test_opt(sb, PRJQUOTA),
};
- sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
+ sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
for (type = 0; type < EXT4_MAXQUOTAS; type++) {
if (qf_inums[type]) {
err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
DQUOT_USAGE_ENABLED |
(quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
if (err) {
+ for (type--; type >= 0; type--)
+ dquot_quota_off(sb, type);
+
ext4_warning(sb,
"Failed to enable quota tracking "
"(type=%d, err=%d). Please run "
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 3dd970168448..3b69330a4250 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -354,8 +354,10 @@ free_bhs:
return ret;
}
+#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec)
+
static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
- struct inode **ea_inode)
+ u32 ea_inode_hash, struct inode **ea_inode)
{
struct inode *inode;
int err;
@@ -385,6 +387,24 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
goto error;
}
+ ext4_xattr_inode_set_class(inode);
+
+ /*
+ * Check whether this is an old Lustre-style xattr inode. Lustre
+ * implementation does not have hash validation, rather it has a
+ * backpointer from ea_inode to the parent inode.
+ */
+ if (ea_inode_hash != ext4_xattr_inode_get_hash(inode) &&
+ EXT4_XATTR_INODE_GET_PARENT(inode) == parent->i_ino &&
+ inode->i_generation == parent->i_generation) {
+ ext4_set_inode_state(inode, EXT4_STATE_LUSTRE_EA_INODE);
+ ext4_xattr_inode_set_ref(inode, 1);
+ } else {
+ inode_lock(inode);
+ inode->i_flags |= S_NOQUOTA;
+ inode_unlock(inode);
+ }
+
*ea_inode = inode;
return 0;
error:
@@ -417,8 +437,6 @@ ext4_xattr_inode_verify_hashes(struct inode *ea_inode,
return 0;
}
-#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec)
-
/*
* Read xattr value from the EA inode.
*/
@@ -431,7 +449,7 @@ ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry,
int err;
err = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum),
- &ea_inode);
+ le32_to_cpu(entry->e_hash), &ea_inode);
if (err) {
ea_inode = NULL;
goto out;
@@ -449,29 +467,20 @@ ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry,
if (err)
goto out;
- err = ext4_xattr_inode_verify_hashes(ea_inode, entry, buffer, size);
- /*
- * Compatibility check for old Lustre ea_inode implementation. Old
- * version does not have hash validation, but it has a backpointer
- * from ea_inode to the parent inode.
- */
- if (err == -EFSCORRUPTED) {
- if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != inode->i_ino ||
- ea_inode->i_generation != inode->i_generation) {
+ if (!ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) {
+ err = ext4_xattr_inode_verify_hashes(ea_inode, entry, buffer,
+ size);
+ if (err) {
ext4_warning_inode(ea_inode,
"EA inode hash validation failed");
goto out;
}
- /* Do not add ea_inode to the cache. */
- ea_inode_cache = NULL;
- err = 0;
- } else if (err)
- goto out;
- if (ea_inode_cache)
- mb_cache_entry_create(ea_inode_cache, GFP_NOFS,
- ext4_xattr_inode_get_hash(ea_inode),
- ea_inode->i_ino, true /* reusable */);
+ if (ea_inode_cache)
+ mb_cache_entry_create(ea_inode_cache, GFP_NOFS,
+ ext4_xattr_inode_get_hash(ea_inode),
+ ea_inode->i_ino, true /* reusable */);
+ }
out:
iput(ea_inode);
return err;
@@ -838,10 +847,15 @@ static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len)
return err;
}
-static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len)
+static void ext4_xattr_inode_free_quota(struct inode *parent,
+ struct inode *ea_inode,
+ size_t len)
{
- dquot_free_space_nodirty(inode, round_up_cluster(inode, len));
- dquot_free_inode(inode);
+ if (ea_inode &&
+ ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE))
+ return;
+ dquot_free_space_nodirty(parent, round_up_cluster(parent, len));
+ dquot_free_inode(parent);
}
int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
@@ -1071,7 +1085,9 @@ static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
if (!entry->e_value_inum)
continue;
ea_ino = le32_to_cpu(entry->e_value_inum);
- err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
+ err = ext4_xattr_inode_iget(parent, ea_ino,
+ le32_to_cpu(entry->e_hash),
+ &ea_inode);
if (err)
goto cleanup;
err = ext4_xattr_inode_inc_ref(handle, ea_inode);
@@ -1093,7 +1109,9 @@ cleanup:
if (!entry->e_value_inum)
continue;
ea_ino = le32_to_cpu(entry->e_value_inum);
- err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
+ err = ext4_xattr_inode_iget(parent, ea_ino,
+ le32_to_cpu(entry->e_hash),
+ &ea_inode);
if (err) {
ext4_warning(parent->i_sb,
"cleanup ea_ino %u iget error %d", ea_ino,
@@ -1131,7 +1149,9 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
if (!entry->e_value_inum)
continue;
ea_ino = le32_to_cpu(entry->e_value_inum);
- err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
+ err = ext4_xattr_inode_iget(parent, ea_ino,
+ le32_to_cpu(entry->e_hash),
+ &ea_inode);
if (err)
continue;
@@ -1159,7 +1179,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
}
if (!skip_quota)
- ext4_xattr_inode_free_quota(parent,
+ ext4_xattr_inode_free_quota(parent, ea_inode,
le32_to_cpu(entry->e_value_size));
/*
@@ -1591,6 +1611,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
if (!s->not_found && here->e_value_inum) {
ret = ext4_xattr_inode_iget(inode,
le32_to_cpu(here->e_value_inum),
+ le32_to_cpu(here->e_hash),
&old_ea_inode);
if (ret) {
old_ea_inode = NULL;
@@ -1609,7 +1630,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
&new_ea_inode);
if (ret) {
new_ea_inode = NULL;
- ext4_xattr_inode_free_quota(inode, i->value_len);
+ ext4_xattr_inode_free_quota(inode, NULL, i->value_len);
goto out;
}
}
@@ -1628,13 +1649,13 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
ext4_warning_inode(new_ea_inode,
"dec ref new_ea_inode err=%d",
err);
- ext4_xattr_inode_free_quota(inode,
+ ext4_xattr_inode_free_quota(inode, new_ea_inode,
i->value_len);
}
goto out;
}
- ext4_xattr_inode_free_quota(inode,
+ ext4_xattr_inode_free_quota(inode, old_ea_inode,
le32_to_cpu(here->e_value_size));
}
@@ -1805,8 +1826,10 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
struct mb_cache_entry *ce = NULL;
int error = 0;
struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
- struct inode *ea_inode = NULL;
- size_t old_ea_inode_size = 0;
+ struct inode *ea_inode = NULL, *tmp_inode;
+ size_t old_ea_inode_quota = 0;
+ unsigned int ea_ino;
+
#define header(x) ((struct ext4_xattr_header *)(x))
@@ -1865,12 +1888,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
* like it has an empty value.
*/
if (!s->not_found && s->here->e_value_inum) {
- /*
- * Defer quota free call for previous inode
- * until success is guaranteed.
- */
- old_ea_inode_size = le32_to_cpu(
+ ea_ino = le32_to_cpu(s->here->e_value_inum);
+ error = ext4_xattr_inode_iget(inode, ea_ino,
+ le32_to_cpu(s->here->e_hash),
+ &tmp_inode);
+ if (error)
+ goto cleanup;
+
+ if (!ext4_test_inode_state(tmp_inode,
+ EXT4_STATE_LUSTRE_EA_INODE)) {
+ /*
+ * Defer quota free call for previous
+ * inode until success is guaranteed.
+ */
+ old_ea_inode_quota = le32_to_cpu(
s->here->e_value_size);
+ }
+ iput(tmp_inode);
+
s->here->e_value_inum = 0;
s->here->e_value_size = 0;
}
@@ -1897,8 +1932,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
goto cleanup;
if (i->value && s->here->e_value_inum) {
- unsigned int ea_ino;
-
/*
* A ref count on ea_inode has been taken as part of the call to
* ext4_xattr_set_entry() above. We would like to drop this
@@ -1906,7 +1939,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
* initialized and has its own ref count on the ea_inode.
*/
ea_ino = le32_to_cpu(s->here->e_value_inum);
- error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
+ error = ext4_xattr_inode_iget(inode, ea_ino,
+ le32_to_cpu(s->here->e_hash),
+ &ea_inode);
if (error) {
ea_inode = NULL;
goto cleanup;
@@ -2056,8 +2091,8 @@ getblk_failed:
}
}
- if (old_ea_inode_size)
- ext4_xattr_inode_free_quota(inode, old_ea_inode_size);
+ if (old_ea_inode_quota)
+ ext4_xattr_inode_free_quota(inode, NULL, old_ea_inode_quota);
/* Update the inode. */
EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
@@ -2084,7 +2119,7 @@ cleanup:
/* If there was an error, revert the quota charge. */
if (error)
- ext4_xattr_inode_free_quota(inode,
+ ext4_xattr_inode_free_quota(inode, ea_inode,
i_size_read(ea_inode));
iput(ea_inode);
}
@@ -2800,6 +2835,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
struct ext4_xattr_ibody_header *header;
struct ext4_iloc iloc = { .bh = NULL };
struct ext4_xattr_entry *entry;
+ struct inode *ea_inode;
int error;
error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
@@ -2854,10 +2890,19 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
if (ext4_has_feature_ea_inode(inode->i_sb)) {
for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
- entry = EXT4_XATTR_NEXT(entry))
- if (entry->e_value_inum)
- ext4_xattr_inode_free_quota(inode,
+ entry = EXT4_XATTR_NEXT(entry)) {
+ if (!entry->e_value_inum)
+ continue;
+ error = ext4_xattr_inode_iget(inode,
+ le32_to_cpu(entry->e_value_inum),
+ le32_to_cpu(entry->e_hash),
+ &ea_inode);
+ if (error)
+ continue;
+ ext4_xattr_inode_free_quota(inode, ea_inode,
le32_to_cpu(entry->e_value_size));
+ iput(ea_inode);
+ }
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index a791aac4c5af..fb96bb71da00 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2253,7 +2253,10 @@ int f2fs_migrate_page(struct address_space *mapping,
SetPagePrivate(newpage);
set_page_private(newpage, page_private(page));
- migrate_page_copy(newpage, page);
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ migrate_page_copy(newpage, page);
+ else
+ migrate_page_states(newpage, page);
return MIGRATEPAGE_SUCCESS;
}
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 2706130c261b..843a0d99f7ea 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -206,7 +206,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
/* if fdatasync is triggered, let's do in-place-update */
if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
set_inode_flag(inode, FI_NEED_IPU);
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ ret = file_write_and_wait_range(file, start, end);
clear_inode_flag(inode, FI_NEED_IPU);
if (ret) {
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 6a7152d0c250..02c066663a3a 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -19,6 +19,8 @@
#include <linux/ctype.h>
#include <linux/slab.h>
#include <linux/namei.h>
+#include <linux/kernel.h>
+
#include "fat.h"
static inline unsigned long vfat_d_version(struct dentry *dentry)
@@ -510,10 +512,8 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
struct nls_table *nls)
{
const unsigned char *ip;
- unsigned char nc;
unsigned char *op;
- unsigned int ec;
- int i, k, fill;
+ int i, fill;
int charlen;
if (utf8) {
@@ -530,33 +530,22 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
i < len && *outlen < FAT_LFN_LEN;
*outlen += 1) {
if (escape && (*ip == ':')) {
+ u8 uc[2];
+
if (i > len - 5)
return -EINVAL;
- ec = 0;
- for (k = 1; k < 5; k++) {
- nc = ip[k];
- ec <<= 4;
- if (nc >= '0' && nc <= '9') {
- ec |= nc - '0';
- continue;
- }
- if (nc >= 'a' && nc <= 'f') {
- ec |= nc - ('a' - 10);
- continue;
- }
- if (nc >= 'A' && nc <= 'F') {
- ec |= nc - ('A' - 10);
- continue;
- }
+
+ if (hex2bin(uc, ip + 1, 2) < 0)
return -EINVAL;
- }
- *op++ = ec & 0xFF;
- *op++ = ec >> 8;
+
+ *(wchar_t *)op = uc[0] << 8 | uc[1];
+
+ op += 2;
ip += 5;
i += 5;
} else {
charlen = nls->char2uni(ip, len - i,
- (wchar_t *)op);
+ (wchar_t *)op);
if (charlen < 0)
return -EINVAL;
ip += charlen;
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index c8c4f79c7ce1..0ad3fd3ad0b4 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -1178,11 +1178,10 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
pagevec_init(&pvec, 0);
next = 0;
do {
- if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
+ if (!pagevec_lookup(&pvec, mapping, &next))
break;
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
- next = page->index;
if (PageFsCache(page)) {
__fscache_wait_on_page_write(cookie, page);
__fscache_uncache_page(cookie, page);
@@ -1190,7 +1189,7 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
}
pagevec_release(&pvec);
cond_resched();
- } while (++next);
+ } while (next);
_leave("");
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index ab60051be6e5..d66789804287 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -457,7 +457,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
* wait for all outstanding writes, before sending the FSYNC
* request.
*/
- err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ err = file_write_and_wait_range(file, start, end);
if (err)
goto out;
@@ -465,10 +465,10 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
/*
* Due to implementation of fuse writeback
- * filemap_write_and_wait_range() does not catch errors.
+ * file_write_and_wait_range() does not catch errors.
* We have to do this directly after fuse_sync_writes()
*/
- err = filemap_check_errors(file->f_mapping);
+ err = file_check_and_advance_wb_err(file);
if (err)
goto out;
@@ -2102,11 +2102,11 @@ static int convert_fuse_file_lock(struct fuse_conn *fc,
fl->fl_end = ffl->end;
/*
- * Convert pid into the caller's pid namespace. If the pid
- * does not map into the namespace fl_pid will get set to 0.
+ * Convert pid into init's pid namespace. The locks API will
+ * translate it into the caller's pid namespace.
*/
rcu_read_lock();
- fl->fl_pid = pid_vnr(find_pid_ns(ffl->pid, fc->pid_ns));
+ fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns);
rcu_read_unlock();
break;
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 2524807ee070..9d5eecb123de 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -86,19 +86,6 @@ int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
char *data;
const char *name = gfs2_acl_name(type);
- if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode)))
- return -E2BIG;
-
- if (type == ACL_TYPE_ACCESS) {
- umode_t mode = inode->i_mode;
-
- error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
- if (error)
- return error;
- if (mode != inode->i_mode)
- mark_inode_dirty(inode);
- }
-
if (acl) {
len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0);
if (len == 0)
@@ -129,6 +116,10 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
struct gfs2_holder gh;
bool need_unlock = false;
int ret;
+ umode_t mode;
+
+ if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode)))
+ return -E2BIG;
ret = gfs2_rsqa_alloc(ip);
if (ret)
@@ -140,7 +131,20 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
return ret;
need_unlock = true;
}
+
+ mode = inode->i_mode;
+ if (type == ACL_TYPE_ACCESS && acl) {
+ ret = posix_acl_update_mode(inode, &mode, &acl);
+ if (ret)
+ goto unlock;
+ }
+
ret = __gfs2_set_acl(inode, acl, type);
+ if (!ret && mode != inode->i_mode) {
+ inode->i_mode = mode;
+ mark_inode_dirty(inode);
+ }
+unlock:
if (need_unlock)
gfs2_glock_dq_uninit(&gh);
return ret;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index ed7a2e252ad8..68ed06962537 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -234,7 +234,19 @@ out:
static int gfs2_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
+ struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
+ int ret = mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
+
+ /*
+ * Even if we didn't write any pages here, we might still be holding
+ * dirty pages in the ail. We forcibly flush the ail because we don't
+ * want balance_dirty_pages() to loop indefinitely trying to write out
+ * pages held in the ail that it can't find.
+ */
+ if (ret == 0)
+ set_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags);
+
+ return ret;
}
/**
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 9fa3aef9a5b3..3dd0cceefa43 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -291,8 +291,9 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl,
if (trylock_buffer(rabh)) {
if (!buffer_uptodate(rabh)) {
rabh->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META,
- rabh);
+ submit_bh(REQ_OP_READ,
+ REQ_RAHEAD | REQ_META | REQ_PRIO,
+ rabh);
continue;
}
unlock_buffer(rabh);
@@ -1103,8 +1104,15 @@ static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
while (true) {
ptr = metapointer(h, mp);
- if (*ptr) /* if we have a non-null pointer */
+ if (*ptr) { /* if we have a non-null pointer */
+ /* Now zero the metapath after the current height. */
+ h++;
+ if (h < GFS2_MAX_META_HEIGHT)
+ memset(&mp->mp_list[h], 0,
+ (GFS2_MAX_META_HEIGHT - h) *
+ sizeof(mp->mp_list[0]));
return true;
+ }
if (mp->mp_list[h] < ptrs)
mp->mp_list[h]++;
@@ -1120,6 +1128,13 @@ enum dealloc_states {
DEALLOC_DONE = 3, /* process complete */
};
+static bool mp_eq_to_hgt(struct metapath *mp, __u16 *nbof, unsigned int h)
+{
+ if (memcmp(mp->mp_list, nbof, h * sizeof(mp->mp_list[0])))
+ return false;
+ return true;
+}
+
/**
* trunc_dealloc - truncate a file down to a desired size
* @ip: inode to truncate
@@ -1197,8 +1212,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize)
/* If we're truncating to a non-zero size and the mp is
at the beginning of file for the strip height, we
need to preserve the first metadata pointer. */
- preserve1 = (newsize &&
- (mp.mp_list[mp_h] == nbof[mp_h]));
+ preserve1 = (newsize && mp_eq_to_hgt(&mp, nbof, mp_h));
bh = mp.mp_bh[mp_h];
gfs2_assert_withdraw(sdp, bh);
if (gfs2_assert_withdraw(sdp,
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 5ee2e2f8576c..06a0d1947c77 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1513,7 +1513,9 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index,
continue;
}
bh->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META, bh);
+ submit_bh(REQ_OP_READ,
+ REQ_RAHEAD | REQ_META | REQ_PRIO,
+ bh);
continue;
}
brelse(bh);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index c2062a108d19..33a0cb5701a3 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -668,12 +668,14 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
if (ret)
return ret;
if (gfs2_is_jdata(ip))
- filemap_write_and_wait(mapping);
+ ret = file_write_and_wait(file);
+ if (ret)
+ return ret;
gfs2_ail_flush(ip->i_gl, 1);
}
if (mapping->nrpages)
- ret = filemap_fdatawait_range(mapping, start, end);
+ ret = file_fdatawait_range(file, start, end);
return ret ? ret : ret1;
}
@@ -1030,8 +1032,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
mutex_lock(&fp->f_fl_mutex);
- gl = fl_gh->gh_gl;
- if (gl) {
+ if (gfs2_holder_initialized(fl_gh)) {
if (fl_gh->gh_state == state)
goto out;
locks_lock_file_wait(file,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index c38ab6c81898..98e845b7841b 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -15,6 +15,7 @@
#include <linux/buffer_head.h>
#include <linux/delay.h>
#include <linux/sort.h>
+#include <linux/hash.h>
#include <linux/jhash.h>
#include <linux/kallsyms.h>
#include <linux/gfs2_ondisk.h>
@@ -71,7 +72,7 @@ static DEFINE_SPINLOCK(lru_lock);
#define GFS2_GL_HASH_SHIFT 15
#define GFS2_GL_HASH_SIZE BIT(GFS2_GL_HASH_SHIFT)
-static struct rhashtable_params ht_parms = {
+static const struct rhashtable_params ht_parms = {
.nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4,
.key_len = offsetofend(struct lm_lockname, ln_type),
.key_offset = offsetof(struct gfs2_glock, gl_name),
@@ -80,6 +81,49 @@ static struct rhashtable_params ht_parms = {
static struct rhashtable gl_hash_table;
+#define GLOCK_WAIT_TABLE_BITS 12
+#define GLOCK_WAIT_TABLE_SIZE (1 << GLOCK_WAIT_TABLE_BITS)
+static wait_queue_head_t glock_wait_table[GLOCK_WAIT_TABLE_SIZE] __cacheline_aligned;
+
+struct wait_glock_queue {
+ struct lm_lockname *name;
+ wait_queue_entry_t wait;
+};
+
+static int glock_wake_function(wait_queue_entry_t *wait, unsigned int mode,
+ int sync, void *key)
+{
+ struct wait_glock_queue *wait_glock =
+ container_of(wait, struct wait_glock_queue, wait);
+ struct lm_lockname *wait_name = wait_glock->name;
+ struct lm_lockname *wake_name = key;
+
+ if (wake_name->ln_sbd != wait_name->ln_sbd ||
+ wake_name->ln_number != wait_name->ln_number ||
+ wake_name->ln_type != wait_name->ln_type)
+ return 0;
+ return autoremove_wake_function(wait, mode, sync, key);
+}
+
+static wait_queue_head_t *glock_waitqueue(struct lm_lockname *name)
+{
+ u32 hash = jhash2((u32 *)name, sizeof(*name) / 4, 0);
+
+ return glock_wait_table + hash_32(hash, GLOCK_WAIT_TABLE_BITS);
+}
+
+/**
+ * wake_up_glock - Wake up waiters on a glock
+ * @gl: the glock
+ */
+static void wake_up_glock(struct gfs2_glock *gl)
+{
+ wait_queue_head_t *wq = glock_waitqueue(&gl->gl_name);
+
+ if (waitqueue_active(wq))
+ __wake_up(wq, TASK_NORMAL, 1, &gl->gl_name);
+}
+
static void gfs2_glock_dealloc(struct rcu_head *rcu)
{
struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
@@ -96,6 +140,9 @@ void gfs2_glock_free(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+ rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms);
+ smp_mb();
+ wake_up_glock(gl);
call_rcu(&gl->gl_rcu, gfs2_glock_dealloc);
if (atomic_dec_and_test(&sdp->sd_glock_disposal))
wake_up(&sdp->sd_glock_wait);
@@ -107,7 +154,7 @@ void gfs2_glock_free(struct gfs2_glock *gl)
*
*/
-static void gfs2_glock_hold(struct gfs2_glock *gl)
+void gfs2_glock_hold(struct gfs2_glock *gl)
{
GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref));
lockref_get(&gl->gl_lockref);
@@ -150,6 +197,9 @@ void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
{
+ if (!(gl->gl_ops->go_flags & GLOF_LRU))
+ return;
+
spin_lock(&lru_lock);
if (!list_empty(&gl->gl_lru)) {
list_del_init(&gl->gl_lru);
@@ -191,13 +241,20 @@ static void __gfs2_glock_put(struct gfs2_glock *gl)
gfs2_glock_remove_from_lru(gl);
spin_unlock(&gl->gl_lockref.lock);
- rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms);
GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
trace_gfs2_glock_put(gl);
sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
}
+/*
+ * Cause the glock to be put in work queue context.
+ */
+void gfs2_glock_queue_put(struct gfs2_glock *gl)
+{
+ gfs2_glock_queue_work(gl, 0);
+}
+
/**
* gfs2_glock_put() - Decrement reference count on glock
* @gl: The glock to put
@@ -676,6 +733,40 @@ static void glock_work_func(struct work_struct *work)
spin_unlock(&gl->gl_lockref.lock);
}
+static struct gfs2_glock *find_insert_glock(struct lm_lockname *name,
+ struct gfs2_glock *new)
+{
+ struct wait_glock_queue wait;
+ wait_queue_head_t *wq = glock_waitqueue(name);
+ struct gfs2_glock *gl;
+
+ wait.name = name;
+ init_wait(&wait.wait);
+ wait.wait.func = glock_wake_function;
+
+again:
+ prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ rcu_read_lock();
+ if (new) {
+ gl = rhashtable_lookup_get_insert_fast(&gl_hash_table,
+ &new->gl_node, ht_parms);
+ if (IS_ERR(gl))
+ goto out;
+ } else {
+ gl = rhashtable_lookup_fast(&gl_hash_table,
+ name, ht_parms);
+ }
+ if (gl && !lockref_get_not_dead(&gl->gl_lockref)) {
+ rcu_read_unlock();
+ schedule();
+ goto again;
+ }
+out:
+ rcu_read_unlock();
+ finish_wait(wq, &wait.wait);
+ return gl;
+}
+
/**
* gfs2_glock_get() - Get a glock, or create one if one doesn't exist
* @sdp: The GFS2 superblock
@@ -702,15 +793,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
struct kmem_cache *cachep;
int ret = 0;
- rcu_read_lock();
- gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
- if (gl && !lockref_get_not_dead(&gl->gl_lockref))
- gl = NULL;
- rcu_read_unlock();
-
- *glp = gl;
- if (gl)
+ gl = find_insert_glock(&name, NULL);
+ if (gl) {
+ *glp = gl;
return 0;
+ }
if (!create)
return -ENOENT;
@@ -764,10 +851,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
mapping->writeback_index = 0;
}
-again:
- rcu_read_lock();
- tmp = rhashtable_lookup_get_insert_fast(&gl_hash_table, &gl->gl_node,
- ht_parms);
+ tmp = find_insert_glock(&name, gl);
if (!tmp) {
*glp = gl;
goto out;
@@ -776,13 +860,7 @@ again:
ret = PTR_ERR(tmp);
goto out_free;
}
- if (lockref_get_not_dead(&tmp->gl_lockref)) {
- *glp = tmp;
- goto out_free;
- }
- rcu_read_unlock();
- cond_resched();
- goto again;
+ *glp = tmp;
out_free:
kfree(gl->gl_lksb.sb_lvbptr);
@@ -790,7 +868,6 @@ out_free:
atomic_dec(&sdp->sd_glock_disposal);
out:
- rcu_read_unlock();
return ret;
}
@@ -1473,14 +1550,15 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
do {
gl = ERR_PTR(rhashtable_walk_start(&iter));
- if (gl)
- continue;
+ if (IS_ERR(gl))
+ goto walk_stop;
while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl))
- if ((gl->gl_name.ln_sbd == sdp) &&
+ if (gl->gl_name.ln_sbd == sdp &&
lockref_get_not_dead(&gl->gl_lockref))
examiner(gl);
+walk_stop:
rhashtable_walk_stop(&iter);
} while (cond_resched(), gl == ERR_PTR(-EAGAIN));
@@ -1803,7 +1881,7 @@ static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr)
int __init gfs2_glock_init(void)
{
- int ret;
+ int i, ret;
ret = rhashtable_init(&gl_hash_table, &ht_parms);
if (ret < 0)
@@ -1832,6 +1910,9 @@ int __init gfs2_glock_init(void)
return ret;
}
+ for (i = 0; i < GLOCK_WAIT_TABLE_SIZE; i++)
+ init_waitqueue_head(glock_wait_table + i);
+
return 0;
}
@@ -1860,6 +1941,7 @@ static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
}
static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
{
struct gfs2_glock_iter *gi = seq->private;
loff_t n = *pos;
@@ -1892,6 +1974,7 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
}
static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
+ __releases(RCU)
{
struct gfs2_glock_iter *gi = seq->private;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 9ad4a6ac6c84..5e12220cc0c2 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -13,6 +13,7 @@
#include <linux/sched.h>
#include <linux/parser.h>
#include "incore.h"
+#include "util.h"
/* Options for hostdata parser */
@@ -181,7 +182,9 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
const struct gfs2_glock_operations *glops,
int create, struct gfs2_glock **glp);
+extern void gfs2_glock_hold(struct gfs2_glock *gl);
extern void gfs2_glock_put(struct gfs2_glock *gl);
+extern void gfs2_glock_queue_put(struct gfs2_glock *gl);
extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
u16 flags, struct gfs2_holder *gh);
extern void gfs2_holder_reinit(unsigned int state, u16 flags,
@@ -257,11 +260,44 @@ static inline bool gfs2_holder_initialized(struct gfs2_holder *gh)
return gh->gh_gl;
}
+/**
+ * glock_set_object - set the gl_object field of a glock
+ * @gl: the glock
+ * @object: the object
+ */
static inline void glock_set_object(struct gfs2_glock *gl, void *object)
{
spin_lock(&gl->gl_lockref.lock);
+ if (gfs2_assert_warn(gl->gl_name.ln_sbd, gl->gl_object == NULL))
+ gfs2_dump_glock(NULL, gl);
gl->gl_object = object;
spin_unlock(&gl->gl_lockref.lock);
}
+/**
+ * glock_clear_object - clear the gl_object field of a glock
+ * @gl: the glock
+ * @object: the object
+ *
+ * I'd love to similarly add this:
+ * else if (gfs2_assert_warn(gl->gl_sbd, gl->gl_object == object))
+ * gfs2_dump_glock(NULL, gl);
+ * Unfortunately, that's not possible because as soon as gfs2_delete_inode
+ * frees the block in the rgrp, another process can reassign it for an I_NEW
+ * inode in gfs2_create_inode because that calls new_inode, not gfs2_iget.
+ * That means gfs2_delete_inode may subsequently try to call this function
+ * for a glock that's already pointing to a brand new inode. If we clear the
+ * new inode's gl_object, we'll introduce metadata corruption. Function
+ * gfs2_delete_inode calls clear_inode which calls gfs2_clear_inode which also
+ * tries to clear gl_object, so it's more than just gfs2_delete_inode.
+ *
+ */
+static inline void glock_clear_object(struct gfs2_glock *gl, void *object)
+{
+ spin_lock(&gl->gl_lockref.lock);
+ if (gl->gl_object == object)
+ gl->gl_object = NULL;
+ spin_unlock(&gl->gl_lockref.lock);
+}
+
#endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 5e69636d4dd3..dac6559e2195 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -329,32 +329,6 @@ static int inode_go_demote_ok(const struct gfs2_glock *gl)
return 1;
}
-/**
- * gfs2_set_nlink - Set the inode's link count based on on-disk info
- * @inode: The inode in question
- * @nlink: The link count
- *
- * If the link count has hit zero, it must never be raised, whatever the
- * on-disk inode might say. When new struct inodes are created the link
- * count is set to 1, so that we can safely use this test even when reading
- * in on disk information for the first time.
- */
-
-static void gfs2_set_nlink(struct inode *inode, u32 nlink)
-{
- /*
- * We will need to review setting the nlink count here in the
- * light of the forthcoming ro bind mount work. This is a reminder
- * to do that.
- */
- if ((inode->i_nlink != nlink) && (inode->i_nlink != 0)) {
- if (nlink == 0)
- clear_nlink(inode);
- else
- set_nlink(inode, nlink);
- }
-}
-
static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
{
const struct gfs2_dinode *str = buf;
@@ -376,7 +350,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid));
i_gid_write(&ip->i_inode, be32_to_cpu(str->di_gid));
- gfs2_set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink));
+ set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink));
i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
atime.tv_sec = be64_to_cpu(str->di_atime);
@@ -470,7 +444,7 @@ static int inode_go_lock(struct gfs2_holder *gh)
(gh->gh_state == LM_ST_EXCLUSIVE)) {
spin_lock(&sdp->sd_trunc_lock);
if (list_empty(&ip->i_trunc_list))
- list_add(&sdp->sd_trunc_list, &ip->i_trunc_list);
+ list_add(&ip->i_trunc_list, &sdp->sd_trunc_list);
spin_unlock(&sdp->sd_trunc_lock);
wake_up(&sdp->sd_quota_wait);
return 1;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 73fce76e67ee..6e18e9793ec4 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -606,6 +606,7 @@ enum {
SDF_NOJOURNALID = 6,
SDF_RORECOVERY = 7, /* read only recovery */
SDF_SKIP_DLM_UNLOCK = 8,
+ SDF_FORCE_AIL_FLUSH = 9,
};
enum gfs2_freeze_state {
@@ -816,6 +817,7 @@ struct gfs2_sbd {
atomic_t sd_log_in_flight;
struct bio *sd_log_bio;
wait_queue_head_t sd_log_flush_wait;
+ int sd_log_error;
atomic_t sd_reserving_log;
wait_queue_head_t sd_reserving_log_wait;
@@ -831,7 +833,7 @@ struct gfs2_sbd {
atomic_t sd_freeze_state;
struct mutex sd_freeze_mutex;
- char sd_fsname[GFS2_FSNAME_LEN];
+ char sd_fsname[GFS2_FSNAME_LEN + 3 * sizeof(int) + 2];
char sd_table_name[GFS2_FSNAME_LEN];
char sd_proto_name[GFS2_FSNAME_LEN];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index acca501f8110..863749e29bf9 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -109,7 +109,7 @@ static void gfs2_set_iop(struct inode *inode)
* @no_addr: The inode number
* @no_formal_ino: The inode generation number
* @blktype: Requested block type (GFS2_BLKST_DINODE or GFS2_BLKST_UNLINKED;
- * GFS2_BLKST_FREE do indicate not to verify)
+ * GFS2_BLKST_FREE to indicate not to verify)
*
* If @type is DT_UNKNOWN, the inode type is fetched from disk.
*
@@ -145,7 +145,6 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
if (unlikely(error))
goto fail;
flush_delayed_work(&ip->i_gl->gl_work);
- glock_set_object(ip->i_gl, ip);
error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
if (unlikely(error))
@@ -170,11 +169,11 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
}
}
+ glock_set_object(ip->i_gl, ip);
set_bit(GIF_INVALID, &ip->i_flags);
error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
if (unlikely(error))
goto fail_put;
- flush_delayed_work(&ip->i_iopen_gh.gh_gl->gl_work);
glock_set_object(ip->i_iopen_gh.gh_gl, ip);
gfs2_glock_put(io_gl);
io_gl = NULL;
@@ -202,14 +201,14 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
fail_refresh:
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- glock_set_object(ip->i_iopen_gh.gh_gl, NULL);
+ glock_clear_object(ip->i_iopen_gh.gh_gl, ip);
gfs2_glock_dq_uninit(&ip->i_iopen_gh);
fail_put:
if (io_gl)
gfs2_glock_put(io_gl);
+ glock_clear_object(ip->i_gl, ip);
if (gfs2_holder_initialized(&i_gh))
gfs2_glock_dq_uninit(&i_gh);
- glock_set_object(ip->i_gl, NULL);
fail:
iget_failed(inode);
return ERR_PTR(error);
@@ -706,8 +705,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
if (error)
goto fail_free_inode;
-
+ flush_delayed_work(&ip->i_gl->gl_work);
glock_set_object(ip->i_gl, ip);
+
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
if (error)
goto fail_free_inode;
@@ -775,14 +775,17 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
return error;
fail_gunlock3:
+ glock_clear_object(io_gl, ip);
gfs2_glock_dq_uninit(&ip->i_iopen_gh);
gfs2_glock_put(io_gl);
fail_gunlock2:
if (io_gl)
clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags);
fail_free_inode:
- if (ip->i_gl)
+ if (ip->i_gl) {
+ glock_clear_object(ip->i_gl, ip);
gfs2_glock_put(ip->i_gl);
+ }
gfs2_rsqa_delete(ip, NULL);
fail_free_acls:
if (default_acl)
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 0515f0a68637..65f33a0ac190 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -23,8 +23,6 @@
#include "sys.h"
#include "trace_gfs2.h"
-extern struct workqueue_struct *gfs2_control_wq;
-
/**
* gfs2_update_stats - Update time based stats
* @mv: Pointer to mean/variance structure to update
@@ -1059,6 +1057,7 @@ static void free_recover_size(struct lm_lockstruct *ls)
ls->ls_recover_submit = NULL;
ls->ls_recover_result = NULL;
ls->ls_recover_size = 0;
+ ls->ls_lvb_bits = NULL;
}
/* dlm calls before it does lock recovery */
@@ -1175,7 +1174,7 @@ static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid,
spin_unlock(&ls->ls_recover_spin);
}
-const struct dlm_lockspace_ops gdlm_lockspace_ops = {
+static const struct dlm_lockspace_ops gdlm_lockspace_ops = {
.recover_prep = gdlm_recover_prep,
.recover_slot = gdlm_recover_slot,
.recover_done = gdlm_recover_done,
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 9a624f694400..f72c44231406 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -898,6 +898,10 @@ static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp)
static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp)
{
unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free);
+
+ if (test_and_clear_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags))
+ return 1;
+
return used_blocks + atomic_read(&sdp->sd_log_blks_needed) >=
atomic_read(&sdp->sd_log_thresh2);
}
@@ -919,6 +923,15 @@ int gfs2_logd(void *data)
while (!kthread_should_stop()) {
+ /* Check for errors writing to the journal */
+ if (sdp->sd_log_error) {
+ gfs2_lm_withdraw(sdp,
+ "GFS2: fsid=%s: error %d: "
+ "withdrawing the file system to "
+ "prevent further damage.\n",
+ sdp->sd_fsname, sdp->sd_log_error);
+ }
+
did_flush = false;
if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
gfs2_ail1_empty(sdp);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 720c19ada0f9..c8ff7b7954f0 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -207,8 +207,11 @@ static void gfs2_end_log_write(struct bio *bio)
struct page *page;
int i;
- if (bio->bi_status)
- fs_err(sdp, "Error %d writing to log\n", bio->bi_status);
+ if (bio->bi_status) {
+ fs_err(sdp, "Error %d writing to journal, jid=%u\n",
+ bio->bi_status, sdp->sd_jdesc->jd_jid);
+ wake_up(&sdp->sd_logd_waitq);
+ }
bio_for_each_segment_all(bvec, bio, i) {
page = bvec->bv_page;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 39433a173baa..52de1036d9f9 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -419,8 +419,9 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) {
brelse(bh);
ret = -EIO;
+ } else {
+ *bhp = bh;
}
- *bhp = bh;
return ret;
}
@@ -452,7 +453,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
if (buffer_uptodate(first_bh))
goto out;
if (!buffer_locked(first_bh))
- ll_rw_block(REQ_OP_READ, REQ_META, 1, &first_bh);
+ ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &first_bh);
dblock++;
extlen--;
@@ -461,7 +462,9 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
bh = gfs2_getbuf(gl, dblock, CREATE);
if (!buffer_uptodate(bh) && !buffer_locked(bh))
- ll_rw_block(REQ_OP_READ, REQ_RAHEAD | REQ_META, 1, &bh);
+ ll_rw_block(REQ_OP_READ,
+ REQ_RAHEAD | REQ_META | REQ_PRIO,
+ 1, &bh);
brelse(bh);
dblock++;
extlen--;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 8155e16076e1..84593587691d 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1113,7 +1113,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
return error;
}
- snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s", sdp->sd_table_name);
+ snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s", sdp->sd_table_name);
error = gfs2_sys_fs_add(sdp);
/*
@@ -1159,10 +1159,10 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
}
if (sdp->sd_args.ar_spectator)
- snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s",
+ snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s.s",
sdp->sd_table_name);
else
- snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u",
+ snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s.%u",
sdp->sd_table_name, sdp->sd_lockstruct.ls_jid);
error = init_inodes(sdp, DO);
@@ -1388,7 +1388,6 @@ static void gfs2_kill_sb(struct super_block *sb)
sdp->sd_root_dir = NULL;
sdp->sd_master_dir = NULL;
shrink_dcache_sb(sb);
- gfs2_delete_debugfs_file(sdp);
free_percpu(sdp->sd_lkstats);
kill_block_super(sb);
}
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index c2ca9566b764..e647938432bd 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -730,7 +730,7 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index,
if (PageUptodate(page))
set_buffer_uptodate(bh);
if (!buffer_uptodate(bh)) {
- ll_rw_block(REQ_OP_READ, REQ_META, 1, &bh);
+ ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh))
goto unlock_out;
@@ -1474,8 +1474,11 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
{
if (error == 0 || error == -EROFS)
return;
- if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+ if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) {
fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error);
+ sdp->sd_log_error = error;
+ wake_up(&sdp->sd_logd_waitq);
+ }
}
static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg,
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 836e38ba5d0a..95b2a57ded33 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -705,8 +705,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
rb_erase(n, &sdp->sd_rindex_tree);
if (gl) {
- glock_set_object(gl, NULL);
- gfs2_glock_add_to_lru(gl);
+ glock_clear_object(gl, rgd);
gfs2_glock_put(gl);
}
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index fdedec379b78..769841185ce5 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -924,6 +924,7 @@ restart:
gfs2_jindex_free(sdp);
/* Take apart glock structures and buffer lists */
gfs2_gl_hash_clear(sdp);
+ gfs2_delete_debugfs_file(sdp);
/* Unmount the locking protocol */
gfs2_lm_unmount(sdp);
@@ -943,9 +944,9 @@ static int gfs2_sync_fs(struct super_block *sb, int wait)
struct gfs2_sbd *sdp = sb->s_fs_info;
gfs2_quota_sync(sb, -1);
- if (wait && sdp)
+ if (wait)
gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
- return 0;
+ return sdp->sd_log_error;
}
void gfs2_freeze_func(struct work_struct *work)
@@ -1295,7 +1296,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
* gfs2_drop_inode - Drop an inode (test for remote unlink)
* @inode: The inode to drop
*
- * If we've received a callback on an iopen lock then its because a
+ * If we've received a callback on an iopen lock then it's because a
* remote node tried to deallocate the inode but failed due to this node
* still having the inode open. Here we mark the link count zero
* since we know that it must have reached zero if the GLF_DEMOTE flag
@@ -1317,6 +1318,23 @@ static int gfs2_drop_inode(struct inode *inode)
if (test_bit(GLF_DEMOTE, &gl->gl_flags))
clear_nlink(inode);
}
+
+ /*
+ * When under memory pressure when an inode's link count has dropped to
+ * zero, defer deleting the inode to the delete workqueue. This avoids
+ * calling into DLM under memory pressure, which can deadlock.
+ */
+ if (!inode->i_nlink &&
+ unlikely(current->flags & PF_MEMALLOC) &&
+ gfs2_holder_initialized(&ip->i_iopen_gh)) {
+ struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
+
+ gfs2_glock_hold(gl);
+ if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
+ gfs2_glock_queue_put(gl);
+ return false;
+ }
+
return generic_drop_inode(inode);
}
@@ -1501,6 +1519,22 @@ out_qs:
}
/**
+ * gfs2_glock_put_eventually
+ * @gl: The glock to put
+ *
+ * When under memory pressure, trigger a deferred glock put to make sure we
+ * won't call into DLM and deadlock. Otherwise, put the glock directly.
+ */
+
+static void gfs2_glock_put_eventually(struct gfs2_glock *gl)
+{
+ if (current->flags & PF_MEMALLOC)
+ gfs2_glock_queue_put(gl);
+ else
+ gfs2_glock_put(gl);
+}
+
+/**
* gfs2_evict_inode - Remove an inode from cache
* @inode: The inode to evict
*
@@ -1544,9 +1578,14 @@ static void gfs2_evict_inode(struct inode *inode)
goto alloc_failed;
}
+ /* Deletes should never happen under memory pressure anymore. */
+ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
+ goto out;
+
/* Must not read inode block until block type has been verified */
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
if (unlikely(error)) {
+ glock_clear_object(ip->i_iopen_gh.gh_gl, ip);
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
gfs2_glock_dq_uninit(&ip->i_iopen_gh);
goto out;
@@ -1562,6 +1601,12 @@ static void gfs2_evict_inode(struct inode *inode)
goto out_truncate;
}
+ /*
+ * The inode may have been recreated in the meantime.
+ */
+ if (inode->i_nlink)
+ goto out_truncate;
+
alloc_failed:
if (gfs2_holder_initialized(&ip->i_iopen_gh) &&
test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
@@ -1595,6 +1640,11 @@ alloc_failed:
goto out_unlock;
}
+ /* We're about to clear the bitmap for the dinode, but as soon as we
+ do, gfs2_create_inode can create another inode at the same block
+ location and try to set gl_object again. We clear gl_object here so
+ that subsequent inode creates don't see an old gl_object. */
+ glock_clear_object(ip->i_gl, ip);
error = gfs2_dinode_dealloc(ip);
goto out_unlock;
@@ -1623,14 +1673,17 @@ out_unlock:
gfs2_rs_deltree(&ip->i_res);
if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
+ glock_clear_object(ip->i_iopen_gh.gh_gl, ip);
if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
gfs2_glock_dq(&ip->i_iopen_gh);
}
gfs2_holder_uninit(&ip->i_iopen_gh);
}
- if (gfs2_holder_initialized(&gh))
+ if (gfs2_holder_initialized(&gh)) {
+ glock_clear_object(ip->i_gl, ip);
gfs2_glock_dq_uninit(&gh);
+ }
if (error && error != GLR_TRYFAILED && error != -EROFS)
fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
out:
@@ -1640,15 +1693,19 @@ out:
gfs2_ordered_del_inode(ip);
clear_inode(inode);
gfs2_dir_hash_inval(ip);
- glock_set_object(ip->i_gl, NULL);
+ glock_clear_object(ip->i_gl, ip);
wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE);
gfs2_glock_add_to_lru(ip->i_gl);
- gfs2_glock_put(ip->i_gl);
+ gfs2_glock_put_eventually(ip->i_gl);
ip->i_gl = NULL;
if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
- glock_set_object(ip->i_iopen_gh.gh_gl, NULL);
+ struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
+
+ glock_clear_object(gl, ip);
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
+ gfs2_glock_hold(gl);
gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+ gfs2_glock_put_eventually(gl);
}
}
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index c81295f407f6..3926f95a6eb7 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -151,6 +151,7 @@ extern struct kmem_cache *gfs2_rgrpd_cachep;
extern struct kmem_cache *gfs2_quotad_cachep;
extern struct kmem_cache *gfs2_qadata_cachep;
extern mempool_t *gfs2_page_pool;
+extern struct workqueue_struct *gfs2_control_wq;
static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
unsigned int *p)
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 54179554c7d2..ea09e41dbb49 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -25,6 +25,7 @@
#include "meta_io.h"
#include "quota.h"
#include "rgrp.h"
+#include "super.h"
#include "trans.h"
#include "util.h"
@@ -1209,8 +1210,12 @@ int __gfs2_xattr_set(struct inode *inode, const char *name,
if (namel > GFS2_EA_MAX_NAME_LEN)
return -ERANGE;
- if (value == NULL)
- return gfs2_xattr_remove(ip, type, name);
+ if (value == NULL) {
+ error = gfs2_xattr_remove(ip, type, name);
+ if (error == -ENODATA && !(flags & XATTR_REPLACE))
+ error = 0;
+ return error;
+ }
if (ea_check_size(sdp, namel, size))
return -ERANGE;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index bfbba799430f..2538b49cc349 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -656,7 +656,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end,
struct super_block * sb;
int ret, err;
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ ret = file_write_and_wait_range(filp, start, end);
if (ret)
return ret;
inode_lock(inode);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index e8638d528195..4f26b6877130 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -283,7 +283,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
int error = 0, error2;
- error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ error = file_write_and_wait_range(file, start, end);
if (error)
return error;
inode_lock(inode);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index e61261a7417e..c148e7f4f451 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -374,7 +374,7 @@ static int hostfs_fsync(struct file *file, loff_t start, loff_t end,
struct inode *inode = file->f_mapping->host;
int ret;
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ ret = file_write_and_wait_range(file, start, end);
if (ret)
return ret;
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index b3be1b5a62e2..f26138425b16 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -24,7 +24,7 @@ int hpfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
struct inode *inode = file->f_mapping->host;
int ret;
- ret = filemap_write_and_wait_range(file->f_mapping, start, end);
+ ret = file_write_and_wait_range(file, start, end);
if (ret)
return ret;
return sync_blockdev(inode->i_sb->s_bdev);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 28d2753be094..59073e9f01a4 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -334,7 +334,7 @@ static void remove_huge_page(struct page *page)
}
static void
-hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
+hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
{
struct vm_area_struct *vma;
@@ -401,9 +401,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
const pgoff_t end = lend >> huge_page_shift(h);
struct vm_area_struct pseudo_vma;
struct pagevec pvec;
- pgoff_t next;
+ pgoff_t next, index;
int i, freed = 0;
- long lookup_nr = PAGEVEC_SIZE;
bool truncate_op = (lend == LLONG_MAX);
memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
@@ -412,33 +411,19 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
next = start;
while (next < end) {
/*
- * Don't grab more pages than the number left in the range.
- */
- if (end - next < lookup_nr)
- lookup_nr = end - next;
-
- /*
* When no more pages are found, we are done.
*/
- if (!pagevec_lookup(&pvec, mapping, next, lookup_nr))
+ if (!pagevec_lookup_range(&pvec, mapping, &next, end - 1))
break;
for (i = 0; i < pagevec_count(&pvec); ++i) {
struct page *page = pvec.pages[i];
u32 hash;
- /*
- * The page (index) could be beyond end. This is
- * only possible in the punch hole case as end is
- * max page offset in the truncate case.
- */
- next = page->index;
- if (next >= end)
- break;
-
+ index = page->index;
hash = hugetlb_fault_mutex_hash(h, current->mm,
&pseudo_vma,
- mapping, next, 0);
+ mapping, index, 0);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/*
@@ -455,8 +440,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
i_mmap_lock_write(mapping);
hugetlb_vmdelete_list(&mapping->i_mmap,
- next * pages_per_huge_page(h),
- (next + 1) * pages_per_huge_page(h));
+ index * pages_per_huge_page(h),
+ (index + 1) * pages_per_huge_page(h));
i_mmap_unlock_write(mapping);
}
@@ -475,14 +460,13 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
freed++;
if (!truncate_op) {
if (unlikely(hugetlb_unreserve_pages(inode,
- next, next + 1, 1)))
+ index, index + 1, 1)))
hugetlb_fix_reserve_counts(inode);
}
unlock_page(page);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
- ++next;
huge_pagevec_release(&pvec);
cond_resched();
}
@@ -514,7 +498,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
i_size_write(inode, offset);
i_mmap_lock_write(mapping);
- if (!RB_EMPTY_ROOT(&mapping->i_mmap))
+ if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
i_mmap_unlock_write(mapping);
remove_inode_hugepages(inode, offset, LLONG_MAX);
@@ -539,7 +523,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
inode_lock(inode);
i_mmap_lock_write(mapping);
- if (!RB_EMPTY_ROOT(&mapping->i_mmap))
+ if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
hugetlb_vmdelete_list(&mapping->i_mmap,
hole_start >> PAGE_SHIFT,
hole_end >> PAGE_SHIFT);
@@ -846,7 +830,10 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
rc = migrate_huge_page_move_mapping(mapping, newpage, page);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
- migrate_page_copy(newpage, page);
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ migrate_page_copy(newpage, page);
+ else
+ migrate_page_states(newpage, page);
return MIGRATEPAGE_SUCCESS;
}
diff --git a/fs/inode.c b/fs/inode.c
index 50370599e371..210054157a49 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -353,7 +353,7 @@ void address_space_init_once(struct address_space *mapping)
init_rwsem(&mapping->i_mmap_rwsem);
INIT_LIST_HEAD(&mapping->private_list);
spin_lock_init(&mapping->private_lock);
- mapping->i_mmap = RB_ROOT;
+ mapping->i_mmap = RB_ROOT_CACHED;
}
EXPORT_SYMBOL(address_space_init_once);
@@ -637,6 +637,7 @@ again:
dispose_list(&dispose);
}
+EXPORT_SYMBOL_GPL(evict_inodes);
/**
* invalidate_inodes - attempt to free all inodes on a superblock
diff --git a/fs/internal.h b/fs/internal.h
index 9676fe11c093..fedfe94d84ba 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -132,7 +132,6 @@ static inline bool atime_needs_update_rcu(const struct path *path,
extern void inode_io_list_del(struct inode *inode);
extern long get_nr_dirty_inodes(void);
-extern void evict_inodes(struct super_block *);
extern int invalidate_inodes(struct super_block *, bool);
/*
diff --git a/fs/iomap.c b/fs/iomap.c
index 64b0df2308bb..269b24a01f32 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -477,10 +477,10 @@ int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
set_page_dirty(page);
wait_for_stable_page(page);
- return 0;
+ return VM_FAULT_LOCKED;
out_unlock:
unlock_page(page);
- return ret;
+ return block_page_mkwrite_return(ret);
}
EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 217a5e7815da..f1ed935322db 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -96,7 +96,7 @@ static int __init init_inodecache(void)
0, (SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
- if (isofs_inode_cachep == NULL)
+ if (!isofs_inode_cachep)
return -ENOMEM;
return 0;
}
@@ -678,7 +678,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
if (isonum_711(vdp->type) == ISO_VD_END)
break;
if (isonum_711(vdp->type) == ISO_VD_PRIMARY) {
- if (pri == NULL) {
+ if (!pri) {
pri = (struct iso_primary_descriptor *)vdp;
/* Save the buffer in case we need it ... */
pri_bh = bh;
@@ -742,7 +742,7 @@ root_found:
goto out_freebh;
}
- if (joliet_level && (pri == NULL || !opt.rock)) {
+ if (joliet_level && (!pri || !opt.rock)) {
/* This is the case of Joliet with the norock mount flag.
* A disc with both Joliet and Rock Ridge is handled later
*/
@@ -1298,7 +1298,7 @@ static int isofs_read_inode(struct inode *inode, int relocated)
unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
unsigned long block;
int high_sierra = sbi->s_high_sierra;
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh;
struct iso_directory_record *de;
struct iso_directory_record *tmpde = NULL;
unsigned int de_len;
@@ -1320,8 +1320,7 @@ static int isofs_read_inode(struct inode *inode, int relocated)
int frag1 = bufsize - offset;
tmpde = kmalloc(de_len, GFP_KERNEL);
- if (tmpde == NULL) {
- printk(KERN_INFO "%s: out of memory\n", __func__);
+ if (!tmpde) {
ret = -ENOMEM;
goto fail;
}
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index c12476e309c6..bd0428bebe9b 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -35,7 +35,7 @@ int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
int ret;
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ ret = file_write_and_wait_range(filp, start, end);
if (ret)
return ret;
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 739492c7a3fd..36665fd37095 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -34,7 +34,7 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
struct inode *inode = file->f_mapping->host;
int rc = 0;
- rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ rc = file_write_and_wait_range(file, start, end);
if (rc)
return rc;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 78b41e1d5c67..60726ae7cf26 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -619,16 +619,10 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
if (!sb->s_root)
goto out_no_root;
- /* logical blocks are represented by 40 bits in pxd_t, etc. */
- sb->s_maxbytes = ((u64) sb->s_blocksize) << 40;
-#if BITS_PER_LONG == 32
- /*
- * Page cache is indexed by long.
- * I would use MAX_LFS_FILESIZE, but it's only half as big
+ /* logical blocks are represented by 40 bits in pxd_t, etc.
+ * and page cache is indexed by long
*/
- sb->s_maxbytes = min(((u64) PAGE_SIZE << 32) - 1,
- (u64)sb->s_maxbytes);
-#endif
+ sb->s_maxbytes = min(((loff_t)sb->s_blocksize) << 40, MAX_LFS_FILESIZE);
sb->s_time_gran = 1;
return 0;
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 744192539010..9698e51656b1 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -997,7 +997,7 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
#ifdef CONFIG_DEBUG_LOCK_ALLOC
if (key) {
- lockdep_init_map(&kn->dep_map, "s_active", key, 0);
+ lockdep_init_map(&kn->dep_map, "kn->count", key, 0);
kn->flags |= KERNFS_LOCKDEP;
}
#endif
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 7c452f4d83e9..95a7c88baed9 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -99,8 +99,8 @@ static struct inode *kernfs_fh_get_inode(struct super_block *sb,
return ERR_PTR(-ESTALE);
inode = kernfs_get_inode(sb, kn);
kernfs_put(kn);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
+ if (!inode)
+ return ERR_PTR(-ESTALE);
if (generation && inode->i_generation != generation) {
/* we didn't find the right inode.. */
diff --git a/fs/locks.c b/fs/locks.c
index afefeb4ad6de..1bd71c4d663a 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -137,6 +137,7 @@
#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK)
+#define IS_REMOTELCK(fl) (fl->fl_pid <= 0)
static inline bool is_remote_lock(struct file *filp)
{
@@ -270,6 +271,22 @@ locks_check_ctx_lists(struct inode *inode)
}
}
+static void
+locks_check_ctx_file_list(struct file *filp, struct list_head *list,
+ char *list_type)
+{
+ struct file_lock *fl;
+ struct inode *inode = locks_inode(filp);
+
+ list_for_each_entry(fl, list, fl_list)
+ if (fl->fl_file == filp)
+ pr_warn("Leaked %s lock on dev=0x%x:0x%x ino=0x%lx "
+ " fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n",
+ list_type, MAJOR(inode->i_sb->s_dev),
+ MINOR(inode->i_sb->s_dev), inode->i_ino,
+ fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid);
+}
+
void
locks_free_lock_context(struct inode *inode)
{
@@ -733,7 +750,6 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
static void
locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before)
{
- fl->fl_nspid = get_pid(task_tgid(current));
list_add_tail(&fl->fl_list, before);
locks_insert_global_locks(fl);
}
@@ -743,10 +759,6 @@ locks_unlink_lock_ctx(struct file_lock *fl)
{
locks_delete_global_locks(fl);
list_del_init(&fl->fl_list);
- if (fl->fl_nspid) {
- put_pid(fl->fl_nspid);
- fl->fl_nspid = NULL;
- }
locks_wake_up_blocks(fl);
}
@@ -823,8 +835,6 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
if (posix_locks_conflict(fl, cfl)) {
locks_copy_conflock(fl, cfl);
- if (cfl->fl_nspid)
- fl->fl_pid = pid_vnr(cfl->fl_nspid);
goto out;
}
}
@@ -2048,9 +2058,33 @@ int vfs_test_lock(struct file *filp, struct file_lock *fl)
}
EXPORT_SYMBOL_GPL(vfs_test_lock);
+/**
+ * locks_translate_pid - translate a file_lock's fl_pid number into a namespace
+ * @fl: The file_lock who's fl_pid should be translated
+ * @ns: The namespace into which the pid should be translated
+ *
+ * Used to tranlate a fl_pid into a namespace virtual pid number
+ */
+static pid_t locks_translate_pid(struct file_lock *fl, struct pid_namespace *ns)
+{
+ pid_t vnr;
+ struct pid *pid;
+
+ if (IS_OFDLCK(fl))
+ return -1;
+ if (IS_REMOTELCK(fl))
+ return fl->fl_pid;
+
+ rcu_read_lock();
+ pid = find_pid_ns(fl->fl_pid, &init_pid_ns);
+ vnr = pid_nr_ns(pid, ns);
+ rcu_read_unlock();
+ return vnr;
+}
+
static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
{
- flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid;
+ flock->l_pid = locks_translate_pid(fl, task_active_pid_ns(current));
#if BITS_PER_LONG == 32
/*
* Make sure we can represent the posix lock via
@@ -2072,7 +2106,7 @@ static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
#if BITS_PER_LONG == 32
static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
{
- flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid;
+ flock->l_pid = locks_translate_pid(fl, task_active_pid_ns(current));
flock->l_start = fl->fl_start;
flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
fl->fl_end - fl->fl_start + 1;
@@ -2086,14 +2120,17 @@ static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
*/
int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock)
{
- struct file_lock file_lock;
+ struct file_lock *fl;
int error;
+ fl = locks_alloc_lock();
+ if (fl == NULL)
+ return -ENOMEM;
error = -EINVAL;
if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK)
goto out;
- error = flock_to_posix_lock(filp, &file_lock, flock);
+ error = flock_to_posix_lock(filp, fl, flock);
if (error)
goto out;
@@ -2103,23 +2140,22 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock)
goto out;
cmd = F_GETLK;
- file_lock.fl_flags |= FL_OFDLCK;
- file_lock.fl_owner = filp;
+ fl->fl_flags |= FL_OFDLCK;
+ fl->fl_owner = filp;
}
- error = vfs_test_lock(filp, &file_lock);
+ error = vfs_test_lock(filp, fl);
if (error)
goto out;
- flock->l_type = file_lock.fl_type;
- if (file_lock.fl_type != F_UNLCK) {
- error = posix_lock_to_flock(flock, &file_lock);
+ flock->l_type = fl->fl_type;
+ if (fl->fl_type != F_UNLCK) {
+ error = posix_lock_to_flock(flock, fl);
if (error)
- goto rel_priv;
+ goto out;
}
-rel_priv:
- locks_release_private(&file_lock);
out:
+ locks_free_lock(fl);
return error;
}
@@ -2298,14 +2334,18 @@ out:
*/
int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock)
{
- struct file_lock file_lock;
+ struct file_lock *fl;
int error;
+ fl = locks_alloc_lock();
+ if (fl == NULL)
+ return -ENOMEM;
+
error = -EINVAL;
if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK)
goto out;
- error = flock64_to_posix_lock(filp, &file_lock, flock);
+ error = flock64_to_posix_lock(filp, fl, flock);
if (error)
goto out;
@@ -2315,20 +2355,20 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock)
goto out;
cmd = F_GETLK64;
- file_lock.fl_flags |= FL_OFDLCK;
- file_lock.fl_owner = filp;
+ fl->fl_flags |= FL_OFDLCK;
+ fl->fl_owner = filp;
}
- error = vfs_test_lock(filp, &file_lock);
+ error = vfs_test_lock(filp, fl);
if (error)
goto out;
- flock->l_type = file_lock.fl_type;
- if (file_lock.fl_type != F_UNLCK)
- posix_lock_to_flock64(flock, &file_lock);
+ flock->l_type = fl->fl_type;
+ if (fl->fl_type != F_UNLCK)
+ posix_lock_to_flock64(flock, fl);
- locks_release_private(&file_lock);
out:
+ locks_free_lock(fl);
return error;
}
@@ -2525,6 +2565,12 @@ void locks_remove_file(struct file *filp)
/* remove any leases */
locks_remove_lease(filp, ctx);
+
+ spin_lock(&ctx->flc_lock);
+ locks_check_ctx_file_list(filp, &ctx->flc_posix, "POSIX");
+ locks_check_ctx_file_list(filp, &ctx->flc_flock, "FLOCK");
+ locks_check_ctx_file_list(filp, &ctx->flc_lease, "LEASE");
+ spin_unlock(&ctx->flc_lock);
}
/**
@@ -2578,22 +2624,16 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
{
struct inode *inode = NULL;
unsigned int fl_pid;
+ struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info;
- if (fl->fl_nspid) {
- struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info;
-
- /* Don't let fl_pid change based on who is reading the file */
- fl_pid = pid_nr_ns(fl->fl_nspid, proc_pidns);
-
- /*
- * If there isn't a fl_pid don't display who is waiting on
- * the lock if we are called from locks_show, or if we are
- * called from __show_fd_info - skip lock entirely
- */
- if (fl_pid == 0)
- return;
- } else
- fl_pid = fl->fl_pid;
+ fl_pid = locks_translate_pid(fl, proc_pidns);
+ /*
+ * If there isn't a fl_pid don't display who is waiting on
+ * the lock if we are called from locks_show, or if we are
+ * called from __show_fd_info - skip lock entirely
+ */
+ if (fl_pid == 0)
+ return;
if (fl->fl_file != NULL)
inode = locks_inode(fl->fl_file);
@@ -2668,7 +2708,7 @@ static int locks_show(struct seq_file *f, void *v)
fl = hlist_entry(v, struct file_lock, fl_link);
- if (fl->fl_nspid && !pid_nr_ns(fl->fl_nspid, proc_pidns))
+ if (locks_translate_pid(fl, proc_pidns) == 0)
return 0;
lock_get_status(f, fl, iter->li_pos, "");
diff --git a/fs/namei.c b/fs/namei.c
index ddb6a7c2b3d4..1180f9c58093 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1129,9 +1129,18 @@ static int follow_automount(struct path *path, struct nameidata *nd,
* of the daemon to instantiate them before they can be used.
*/
if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
- LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
- path->dentry->d_inode)
- return -EISDIR;
+ LOOKUP_OPEN | LOOKUP_CREATE |
+ LOOKUP_AUTOMOUNT))) {
+ /* Positive dentry that isn't meant to trigger an
+ * automount, EISDIR will allow it to be used,
+ * otherwise there's no mount here "now" so return
+ * ENOENT.
+ */
+ if (path->dentry->d_inode)
+ return -EISDIR;
+ else
+ return -ENOENT;
+ }
if (path->dentry->d_sb->s_user_ns != &init_user_ns)
return -EACCES;
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 76965e772264..a06c07619ee6 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -23,7 +23,7 @@
static int ncp_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
- return filemap_write_and_wait_range(file->f_mapping, start, end);
+ return file_write_and_wait_range(file, start, end);
}
/*
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 777b055063f6..3025fe8584a0 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -252,45 +252,6 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
}
/*
- * Indication from FS-Cache that the cookie is no longer cached
- * - This function is called when the backing store currently caching a cookie
- * is removed
- * - The netfs should use this to clean up any markers indicating cached pages
- * - This is mandatory for any object that may have data
- */
-static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data)
-{
- struct nfs_inode *nfsi = cookie_netfs_data;
- struct pagevec pvec;
- pgoff_t first;
- int loop, nr_pages;
-
- pagevec_init(&pvec, 0);
- first = 0;
-
- dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi);
-
- for (;;) {
- /* grab a bunch of pages to unmark */
- nr_pages = pagevec_lookup(&pvec,
- nfsi->vfs_inode.i_mapping,
- first,
- PAGEVEC_SIZE - pagevec_count(&pvec));
- if (!nr_pages)
- break;
-
- for (loop = 0; loop < nr_pages; loop++)
- ClearPageFsCache(pvec.pages[loop]);
-
- first = pvec.pages[nr_pages - 1]->index + 1;
-
- pvec.nr = nr_pages;
- pagevec_release(&pvec);
- cond_resched();
- }
-}
-
-/*
* Get an extra reference on a read context.
* - This function can be absent if the completion function doesn't require a
* context.
@@ -330,7 +291,6 @@ const struct fscache_cookie_def nfs_fscache_inode_object_def = {
.get_attr = nfs_fscache_inode_get_attr,
.get_aux = nfs_fscache_inode_get_aux,
.check_aux = nfs_fscache_inode_check_aux,
- .now_uncached = nfs_fscache_inode_now_uncached,
.get_context = nfs_fh_get_context,
.put_context = nfs_fh_put_context,
};
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 38d0383dc7f9..bc69d40c4e8b 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -969,7 +969,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
int use_wgather;
loff_t pos = offset;
unsigned int pflags = current->flags;
- int flags = 0;
+ rwf_t flags = 0;
if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
/*
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index f11a3ad2df0c..8616c46d33da 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -312,10 +312,9 @@ void nilfs_copy_back_pages(struct address_space *dmap,
pagevec_init(&pvec, 0);
repeat:
- n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE);
+ n = pagevec_lookup(&pvec, smap, &index);
if (!n)
return;
- index = pvec.pages[n - 1]->index + 1;
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i], *dpage;
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 2430a0415995..cba328315929 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -133,7 +133,7 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
kmem_cache_free(dnotify_mark_cache, dn_mark);
}
-static struct fsnotify_ops dnotify_fsnotify_ops = {
+static const struct fsnotify_ops dnotify_fsnotify_ops = {
.handle_event = dnotify_handle_event,
.free_mark = dnotify_free_mark,
};
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 0ee19ecc982d..1a24be9e8405 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1506,7 +1506,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
- err = filemap_write_and_wait_range(vi->i_mapping, start, end);
+ err = file_write_and_wait_range(filp, start, end);
if (err)
return err;
inode_lock(vi);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index c4f68c338735..331910fa8442 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1989,7 +1989,7 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
- err = filemap_write_and_wait_range(vi->i_mapping, start, end);
+ err = file_write_and_wait_range(filp, start, end);
if (err)
return err;
inode_lock(vi);
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index e50a387959bf..40b5cc97f7b0 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -221,7 +221,7 @@ out:
/*
* Set the access or default ACL of an inode.
*/
-int ocfs2_set_acl(handle_t *handle,
+static int ocfs2_set_acl(handle_t *handle,
struct inode *inode,
struct buffer_head *di_bh,
int type,
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 2783a75b3999..7be0bb756286 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -28,13 +28,6 @@ struct ocfs2_acl_entry {
struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type);
int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-int ocfs2_set_acl(handle_t *handle,
- struct inode *inode,
- struct buffer_head *di_bh,
- int type,
- struct posix_acl *acl,
- struct ocfs2_alloc_context *meta_ac,
- struct ocfs2_alloc_context *data_ac);
extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *);
extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
struct buffer_head *, struct buffer_head *,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index fb15a96df0b6..a177eae3aa1a 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -955,8 +955,7 @@ int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
/*
* How many free extents have we got before we need more meta data?
*/
-int ocfs2_num_free_extents(struct ocfs2_super *osb,
- struct ocfs2_extent_tree *et)
+int ocfs2_num_free_extents(struct ocfs2_extent_tree *et)
{
int retval;
struct ocfs2_extent_list *el = NULL;
@@ -1933,14 +1932,12 @@ out:
* the new changes.
*
* left_rec: the record on the left.
- * left_child_el: is the child list pointed to by left_rec
* right_rec: the record to the right of left_rec
* right_child_el: is the child list pointed to by right_rec
*
* By definition, this only works on interior nodes.
*/
static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
- struct ocfs2_extent_list *left_child_el,
struct ocfs2_extent_rec *right_rec,
struct ocfs2_extent_list *right_child_el)
{
@@ -2003,7 +2000,7 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
*/
BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
- ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
+ ocfs2_adjust_adjacent_records(&root_el->l_recs[i],
&root_el->l_recs[i + 1], right_el);
}
@@ -2060,8 +2057,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
el = right_path->p_node[i].el;
right_rec = &el->l_recs[0];
- ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
- right_el);
+ ocfs2_adjust_adjacent_records(left_rec, right_rec, right_el);
ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
@@ -2509,7 +2505,7 @@ out_ret_path:
static int ocfs2_update_edge_lengths(handle_t *handle,
struct ocfs2_extent_tree *et,
- int subtree_index, struct ocfs2_path *path)
+ struct ocfs2_path *path)
{
int i, idx, ret;
struct ocfs2_extent_rec *rec;
@@ -2755,8 +2751,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
if (del_right_subtree) {
ocfs2_unlink_subtree(handle, et, left_path, right_path,
subtree_index, dealloc);
- ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
- left_path);
+ ret = ocfs2_update_edge_lengths(handle, et, left_path);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3060,8 +3055,7 @@ static int ocfs2_remove_rightmost_path(handle_t *handle,
ocfs2_unlink_subtree(handle, et, left_path, path,
subtree_index, dealloc);
- ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
- left_path);
+ ret = ocfs2_update_edge_lengths(handle, et, left_path);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4790,7 +4784,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
if (mark_unwritten)
flags = OCFS2_EXT_UNWRITTEN;
- free_extents = ocfs2_num_free_extents(osb, et);
+ free_extents = ocfs2_num_free_extents(et);
if (free_extents < 0) {
status = free_extents;
mlog_errno(status);
@@ -5668,7 +5662,7 @@ static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
*ac = NULL;
- num_free_extents = ocfs2_num_free_extents(osb, et);
+ num_free_extents = ocfs2_num_free_extents(et);
if (num_free_extents < 0) {
ret = num_free_extents;
mlog_errno(ret);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 4a5152ec88a3..27b75cf32cfa 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -144,8 +144,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
struct ocfs2_cached_dealloc_ctxt *dealloc,
u64 refcount_loc, bool refcount_tree_locked);
-int ocfs2_num_free_extents(struct ocfs2_super *osb,
- struct ocfs2_extent_tree *et);
+int ocfs2_num_free_extents(struct ocfs2_extent_tree *et);
/*
* how many new metadata chunks would an allocation need at maximum?
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 6aea15746a56..d0206042d068 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -505,8 +505,7 @@ static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
}
}
-static void o2hb_wait_on_io(struct o2hb_region *reg,
- struct o2hb_bio_wait_ctxt *wc)
+static void o2hb_wait_on_io(struct o2hb_bio_wait_ctxt *wc)
{
o2hb_bio_wait_dec(wc, 1);
wait_for_completion(&wc->wc_io_complete);
@@ -608,7 +607,7 @@ static int o2hb_read_slots(struct o2hb_region *reg,
status = 0;
bail_and_wait:
- o2hb_wait_on_io(reg, &wc);
+ o2hb_wait_on_io(&wc);
if (wc.wc_error && !status)
status = wc.wc_error;
@@ -1162,7 +1161,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
* before we can go to steady state. This ensures that
* people we find in our steady state have seen us.
*/
- o2hb_wait_on_io(reg, &write_wc);
+ o2hb_wait_on_io(&write_wc);
if (write_wc.wc_error) {
/* Do not re-arm the write timeout on I/O error - we
* can't be sure that the new block ever made it to
@@ -1275,7 +1274,7 @@ static int o2hb_thread(void *data)
o2hb_prepare_block(reg, 0);
ret = o2hb_issue_node_write(reg, &write_wc);
if (ret == 0)
- o2hb_wait_on_io(reg, &write_wc);
+ o2hb_wait_on_io(&write_wc);
else
mlog_errno(ret);
}
@@ -2576,22 +2575,6 @@ void o2hb_unregister_callback(const char *region_uuid,
}
EXPORT_SYMBOL_GPL(o2hb_unregister_callback);
-int o2hb_check_node_heartbeating(u8 node_num)
-{
- unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
-
- o2hb_fill_node_map(testing_map, sizeof(testing_map));
- if (!test_bit(node_num, testing_map)) {
- mlog(ML_HEARTBEAT,
- "node (%u) does not have heartbeating enabled.\n",
- node_num);
- return 0;
- }
-
- return 1;
-}
-EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
-
int o2hb_check_node_heartbeating_no_sem(u8 node_num)
{
unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
@@ -2626,23 +2609,6 @@ int o2hb_check_node_heartbeating_from_callback(u8 node_num)
}
EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback);
-/* Makes sure our local node is configured with a node number, and is
- * heartbeating. */
-int o2hb_check_local_node_heartbeating(void)
-{
- u8 node_num;
-
- /* if this node was set then we have networking */
- node_num = o2nm_this_node();
- if (node_num == O2NM_MAX_NODES) {
- mlog(ML_HEARTBEAT, "this node has not been configured.\n");
- return 0;
- }
-
- return o2hb_check_node_heartbeating(node_num);
-}
-EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating);
-
/*
* this is just a hack until we get the plumbing which flips file systems
* read only and drops the hb ref instead of killing the node dead.
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 3ecb9f337b7d..febe6312ceff 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3249,7 +3249,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
spin_unlock(&OCFS2_I(dir)->ip_lock);
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir),
parent_fe_bh);
- num_free_extents = ocfs2_num_free_extents(osb, &et);
+ num_free_extents = ocfs2_num_free_extents(&et);
if (num_free_extents < 0) {
status = num_free_extents;
mlog_errno(status);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index bfeb647459d9..6e41fc8fabbe 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -196,7 +196,7 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
return -EROFS;
- err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ err = file_write_and_wait_range(file, start, end);
if (err)
return err;
@@ -713,13 +713,6 @@ leave:
return status;
}
-int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
- u32 clusters_to_add, int mark_unwritten)
-{
- return __ocfs2_extend_allocation(inode, logical_start,
- clusters_to_add, mark_unwritten);
-}
-
/*
* While a write will already be ordering the data, a truncate will not.
* Thus, we need to explicitly order the zeroed pages.
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index d5e5fa7f0743..36304434eacf 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1348,7 +1348,6 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
ocfs2_schedule_truncate_log_flush(osb, 0);
osb->local_alloc_copy = NULL;
- osb->dirty = 0;
/* queue to recover orphan slots for all offline slots */
ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index e52a2852d50d..7eb3b0a6347e 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -175,7 +175,7 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode,
unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- num_free_extents = ocfs2_num_free_extents(osb, et);
+ num_free_extents = ocfs2_num_free_extents(et);
if (num_free_extents < 0) {
ret = num_free_extents;
mlog_errno(ret);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 0c39d71c67a1..9a50f222ac97 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -320,7 +320,6 @@ struct ocfs2_super
u64 system_dir_blkno;
u64 bitmap_blkno;
u32 bitmap_cpg;
- u8 *uuid;
char *uuid_str;
u32 uuid_hash;
u8 *vol_label;
@@ -388,9 +387,8 @@ struct ocfs2_super
unsigned int osb_resv_level;
unsigned int osb_dir_resv_level;
- /* Next three fields are for local node slot recovery during
+ /* Next two fields are for local node slot recovery during
* mount. */
- int dirty;
struct ocfs2_dinode *local_alloc_copy;
struct ocfs2_quota_recovery *quota_rec;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index cec495a921e3..c94b6baaa551 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -33,7 +33,7 @@
* Locking of quotas with OCFS2 is rather complex. Here are rules that
* should be obeyed by all the functions:
* - any write of quota structure (either to local or global file) is protected
- * by dqio_mutex or dquot->dq_lock.
+ * by dqio_sem or dquot->dq_lock.
* - any modification of global quota file holds inode cluster lock, i_mutex,
* and ip_alloc_sem of the global quota file (achieved by
* ocfs2_lock_global_qf). It also has to hold qinfo_lock.
@@ -42,9 +42,9 @@
*
* A rough sketch of locking dependencies (lf = local file, gf = global file):
* Normal filesystem operation:
- * start_trans -> dqio_mutex -> write to lf
+ * start_trans -> dqio_sem -> write to lf
* Syncing of local and global file:
- * ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
+ * ocfs2_lock_global_qf -> start_trans -> dqio_sem -> qinfo_lock ->
* write to gf
* -> write to lf
* Acquire dquot for the first time:
@@ -60,7 +60,7 @@
* Recovery:
* inode cluster lock of recovered lf
* -> read bitmaps -> ip_alloc_sem of lf
- * -> ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
+ * -> ocfs2_lock_global_qf -> start_trans -> dqio_sem -> qinfo_lock ->
* write to gf
*/
@@ -443,13 +443,17 @@ static int __ocfs2_global_write_info(struct super_block *sb, int type)
int ocfs2_global_write_info(struct super_block *sb, int type)
{
int err;
- struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+ struct quota_info *dqopt = sb_dqopt(sb);
+ struct ocfs2_mem_dqinfo *info = dqopt->info[type].dqi_priv;
+ down_write(&dqopt->dqio_sem);
err = ocfs2_qinfo_lock(info, 1);
if (err < 0)
- return err;
+ goto out_sem;
err = __ocfs2_global_write_info(sb, type);
ocfs2_qinfo_unlock(info, 1);
+out_sem:
+ up_write(&dqopt->dqio_sem);
return err;
}
@@ -500,7 +504,7 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
/* Update space and inode usage. Get also other information from
* global quota file so that we don't overwrite any changes there.
* We are */
- spin_lock(&dq_data_lock);
+ spin_lock(&dquot->dq_dqb_lock);
spacechange = dquot->dq_dqb.dqb_curspace -
OCFS2_DQUOT(dquot)->dq_origspace;
inodechange = dquot->dq_dqb.dqb_curinodes -
@@ -556,7 +560,7 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
__clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
- spin_unlock(&dq_data_lock);
+ spin_unlock(&dquot->dq_dqb_lock);
err = ocfs2_qinfo_lock(info, freeing);
if (err < 0) {
mlog(ML_ERROR, "Failed to lock quota info, losing quota write"
@@ -611,7 +615,7 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
mlog_errno(status);
goto out_ilock;
}
- mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+ down_write(&sb_dqopt(sb)->dqio_sem);
status = ocfs2_sync_dquot(dquot);
if (status < 0)
mlog_errno(status);
@@ -619,7 +623,7 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
status = ocfs2_local_write_dquot(dquot);
if (status < 0)
mlog_errno(status);
- mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+ up_write(&sb_dqopt(sb)->dqio_sem);
ocfs2_commit_trans(osb, handle);
out_ilock:
ocfs2_unlock_global_qf(oinfo, 1);
@@ -666,9 +670,9 @@ static int ocfs2_write_dquot(struct dquot *dquot)
mlog_errno(status);
goto out;
}
- mutex_lock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
+ down_write(&sb_dqopt(dquot->dq_sb)->dqio_sem);
status = ocfs2_local_write_dquot(dquot);
- mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
+ up_write(&sb_dqopt(dquot->dq_sb)->dqio_sem);
ocfs2_commit_trans(osb, handle);
out:
return status;
@@ -920,10 +924,10 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
/* In case user set some limits, sync dquot immediately to global
* quota file so that information propagates quicker */
- spin_lock(&dq_data_lock);
+ spin_lock(&dquot->dq_dqb_lock);
if (dquot->dq_flags & mask)
sync = 1;
- spin_unlock(&dq_data_lock);
+ spin_unlock(&dquot->dq_dqb_lock);
/* This is a slight hack but we can't afford getting global quota
* lock if we already have a transaction started. */
if (!sync || journal_current_handle()) {
@@ -939,7 +943,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
mlog_errno(status);
goto out_ilock;
}
- mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+ down_write(&sb_dqopt(sb)->dqio_sem);
status = ocfs2_sync_dquot(dquot);
if (status < 0) {
mlog_errno(status);
@@ -948,7 +952,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
/* Now write updated local dquot structure */
status = ocfs2_local_write_dquot(dquot);
out_dlock:
- mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+ up_write(&sb_dqopt(sb)->dqio_sem);
ocfs2_commit_trans(osb, handle);
out_ilock:
ocfs2_unlock_global_qf(oinfo, 1);
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 32c5a40c1257..aa700fd10610 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -520,8 +520,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
mlog_errno(status);
goto out_drop_lock;
}
- mutex_lock(&sb_dqopt(sb)->dqio_mutex);
- spin_lock(&dq_data_lock);
+ down_write(&sb_dqopt(sb)->dqio_sem);
+ spin_lock(&dquot->dq_dqb_lock);
/* Add usage from quota entry into quota changes
* of our node. Auxiliary variables are important
* due to signedness */
@@ -529,7 +529,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
inodechange = le64_to_cpu(dqblk->dqb_inodemod);
dquot->dq_dqb.dqb_curspace += spacechange;
dquot->dq_dqb.dqb_curinodes += inodechange;
- spin_unlock(&dq_data_lock);
+ spin_unlock(&dquot->dq_dqb_lock);
/* We want to drop reference held by the crashed
* node. Since we have our own reference we know
* global structure actually won't be freed. */
@@ -553,7 +553,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
unlock_buffer(qbh);
ocfs2_journal_dirty(handle, qbh);
out_commit:
- mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+ up_write(&sb_dqopt(sb)->dqio_sem);
ocfs2_commit_trans(OCFS2_SB(sb), handle);
out_drop_lock:
ocfs2_unlock_global_qf(oinfo, 1);
@@ -691,9 +691,6 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
struct ocfs2_quota_recovery *rec;
int locked = 0;
- /* We don't need the lock and we have to acquire quota file locks
- * which will later depend on this lock */
- mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
info->dqi_max_spc_limit = 0x7fffffffffffffffLL;
info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
@@ -772,7 +769,6 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
goto out_err;
}
- mutex_lock(&sb_dqopt(sb)->dqio_mutex);
return 0;
out_err:
if (oinfo) {
@@ -786,7 +782,6 @@ out_err:
kfree(oinfo);
}
brelse(bh);
- mutex_lock(&sb_dqopt(sb)->dqio_mutex);
return -1;
}
@@ -882,12 +877,12 @@ static void olq_set_dquot(struct buffer_head *bh, void *private)
dqblk->dqb_id = cpu_to_le64(from_kqid(&init_user_ns,
od->dq_dquot.dq_id));
- spin_lock(&dq_data_lock);
+ spin_lock(&od->dq_dquot.dq_dqb_lock);
dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace -
od->dq_origspace);
dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes -
od->dq_originodes);
- spin_unlock(&dq_data_lock);
+ spin_unlock(&od->dq_dquot.dq_dqb_lock);
trace_olq_set_dquot(
(unsigned long long)le64_to_cpu(dqblk->dqb_spacemod),
(unsigned long long)le64_to_cpu(dqblk->dqb_inodemod),
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index f8933cb53d68..ab156e35ec00 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2851,7 +2851,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb,
int *credits)
{
int ret = 0, meta_add = 0;
- int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et);
+ int num_free_extents = ocfs2_num_free_extents(et);
if (num_free_extents < 0) {
ret = num_free_extents;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 6ad3533940ba..71f22c8fbffd 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -2700,7 +2700,7 @@ int ocfs2_lock_allocators(struct inode *inode,
BUG_ON(clusters_to_add != 0 && data_ac == NULL);
- num_free_extents = ocfs2_num_free_extents(osb, et);
+ num_free_extents = ocfs2_num_free_extents(et);
if (num_free_extents < 0) {
ret = num_free_extents;
mlog_errno(ret);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 83005f486451..3f936be379a9 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2486,7 +2486,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
if (dirty) {
/* Recovery will be completed after we've mounted the
* rest of the volume. */
- osb->dirty = 1;
osb->local_alloc_copy = local_alloc;
local_alloc = NULL;
}
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index f70c3778d600..5fdf269ba82e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -6800,7 +6800,7 @@ static int ocfs2_lock_reflink_xattr_rec_allocators(
*credits += 1;
/* count in the xattr tree change. */
- num_free_extents = ocfs2_num_free_extents(osb, xt_et);
+ num_free_extents = ocfs2_num_free_extents(xt_et);
if (num_free_extents < 0) {
ret = num_free_extents;
mlog_errno(ret);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 3d424a51cabb..f0fd3adb1693 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -446,14 +446,14 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
ovl_path_upper(dentry, &upperpath);
realfile = ovl_path_open(&upperpath, O_RDONLY);
- smp_mb__before_spinlock();
+
inode_lock(inode);
if (!od->upperfile) {
if (IS_ERR(realfile)) {
inode_unlock(inode);
return PTR_ERR(realfile);
}
- od->upperfile = realfile;
+ smp_store_release(&od->upperfile, realfile);
} else {
/* somebody has beaten us to it */
if (!IS_ERR(realfile))
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 719c2e943ea1..e5d89a0d0b8a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1408,12 +1408,13 @@ static const struct file_operations proc_fail_nth_operations = {
static int sched_show(struct seq_file *m, void *v)
{
struct inode *inode = m->private;
+ struct pid_namespace *ns = inode->i_sb->s_fs_info;
struct task_struct *p;
p = get_proc_task(inode);
if (!p)
return -ESRCH;
- proc_sched_show_task(p, m);
+ proc_sched_show_task(p, ns, m);
put_task_struct(p);
@@ -2930,6 +2931,7 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
REG("smaps", S_IRUGO, proc_pid_smaps_operations),
+ REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
REG("pagemap", S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
@@ -3323,6 +3325,7 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
REG("smaps", S_IRUGO, proc_tid_smaps_operations),
+ REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
REG("pagemap", S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index 50493edc30e5..e5709343feb7 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -7,14 +7,14 @@ static int devinfo_show(struct seq_file *f, void *v)
{
int i = *(loff_t *) v;
- if (i < CHRDEV_MAJOR_HASH_SIZE) {
+ if (i < CHRDEV_MAJOR_MAX) {
if (i == 0)
seq_puts(f, "Character devices:\n");
chrdev_show(f, i);
}
#ifdef CONFIG_BLOCK
else {
- i -= CHRDEV_MAJOR_HASH_SIZE;
+ i -= CHRDEV_MAJOR_MAX;
if (i == 0)
seq_puts(f, "\nBlock devices:\n");
blkdev_show(f, i);
@@ -25,7 +25,7 @@ static int devinfo_show(struct seq_file *f, void *v)
static void *devinfo_start(struct seq_file *f, loff_t *pos)
{
- if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE))
+ if (*pos < (BLKDEV_MAJOR_MAX + CHRDEV_MAJOR_MAX))
return pos;
return NULL;
}
@@ -33,7 +33,7 @@ static void *devinfo_start(struct seq_file *f, loff_t *pos)
static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos)
{
(*pos)++;
- if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE))
+ if (*pos >= (BLKDEV_MAJOR_MAX + CHRDEV_MAJOR_MAX))
return NULL;
return pos;
}
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index e3cda0b5968f..793a67574668 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -40,8 +40,8 @@ static int proc_match(unsigned int len, const char *name, struct proc_dir_entry
static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir)
{
- return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry,
- subdir_node);
+ return rb_entry_safe(rb_first_cached(&dir->subdir),
+ struct proc_dir_entry, subdir_node);
}
static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir)
@@ -54,7 +54,7 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
const char *name,
unsigned int len)
{
- struct rb_node *node = dir->subdir.rb_node;
+ struct rb_node *node = dir->subdir.rb_root.rb_node;
while (node) {
struct proc_dir_entry *de = rb_entry(node,
@@ -75,8 +75,9 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
static bool pde_subdir_insert(struct proc_dir_entry *dir,
struct proc_dir_entry *de)
{
- struct rb_root *root = &dir->subdir;
- struct rb_node **new = &root->rb_node, *parent = NULL;
+ struct rb_root_cached *root = &dir->subdir;
+ struct rb_node **new = &root->rb_root.rb_node, *parent = NULL;
+ bool leftmost = true;
/* Figure out where to put new node */
while (*new) {
@@ -88,15 +89,16 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir,
parent = *new;
if (result < 0)
new = &(*new)->rb_left;
- else if (result > 0)
+ else if (result > 0) {
new = &(*new)->rb_right;
- else
+ leftmost = false;
+ } else
return false;
}
/* Add new node and rebalance tree. */
rb_link_node(&de->subdir_node, parent, new);
- rb_insert_color(&de->subdir_node, root);
+ rb_insert_color_cached(&de->subdir_node, root, leftmost);
return true;
}
@@ -369,7 +371,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
ent->namelen = qstr.len;
ent->mode = mode;
ent->nlink = nlink;
- ent->subdir = RB_ROOT;
+ ent->subdir = RB_ROOT_CACHED;
atomic_set(&ent->count, 1);
spin_lock_init(&ent->pde_unload_lock);
INIT_LIST_HEAD(&ent->pde_openers);
@@ -499,6 +501,14 @@ out:
}
EXPORT_SYMBOL(proc_create_data);
+struct proc_dir_entry *proc_create(const char *name, umode_t mode,
+ struct proc_dir_entry *parent,
+ const struct file_operations *proc_fops)
+{
+ return proc_create_data(name, mode, parent, proc_fops, NULL);
+}
+EXPORT_SYMBOL(proc_create);
+
void proc_set_size(struct proc_dir_entry *de, loff_t size)
{
de->size = size;
@@ -545,7 +555,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
de = pde_subdir_find(parent, fn, len);
if (de)
- rb_erase(&de->subdir_node, &parent->subdir);
+ rb_erase_cached(&de->subdir_node, &parent->subdir);
write_unlock(&proc_subdir_lock);
if (!de) {
WARN(1, "name '%s'\n", name);
@@ -582,13 +592,13 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
write_unlock(&proc_subdir_lock);
return -ENOENT;
}
- rb_erase(&root->subdir_node, &parent->subdir);
+ rb_erase_cached(&root->subdir_node, &parent->subdir);
de = root;
while (1) {
next = pde_subdir_first(de);
if (next) {
- rb_erase(&next->subdir_node, &de->subdir);
+ rb_erase_cached(&next->subdir_node, &de->subdir);
de = next;
continue;
}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index aa2b89071630..a34195e92b20 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -40,7 +40,7 @@ struct proc_dir_entry {
const struct inode_operations *proc_iops;
const struct file_operations *proc_fops;
struct proc_dir_entry *parent;
- struct rb_root subdir;
+ struct rb_root_cached subdir;
struct rb_node subdir_node;
void *data;
atomic_t count; /* use count */
@@ -269,10 +269,12 @@ extern int proc_remount(struct super_block *, int *, char *);
/*
* task_[no]mmu.c
*/
+struct mem_size_stats;
struct proc_maps_private {
struct inode *inode;
struct task_struct *task;
struct mm_struct *mm;
+ struct mem_size_stats *rollup;
#ifdef CONFIG_MMU
struct vm_area_struct *tail_vma;
#endif
@@ -288,6 +290,7 @@ extern const struct file_operations proc_tid_maps_operations;
extern const struct file_operations proc_pid_numa_maps_operations;
extern const struct file_operations proc_tid_numa_maps_operations;
extern const struct file_operations proc_pid_smaps_operations;
+extern const struct file_operations proc_pid_smaps_rollup_operations;
extern const struct file_operations proc_tid_smaps_operations;
extern const struct file_operations proc_clear_refs_operations;
extern const struct file_operations proc_pagemap_operations;
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 509a61668d90..cdd979724c74 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -80,7 +80,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]);
show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]);
show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]);
- show_val_kb(m, "Mlocked: ", global_page_state(NR_MLOCK));
+ show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK));
#ifdef CONFIG_HIGHMEM
show_val_kb(m, "HighTotal: ", i.totalhigh);
@@ -114,9 +114,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "SUnreclaim: ",
global_node_page_state(NR_SLAB_UNRECLAIMABLE));
seq_printf(m, "KernelStack: %8lu kB\n",
- global_page_state(NR_KERNEL_STACK_KB));
+ global_zone_page_state(NR_KERNEL_STACK_KB));
show_val_kb(m, "PageTables: ",
- global_page_state(NR_PAGETABLE));
+ global_zone_page_state(NR_PAGETABLE));
#ifdef CONFIG_QUICKLIST
show_val_kb(m, "Quicklists: ", quicklist_total_size());
#endif
@@ -124,7 +124,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "NFS_Unstable: ",
global_node_page_state(NR_UNSTABLE_NFS));
show_val_kb(m, "Bounce: ",
- global_page_state(NR_BOUNCE));
+ global_zone_page_state(NR_BOUNCE));
show_val_kb(m, "WritebackTmp: ",
global_node_page_state(NR_WRITEBACK_TEMP));
show_val_kb(m, "CommitLimit: ", vm_commit_limit());
@@ -151,7 +151,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#ifdef CONFIG_CMA
show_val_kb(m, "CmaTotal: ", totalcma_pages);
show_val_kb(m, "CmaFree: ",
- global_page_state(NR_FREE_CMA_PAGES));
+ global_zone_page_state(NR_FREE_CMA_PAGES));
#endif
hugetlb_report_meminfo(m);
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index d72fc40241d9..a2bf369c923d 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -196,7 +196,7 @@ static __net_init int proc_net_ns_init(struct net *net)
if (!netd)
goto out;
- netd->subdir = RB_ROOT;
+ netd->subdir = RB_ROOT_CACHED;
netd->data = net;
netd->nlink = 2;
netd->namelen = 3;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index deecb397daa3..926fb27f4ca2 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -210,7 +210,7 @@ struct proc_dir_entry proc_root = {
.proc_iops = &proc_root_inode_operations,
.proc_fops = &proc_root_operations,
.parent = &proc_root,
- .subdir = RB_ROOT,
+ .subdir = RB_ROOT_CACHED,
.name = "/proc",
};
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fe8f3265e877..7b40e11ede9b 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -253,6 +253,7 @@ static int proc_map_release(struct inode *inode, struct file *file)
if (priv->mm)
mmdrop(priv->mm);
+ kfree(priv->rollup);
return seq_release_private(inode, file);
}
@@ -267,8 +268,7 @@ static int do_maps_open(struct inode *inode, struct file *file,
* Indicate if the VMA is a stack for the given task; for
* /proc/PID/maps that is the stack of the main task.
*/
-static int is_stack(struct proc_maps_private *priv,
- struct vm_area_struct *vma)
+static int is_stack(struct vm_area_struct *vma)
{
/*
* We make no effort to guess what a given thread considers to be
@@ -279,12 +279,28 @@ static int is_stack(struct proc_maps_private *priv,
vma->vm_end >= vma->vm_mm->start_stack;
}
+static void show_vma_header_prefix(struct seq_file *m,
+ unsigned long start, unsigned long end,
+ vm_flags_t flags, unsigned long long pgoff,
+ dev_t dev, unsigned long ino)
+{
+ seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
+ seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
+ start,
+ end,
+ flags & VM_READ ? 'r' : '-',
+ flags & VM_WRITE ? 'w' : '-',
+ flags & VM_EXEC ? 'x' : '-',
+ flags & VM_MAYSHARE ? 's' : 'p',
+ pgoff,
+ MAJOR(dev), MINOR(dev), ino);
+}
+
static void
show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
{
struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
- struct proc_maps_private *priv = m->private;
vm_flags_t flags = vma->vm_flags;
unsigned long ino = 0;
unsigned long long pgoff = 0;
@@ -301,17 +317,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
start = vma->vm_start;
end = vma->vm_end;
-
- seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
- seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
- start,
- end,
- flags & VM_READ ? 'r' : '-',
- flags & VM_WRITE ? 'w' : '-',
- flags & VM_EXEC ? 'x' : '-',
- flags & VM_MAYSHARE ? 's' : 'p',
- pgoff,
- MAJOR(dev), MINOR(dev), ino);
+ show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
/*
* Print the dentry name for named mappings, and a
@@ -342,7 +348,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
goto done;
}
- if (is_stack(priv, vma))
+ if (is_stack(vma))
name = "[stack]";
}
@@ -430,6 +436,7 @@ const struct file_operations proc_tid_maps_operations = {
#ifdef CONFIG_PROC_PAGE_MONITOR
struct mem_size_stats {
+ bool first;
unsigned long resident;
unsigned long shared_clean;
unsigned long shared_dirty;
@@ -443,7 +450,9 @@ struct mem_size_stats {
unsigned long swap;
unsigned long shared_hugetlb;
unsigned long private_hugetlb;
+ unsigned long first_vma_start;
u64 pss;
+ u64 pss_locked;
u64 swap_pss;
bool check_shmem_swap;
};
@@ -538,6 +547,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
}
} else if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
+ else if (is_device_private_entry(swpent))
+ page = device_private_entry_to_page(swpent);
} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
&& pte_none(*pte))) {
page = find_get_entry(vma->vm_file->f_mapping,
@@ -597,13 +608,14 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
- smaps_pmd_entry(pmd, addr, walk);
+ if (pmd_present(*pmd))
+ smaps_pmd_entry(pmd, addr, walk);
spin_unlock(ptl);
- return 0;
+ goto out;
}
if (pmd_trans_unstable(pmd))
- return 0;
+ goto out;
/*
* The mmap_sem held all the way back in m_start() is what
* keeps khugepaged out of here and from collapsing things
@@ -613,6 +625,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
for (; addr != end; pte++, addr += PAGE_SIZE)
smaps_pte_entry(pte, addr, walk);
pte_unmap_unlock(pte - 1, ptl);
+out:
cond_resched();
return 0;
}
@@ -652,6 +665,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_NORESERVE)] = "nr",
[ilog2(VM_HUGETLB)] = "ht",
[ilog2(VM_ARCH_1)] = "ar",
+ [ilog2(VM_WIPEONFORK)] = "wf",
[ilog2(VM_DONTDUMP)] = "dd",
#ifdef CONFIG_MEM_SOFT_DIRTY
[ilog2(VM_SOFTDIRTY)] = "sd",
@@ -700,6 +714,8 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
+ else if (is_device_private_entry(swpent))
+ page = device_private_entry_to_page(swpent);
}
if (page) {
int mapcount = page_mapcount(page);
@@ -719,18 +735,36 @@ void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
static int show_smap(struct seq_file *m, void *v, int is_pid)
{
+ struct proc_maps_private *priv = m->private;
struct vm_area_struct *vma = v;
- struct mem_size_stats mss;
+ struct mem_size_stats mss_stack;
+ struct mem_size_stats *mss;
struct mm_walk smaps_walk = {
.pmd_entry = smaps_pte_range,
#ifdef CONFIG_HUGETLB_PAGE
.hugetlb_entry = smaps_hugetlb_range,
#endif
.mm = vma->vm_mm,
- .private = &mss,
};
+ int ret = 0;
+ bool rollup_mode;
+ bool last_vma;
- memset(&mss, 0, sizeof mss);
+ if (priv->rollup) {
+ rollup_mode = true;
+ mss = priv->rollup;
+ if (mss->first) {
+ mss->first_vma_start = vma->vm_start;
+ mss->first = false;
+ }
+ last_vma = !m_next_vma(priv, vma);
+ } else {
+ rollup_mode = false;
+ memset(&mss_stack, 0, sizeof(mss_stack));
+ mss = &mss_stack;
+ }
+
+ smaps_walk.private = mss;
#ifdef CONFIG_SHMEM
if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
@@ -748,9 +782,9 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
!(vma->vm_flags & VM_WRITE)) {
- mss.swap = shmem_swapped;
+ mss->swap = shmem_swapped;
} else {
- mss.check_shmem_swap = true;
+ mss->check_shmem_swap = true;
smaps_walk.pte_hole = smaps_pte_hole;
}
}
@@ -758,54 +792,71 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
/* mmap_sem is held in m_start */
walk_page_vma(vma, &smaps_walk);
+ if (vma->vm_flags & VM_LOCKED)
+ mss->pss_locked += mss->pss;
- show_map_vma(m, vma, is_pid);
+ if (!rollup_mode) {
+ show_map_vma(m, vma, is_pid);
+ } else if (last_vma) {
+ show_vma_header_prefix(
+ m, mss->first_vma_start, vma->vm_end, 0, 0, 0, 0);
+ seq_pad(m, ' ');
+ seq_puts(m, "[rollup]\n");
+ } else {
+ ret = SEQ_SKIP;
+ }
+
+ if (!rollup_mode)
+ seq_printf(m,
+ "Size: %8lu kB\n"
+ "KernelPageSize: %8lu kB\n"
+ "MMUPageSize: %8lu kB\n",
+ (vma->vm_end - vma->vm_start) >> 10,
+ vma_kernel_pagesize(vma) >> 10,
+ vma_mmu_pagesize(vma) >> 10);
- seq_printf(m,
- "Size: %8lu kB\n"
- "Rss: %8lu kB\n"
- "Pss: %8lu kB\n"
- "Shared_Clean: %8lu kB\n"
- "Shared_Dirty: %8lu kB\n"
- "Private_Clean: %8lu kB\n"
- "Private_Dirty: %8lu kB\n"
- "Referenced: %8lu kB\n"
- "Anonymous: %8lu kB\n"
- "LazyFree: %8lu kB\n"
- "AnonHugePages: %8lu kB\n"
- "ShmemPmdMapped: %8lu kB\n"
- "Shared_Hugetlb: %8lu kB\n"
- "Private_Hugetlb: %7lu kB\n"
- "Swap: %8lu kB\n"
- "SwapPss: %8lu kB\n"
- "KernelPageSize: %8lu kB\n"
- "MMUPageSize: %8lu kB\n"
- "Locked: %8lu kB\n",
- (vma->vm_end - vma->vm_start) >> 10,
- mss.resident >> 10,
- (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
- mss.shared_clean >> 10,
- mss.shared_dirty >> 10,
- mss.private_clean >> 10,
- mss.private_dirty >> 10,
- mss.referenced >> 10,
- mss.anonymous >> 10,
- mss.lazyfree >> 10,
- mss.anonymous_thp >> 10,
- mss.shmem_thp >> 10,
- mss.shared_hugetlb >> 10,
- mss.private_hugetlb >> 10,
- mss.swap >> 10,
- (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
- vma_kernel_pagesize(vma) >> 10,
- vma_mmu_pagesize(vma) >> 10,
- (vma->vm_flags & VM_LOCKED) ?
- (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
- arch_show_smap(m, vma);
- show_smap_vma_flags(m, vma);
+ if (!rollup_mode || last_vma)
+ seq_printf(m,
+ "Rss: %8lu kB\n"
+ "Pss: %8lu kB\n"
+ "Shared_Clean: %8lu kB\n"
+ "Shared_Dirty: %8lu kB\n"
+ "Private_Clean: %8lu kB\n"
+ "Private_Dirty: %8lu kB\n"
+ "Referenced: %8lu kB\n"
+ "Anonymous: %8lu kB\n"
+ "LazyFree: %8lu kB\n"
+ "AnonHugePages: %8lu kB\n"
+ "ShmemPmdMapped: %8lu kB\n"
+ "Shared_Hugetlb: %8lu kB\n"
+ "Private_Hugetlb: %7lu kB\n"
+ "Swap: %8lu kB\n"
+ "SwapPss: %8lu kB\n"
+ "Locked: %8lu kB\n",
+ mss->resident >> 10,
+ (unsigned long)(mss->pss >> (10 + PSS_SHIFT)),
+ mss->shared_clean >> 10,
+ mss->shared_dirty >> 10,
+ mss->private_clean >> 10,
+ mss->private_dirty >> 10,
+ mss->referenced >> 10,
+ mss->anonymous >> 10,
+ mss->lazyfree >> 10,
+ mss->anonymous_thp >> 10,
+ mss->shmem_thp >> 10,
+ mss->shared_hugetlb >> 10,
+ mss->private_hugetlb >> 10,
+ mss->swap >> 10,
+ (unsigned long)(mss->swap_pss >> (10 + PSS_SHIFT)),
+ (unsigned long)(mss->pss >> (10 + PSS_SHIFT)));
+
+ if (!rollup_mode) {
+ arch_show_smap(m, vma);
+ show_smap_vma_flags(m, vma);
+ }
m_cache_vma(m, vma);
- return 0;
+ return ret;
}
static int show_pid_smap(struct seq_file *m, void *v)
@@ -837,6 +888,25 @@ static int pid_smaps_open(struct inode *inode, struct file *file)
return do_maps_open(inode, file, &proc_pid_smaps_op);
}
+static int pid_smaps_rollup_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ struct proc_maps_private *priv;
+ int ret = do_maps_open(inode, file, &proc_pid_smaps_op);
+
+ if (ret < 0)
+ return ret;
+ seq = file->private_data;
+ priv = seq->private;
+ priv->rollup = kzalloc(sizeof(*priv->rollup), GFP_KERNEL);
+ if (!priv->rollup) {
+ proc_map_release(inode, file);
+ return -ENOMEM;
+ }
+ priv->rollup->first = true;
+ return 0;
+}
+
static int tid_smaps_open(struct inode *inode, struct file *file)
{
return do_maps_open(inode, file, &proc_tid_smaps_op);
@@ -849,6 +919,13 @@ const struct file_operations proc_pid_smaps_operations = {
.release = proc_map_release,
};
+const struct file_operations proc_pid_smaps_rollup_operations = {
+ .open = pid_smaps_rollup_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = proc_map_release,
+};
+
const struct file_operations proc_tid_smaps_operations = {
.open = tid_smaps_open,
.read = seq_read,
@@ -904,17 +981,22 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
{
pmd_t pmd = *pmdp;
- /* See comment in change_huge_pmd() */
- pmdp_invalidate(vma, addr, pmdp);
- if (pmd_dirty(*pmdp))
- pmd = pmd_mkdirty(pmd);
- if (pmd_young(*pmdp))
- pmd = pmd_mkyoung(pmd);
+ if (pmd_present(pmd)) {
+ /* See comment in change_huge_pmd() */
+ pmdp_invalidate(vma, addr, pmdp);
+ if (pmd_dirty(*pmdp))
+ pmd = pmd_mkdirty(pmd);
+ if (pmd_young(*pmdp))
+ pmd = pmd_mkyoung(pmd);
- pmd = pmd_wrprotect(pmd);
- pmd = pmd_clear_soft_dirty(pmd);
+ pmd = pmd_wrprotect(pmd);
+ pmd = pmd_clear_soft_dirty(pmd);
- set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ } else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
+ pmd = pmd_swp_clear_soft_dirty(pmd);
+ set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ }
}
#else
static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
@@ -939,6 +1021,9 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
goto out;
}
+ if (!pmd_present(*pmd))
+ goto out;
+
page = pmd_page(*pmd);
/* Clear accessed and referenced bits. */
@@ -1181,7 +1266,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
if (pm->show_pfn)
frame = pte_pfn(pte);
flags |= PM_PRESENT;
- page = vm_normal_page(vma, addr, pte);
+ page = _vm_normal_page(vma, addr, pte, true);
if (pte_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
} else if (is_swap_pte(pte)) {
@@ -1194,6 +1279,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
flags |= PM_SWAP;
if (is_migration_entry(entry))
page = migration_entry_to_page(entry);
+
+ if (is_device_private_entry(entry))
+ page = device_private_entry_to_page(entry);
}
if (page && !PageAnon(page))
@@ -1220,27 +1308,33 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
if (ptl) {
u64 flags = 0, frame = 0;
pmd_t pmd = *pmdp;
+ struct page *page = NULL;
if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd))
flags |= PM_SOFT_DIRTY;
- /*
- * Currently pmd for thp is always present because thp
- * can not be swapped-out, migrated, or HWPOISONed
- * (split in such cases instead.)
- * This if-check is just to prepare for future implementation.
- */
if (pmd_present(pmd)) {
- struct page *page = pmd_page(pmd);
-
- if (page_mapcount(page) == 1)
- flags |= PM_MMAP_EXCLUSIVE;
+ page = pmd_page(pmd);
flags |= PM_PRESENT;
if (pm->show_pfn)
frame = pmd_pfn(pmd) +
((addr & ~PMD_MASK) >> PAGE_SHIFT);
}
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ else if (is_swap_pmd(pmd)) {
+ swp_entry_t entry = pmd_to_swp_entry(pmd);
+
+ frame = swp_type(entry) |
+ (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
+ flags |= PM_SWAP;
+ VM_BUG_ON(!is_pmd_migration_entry(pmd));
+ page = migration_entry_to_page(entry);
+ }
+#endif
+
+ if (page && page_mapcount(page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
for (; addr != end; addr += PAGE_SIZE) {
pagemap_entry_t pme = make_pme(frame, flags);
@@ -1673,7 +1767,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
seq_file_path(m, file, "\n\t= ");
} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
seq_puts(m, " heap");
- } else if (is_stack(proc_priv, vma)) {
+ } else if (is_stack(vma)) {
seq_puts(m, " stack");
}
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 23266694db11..dea90b566a6e 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -125,8 +125,7 @@ unsigned long task_statm(struct mm_struct *mm,
return size;
}
-static int is_stack(struct proc_maps_private *priv,
- struct vm_area_struct *vma)
+static int is_stack(struct vm_area_struct *vma)
{
struct mm_struct *mm = vma->vm_mm;
@@ -178,7 +177,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
if (file) {
seq_pad(m, ' ');
seq_file_path(m, file, "");
- } else if (mm && is_stack(priv, vma)) {
+ } else if (mm && is_stack(vma)) {
seq_pad(m, ' ');
seq_printf(m, "[stack]");
}
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index fefd22611cf6..d814723fb27d 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -36,7 +36,6 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/uaccess.h>
-#include <linux/syslog.h>
#include "internal.h"
@@ -132,18 +131,6 @@ static const struct seq_operations pstore_ftrace_seq_ops = {
.show = pstore_ftrace_seq_show,
};
-static int pstore_check_syslog_permissions(struct pstore_private *ps)
-{
- switch (ps->record->type) {
- case PSTORE_TYPE_DMESG:
- case PSTORE_TYPE_CONSOLE:
- return check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
- SYSLOG_FROM_READER);
- default:
- return 0;
- }
-}
-
static ssize_t pstore_file_read(struct file *file, char __user *userbuf,
size_t count, loff_t *ppos)
{
@@ -163,10 +150,6 @@ static int pstore_file_open(struct inode *inode, struct file *file)
int err;
const struct seq_operations *sops = NULL;
- err = pstore_check_syslog_permissions(ps);
- if (err)
- return err;
-
if (ps->record->type == PSTORE_TYPE_FTRACE)
sops = &pstore_ftrace_seq_ops;
@@ -204,11 +187,6 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
{
struct pstore_private *p = d_inode(dentry)->i_private;
struct pstore_record *record = p->record;
- int err;
-
- err = pstore_check_syslog_permissions(p);
- if (err)
- return err;
if (!record->psi->erase)
return -EPERM;
@@ -471,7 +449,7 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent)
inode = pstore_get_inode(sb);
if (inode) {
- inode->i_mode = S_IFDIR | 0755;
+ inode->i_mode = S_IFDIR | 0750;
inode->i_op = &pstore_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
inc_nlink(inode);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 566e6ef99f07..8381db9db6d9 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -82,16 +82,19 @@
#include <linux/uaccess.h>
/*
- * There are three quota SMP locks. dq_list_lock protects all lists with quotas
- * and quota formats.
- * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and
- * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes.
- * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly
- * in inode_add_bytes() and inode_sub_bytes(). dq_state_lock protects
- * modifications of quota state (on quotaon and quotaoff) and readers who care
- * about latest values take it as well.
+ * There are five quota SMP locks:
+ * * dq_list_lock protects all lists with quotas and quota formats.
+ * * dquot->dq_dqb_lock protects data from dq_dqb
+ * * inode->i_lock protects inode->i_blocks, i_bytes and also guards
+ * consistency of dquot->dq_dqb with inode->i_blocks, i_bytes so that
+ * dquot_transfer() can stabilize amount it transfers
+ * * dq_data_lock protects mem_dqinfo structures and modifications of dquot
+ * pointers in the inode
+ * * dq_state_lock protects modifications of quota state (on quotaon and
+ * quotaoff) and readers who care about latest values take it as well.
*
- * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock,
+ * The spinlock ordering is hence:
+ * dq_data_lock > dq_list_lock > i_lock > dquot->dq_dqb_lock,
* dq_list_lock > dq_state_lock
*
* Note that some things (eg. sb pointer, type, id) doesn't change during
@@ -110,17 +113,14 @@
* sure they cannot race with quotaon which first sets S_NOQUOTA flag and
* then drops all pointers to dquots from an inode.
*
- * Each dquot has its dq_lock mutex. Locked dquots might not be referenced
- * from inodes (dquot_alloc_space() and such don't check the dq_lock).
- * Currently dquot is locked only when it is being read to memory (or space for
- * it is being allocated) on the first dqget() and when it is being released on
- * the last dqput(). The allocation and release oparations are serialized by
- * the dq_lock and by checking the use count in dquot_release(). Write
- * operations on dquots don't hold dq_lock as they copy data under dq_data_lock
- * spinlock to internal buffers before writing.
+ * Each dquot has its dq_lock mutex. Dquot is locked when it is being read to
+ * memory (or space for it is being allocated) on the first dqget(), when it is
+ * being written out, and when it is being released on the last dqput(). The
+ * allocation and release operations are serialized by the dq_lock and by
+ * checking the use count in dquot_release().
*
* Lock ordering (including related VFS locks) is the following:
- * s_umount > i_mutex > journal_lock > dquot->dq_lock > dqio_mutex
+ * s_umount > i_mutex > journal_lock > dquot->dq_lock > dqio_sem
*/
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock);
@@ -129,6 +129,8 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
EXPORT_SYMBOL(dq_data_lock);
DEFINE_STATIC_SRCU(dquot_srcu);
+static DECLARE_WAIT_QUEUE_HEAD(dquot_ref_wq);
+
void __quota_error(struct super_block *sb, const char *func,
const char *fmt, ...)
{
@@ -247,6 +249,7 @@ struct dqstats dqstats;
EXPORT_SYMBOL(dqstats);
static qsize_t inode_get_rsv_space(struct inode *inode);
+static qsize_t __inode_get_rsv_space(struct inode *inode);
static int __dquot_initialize(struct inode *inode, int type);
static inline unsigned int
@@ -342,6 +345,12 @@ int dquot_mark_dquot_dirty(struct dquot *dquot)
{
int ret = 1;
+ if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
+ return 0;
+
+ if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY)
+ return test_and_set_bit(DQ_MOD_B, &dquot->dq_flags);
+
/* If quota is dirty already, we don't have to acquire dq_list_lock */
if (test_bit(DQ_MOD_B, &dquot->dq_flags))
return 1;
@@ -381,18 +390,26 @@ static inline void dqput_all(struct dquot **dquot)
dqput(dquot[cnt]);
}
-/* This function needs dq_list_lock */
static inline int clear_dquot_dirty(struct dquot *dquot)
{
- if (!test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags))
+ if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY)
+ return test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags);
+
+ spin_lock(&dq_list_lock);
+ if (!test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags)) {
+ spin_unlock(&dq_list_lock);
return 0;
+ }
list_del_init(&dquot->dq_dirty);
+ spin_unlock(&dq_list_lock);
return 1;
}
void mark_info_dirty(struct super_block *sb, int type)
{
- set_bit(DQF_INFO_DIRTY_B, &sb_dqopt(sb)->info[type].dqi_flags);
+ spin_lock(&dq_data_lock);
+ sb_dqopt(sb)->info[type].dqi_flags |= DQF_INFO_DIRTY;
+ spin_unlock(&dq_data_lock);
}
EXPORT_SYMBOL(mark_info_dirty);
@@ -406,7 +423,6 @@ int dquot_acquire(struct dquot *dquot)
struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
mutex_lock(&dquot->dq_lock);
- mutex_lock(&dqopt->dqio_mutex);
if (!test_bit(DQ_READ_B, &dquot->dq_flags))
ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot);
if (ret < 0)
@@ -436,7 +452,6 @@ int dquot_acquire(struct dquot *dquot)
smp_mb__before_atomic();
set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_iolock:
- mutex_unlock(&dqopt->dqio_mutex);
mutex_unlock(&dquot->dq_lock);
return ret;
}
@@ -450,21 +465,17 @@ int dquot_commit(struct dquot *dquot)
int ret = 0;
struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
- mutex_lock(&dqopt->dqio_mutex);
- spin_lock(&dq_list_lock);
- if (!clear_dquot_dirty(dquot)) {
- spin_unlock(&dq_list_lock);
- goto out_sem;
- }
- spin_unlock(&dq_list_lock);
+ mutex_lock(&dquot->dq_lock);
+ if (!clear_dquot_dirty(dquot))
+ goto out_lock;
/* Inactive dquot can be only if there was error during read/init
* => we have better not writing it */
if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot);
else
ret = -EIO;
-out_sem:
- mutex_unlock(&dqopt->dqio_mutex);
+out_lock:
+ mutex_unlock(&dquot->dq_lock);
return ret;
}
EXPORT_SYMBOL(dquot_commit);
@@ -481,7 +492,6 @@ int dquot_release(struct dquot *dquot)
/* Check whether we are not racing with some other dqget() */
if (atomic_read(&dquot->dq_count) > 1)
goto out_dqlock;
- mutex_lock(&dqopt->dqio_mutex);
if (dqopt->ops[dquot->dq_id.type]->release_dqblk) {
ret = dqopt->ops[dquot->dq_id.type]->release_dqblk(dquot);
/* Write the info */
@@ -493,7 +503,6 @@ int dquot_release(struct dquot *dquot)
ret = ret2;
}
clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
- mutex_unlock(&dqopt->dqio_mutex);
out_dqlock:
mutex_unlock(&dquot->dq_lock);
return ret;
@@ -530,22 +539,18 @@ restart:
continue;
/* Wait for dquot users */
if (atomic_read(&dquot->dq_count)) {
- DEFINE_WAIT(wait);
-
dqgrab(dquot);
- prepare_to_wait(&dquot->dq_wait_unused, &wait,
- TASK_UNINTERRUPTIBLE);
spin_unlock(&dq_list_lock);
- /* Once dqput() wakes us up, we know it's time to free
+ /*
+ * Once dqput() wakes us up, we know it's time to free
* the dquot.
* IMPORTANT: we rely on the fact that there is always
* at most one process waiting for dquot to free.
* Otherwise dq_count would be > 1 and we would never
* wake up.
*/
- if (atomic_read(&dquot->dq_count) > 1)
- schedule();
- finish_wait(&dquot->dq_wait_unused, &wait);
+ wait_event(dquot_ref_wq,
+ atomic_read(&dquot->dq_count) == 1);
dqput(dquot);
/* At this moment dquot() need not exist (it could be
* reclaimed by prune_dqcache(). Hence we must
@@ -629,11 +634,9 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
while (!list_empty(dirty)) {
dquot = list_first_entry(dirty, struct dquot,
dq_dirty);
- /* Dirty and inactive can be only bad dquot... */
- if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
- clear_dquot_dirty(dquot);
- continue;
- }
+
+ WARN_ON(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags));
+
/* Now we have active dquot from which someone is
* holding reference so we can safely just increase
* use count */
@@ -759,12 +762,12 @@ we_slept:
/* Releasing dquot during quotaoff phase? */
if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_id.type) &&
atomic_read(&dquot->dq_count) == 1)
- wake_up(&dquot->dq_wait_unused);
+ wake_up(&dquot_ref_wq);
spin_unlock(&dq_list_lock);
return;
}
/* Need to release dquot? */
- if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && dquot_dirty(dquot)) {
+ if (dquot_dirty(dquot)) {
spin_unlock(&dq_list_lock);
/* Commit dquot before releasing */
ret = dquot->dq_sb->dq_op->write_dquot(dquot);
@@ -776,14 +779,10 @@ we_slept:
* We clear dirty bit anyway, so that we avoid
* infinite loop here
*/
- spin_lock(&dq_list_lock);
clear_dquot_dirty(dquot);
- spin_unlock(&dq_list_lock);
}
goto we_slept;
}
- /* Clear flag in case dquot was inactive (something bad happened) */
- clear_dquot_dirty(dquot);
if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
spin_unlock(&dq_list_lock);
dquot->dq_sb->dq_op->release_dquot(dquot);
@@ -818,10 +817,10 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
INIT_LIST_HEAD(&dquot->dq_inuse);
INIT_HLIST_NODE(&dquot->dq_hash);
INIT_LIST_HEAD(&dquot->dq_dirty);
- init_waitqueue_head(&dquot->dq_wait_unused);
dquot->dq_sb = sb;
dquot->dq_id = make_kqid_invalid(type);
atomic_set(&dquot->dq_count, 1);
+ spin_lock_init(&dquot->dq_dqb_lock);
return dquot;
}
@@ -1079,42 +1078,6 @@ static void drop_dquot_ref(struct super_block *sb, int type)
}
}
-static inline void dquot_incr_inodes(struct dquot *dquot, qsize_t number)
-{
- dquot->dq_dqb.dqb_curinodes += number;
-}
-
-static inline void dquot_incr_space(struct dquot *dquot, qsize_t number)
-{
- dquot->dq_dqb.dqb_curspace += number;
-}
-
-static inline void dquot_resv_space(struct dquot *dquot, qsize_t number)
-{
- dquot->dq_dqb.dqb_rsvspace += number;
-}
-
-/*
- * Claim reserved quota space
- */
-static void dquot_claim_reserved_space(struct dquot *dquot, qsize_t number)
-{
- if (dquot->dq_dqb.dqb_rsvspace < number) {
- WARN_ON_ONCE(1);
- number = dquot->dq_dqb.dqb_rsvspace;
- }
- dquot->dq_dqb.dqb_curspace += number;
- dquot->dq_dqb.dqb_rsvspace -= number;
-}
-
-static void dquot_reclaim_reserved_space(struct dquot *dquot, qsize_t number)
-{
- if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number))
- number = dquot->dq_dqb.dqb_curspace;
- dquot->dq_dqb.dqb_rsvspace += number;
- dquot->dq_dqb.dqb_curspace -= number;
-}
-
static inline
void dquot_free_reserved_space(struct dquot *dquot, qsize_t number)
{
@@ -1278,21 +1241,24 @@ static int ignore_hardlimit(struct dquot *dquot)
!(info->dqi_flags & DQF_ROOT_SQUASH));
}
-/* needs dq_data_lock */
-static int check_idq(struct dquot *dquot, qsize_t inodes,
- struct dquot_warn *warn)
+static int dquot_add_inodes(struct dquot *dquot, qsize_t inodes,
+ struct dquot_warn *warn)
{
- qsize_t newinodes = dquot->dq_dqb.dqb_curinodes + inodes;
+ qsize_t newinodes;
+ int ret = 0;
+ spin_lock(&dquot->dq_dqb_lock);
+ newinodes = dquot->dq_dqb.dqb_curinodes + inodes;
if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type) ||
test_bit(DQ_FAKE_B, &dquot->dq_flags))
- return 0;
+ goto add;
if (dquot->dq_dqb.dqb_ihardlimit &&
newinodes > dquot->dq_dqb.dqb_ihardlimit &&
!ignore_hardlimit(dquot)) {
prepare_warning(warn, dquot, QUOTA_NL_IHARDWARN);
- return -EDQUOT;
+ ret = -EDQUOT;
+ goto out;
}
if (dquot->dq_dqb.dqb_isoftlimit &&
@@ -1301,7 +1267,8 @@ static int check_idq(struct dquot *dquot, qsize_t inodes,
ktime_get_real_seconds() >= dquot->dq_dqb.dqb_itime &&
!ignore_hardlimit(dquot)) {
prepare_warning(warn, dquot, QUOTA_NL_ISOFTLONGWARN);
- return -EDQUOT;
+ ret = -EDQUOT;
+ goto out;
}
if (dquot->dq_dqb.dqb_isoftlimit &&
@@ -1311,30 +1278,40 @@ static int check_idq(struct dquot *dquot, qsize_t inodes,
dquot->dq_dqb.dqb_itime = ktime_get_real_seconds() +
sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type].dqi_igrace;
}
+add:
+ dquot->dq_dqb.dqb_curinodes = newinodes;
- return 0;
+out:
+ spin_unlock(&dquot->dq_dqb_lock);
+ return ret;
}
-/* needs dq_data_lock */
-static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc,
- struct dquot_warn *warn)
+static int dquot_add_space(struct dquot *dquot, qsize_t space,
+ qsize_t rsv_space, unsigned int flags,
+ struct dquot_warn *warn)
{
qsize_t tspace;
struct super_block *sb = dquot->dq_sb;
+ int ret = 0;
+ spin_lock(&dquot->dq_dqb_lock);
if (!sb_has_quota_limits_enabled(sb, dquot->dq_id.type) ||
test_bit(DQ_FAKE_B, &dquot->dq_flags))
- return 0;
+ goto add;
tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace
- + space;
+ + space + rsv_space;
+
+ if (flags & DQUOT_SPACE_NOFAIL)
+ goto add;
if (dquot->dq_dqb.dqb_bhardlimit &&
tspace > dquot->dq_dqb.dqb_bhardlimit &&
!ignore_hardlimit(dquot)) {
- if (!prealloc)
+ if (flags & DQUOT_SPACE_WARN)
prepare_warning(warn, dquot, QUOTA_NL_BHARDWARN);
- return -EDQUOT;
+ ret = -EDQUOT;
+ goto out;
}
if (dquot->dq_dqb.dqb_bsoftlimit &&
@@ -1342,28 +1319,34 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc,
dquot->dq_dqb.dqb_btime &&
ktime_get_real_seconds() >= dquot->dq_dqb.dqb_btime &&
!ignore_hardlimit(dquot)) {
- if (!prealloc)
+ if (flags & DQUOT_SPACE_WARN)
prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN);
- return -EDQUOT;
+ ret = -EDQUOT;
+ goto out;
}
if (dquot->dq_dqb.dqb_bsoftlimit &&
tspace > dquot->dq_dqb.dqb_bsoftlimit &&
dquot->dq_dqb.dqb_btime == 0) {
- if (!prealloc) {
+ if (flags & DQUOT_SPACE_WARN) {
prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN);
dquot->dq_dqb.dqb_btime = ktime_get_real_seconds() +
sb_dqopt(sb)->info[dquot->dq_id.type].dqi_bgrace;
- }
- else
+ } else {
/*
* We don't allow preallocation to exceed softlimit so exceeding will
* be always printed
*/
- return -EDQUOT;
+ ret = -EDQUOT;
+ goto out;
+ }
}
-
- return 0;
+add:
+ dquot->dq_dqb.dqb_rsvspace += rsv_space;
+ dquot->dq_dqb.dqb_curspace += space;
+out:
+ spin_unlock(&dquot->dq_dqb_lock);
+ return ret;
}
static int info_idq_free(struct dquot *dquot, qsize_t inodes)
@@ -1502,8 +1485,15 @@ static int __dquot_initialize(struct inode *inode, int type)
* did a write before quota was turned on
*/
rsv = inode_get_rsv_space(inode);
- if (unlikely(rsv))
- dquot_resv_space(dquots[cnt], rsv);
+ if (unlikely(rsv)) {
+ spin_lock(&inode->i_lock);
+ /* Get reservation again under proper lock */
+ rsv = __inode_get_rsv_space(inode);
+ spin_lock(&dquots[cnt]->dq_dqb_lock);
+ dquots[cnt]->dq_dqb.dqb_rsvspace += rsv;
+ spin_unlock(&dquots[cnt]->dq_dqb_lock);
+ spin_unlock(&inode->i_lock);
+ }
}
}
out_lock:
@@ -1598,39 +1588,12 @@ static qsize_t *inode_reserved_space(struct inode * inode)
return inode->i_sb->dq_op->get_reserved_space(inode);
}
-void inode_add_rsv_space(struct inode *inode, qsize_t number)
-{
- spin_lock(&inode->i_lock);
- *inode_reserved_space(inode) += number;
- spin_unlock(&inode->i_lock);
-}
-EXPORT_SYMBOL(inode_add_rsv_space);
-
-void inode_claim_rsv_space(struct inode *inode, qsize_t number)
+static qsize_t __inode_get_rsv_space(struct inode *inode)
{
- spin_lock(&inode->i_lock);
- *inode_reserved_space(inode) -= number;
- __inode_add_bytes(inode, number);
- spin_unlock(&inode->i_lock);
-}
-EXPORT_SYMBOL(inode_claim_rsv_space);
-
-void inode_reclaim_rsv_space(struct inode *inode, qsize_t number)
-{
- spin_lock(&inode->i_lock);
- *inode_reserved_space(inode) += number;
- __inode_sub_bytes(inode, number);
- spin_unlock(&inode->i_lock);
-}
-EXPORT_SYMBOL(inode_reclaim_rsv_space);
-
-void inode_sub_rsv_space(struct inode *inode, qsize_t number)
-{
- spin_lock(&inode->i_lock);
- *inode_reserved_space(inode) -= number;
- spin_unlock(&inode->i_lock);
+ if (!inode->i_sb->dq_op->get_reserved_space)
+ return 0;
+ return *inode_reserved_space(inode);
}
-EXPORT_SYMBOL(inode_sub_rsv_space);
static qsize_t inode_get_rsv_space(struct inode *inode)
{
@@ -1639,28 +1602,11 @@ static qsize_t inode_get_rsv_space(struct inode *inode)
if (!inode->i_sb->dq_op->get_reserved_space)
return 0;
spin_lock(&inode->i_lock);
- ret = *inode_reserved_space(inode);
+ ret = __inode_get_rsv_space(inode);
spin_unlock(&inode->i_lock);
return ret;
}
-static void inode_incr_space(struct inode *inode, qsize_t number,
- int reserve)
-{
- if (reserve)
- inode_add_rsv_space(inode, number);
- else
- inode_add_bytes(inode, number);
-}
-
-static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
-{
- if (reserve)
- inode_sub_rsv_space(inode, number);
- else
- inode_sub_bytes(inode, number);
-}
-
/*
* This functions updates i_blocks+i_bytes fields and quota information
* (together with appropriate checks).
@@ -1682,7 +1628,13 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
struct dquot **dquots;
if (!dquot_active(inode)) {
- inode_incr_space(inode, number, reserve);
+ if (reserve) {
+ spin_lock(&inode->i_lock);
+ *inode_reserved_space(inode) += number;
+ spin_unlock(&inode->i_lock);
+ } else {
+ inode_add_bytes(inode, number);
+ }
goto out;
}
@@ -1691,27 +1643,41 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
dquots = i_dquot(inode);
index = srcu_read_lock(&dquot_srcu);
- spin_lock(&dq_data_lock);
+ spin_lock(&inode->i_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (!dquots[cnt])
continue;
- ret = check_bdq(dquots[cnt], number,
- !(flags & DQUOT_SPACE_WARN), &warn[cnt]);
- if (ret && !(flags & DQUOT_SPACE_NOFAIL)) {
- spin_unlock(&dq_data_lock);
+ if (flags & DQUOT_SPACE_RESERVE) {
+ ret = dquot_add_space(dquots[cnt], 0, number, flags,
+ &warn[cnt]);
+ } else {
+ ret = dquot_add_space(dquots[cnt], number, 0, flags,
+ &warn[cnt]);
+ }
+ if (ret) {
+ /* Back out changes we already did */
+ for (cnt--; cnt >= 0; cnt--) {
+ if (!dquots[cnt])
+ continue;
+ spin_lock(&dquots[cnt]->dq_dqb_lock);
+ if (flags & DQUOT_SPACE_RESERVE) {
+ dquots[cnt]->dq_dqb.dqb_rsvspace -=
+ number;
+ } else {
+ dquots[cnt]->dq_dqb.dqb_curspace -=
+ number;
+ }
+ spin_unlock(&dquots[cnt]->dq_dqb_lock);
+ }
+ spin_unlock(&inode->i_lock);
goto out_flush_warn;
}
}
- for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (!dquots[cnt])
- continue;
- if (reserve)
- dquot_resv_space(dquots[cnt], number);
- else
- dquot_incr_space(dquots[cnt], number);
- }
- inode_incr_space(inode, number, reserve);
- spin_unlock(&dq_data_lock);
+ if (reserve)
+ *inode_reserved_space(inode) += number;
+ else
+ __inode_add_bytes(inode, number);
+ spin_unlock(&inode->i_lock);
if (reserve)
goto out_flush_warn;
@@ -1740,23 +1706,26 @@ int dquot_alloc_inode(struct inode *inode)
dquots = i_dquot(inode);
index = srcu_read_lock(&dquot_srcu);
- spin_lock(&dq_data_lock);
+ spin_lock(&inode->i_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (!dquots[cnt])
continue;
- ret = check_idq(dquots[cnt], 1, &warn[cnt]);
- if (ret)
+ ret = dquot_add_inodes(dquots[cnt], 1, &warn[cnt]);
+ if (ret) {
+ for (cnt--; cnt >= 0; cnt--) {
+ if (!dquots[cnt])
+ continue;
+ /* Back out changes we already did */
+ spin_lock(&dquots[cnt]->dq_dqb_lock);
+ dquots[cnt]->dq_dqb.dqb_curinodes--;
+ spin_unlock(&dquots[cnt]->dq_dqb_lock);
+ }
goto warn_put_all;
- }
-
- for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (!dquots[cnt])
- continue;
- dquot_incr_inodes(dquots[cnt], 1);
+ }
}
warn_put_all:
- spin_unlock(&dq_data_lock);
+ spin_unlock(&inode->i_lock);
if (ret == 0)
mark_all_dquot_dirty(dquots);
srcu_read_unlock(&dquot_srcu, index);
@@ -1774,21 +1743,33 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
int cnt, index;
if (!dquot_active(inode)) {
- inode_claim_rsv_space(inode, number);
+ spin_lock(&inode->i_lock);
+ *inode_reserved_space(inode) -= number;
+ __inode_add_bytes(inode, number);
+ spin_unlock(&inode->i_lock);
return 0;
}
dquots = i_dquot(inode);
index = srcu_read_lock(&dquot_srcu);
- spin_lock(&dq_data_lock);
+ spin_lock(&inode->i_lock);
/* Claim reserved quotas to allocated quotas */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (dquots[cnt])
- dquot_claim_reserved_space(dquots[cnt], number);
+ if (dquots[cnt]) {
+ struct dquot *dquot = dquots[cnt];
+
+ spin_lock(&dquot->dq_dqb_lock);
+ if (WARN_ON_ONCE(dquot->dq_dqb.dqb_rsvspace < number))
+ number = dquot->dq_dqb.dqb_rsvspace;
+ dquot->dq_dqb.dqb_curspace += number;
+ dquot->dq_dqb.dqb_rsvspace -= number;
+ spin_unlock(&dquot->dq_dqb_lock);
+ }
}
/* Update inode bytes */
- inode_claim_rsv_space(inode, number);
- spin_unlock(&dq_data_lock);
+ *inode_reserved_space(inode) -= number;
+ __inode_add_bytes(inode, number);
+ spin_unlock(&inode->i_lock);
mark_all_dquot_dirty(dquots);
srcu_read_unlock(&dquot_srcu, index);
return 0;
@@ -1804,21 +1785,33 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
int cnt, index;
if (!dquot_active(inode)) {
- inode_reclaim_rsv_space(inode, number);
+ spin_lock(&inode->i_lock);
+ *inode_reserved_space(inode) += number;
+ __inode_sub_bytes(inode, number);
+ spin_unlock(&inode->i_lock);
return;
}
dquots = i_dquot(inode);
index = srcu_read_lock(&dquot_srcu);
- spin_lock(&dq_data_lock);
+ spin_lock(&inode->i_lock);
/* Claim reserved quotas to allocated quotas */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (dquots[cnt])
- dquot_reclaim_reserved_space(dquots[cnt], number);
+ if (dquots[cnt]) {
+ struct dquot *dquot = dquots[cnt];
+
+ spin_lock(&dquot->dq_dqb_lock);
+ if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number))
+ number = dquot->dq_dqb.dqb_curspace;
+ dquot->dq_dqb.dqb_rsvspace += number;
+ dquot->dq_dqb.dqb_curspace -= number;
+ spin_unlock(&dquot->dq_dqb_lock);
+ }
}
/* Update inode bytes */
- inode_reclaim_rsv_space(inode, number);
- spin_unlock(&dq_data_lock);
+ *inode_reserved_space(inode) += number;
+ __inode_sub_bytes(inode, number);
+ spin_unlock(&inode->i_lock);
mark_all_dquot_dirty(dquots);
srcu_read_unlock(&dquot_srcu, index);
return;
@@ -1836,19 +1829,26 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
int reserve = flags & DQUOT_SPACE_RESERVE, index;
if (!dquot_active(inode)) {
- inode_decr_space(inode, number, reserve);
+ if (reserve) {
+ spin_lock(&inode->i_lock);
+ *inode_reserved_space(inode) -= number;
+ spin_unlock(&inode->i_lock);
+ } else {
+ inode_sub_bytes(inode, number);
+ }
return;
}
dquots = i_dquot(inode);
index = srcu_read_lock(&dquot_srcu);
- spin_lock(&dq_data_lock);
+ spin_lock(&inode->i_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
int wtype;
warn[cnt].w_type = QUOTA_NL_NOWARN;
if (!dquots[cnt])
continue;
+ spin_lock(&dquots[cnt]->dq_dqb_lock);
wtype = info_bdq_free(dquots[cnt], number);
if (wtype != QUOTA_NL_NOWARN)
prepare_warning(&warn[cnt], dquots[cnt], wtype);
@@ -1856,9 +1856,13 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
dquot_free_reserved_space(dquots[cnt], number);
else
dquot_decr_space(dquots[cnt], number);
+ spin_unlock(&dquots[cnt]->dq_dqb_lock);
}
- inode_decr_space(inode, number, reserve);
- spin_unlock(&dq_data_lock);
+ if (reserve)
+ *inode_reserved_space(inode) -= number;
+ else
+ __inode_sub_bytes(inode, number);
+ spin_unlock(&inode->i_lock);
if (reserve)
goto out_unlock;
@@ -1884,19 +1888,21 @@ void dquot_free_inode(struct inode *inode)
dquots = i_dquot(inode);
index = srcu_read_lock(&dquot_srcu);
- spin_lock(&dq_data_lock);
+ spin_lock(&inode->i_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
int wtype;
warn[cnt].w_type = QUOTA_NL_NOWARN;
if (!dquots[cnt])
continue;
+ spin_lock(&dquots[cnt]->dq_dqb_lock);
wtype = info_idq_free(dquots[cnt], 1);
if (wtype != QUOTA_NL_NOWARN)
prepare_warning(&warn[cnt], dquots[cnt], wtype);
dquot_decr_inodes(dquots[cnt], 1);
+ spin_unlock(&dquots[cnt]->dq_dqb_lock);
}
- spin_unlock(&dq_data_lock);
+ spin_unlock(&inode->i_lock);
mark_all_dquot_dirty(dquots);
srcu_read_unlock(&dquot_srcu, index);
flush_warnings(warn);
@@ -1917,7 +1923,7 @@ EXPORT_SYMBOL(dquot_free_inode);
*/
int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
{
- qsize_t space, cur_space;
+ qsize_t cur_space;
qsize_t rsv_space = 0;
qsize_t inode_usage = 1;
struct dquot *transfer_from[MAXQUOTAS] = {};
@@ -1944,14 +1950,18 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
}
spin_lock(&dq_data_lock);
+ spin_lock(&inode->i_lock);
if (IS_NOQUOTA(inode)) { /* File without quota accounting? */
+ spin_unlock(&inode->i_lock);
spin_unlock(&dq_data_lock);
return 0;
}
- cur_space = inode_get_bytes(inode);
- rsv_space = inode_get_rsv_space(inode);
- space = cur_space + rsv_space;
- /* Build the transfer_from list and check the limits */
+ cur_space = __inode_get_bytes(inode);
+ rsv_space = __inode_get_rsv_space(inode);
+ /*
+ * Build the transfer_from list, check limits, and update usage in
+ * the target structures.
+ */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
/*
* Skip changes for same uid or gid or for turned off quota-type.
@@ -1963,28 +1973,33 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
continue;
is_valid[cnt] = 1;
transfer_from[cnt] = i_dquot(inode)[cnt];
- ret = check_idq(transfer_to[cnt], inode_usage, &warn_to[cnt]);
+ ret = dquot_add_inodes(transfer_to[cnt], inode_usage,
+ &warn_to[cnt]);
if (ret)
goto over_quota;
- ret = check_bdq(transfer_to[cnt], space, 0, &warn_to[cnt]);
- if (ret)
+ ret = dquot_add_space(transfer_to[cnt], cur_space, rsv_space, 0,
+ &warn_to[cnt]);
+ if (ret) {
+ dquot_decr_inodes(transfer_to[cnt], inode_usage);
goto over_quota;
+ }
}
- /*
- * Finally perform the needed transfer from transfer_from to transfer_to
- */
+ /* Decrease usage for source structures and update quota pointers */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (!is_valid[cnt])
continue;
/* Due to IO error we might not have transfer_from[] structure */
if (transfer_from[cnt]) {
int wtype;
+
+ spin_lock(&transfer_from[cnt]->dq_dqb_lock);
wtype = info_idq_free(transfer_from[cnt], inode_usage);
if (wtype != QUOTA_NL_NOWARN)
prepare_warning(&warn_from_inodes[cnt],
transfer_from[cnt], wtype);
- wtype = info_bdq_free(transfer_from[cnt], space);
+ wtype = info_bdq_free(transfer_from[cnt],
+ cur_space + rsv_space);
if (wtype != QUOTA_NL_NOWARN)
prepare_warning(&warn_from_space[cnt],
transfer_from[cnt], wtype);
@@ -1992,14 +2007,11 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
dquot_decr_space(transfer_from[cnt], cur_space);
dquot_free_reserved_space(transfer_from[cnt],
rsv_space);
+ spin_unlock(&transfer_from[cnt]->dq_dqb_lock);
}
-
- dquot_incr_inodes(transfer_to[cnt], inode_usage);
- dquot_incr_space(transfer_to[cnt], cur_space);
- dquot_resv_space(transfer_to[cnt], rsv_space);
-
i_dquot(inode)[cnt] = transfer_to[cnt];
}
+ spin_unlock(&inode->i_lock);
spin_unlock(&dq_data_lock);
mark_all_dquot_dirty(transfer_from);
@@ -2013,6 +2025,17 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
transfer_to[cnt] = transfer_from[cnt];
return 0;
over_quota:
+ /* Back out changes we already did */
+ for (cnt--; cnt >= 0; cnt--) {
+ if (!is_valid[cnt])
+ continue;
+ spin_lock(&transfer_to[cnt]->dq_dqb_lock);
+ dquot_decr_inodes(transfer_to[cnt], inode_usage);
+ dquot_decr_space(transfer_to[cnt], cur_space);
+ dquot_free_reserved_space(transfer_to[cnt], rsv_space);
+ spin_unlock(&transfer_to[cnt]->dq_dqb_lock);
+ }
+ spin_unlock(&inode->i_lock);
spin_unlock(&dq_data_lock);
flush_warnings(warn_to);
return ret;
@@ -2066,29 +2089,21 @@ EXPORT_SYMBOL(dquot_transfer);
*/
int dquot_commit_info(struct super_block *sb, int type)
{
- int ret;
struct quota_info *dqopt = sb_dqopt(sb);
- mutex_lock(&dqopt->dqio_mutex);
- ret = dqopt->ops[type]->write_file_info(sb, type);
- mutex_unlock(&dqopt->dqio_mutex);
- return ret;
+ return dqopt->ops[type]->write_file_info(sb, type);
}
EXPORT_SYMBOL(dquot_commit_info);
int dquot_get_next_id(struct super_block *sb, struct kqid *qid)
{
struct quota_info *dqopt = sb_dqopt(sb);
- int err;
if (!sb_has_quota_active(sb, qid->type))
return -ESRCH;
if (!dqopt->ops[qid->type]->get_next_id)
return -ENOSYS;
- mutex_lock(&dqopt->dqio_mutex);
- err = dqopt->ops[qid->type]->get_next_id(sb, qid);
- mutex_unlock(&dqopt->dqio_mutex);
- return err;
+ return dqopt->ops[qid->type]->get_next_id(sb, qid);
}
EXPORT_SYMBOL(dquot_get_next_id);
@@ -2337,15 +2352,14 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
dqopt->info[type].dqi_format = fmt;
dqopt->info[type].dqi_fmt_id = format_id;
INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list);
- mutex_lock(&dqopt->dqio_mutex);
error = dqopt->ops[type]->read_file_info(sb, type);
- if (error < 0) {
- mutex_unlock(&dqopt->dqio_mutex);
+ if (error < 0)
goto out_file_init;
- }
- if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
+ if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) {
+ spin_lock(&dq_data_lock);
dqopt->info[type].dqi_flags |= DQF_SYS_FILE;
- mutex_unlock(&dqopt->dqio_mutex);
+ spin_unlock(&dq_data_lock);
+ }
spin_lock(&dq_state_lock);
dqopt->flags |= dquot_state_flag(flags, type);
spin_unlock(&dq_state_lock);
@@ -2572,7 +2586,7 @@ static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di)
struct mem_dqblk *dm = &dquot->dq_dqb;
memset(di, 0, sizeof(*di));
- spin_lock(&dq_data_lock);
+ spin_lock(&dquot->dq_dqb_lock);
di->d_spc_hardlimit = dm->dqb_bhardlimit;
di->d_spc_softlimit = dm->dqb_bsoftlimit;
di->d_ino_hardlimit = dm->dqb_ihardlimit;
@@ -2581,7 +2595,7 @@ static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di)
di->d_ino_count = dm->dqb_curinodes;
di->d_spc_timer = dm->dqb_btime;
di->d_ino_timer = dm->dqb_itime;
- spin_unlock(&dq_data_lock);
+ spin_unlock(&dquot->dq_dqb_lock);
}
int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
@@ -2645,7 +2659,7 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
(di->d_ino_hardlimit > dqi->dqi_max_ino_limit)))
return -ERANGE;
- spin_lock(&dq_data_lock);
+ spin_lock(&dquot->dq_dqb_lock);
if (di->d_fieldmask & QC_SPACE) {
dm->dqb_curspace = di->d_space - dm->dqb_rsvspace;
check_blim = 1;
@@ -2711,7 +2725,7 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
clear_bit(DQ_FAKE_B, &dquot->dq_flags);
else
set_bit(DQ_FAKE_B, &dquot->dq_flags);
- spin_unlock(&dq_data_lock);
+ spin_unlock(&dquot->dq_dqb_lock);
mark_dquot_dirty(dquot);
return 0;
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 0738972e8d3f..bb3f59bcfcf5 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -379,7 +379,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
if (!ddquot)
return -ENOMEM;
- /* dq_off is guarded by dqio_mutex */
+ /* dq_off is guarded by dqio_sem */
if (!dquot->dq_off) {
ret = dq_insert_tree(info, dquot);
if (ret < 0) {
@@ -389,9 +389,9 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
return ret;
}
}
- spin_lock(&dq_data_lock);
+ spin_lock(&dquot->dq_dqb_lock);
info->dqi_ops->mem2disk_dqblk(ddquot, dquot);
- spin_unlock(&dq_data_lock);
+ spin_unlock(&dquot->dq_dqb_lock);
ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size,
dquot->dq_off);
if (ret != info->dqi_entry_size) {
@@ -649,14 +649,14 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
kfree(ddquot);
goto out;
}
- spin_lock(&dq_data_lock);
+ spin_lock(&dquot->dq_dqb_lock);
info->dqi_ops->disk2mem_dqblk(dquot, ddquot);
if (!dquot->dq_dqb.dqb_bhardlimit &&
!dquot->dq_dqb.dqb_bsoftlimit &&
!dquot->dq_dqb.dqb_ihardlimit &&
!dquot->dq_dqb.dqb_isoftlimit)
set_bit(DQ_FAKE_B, &dquot->dq_flags);
- spin_unlock(&dq_data_lock);
+ spin_unlock(&dquot->dq_dqb_lock);
kfree(ddquot);
out:
dqstats_inc(DQST_READS);
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 8fe79beced5c..7ac5298aba70 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -56,8 +56,9 @@ static int v1_read_dqblk(struct dquot *dquot)
{
int type = dquot->dq_id.type;
struct v1_disk_dqblk dqblk;
+ struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
- if (!sb_dqopt(dquot->dq_sb)->files[type])
+ if (!dqopt->files[type])
return -EINVAL;
/* Set structure to 0s in case read fails/is after end of file */
@@ -160,6 +161,7 @@ static int v1_read_file_info(struct super_block *sb, int type)
struct v1_disk_dqblk dqblk;
int ret;
+ down_read(&dqopt->dqio_sem);
ret = sb->s_op->quota_read(sb, type, (char *)&dqblk,
sizeof(struct v1_disk_dqblk), v1_dqoff(0));
if (ret != sizeof(struct v1_disk_dqblk)) {
@@ -176,6 +178,7 @@ static int v1_read_file_info(struct super_block *sb, int type)
dqopt->info[type].dqi_bgrace =
dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME;
out:
+ up_read(&dqopt->dqio_sem);
return ret;
}
@@ -185,7 +188,7 @@ static int v1_write_file_info(struct super_block *sb, int type)
struct v1_disk_dqblk dqblk;
int ret;
- dqopt->info[type].dqi_flags &= ~DQF_INFO_DIRTY;
+ down_write(&dqopt->dqio_sem);
ret = sb->s_op->quota_read(sb, type, (char *)&dqblk,
sizeof(struct v1_disk_dqblk), v1_dqoff(0));
if (ret != sizeof(struct v1_disk_dqblk)) {
@@ -193,8 +196,11 @@ static int v1_write_file_info(struct super_block *sb, int type)
ret = -EIO;
goto out;
}
+ spin_lock(&dq_data_lock);
+ dqopt->info[type].dqi_flags &= ~DQF_INFO_DIRTY;
dqblk.dqb_itime = dqopt->info[type].dqi_igrace;
dqblk.dqb_btime = dqopt->info[type].dqi_bgrace;
+ spin_unlock(&dq_data_lock);
ret = sb->s_op->quota_write(sb, type, (char *)&dqblk,
sizeof(struct v1_disk_dqblk), v1_dqoff(0));
if (ret == sizeof(struct v1_disk_dqblk))
@@ -202,6 +208,7 @@ static int v1_write_file_info(struct super_block *sb, int type)
else if (ret > 0)
ret = -EIO;
out:
+ up_write(&dqopt->dqio_sem);
return ret;
}
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index ca71bf881ad1..c0187cda2c1e 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -65,9 +65,11 @@ static int v2_read_header(struct super_block *sb, int type,
if (size != sizeof(struct v2_disk_dqheader)) {
quota_error(sb, "Failed header read: expected=%zd got=%zd",
sizeof(struct v2_disk_dqheader), size);
- return 0;
+ if (size < 0)
+ return size;
+ return -EIO;
}
- return 1;
+ return 0;
}
/* Check whether given file is really vfsv0 quotafile */
@@ -77,7 +79,7 @@ static int v2_check_quota_file(struct super_block *sb, int type)
static const uint quota_magics[] = V2_INITQMAGICS;
static const uint quota_versions[] = V2_INITQVERSIONS;
- if (!v2_read_header(sb, type, &dqhead))
+ if (v2_read_header(sb, type, &dqhead))
return 0;
if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] ||
le32_to_cpu(dqhead.dqh_version) > quota_versions[type])
@@ -90,29 +92,38 @@ static int v2_read_file_info(struct super_block *sb, int type)
{
struct v2_disk_dqinfo dinfo;
struct v2_disk_dqheader dqhead;
- struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct quota_info *dqopt = sb_dqopt(sb);
+ struct mem_dqinfo *info = &dqopt->info[type];
struct qtree_mem_dqinfo *qinfo;
ssize_t size;
unsigned int version;
+ int ret;
- if (!v2_read_header(sb, type, &dqhead))
- return -1;
+ down_read(&dqopt->dqio_sem);
+ ret = v2_read_header(sb, type, &dqhead);
+ if (ret < 0)
+ goto out;
version = le32_to_cpu(dqhead.dqh_version);
if ((info->dqi_fmt_id == QFMT_VFS_V0 && version != 0) ||
- (info->dqi_fmt_id == QFMT_VFS_V1 && version != 1))
- return -1;
+ (info->dqi_fmt_id == QFMT_VFS_V1 && version != 1)) {
+ ret = -EINVAL;
+ goto out;
+ }
size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
if (size != sizeof(struct v2_disk_dqinfo)) {
quota_error(sb, "Can't read info structure");
- return -1;
+ if (size < 0)
+ ret = size;
+ else
+ ret = -EIO;
+ goto out;
}
info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
if (!info->dqi_priv) {
- printk(KERN_WARNING
- "Not enough memory for quota information structure.\n");
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
qinfo = info->dqi_priv;
if (version == 0) {
@@ -147,17 +158,22 @@ static int v2_read_file_info(struct super_block *sb, int type)
qinfo->dqi_entry_size = sizeof(struct v2r1_disk_dqblk);
qinfo->dqi_ops = &v2r1_qtree_ops;
}
- return 0;
+ ret = 0;
+out:
+ up_read(&dqopt->dqio_sem);
+ return ret;
}
/* Write information header to quota file */
static int v2_write_file_info(struct super_block *sb, int type)
{
struct v2_disk_dqinfo dinfo;
- struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct quota_info *dqopt = sb_dqopt(sb);
+ struct mem_dqinfo *info = &dqopt->info[type];
struct qtree_mem_dqinfo *qinfo = info->dqi_priv;
ssize_t size;
+ down_write(&dqopt->dqio_sem);
spin_lock(&dq_data_lock);
info->dqi_flags &= ~DQF_INFO_DIRTY;
dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
@@ -170,6 +186,7 @@ static int v2_write_file_info(struct super_block *sb, int type)
dinfo.dqi_free_entry = cpu_to_le32(qinfo->dqi_free_entry);
size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
+ up_write(&dqopt->dqio_sem);
if (size != sizeof(struct v2_disk_dqinfo)) {
quota_error(sb, "Can't write info structure");
return -1;
@@ -285,17 +302,51 @@ static int v2r1_is_id(void *dp, struct dquot *dquot)
static int v2_read_dquot(struct dquot *dquot)
{
- return qtree_read_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot);
+ struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
+ int ret;
+
+ down_read(&dqopt->dqio_sem);
+ ret = qtree_read_dquot(
+ sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv,
+ dquot);
+ up_read(&dqopt->dqio_sem);
+ return ret;
}
static int v2_write_dquot(struct dquot *dquot)
{
- return qtree_write_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot);
+ struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
+ int ret;
+ bool alloc = false;
+
+ /*
+ * If space for dquot is already allocated, we don't need any
+ * protection as we'll only overwrite the place of dquot. We are
+ * still protected by concurrent writes of the same dquot by
+ * dquot->dq_lock.
+ */
+ if (!dquot->dq_off) {
+ alloc = true;
+ down_write(&dqopt->dqio_sem);
+ }
+ ret = qtree_write_dquot(
+ sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv,
+ dquot);
+ if (alloc)
+ up_write(&dqopt->dqio_sem);
+ return ret;
}
static int v2_release_dquot(struct dquot *dquot)
{
- return qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot);
+ struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
+ int ret;
+
+ down_write(&dqopt->dqio_sem);
+ ret = qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot);
+ up_write(&dqopt->dqio_sem);
+
+ return ret;
}
static int v2_free_file_info(struct super_block *sb, int type)
@@ -306,7 +357,13 @@ static int v2_free_file_info(struct super_block *sb, int type)
static int v2_get_next_id(struct super_block *sb, struct kqid *qid)
{
- return qtree_get_next_id(sb_dqinfo(sb, qid->type)->dqi_priv, qid);
+ struct quota_info *dqopt = sb_dqopt(sb);
+ int ret;
+
+ down_read(&dqopt->dqio_sem);
+ ret = qtree_get_next_id(sb_dqinfo(sb, qid->type)->dqi_priv, qid);
+ up_read(&dqopt->dqio_sem);
+ return ret;
}
static const struct quota_format_ops v2_format_ops = {
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 2ef7ce75c062..3ac1f2387083 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -228,7 +228,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
if (!pages)
goto out_free;
- nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages);
+ nr = find_get_pages(inode->i_mapping, &pgoff, lpages, pages);
if (nr != lpages)
goto out_free_pages; /* leave if some pages were missing */
diff --git a/fs/read_write.c b/fs/read_write.c
index 0cc7033aa413..61b58c7b6531 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -33,7 +33,7 @@ const struct file_operations generic_ro_fops = {
EXPORT_SYMBOL(generic_ro_fops);
-static inline int unsigned_offsets(struct file *file)
+static inline bool unsigned_offsets(struct file *file)
{
return file->f_mode & FMODE_UNSIGNED_OFFSET;
}
@@ -633,7 +633,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
EXPORT_SYMBOL(iov_shorten);
static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
- loff_t *ppos, int type, int flags)
+ loff_t *ppos, int type, rwf_t flags)
{
struct kiocb kiocb;
ssize_t ret;
@@ -655,7 +655,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
/* Do it by hand, with file-ops */
static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
- loff_t *ppos, int type, int flags)
+ loff_t *ppos, int type, rwf_t flags)
{
ssize_t ret = 0;
@@ -871,7 +871,7 @@ out:
#endif
static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
- loff_t *pos, int flags)
+ loff_t *pos, rwf_t flags)
{
size_t tot_len;
ssize_t ret = 0;
@@ -899,7 +899,7 @@ out:
}
ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
- int flags)
+ rwf_t flags)
{
if (!file->f_op->read_iter)
return -EINVAL;
@@ -908,7 +908,7 @@ ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
EXPORT_SYMBOL(vfs_iter_read);
static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
- loff_t *pos, int flags)
+ loff_t *pos, rwf_t flags)
{
size_t tot_len;
ssize_t ret = 0;
@@ -935,7 +935,7 @@ static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
}
ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
- int flags)
+ rwf_t flags)
{
if (!file->f_op->write_iter)
return -EINVAL;
@@ -944,7 +944,7 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
EXPORT_SYMBOL(vfs_iter_write);
ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
- unsigned long vlen, loff_t *pos, int flags)
+ unsigned long vlen, loff_t *pos, rwf_t flags)
{
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
@@ -962,7 +962,7 @@ ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
EXPORT_SYMBOL(vfs_readv);
ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
- unsigned long vlen, loff_t *pos, int flags)
+ unsigned long vlen, loff_t *pos, rwf_t flags)
{
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
@@ -981,7 +981,7 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
EXPORT_SYMBOL(vfs_writev);
static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
- unsigned long vlen, int flags)
+ unsigned long vlen, rwf_t flags)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
@@ -1001,7 +1001,7 @@ static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
}
static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
- unsigned long vlen, int flags)
+ unsigned long vlen, rwf_t flags)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
@@ -1027,7 +1027,7 @@ static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
}
static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
- unsigned long vlen, loff_t pos, int flags)
+ unsigned long vlen, loff_t pos, rwf_t flags)
{
struct fd f;
ssize_t ret = -EBADF;
@@ -1050,7 +1050,7 @@ static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
}
static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
- unsigned long vlen, loff_t pos, int flags)
+ unsigned long vlen, loff_t pos, rwf_t flags)
{
struct fd f;
ssize_t ret = -EBADF;
@@ -1094,7 +1094,7 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
- int, flags)
+ rwf_t, flags)
{
loff_t pos = pos_from_hilo(pos_h, pos_l);
@@ -1114,7 +1114,7 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
- int, flags)
+ rwf_t, flags)
{
loff_t pos = pos_from_hilo(pos_h, pos_l);
@@ -1127,7 +1127,7 @@ SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
#ifdef CONFIG_COMPAT
static size_t compat_readv(struct file *file,
const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t *pos, int flags)
+ unsigned long vlen, loff_t *pos, rwf_t flags)
{
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
@@ -1147,7 +1147,7 @@ static size_t compat_readv(struct file *file,
static size_t do_compat_readv(compat_ulong_t fd,
const struct compat_iovec __user *vec,
- compat_ulong_t vlen, int flags)
+ compat_ulong_t vlen, rwf_t flags)
{
struct fd f = fdget_pos(fd);
ssize_t ret;
@@ -1173,7 +1173,7 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
static long do_compat_preadv64(unsigned long fd,
const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t pos, int flags)
+ unsigned long vlen, loff_t pos, rwf_t flags)
{
struct fd f;
ssize_t ret;
@@ -1211,7 +1211,7 @@ COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
const struct compat_iovec __user *,vec,
- unsigned long, vlen, loff_t, pos, int, flags)
+ unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
return do_compat_preadv64(fd, vec, vlen, pos, flags);
}
@@ -1220,7 +1220,7 @@ COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
const struct compat_iovec __user *,vec,
compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
- int, flags)
+ rwf_t, flags)
{
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
@@ -1232,7 +1232,7 @@ COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
static size_t compat_writev(struct file *file,
const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t *pos, int flags)
+ unsigned long vlen, loff_t *pos, rwf_t flags)
{
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
@@ -1254,7 +1254,7 @@ static size_t compat_writev(struct file *file,
static size_t do_compat_writev(compat_ulong_t fd,
const struct compat_iovec __user* vec,
- compat_ulong_t vlen, int flags)
+ compat_ulong_t vlen, rwf_t flags)
{
struct fd f = fdget_pos(fd);
ssize_t ret;
@@ -1279,7 +1279,7 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
static long do_compat_pwritev64(unsigned long fd,
const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t pos, int flags)
+ unsigned long vlen, loff_t pos, rwf_t flags)
{
struct fd f;
ssize_t ret;
@@ -1317,7 +1317,7 @@ COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
const struct compat_iovec __user *,vec,
- unsigned long, vlen, loff_t, pos, int, flags)
+ unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
return do_compat_pwritev64(fd, vec, vlen, pos, flags);
}
@@ -1325,7 +1325,7 @@ COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
const struct compat_iovec __user *,vec,
- compat_ulong_t, vlen, u32, pos_low, u32, pos_high, int, flags)
+ compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
{
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 45aa05e2232f..5b50689d8539 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -34,7 +34,7 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
struct inode *inode = filp->f_mapping->host;
int err;
- err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ err = file_write_and_wait_range(filp, start, end);
if (err)
return err;
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index b396eb09f288..843aadcc123c 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -154,7 +154,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end,
int err;
int barrier_done;
- err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ err = file_write_and_wait_range(filp, start, end);
if (err)
return err;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index a11d773e5ff3..0c882a0e2a6e 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1481,7 +1481,7 @@ static int flush_journal_list(struct super_block *s,
if ((!was_jwait) && !buffer_locked(saved_bh)) {
reiserfs_warning(s, "journal-813",
"BAD! buffer %llu %cdirty %cjwait, "
- "not in a newer tranasction",
+ "not in a newer transaction",
(unsigned long long)saved_bh->
b_blocknr, was_dirty ? ' ' : '!',
was_jwait ? ' ' : '!');
diff --git a/fs/select.c b/fs/select.c
index 9d5f15ed87fe..c6362e38ae92 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -1164,11 +1164,7 @@ int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
if (ufdset) {
return compat_get_bitmap(fdset, ufdset, nr);
} else {
- /* Tricky, must clear full unsigned long in the
- * kernel fdset at the end, ALIGN makes sure that
- * actually happens.
- */
- memset(fdset, 0, ALIGN(nr, BITS_PER_LONG));
+ zero_fd_set(nr, fdset);
return 0;
}
}
diff --git a/fs/stat.c b/fs/stat.c
index c35610845ab1..8a6aa8caf891 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -710,7 +710,7 @@ loff_t inode_get_bytes(struct inode *inode)
loff_t ret;
spin_lock(&inode->i_lock);
- ret = (((loff_t)inode->i_blocks) << 9) + inode->i_bytes;
+ ret = __inode_get_bytes(inode);
spin_unlock(&inode->i_lock);
return ret;
}
diff --git a/fs/super.c b/fs/super.c
index 6bc3352adcf3..221cfa1f4e92 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -242,7 +242,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
atomic_set(&s->s_active, 1);
mutex_init(&s->s_vfs_rename_mutex);
lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
- mutex_init(&s->s_dquot.dqio_mutex);
+ init_rwsem(&s->s_dquot.dqio_sem);
s->s_maxbytes = MAX_NON_LFS;
s->s_op = &default_op;
s->s_time_gran = 1000000000;
diff --git a/fs/sync.c b/fs/sync.c
index 2a54c1f22035..2e3fd7d94d2d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -335,14 +335,9 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
goto out_put;
mapping = f.file->f_mapping;
- if (!mapping) {
- ret = -EINVAL;
- goto out_put;
- }
-
ret = 0;
if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
- ret = filemap_fdatawait_range(mapping, offset, endbyte);
+ ret = file_fdatawait_range(f.file, offset, endbyte);
if (ret < 0)
goto out_put;
}
@@ -355,7 +350,7 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
}
if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
- ret = filemap_fdatawait_range(mapping, offset, endbyte);
+ ret = file_fdatawait_range(f.file, offset, endbyte);
out_put:
fdput(f);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 8cad0b19b404..a02aa59d1e24 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1337,7 +1337,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
*/
return 0;
- err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ err = file_write_and_wait_range(file, start, end);
if (err)
return err;
inode_lock(inode);
@@ -1490,7 +1490,10 @@ static int ubifs_migrate_page(struct address_space *mapping,
SetPagePrivate(newpage);
}
- migrate_page_copy(newpage, page);
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ migrate_page_copy(newpage, page);
+ else
+ migrate_page_states(newpage, page);
return MIGRATEPAGE_SUCCESS;
}
#endif
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 18fdb9d90812..8dacf4f57414 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -114,7 +114,7 @@ static void udf_update_extent_cache(struct inode *inode, loff_t estart,
__udf_clear_extent_cache(inode);
if (pos->bh)
get_bh(pos->bh);
- memcpy(&iinfo->cached_extent.epos, pos, sizeof(struct extent_position));
+ memcpy(&iinfo->cached_extent.epos, pos, sizeof(*pos));
iinfo->cached_extent.lstart = estart;
switch (iinfo->i_alloc_type) {
case ICBTAG_FLAG_AD_SHORT:
@@ -1572,13 +1572,8 @@ static int udf_alloc_i_data(struct inode *inode, size_t size)
{
struct udf_inode_info *iinfo = UDF_I(inode);
iinfo->i_ext.i_data = kmalloc(size, GFP_KERNEL);
-
- if (!iinfo->i_ext.i_data) {
- udf_err(inode->i_sb, "(ino %ld) no free memory\n",
- inode->i_ino);
+ if (!iinfo->i_ext.i_data)
return -ENOMEM;
- }
-
return 0;
}
@@ -1703,7 +1698,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
dsea->impUseLength = cpu_to_le32(sizeof(struct regid));
}
eid = (struct regid *)dsea->impUse;
- memset(eid, 0, sizeof(struct regid));
+ memset(eid, 0, sizeof(*eid));
strcpy(eid->ident, UDF_ID_DEVELOPER);
eid->identSuffix[0] = UDF_OS_CLASS_UNIX;
eid->identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1754,7 +1749,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime);
udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime);
- memset(&(efe->impIdent), 0, sizeof(struct regid));
+ memset(&(efe->impIdent), 0, sizeof(efe->impIdent));
strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER);
efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 385ee89d5824..885198dfd9f8 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1184,7 +1184,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
*/
ncfi.fileVersionNum = ocfi.fileVersionNum;
ncfi.fileCharacteristics = ocfi.fileCharacteristics;
- memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(struct long_ad));
+ memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(ocfi.icb));
udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL);
/* The old fid may have moved - find it again */
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 462ac2e9258c..93c59630512b 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -266,11 +266,8 @@ static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count)
{
struct udf_sb_info *sbi = UDF_SB(sb);
- sbi->s_partmaps = kcalloc(count, sizeof(struct udf_part_map),
- GFP_KERNEL);
+ sbi->s_partmaps = kcalloc(count, sizeof(*sbi->s_partmaps), GFP_KERNEL);
if (!sbi->s_partmaps) {
- udf_err(sb, "Unable to allocate space for %d partition maps\n",
- count);
sbi->s_partitions = 0;
return -ENOMEM;
}
@@ -324,7 +321,8 @@ static void udf_sb_free_partitions(struct super_block *sb)
{
struct udf_sb_info *sbi = UDF_SB(sb);
int i;
- if (sbi->s_partmaps == NULL)
+
+ if (!sbi->s_partmaps)
return;
for (i = 0; i < sbi->s_partitions; i++)
udf_free_partition(&sbi->s_partmaps[i]);
@@ -1071,7 +1069,7 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
else
bitmap = vzalloc(size); /* TODO: get rid of vzalloc */
- if (bitmap == NULL)
+ if (!bitmap)
return NULL;
bitmap->s_nr_groups = nr_groups;
@@ -2099,7 +2097,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
uopt.fmode = UDF_INVALID_MODE;
uopt.dmode = UDF_INVALID_MODE;
- sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
+ sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index b0d5897bc4e6..ef4b48d1ea42 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -109,27 +109,24 @@ static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
goto out;
WRITE_ONCE(uwq->waken, true);
/*
- * The implicit smp_mb__before_spinlock in try_to_wake_up()
- * renders uwq->waken visible to other CPUs before the task is
- * waken.
+ * The Program-Order guarantees provided by the scheduler
+ * ensure uwq->waken is visible before the task is woken.
*/
ret = wake_up_state(wq->private, mode);
- if (ret)
+ if (ret) {
/*
* Wake only once, autoremove behavior.
*
- * After the effect of list_del_init is visible to the
- * other CPUs, the waitqueue may disappear from under
- * us, see the !list_empty_careful() in
- * handle_userfault(). try_to_wake_up() has an
- * implicit smp_mb__before_spinlock, and the
- * wq->private is read before calling the extern
- * function "wake_up_state" (which in turns calls
- * try_to_wake_up). While the spin_lock;spin_unlock;
- * wouldn't be enough, the smp_mb__before_spinlock is
- * enough to avoid an explicit smp_mb() here.
+ * After the effect of list_del_init is visible to the other
+ * CPUs, the waitqueue may disappear from under us, see the
+ * !list_empty_careful() in handle_userfault().
+ *
+ * try_to_wake_up() has an implicit smp_mb(), and the
+ * wq->private is read before calling the extern function
+ * "wake_up_state" (which in turns calls try_to_wake_up).
*/
list_del_init(&wq->entry);
+ }
out:
return ret;
}
@@ -181,7 +178,8 @@ static inline void msg_init(struct uffd_msg *msg)
static inline struct uffd_msg userfault_msg(unsigned long address,
unsigned int flags,
- unsigned long reason)
+ unsigned long reason,
+ unsigned int features)
{
struct uffd_msg msg;
msg_init(&msg);
@@ -205,6 +203,8 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
* write protect fault.
*/
msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+ if (features & UFFD_FEATURE_THREAD_ID)
+ msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
return msg;
}
@@ -373,13 +373,34 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
+ if (ctx->features & UFFD_FEATURE_SIGBUS)
+ goto out;
+
/*
* If it's already released don't get it. This avoids to loop
* in __get_user_pages if userfaultfd_release waits on the
* caller of handle_userfault to release the mmap_sem.
*/
- if (unlikely(ACCESS_ONCE(ctx->released)))
+ if (unlikely(ACCESS_ONCE(ctx->released))) {
+ /*
+ * Don't return VM_FAULT_SIGBUS in this case, so a non
+ * cooperative manager can close the uffd after the
+ * last UFFDIO_COPY, without risking to trigger an
+ * involuntary SIGBUS if the process was starting the
+ * userfaultfd while the userfaultfd was still armed
+ * (but after the last UFFDIO_COPY). If the uffd
+ * wasn't already closed when the userfault reached
+ * this point, that would normally be solved by
+ * userfaultfd_must_wait returning 'false'.
+ *
+ * If we were to return VM_FAULT_SIGBUS here, the non
+ * cooperative manager would be instead forced to
+ * always call UFFDIO_UNREGISTER before it can safely
+ * close the uffd.
+ */
+ ret = VM_FAULT_NOPAGE;
goto out;
+ }
/*
* Check that we can return VM_FAULT_RETRY.
@@ -422,7 +443,8 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current;
- uwq.msg = userfault_msg(vmf->address, vmf->flags, reason);
+ uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
+ ctx->features);
uwq.ctx = ctx;
uwq.waken = false;
@@ -1197,7 +1219,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
struct uffdio_register __user *user_uffdio_register;
unsigned long vm_flags, new_flags;
bool found;
- bool non_anon_pages;
+ bool basic_ioctls;
unsigned long start, end, vma_end;
user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1263,7 +1285,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
* Search for not compatible vmas.
*/
found = false;
- non_anon_pages = false;
+ basic_ioctls = false;
for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
cond_resched();
@@ -1302,8 +1324,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
/*
* Note vmas containing huge pages
*/
- if (is_vm_hugetlb_page(cur) || vma_is_shmem(cur))
- non_anon_pages = true;
+ if (is_vm_hugetlb_page(cur))
+ basic_ioctls = true;
found = true;
}
@@ -1374,7 +1396,7 @@ out_unlock:
* userland which ioctls methods are guaranteed to
* succeed on this range.
*/
- if (put_user(non_anon_pages ? UFFD_API_RANGE_IOCTLS_BASIC :
+ if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
UFFD_API_RANGE_IOCTLS,
&user_uffdio_register->ioctls))
ret = -EFAULT;
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index de7b9bd30bec..6249c92671de 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -328,20 +328,19 @@ xfs_attr_set(
*/
xfs_defer_init(args.dfops, args.firstblock);
error = xfs_attr_shortform_to_leaf(&args);
- if (!error)
- error = xfs_defer_finish(&args.trans, args.dfops, dp);
- if (error) {
- args.trans = NULL;
- xfs_defer_cancel(&dfops);
- goto out;
- }
+ if (error)
+ goto out_defer_cancel;
+ xfs_defer_ijoin(args.dfops, dp);
+ error = xfs_defer_finish(&args.trans, args.dfops);
+ if (error)
+ goto out_defer_cancel;
/*
* Commit the leaf transformation. We'll need another (linked)
* transaction to add the new attribute to the leaf.
*/
- error = xfs_trans_roll(&args.trans, dp);
+ error = xfs_trans_roll_inode(&args.trans, dp);
if (error)
goto out;
@@ -373,6 +372,9 @@ xfs_attr_set(
return error;
+out_defer_cancel:
+ xfs_defer_cancel(&dfops);
+ args.trans = NULL;
out:
if (args.trans)
xfs_trans_cancel(args.trans);
@@ -593,19 +595,18 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
*/
xfs_defer_init(args->dfops, args->firstblock);
error = xfs_attr3_leaf_to_node(args);
- if (!error)
- error = xfs_defer_finish(&args->trans, args->dfops, dp);
- if (error) {
- args->trans = NULL;
- xfs_defer_cancel(args->dfops);
- return error;
- }
+ if (error)
+ goto out_defer_cancel;
+ xfs_defer_ijoin(args->dfops, dp);
+ error = xfs_defer_finish(&args->trans, args->dfops);
+ if (error)
+ goto out_defer_cancel;
/*
* Commit the current trans (including the inode) and start
* a new one.
*/
- error = xfs_trans_roll(&args->trans, dp);
+ error = xfs_trans_roll_inode(&args->trans, dp);
if (error)
return error;
@@ -620,7 +621,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* Commit the transaction that added the attr name so that
* later routines can manage their own transactions.
*/
- error = xfs_trans_roll(&args->trans, dp);
+ error = xfs_trans_roll_inode(&args->trans, dp);
if (error)
return error;
@@ -684,20 +685,18 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
xfs_defer_init(args->dfops, args->firstblock);
error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
- if (!error)
- error = xfs_defer_finish(&args->trans,
- args->dfops, dp);
- if (error) {
- args->trans = NULL;
- xfs_defer_cancel(args->dfops);
- return error;
- }
+ if (error)
+ goto out_defer_cancel;
+ xfs_defer_ijoin(args->dfops, dp);
+ error = xfs_defer_finish(&args->trans, args->dfops);
+ if (error)
+ goto out_defer_cancel;
}
/*
* Commit the remove and start the next trans in series.
*/
- error = xfs_trans_roll(&args->trans, dp);
+ error = xfs_trans_roll_inode(&args->trans, dp);
} else if (args->rmtblkno > 0) {
/*
@@ -706,6 +705,10 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
error = xfs_attr3_leaf_clearflag(args);
}
return error;
+out_defer_cancel:
+ xfs_defer_cancel(args->dfops);
+ args->trans = NULL;
+ return error;
}
/*
@@ -747,15 +750,18 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
xfs_defer_init(args->dfops, args->firstblock);
error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
- if (!error)
- error = xfs_defer_finish(&args->trans, args->dfops, dp);
- if (error) {
- args->trans = NULL;
- xfs_defer_cancel(args->dfops);
- return error;
- }
+ if (error)
+ goto out_defer_cancel;
+ xfs_defer_ijoin(args->dfops, dp);
+ error = xfs_defer_finish(&args->trans, args->dfops);
+ if (error)
+ goto out_defer_cancel;
}
return 0;
+out_defer_cancel:
+ xfs_defer_cancel(args->dfops);
+ args->trans = NULL;
+ return error;
}
/*
@@ -872,20 +878,18 @@ restart:
state = NULL;
xfs_defer_init(args->dfops, args->firstblock);
error = xfs_attr3_leaf_to_node(args);
- if (!error)
- error = xfs_defer_finish(&args->trans,
- args->dfops, dp);
- if (error) {
- args->trans = NULL;
- xfs_defer_cancel(args->dfops);
- goto out;
- }
+ if (error)
+ goto out_defer_cancel;
+ xfs_defer_ijoin(args->dfops, dp);
+ error = xfs_defer_finish(&args->trans, args->dfops);
+ if (error)
+ goto out_defer_cancel;
/*
* Commit the node conversion and start the next
* trans in the chain.
*/
- error = xfs_trans_roll(&args->trans, dp);
+ error = xfs_trans_roll_inode(&args->trans, dp);
if (error)
goto out;
@@ -900,13 +904,12 @@ restart:
*/
xfs_defer_init(args->dfops, args->firstblock);
error = xfs_da3_split(state);
- if (!error)
- error = xfs_defer_finish(&args->trans, args->dfops, dp);
- if (error) {
- args->trans = NULL;
- xfs_defer_cancel(args->dfops);
- goto out;
- }
+ if (error)
+ goto out_defer_cancel;
+ xfs_defer_ijoin(args->dfops, dp);
+ error = xfs_defer_finish(&args->trans, args->dfops);
+ if (error)
+ goto out_defer_cancel;
} else {
/*
* Addition succeeded, update Btree hashvals.
@@ -925,7 +928,7 @@ restart:
* Commit the leaf addition or btree split and start the next
* trans in the chain.
*/
- error = xfs_trans_roll(&args->trans, dp);
+ error = xfs_trans_roll_inode(&args->trans, dp);
if (error)
goto out;
@@ -999,20 +1002,18 @@ restart:
if (retval && (state->path.active > 1)) {
xfs_defer_init(args->dfops, args->firstblock);
error = xfs_da3_join(state);
- if (!error)
- error = xfs_defer_finish(&args->trans,
- args->dfops, dp);
- if (error) {
- args->trans = NULL;
- xfs_defer_cancel(args->dfops);
- goto out;
- }
+ if (error)
+ goto out_defer_cancel;
+ xfs_defer_ijoin(args->dfops, dp);
+ error = xfs_defer_finish(&args->trans, args->dfops);
+ if (error)
+ goto out_defer_cancel;
}
/*
* Commit and start the next trans in the chain.
*/
- error = xfs_trans_roll(&args->trans, dp);
+ error = xfs_trans_roll_inode(&args->trans, dp);
if (error)
goto out;
@@ -1032,6 +1033,10 @@ out:
if (error)
return error;
return retval;
+out_defer_cancel:
+ xfs_defer_cancel(args->dfops);
+ args->trans = NULL;
+ goto out;
}
/*
@@ -1122,17 +1127,16 @@ xfs_attr_node_removename(xfs_da_args_t *args)
if (retval && (state->path.active > 1)) {
xfs_defer_init(args->dfops, args->firstblock);
error = xfs_da3_join(state);
- if (!error)
- error = xfs_defer_finish(&args->trans, args->dfops, dp);
- if (error) {
- args->trans = NULL;
- xfs_defer_cancel(args->dfops);
- goto out;
- }
+ if (error)
+ goto out_defer_cancel;
+ xfs_defer_ijoin(args->dfops, dp);
+ error = xfs_defer_finish(&args->trans, args->dfops);
+ if (error)
+ goto out_defer_cancel;
/*
* Commit the Btree join operation and start a new trans.
*/
- error = xfs_trans_roll(&args->trans, dp);
+ error = xfs_trans_roll_inode(&args->trans, dp);
if (error)
goto out;
}
@@ -1156,14 +1160,12 @@ xfs_attr_node_removename(xfs_da_args_t *args)
xfs_defer_init(args->dfops, args->firstblock);
error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
- if (!error)
- error = xfs_defer_finish(&args->trans,
- args->dfops, dp);
- if (error) {
- args->trans = NULL;
- xfs_defer_cancel(args->dfops);
- goto out;
- }
+ if (error)
+ goto out_defer_cancel;
+ xfs_defer_ijoin(args->dfops, dp);
+ error = xfs_defer_finish(&args->trans, args->dfops);
+ if (error)
+ goto out_defer_cancel;
} else
xfs_trans_brelse(args->trans, bp);
}
@@ -1172,6 +1174,10 @@ xfs_attr_node_removename(xfs_da_args_t *args)
out:
xfs_da_state_free(state);
return error;
+out_defer_cancel:
+ xfs_defer_cancel(args->dfops);
+ args->trans = NULL;
+ goto out;
}
/*
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index c6c15e5717e4..5c16db86b38f 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -2608,7 +2608,7 @@ xfs_attr3_leaf_clearflag(
/*
* Commit the flag value change and start the next trans in series.
*/
- return xfs_trans_roll(&args->trans, args->dp);
+ return xfs_trans_roll_inode(&args->trans, args->dp);
}
/*
@@ -2659,7 +2659,7 @@ xfs_attr3_leaf_setflag(
/*
* Commit the flag value change and start the next trans in series.
*/
- return xfs_trans_roll(&args->trans, args->dp);
+ return xfs_trans_roll_inode(&args->trans, args->dp);
}
/*
@@ -2777,7 +2777,7 @@ xfs_attr3_leaf_flipflags(
/*
* Commit the flag value change and start the next trans in series.
*/
- error = xfs_trans_roll(&args->trans, args->dp);
+ error = xfs_trans_roll_inode(&args->trans, args->dp);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 5236d8e45146..d56caf037ca0 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -467,13 +467,12 @@ xfs_attr_rmtval_set(
error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock,
args->total, &map, &nmap, args->dfops);
- if (!error)
- error = xfs_defer_finish(&args->trans, args->dfops, dp);
- if (error) {
- args->trans = NULL;
- xfs_defer_cancel(args->dfops);
- return error;
- }
+ if (error)
+ goto out_defer_cancel;
+ xfs_defer_ijoin(args->dfops, dp);
+ error = xfs_defer_finish(&args->trans, args->dfops);
+ if (error)
+ goto out_defer_cancel;
ASSERT(nmap == 1);
ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
@@ -484,7 +483,7 @@ xfs_attr_rmtval_set(
/*
* Start the next trans in the chain.
*/
- error = xfs_trans_roll(&args->trans, dp);
+ error = xfs_trans_roll_inode(&args->trans, dp);
if (error)
return error;
}
@@ -539,6 +538,10 @@ xfs_attr_rmtval_set(
}
ASSERT(valuelen == 0);
return 0;
+out_defer_cancel:
+ xfs_defer_cancel(args->dfops);
+ args->trans = NULL;
+ return error;
}
/*
@@ -609,21 +612,23 @@ xfs_attr_rmtval_remove(
error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
XFS_BMAPI_ATTRFORK, 1, args->firstblock,
args->dfops, &done);
- if (!error)
- error = xfs_defer_finish(&args->trans, args->dfops,
- args->dp);
- if (error) {
- args->trans = NULL;
- xfs_defer_cancel(args->dfops);
- return error;
- }
+ if (error)
+ goto out_defer_cancel;
+ xfs_defer_ijoin(args->dfops, args->dp);
+ error = xfs_defer_finish(&args->trans, args->dfops);
+ if (error)
+ goto out_defer_cancel;
/*
* Close out trans and start the next one in the chain.
*/
- error = xfs_trans_roll(&args->trans, args->dp);
+ error = xfs_trans_roll_inode(&args->trans, args->dp);
if (error)
return error;
}
return 0;
+out_defer_cancel:
+ xfs_defer_cancel(args->dfops);
+ args->trans = NULL;
+ return error;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index c09c16b1ad3b..459f4b4f08fe 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -579,7 +579,7 @@ xfs_bmap_validate_ret(
#else
#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0)
-#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
+#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) do { } while (0)
#endif /* DEBUG */
/*
@@ -880,7 +880,7 @@ xfs_bmap_local_to_extents(
xfs_ifork_t *ifp; /* inode fork pointer */
xfs_alloc_arg_t args; /* allocation arguments */
xfs_buf_t *bp; /* buffer for extent block */
- xfs_bmbt_rec_host_t *ep; /* extent record pointer */
+ struct xfs_bmbt_irec rec;
/*
* We don't want to deal with the case of keeping inode data inline yet.
@@ -943,9 +943,12 @@ xfs_bmap_local_to_extents(
xfs_bmap_local_to_extents_empty(ip, whichfork);
flags |= XFS_ILOG_CORE;
- xfs_iext_add(ifp, 0, 1);
- ep = xfs_iext_get_ext(ifp, 0);
- xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
+ rec.br_startoff = 0;
+ rec.br_startblock = args.fsbno;
+ rec.br_blockcount = 1;
+ rec.br_state = XFS_EXT_NORM;
+ xfs_iext_insert(ip, 0, 1, &rec, 0);
+
trace_xfs_bmap_post_update(ip, 0,
whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
_THIS_IP_);
@@ -1196,7 +1199,7 @@ xfs_bmap_add_attrfork(
xfs_log_sb(tp);
}
- error = xfs_defer_finish(&tp, &dfops, NULL);
+ error = xfs_defer_finish(&tp, &dfops);
if (error)
goto bmap_cancel;
error = xfs_trans_commit(tp);
@@ -1356,7 +1359,6 @@ xfs_bmap_first_unused(
xfs_fileoff_t lastaddr; /* last block number seen */
xfs_fileoff_t lowest; /* lowest useful block */
xfs_fileoff_t max; /* starting useful block */
- xfs_fileoff_t off; /* offset for this block */
xfs_extnum_t nextents; /* number of extent entries */
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
@@ -1373,16 +1375,19 @@ xfs_bmap_first_unused(
lowest = *first_unused;
nextents = xfs_iext_count(ifp);
for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) {
- xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
- off = xfs_bmbt_get_startoff(ep);
+ struct xfs_bmbt_irec got;
+
+ xfs_iext_get_extent(ifp, idx, &got);
+
/*
* See if the hole before this extent will work.
*/
- if (off >= lowest + len && off - max >= len) {
+ if (got.br_startoff >= lowest + len &&
+ got.br_startoff - max >= len) {
*first_unused = max;
return 0;
}
- lastaddr = off + xfs_bmbt_get_blockcount(ep);
+ lastaddr = got.br_startoff + got.br_blockcount;
max = XFS_FILEOFF_MAX(lastaddr, lowest);
}
*first_unused = max;
@@ -4918,7 +4923,7 @@ xfs_bmap_del_extent_delay(
da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip,
got->br_blockcount), da_old);
got->br_startblock = nullstartblock((int)da_new);
- xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got);
+ xfs_iext_update_extent(ifp, *idx, got);
trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
break;
case BMAP_RIGHT_CONTIG:
@@ -4930,7 +4935,7 @@ xfs_bmap_del_extent_delay(
da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip,
got->br_blockcount), da_old);
got->br_startblock = nullstartblock((int)da_new);
- xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got);
+ xfs_iext_update_extent(ifp, *idx, got);
trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
break;
case 0:
@@ -4956,7 +4961,7 @@ xfs_bmap_del_extent_delay(
del->br_blockcount);
got->br_startblock = nullstartblock((int)got_indlen);
- xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got);
+ xfs_iext_update_extent(ifp, *idx, got);
trace_xfs_bmap_post_update(ip, *idx, 0, _THIS_IP_);
new.br_startoff = del_endoff;
@@ -5026,7 +5031,7 @@ xfs_bmap_del_extent_cow(
got->br_startoff = del_endoff;
got->br_blockcount -= del->br_blockcount;
got->br_startblock = del->br_startblock + del->br_blockcount;
- xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got);
+ xfs_iext_update_extent(ifp, *idx, got);
trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
break;
case BMAP_RIGHT_CONTIG:
@@ -5035,7 +5040,7 @@ xfs_bmap_del_extent_cow(
*/
trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
got->br_blockcount -= del->br_blockcount;
- xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got);
+ xfs_iext_update_extent(ifp, *idx, got);
trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
break;
case 0:
@@ -5044,7 +5049,7 @@ xfs_bmap_del_extent_cow(
*/
trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
got->br_blockcount = del->br_startoff - got->br_startoff;
- xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got);
+ xfs_iext_update_extent(ifp, *idx, got);
trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
new.br_startoff = del_endoff;
@@ -5876,32 +5881,26 @@ xfs_bmse_merge(
int whichfork,
xfs_fileoff_t shift, /* shift fsb */
int current_ext, /* idx of gotp */
- struct xfs_bmbt_rec_host *gotp, /* extent to shift */
- struct xfs_bmbt_rec_host *leftp, /* preceding extent */
+ struct xfs_bmbt_irec *got, /* extent to shift */
+ struct xfs_bmbt_irec *left, /* preceding extent */
struct xfs_btree_cur *cur,
- int *logflags) /* output */
+ int *logflags, /* output */
+ struct xfs_defer_ops *dfops)
{
- struct xfs_bmbt_irec got;
- struct xfs_bmbt_irec left;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_bmbt_irec new;
xfs_filblks_t blockcount;
int error, i;
struct xfs_mount *mp = ip->i_mount;
- xfs_bmbt_get_all(gotp, &got);
- xfs_bmbt_get_all(leftp, &left);
- blockcount = left.br_blockcount + got.br_blockcount;
+ blockcount = left->br_blockcount + got->br_blockcount;
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(xfs_bmse_can_merge(&left, &got, shift));
+ ASSERT(xfs_bmse_can_merge(left, got, shift));
- /*
- * Merge the in-core extents. Note that the host record pointers and
- * current_ext index are invalid once the extent has been removed via
- * xfs_iext_remove().
- */
- xfs_bmbt_set_blockcount(leftp, blockcount);
- xfs_iext_remove(ip, current_ext, 1, 0);
+ new = *left;
+ new.br_blockcount = blockcount;
/*
* Update the on-disk extent count, the btree if necessary and log the
@@ -5912,12 +5911,12 @@ xfs_bmse_merge(
*logflags |= XFS_ILOG_CORE;
if (!cur) {
*logflags |= XFS_ILOG_DEXT;
- return 0;
+ goto done;
}
/* lookup and remove the extent to merge */
- error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock,
- got.br_blockcount, &i);
+ error = xfs_bmbt_lookup_eq(cur, got->br_startoff, got->br_startblock,
+ got->br_blockcount, &i);
if (error)
return error;
XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
@@ -5928,16 +5927,28 @@ xfs_bmse_merge(
XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
/* lookup and update size of the previous extent */
- error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock,
- left.br_blockcount, &i);
+ error = xfs_bmbt_lookup_eq(cur, left->br_startoff, left->br_startblock,
+ left->br_blockcount, &i);
if (error)
return error;
XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
- left.br_blockcount = blockcount;
+ error = xfs_bmbt_update(cur, new.br_startoff, new.br_startblock,
+ new.br_blockcount, new.br_state);
+ if (error)
+ return error;
+
+done:
+ xfs_iext_update_extent(ifp, current_ext - 1, &new);
+ xfs_iext_remove(ip, current_ext, 1, 0);
- return xfs_bmbt_update(cur, left.br_startoff, left.br_startblock,
- left.br_blockcount, left.br_state);
+ /* update reverse mapping. rmap functions merge the rmaps for us */
+ error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, got);
+ if (error)
+ return error;
+ memcpy(&new, got, sizeof(new));
+ new.br_startoff = left->br_startoff + left->br_blockcount;
+ return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &new);
}
/*
@@ -5949,7 +5960,7 @@ xfs_bmse_shift_one(
int whichfork,
xfs_fileoff_t offset_shift_fsb,
int *current_ext,
- struct xfs_bmbt_rec_host *gotp,
+ struct xfs_bmbt_irec *got,
struct xfs_btree_cur *cur,
int *logflags,
enum shift_direction direction,
@@ -5958,9 +5969,7 @@ xfs_bmse_shift_one(
struct xfs_ifork *ifp;
struct xfs_mount *mp;
xfs_fileoff_t startoff;
- struct xfs_bmbt_rec_host *adj_irecp;
- struct xfs_bmbt_irec got;
- struct xfs_bmbt_irec adj_irec;
+ struct xfs_bmbt_irec adj_irec, new;
int error;
int i;
int total_extents;
@@ -5969,13 +5978,11 @@ xfs_bmse_shift_one(
ifp = XFS_IFORK_PTR(ip, whichfork);
total_extents = xfs_iext_count(ifp);
- xfs_bmbt_get_all(gotp, &got);
-
/* delalloc extents should be prevented by caller */
- XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock));
+ XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got->br_startblock));
if (direction == SHIFT_LEFT) {
- startoff = got.br_startoff - offset_shift_fsb;
+ startoff = got->br_startoff - offset_shift_fsb;
/*
* Check for merge if we've got an extent to the left,
@@ -5983,46 +5990,39 @@ xfs_bmse_shift_one(
* of the file for the shift.
*/
if (!*current_ext) {
- if (got.br_startoff < offset_shift_fsb)
+ if (got->br_startoff < offset_shift_fsb)
return -EINVAL;
goto update_current_ext;
}
+
/*
- * grab the left extent and check for a large
- * enough hole.
+ * grab the left extent and check for a large enough hole.
*/
- adj_irecp = xfs_iext_get_ext(ifp, *current_ext - 1);
- xfs_bmbt_get_all(adj_irecp, &adj_irec);
-
- if (startoff <
- adj_irec.br_startoff + adj_irec.br_blockcount)
+ xfs_iext_get_extent(ifp, *current_ext - 1, &adj_irec);
+ if (startoff < adj_irec.br_startoff + adj_irec.br_blockcount)
return -EINVAL;
/* check whether to merge the extent or shift it down */
- if (xfs_bmse_can_merge(&adj_irec, &got,
- offset_shift_fsb)) {
- error = xfs_bmse_merge(ip, whichfork, offset_shift_fsb,
- *current_ext, gotp, adj_irecp,
- cur, logflags);
- if (error)
- return error;
- adj_irec = got;
- goto update_rmap;
+ if (xfs_bmse_can_merge(&adj_irec, got, offset_shift_fsb)) {
+ return xfs_bmse_merge(ip, whichfork, offset_shift_fsb,
+ *current_ext, got, &adj_irec,
+ cur, logflags, dfops);
}
} else {
- startoff = got.br_startoff + offset_shift_fsb;
+ startoff = got->br_startoff + offset_shift_fsb;
/* nothing to move if this is the last extent */
if (*current_ext >= (total_extents - 1))
goto update_current_ext;
+
/*
* If this is not the last extent in the file, make sure there
* is enough room between current extent and next extent for
* accommodating the shift.
*/
- adj_irecp = xfs_iext_get_ext(ifp, *current_ext + 1);
- xfs_bmbt_get_all(adj_irecp, &adj_irec);
- if (startoff + got.br_blockcount > adj_irec.br_startoff)
+ xfs_iext_get_extent(ifp, *current_ext + 1, &adj_irec);
+ if (startoff + got->br_blockcount > adj_irec.br_startoff)
return -EINVAL;
+
/*
* Unlike a left shift (which involves a hole punch),
* a right shift does not modify extent neighbors
@@ -6030,45 +6030,48 @@ xfs_bmse_shift_one(
* in this scenario. Check anyways and warn if we
* encounter two extents that could be one.
*/
- if (xfs_bmse_can_merge(&got, &adj_irec, offset_shift_fsb))
+ if (xfs_bmse_can_merge(got, &adj_irec, offset_shift_fsb))
WARN_ON_ONCE(1);
}
+
/*
* Increment the extent index for the next iteration, update the start
* offset of the in-core extent and update the btree if applicable.
*/
update_current_ext:
- if (direction == SHIFT_LEFT)
- (*current_ext)++;
- else
- (*current_ext)--;
- xfs_bmbt_set_startoff(gotp, startoff);
*logflags |= XFS_ILOG_CORE;
- adj_irec = got;
- if (!cur) {
+
+ new = *got;
+ new.br_startoff = startoff;
+
+ if (cur) {
+ error = xfs_bmbt_lookup_eq(cur, got->br_startoff,
+ got->br_startblock, got->br_blockcount, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+
+ error = xfs_bmbt_update(cur, new.br_startoff,
+ new.br_startblock, new.br_blockcount,
+ new.br_state);
+ if (error)
+ return error;
+ } else {
*logflags |= XFS_ILOG_DEXT;
- goto update_rmap;
}
- error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock,
- got.br_blockcount, &i);
- if (error)
- return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+ xfs_iext_update_extent(ifp, *current_ext, &new);
- got.br_startoff = startoff;
- error = xfs_bmbt_update(cur, got.br_startoff, got.br_startblock,
- got.br_blockcount, got.br_state);
- if (error)
- return error;
+ if (direction == SHIFT_LEFT)
+ (*current_ext)++;
+ else
+ (*current_ext)--;
-update_rmap:
/* update reverse mapping */
- error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, &adj_irec);
+ error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, got);
if (error)
return error;
- adj_irec.br_startoff = startoff;
- return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &adj_irec);
+ return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &new);
}
/*
@@ -6095,7 +6098,6 @@ xfs_bmap_shift_extents(
int num_exts)
{
struct xfs_btree_cur *cur = NULL;
- struct xfs_bmbt_rec_host *gotp;
struct xfs_bmbt_irec got;
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
@@ -6122,7 +6124,6 @@ xfs_bmap_shift_extents(
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
- ASSERT(*next_fsb != NULLFSBLOCK || direction == SHIFT_RIGHT);
ifp = XFS_IFORK_PTR(ip, whichfork);
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
@@ -6154,10 +6155,26 @@ xfs_bmap_shift_extents(
* In case of first right shift, we need to initialize next_fsb
*/
if (*next_fsb == NULLFSBLOCK) {
- gotp = xfs_iext_get_ext(ifp, total_extents - 1);
- xfs_bmbt_get_all(gotp, &got);
+ ASSERT(direction == SHIFT_RIGHT);
+
+ current_ext = total_extents - 1;
+ xfs_iext_get_extent(ifp, current_ext, &got);
+ if (stop_fsb > got.br_startoff) {
+ *done = 1;
+ goto del_cursor;
+ }
*next_fsb = got.br_startoff;
- if (stop_fsb > *next_fsb) {
+ } else {
+ /*
+ * Look up the extent index for the fsb where we start shifting. We can
+ * henceforth iterate with current_ext as extent list changes are locked
+ * out via ilock.
+ *
+ * If next_fsb lies in a hole beyond which there are no extents we are
+ * done.
+ */
+ if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &current_ext,
+ &got)) {
*done = 1;
goto del_cursor;
}
@@ -6165,37 +6182,26 @@ xfs_bmap_shift_extents(
/* Lookup the extent index at which we have to stop */
if (direction == SHIFT_RIGHT) {
- gotp = xfs_iext_bno_to_ext(ifp, stop_fsb, &stop_extent);
+ struct xfs_bmbt_irec s;
+
+ xfs_iext_lookup_extent(ip, ifp, stop_fsb, &stop_extent, &s);
/* Make stop_extent exclusive of shift range */
stop_extent--;
- } else
+ if (current_ext <= stop_extent) {
+ error = -EIO;
+ goto del_cursor;
+ }
+ } else {
stop_extent = total_extents;
-
- /*
- * Look up the extent index for the fsb where we start shifting. We can
- * henceforth iterate with current_ext as extent list changes are locked
- * out via ilock.
- *
- * gotp can be null in 2 cases: 1) if there are no extents or 2)
- * *next_fsb lies in a hole beyond which there are no extents. Either
- * way, we are done.
- */
- gotp = xfs_iext_bno_to_ext(ifp, *next_fsb, &current_ext);
- if (!gotp) {
- *done = 1;
- goto del_cursor;
- }
-
- /* some sanity checking before we finally start shifting extents */
- if ((direction == SHIFT_LEFT && current_ext >= stop_extent) ||
- (direction == SHIFT_RIGHT && current_ext <= stop_extent)) {
- error = -EIO;
- goto del_cursor;
+ if (current_ext >= stop_extent) {
+ error = -EIO;
+ goto del_cursor;
+ }
}
while (nexts++ < num_exts) {
error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb,
- &current_ext, gotp, cur, &logflags,
+ &current_ext, &got, cur, &logflags,
direction, dfops);
if (error)
goto del_cursor;
@@ -6213,13 +6219,11 @@ xfs_bmap_shift_extents(
*next_fsb = NULLFSBLOCK;
break;
}
- gotp = xfs_iext_get_ext(ifp, current_ext);
+ xfs_iext_get_extent(ifp, current_ext, &got);
}
- if (!*done) {
- xfs_bmbt_get_all(gotp, &got);
+ if (!*done)
*next_fsb = got.br_startoff;
- }
del_cursor:
if (cur)
@@ -6248,7 +6252,6 @@ xfs_bmap_split_extent_at(
{
int whichfork = XFS_DATA_FORK;
struct xfs_btree_cur *cur = NULL;
- struct xfs_bmbt_rec_host *gotp;
struct xfs_bmbt_irec got;
struct xfs_bmbt_irec new; /* split extent */
struct xfs_mount *mp = ip->i_mount;
@@ -6280,21 +6283,10 @@ xfs_bmap_split_extent_at(
}
/*
- * gotp can be null in 2 cases: 1) if there are no extents
- * or 2) split_fsb lies in a hole beyond which there are
- * no extents. Either way, we are done.
- */
- gotp = xfs_iext_bno_to_ext(ifp, split_fsb, &current_ext);
- if (!gotp)
- return 0;
-
- xfs_bmbt_get_all(gotp, &got);
-
- /*
- * Check split_fsb lies in a hole or the start boundary offset
- * of the extent.
+ * If there are not extents, or split_fsb lies in a hole we are done.
*/
- if (got.br_startoff >= split_fsb)
+ if (!xfs_iext_lookup_extent(ip, ifp, split_fsb, &current_ext, &got) ||
+ got.br_startoff >= split_fsb)
return 0;
gotblkcnt = split_fsb - got.br_startoff;
@@ -6317,8 +6309,8 @@ xfs_bmap_split_extent_at(
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor);
}
- xfs_bmbt_set_blockcount(gotp, gotblkcnt);
got.br_blockcount = gotblkcnt;
+ xfs_iext_update_extent(ifp, current_ext, &got);
logflags = XFS_ILOG_CORE;
if (cur) {
@@ -6402,7 +6394,7 @@ xfs_bmap_split_extent(
if (error)
goto out;
- error = xfs_defer_finish(&tp, &dfops, NULL);
+ error = xfs_defer_finish(&tp, &dfops);
if (error)
goto out;
@@ -6452,7 +6444,7 @@ __xfs_bmap_add(
bi->bi_whichfork = whichfork;
bi->bi_bmap = *bmap;
- error = xfs_defer_join(dfops, bi->bi_owner);
+ error = xfs_defer_ijoin(dfops, bi->bi_owner);
if (error) {
kmem_free(bi);
return error;
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 85de22513014..a6331ffa51e3 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -858,6 +858,7 @@ xfs_bmbt_change_owner(
cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
if (!cur)
return -ENOMEM;
+ cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER;
error = xfs_btree_change_owner(cur, new_owner, buffer_list);
xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index e0bcc4a59efd..5bfb88261c7e 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -1791,6 +1791,7 @@ xfs_btree_lookup_get_block(
/* Check the inode owner since the verifiers don't. */
if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
+ !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) &&
(cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
cur->bc_private.b.ip->i_ino)
@@ -4451,10 +4452,15 @@ xfs_btree_block_change_owner(
/* modify the owner */
block = xfs_btree_get_block(cur, level, &bp);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
+ return 0;
block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
- else
+ } else {
+ if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
+ return 0;
block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
+ }
/*
* If the block is a root block hosted in an inode, we might not have a
@@ -4463,16 +4469,19 @@ xfs_btree_block_change_owner(
* block is formatted into the on-disk inode fork. We still change it,
* though, so everything is consistent in memory.
*/
- if (bp) {
- if (cur->bc_tp) {
- xfs_trans_ordered_buf(cur->bc_tp, bp);
+ if (!bp) {
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(level == cur->bc_nlevels - 1);
+ return 0;
+ }
+
+ if (cur->bc_tp) {
+ if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
- } else {
- xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
+ return -EAGAIN;
}
} else {
- ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
- ASSERT(level == cur->bc_nlevels - 1);
+ xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
}
return 0;
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 9c95e965cfe5..f2a88c3b1159 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -233,7 +233,8 @@ typedef struct xfs_btree_cur
short forksize; /* fork's inode space */
char whichfork; /* data or attr fork */
char flags; /* flags */
-#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */
+#define XFS_BTCUR_BPRV_WASDEL (1<<0) /* was delayed */
+#define XFS_BTCUR_BPRV_INVALID_OWNER (1<<1) /* for ext swap */
} b;
} bc_private; /* per-btree type data */
} xfs_btree_cur_t;
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 5c2929f94bd3..072ebfe1d6ae 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -240,23 +240,19 @@ xfs_defer_trans_abort(
STATIC int
xfs_defer_trans_roll(
struct xfs_trans **tp,
- struct xfs_defer_ops *dop,
- struct xfs_inode *ip)
+ struct xfs_defer_ops *dop)
{
int i;
int error;
- /* Log all the joined inodes except the one we passed in. */
- for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) {
- if (dop->dop_inodes[i] == ip)
- continue;
+ /* Log all the joined inodes. */
+ for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++)
xfs_trans_log_inode(*tp, dop->dop_inodes[i], XFS_ILOG_CORE);
- }
trace_xfs_defer_trans_roll((*tp)->t_mountp, dop);
/* Roll the transaction. */
- error = xfs_trans_roll(tp, ip);
+ error = xfs_trans_roll(tp);
if (error) {
trace_xfs_defer_trans_roll_error((*tp)->t_mountp, dop, error);
xfs_defer_trans_abort(*tp, dop, error);
@@ -264,12 +260,9 @@ xfs_defer_trans_roll(
}
dop->dop_committed = true;
- /* Rejoin the joined inodes except the one we passed in. */
- for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) {
- if (dop->dop_inodes[i] == ip)
- continue;
+ /* Rejoin the joined inodes. */
+ for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++)
xfs_trans_ijoin(*tp, dop->dop_inodes[i], 0);
- }
return error;
}
@@ -284,11 +277,10 @@ xfs_defer_has_unfinished_work(
/*
* Add this inode to the deferred op. Each joined inode is relogged
- * each time we roll the transaction, in addition to any inode passed
- * to xfs_defer_finish().
+ * each time we roll the transaction.
*/
int
-xfs_defer_join(
+xfs_defer_ijoin(
struct xfs_defer_ops *dop,
struct xfs_inode *ip)
{
@@ -317,8 +309,7 @@ xfs_defer_join(
int
xfs_defer_finish(
struct xfs_trans **tp,
- struct xfs_defer_ops *dop,
- struct xfs_inode *ip)
+ struct xfs_defer_ops *dop)
{
struct xfs_defer_pending *dfp;
struct list_head *li;
@@ -337,7 +328,7 @@ xfs_defer_finish(
xfs_defer_intake_work(*tp, dop);
/* Roll the transaction. */
- error = xfs_defer_trans_roll(tp, dop, ip);
+ error = xfs_defer_trans_roll(tp, dop);
if (error)
goto out;
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index f6e93ef0bffe..d4f046dd44bd 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -72,12 +72,11 @@ struct xfs_defer_ops {
void xfs_defer_add(struct xfs_defer_ops *dop, enum xfs_defer_ops_type type,
struct list_head *h);
-int xfs_defer_finish(struct xfs_trans **tp, struct xfs_defer_ops *dop,
- struct xfs_inode *ip);
+int xfs_defer_finish(struct xfs_trans **tp, struct xfs_defer_ops *dop);
void xfs_defer_cancel(struct xfs_defer_ops *dop);
void xfs_defer_init(struct xfs_defer_ops *dop, xfs_fsblock_t *fbp);
bool xfs_defer_has_unfinished_work(struct xfs_defer_ops *dop);
-int xfs_defer_join(struct xfs_defer_ops *dop, struct xfs_inode *ip);
+int xfs_defer_ijoin(struct xfs_defer_ops *dop, struct xfs_inode *ip);
/* Description of a deferred type. */
struct xfs_defer_op_type {
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index abf5beaae907..988bb3f31446 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -378,8 +378,6 @@ xfs_ialloc_inode_init(
* transaction and pin the log appropriately.
*/
xfs_trans_ordered_buf(tp, fbuf);
- xfs_trans_log_buf(tp, fbuf, 0,
- BBTOB(fbuf->b_length) - 1);
}
} else {
fbuf->b_flags |= XBF_DONE;
@@ -1133,6 +1131,7 @@ xfs_dialloc_ag_inobt(
int error;
int offset;
int i, j;
+ int searchdistance = 10;
pag = xfs_perag_get(mp, agno);
@@ -1159,7 +1158,6 @@ xfs_dialloc_ag_inobt(
if (pagno == agno) {
int doneleft; /* done, to the left */
int doneright; /* done, to the right */
- int searchdistance = 10;
error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
if (error)
@@ -1220,21 +1218,9 @@ xfs_dialloc_ag_inobt(
/*
* Loop until we find an inode chunk with a free inode.
*/
- while (!doneleft || !doneright) {
+ while (--searchdistance > 0 && (!doneleft || !doneright)) {
int useleft; /* using left inode chunk this time */
- if (!--searchdistance) {
- /*
- * Not in range - save last search
- * location and allocate a new inode
- */
- xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
- pag->pagl_leftrec = trec.ir_startino;
- pag->pagl_rightrec = rec.ir_startino;
- pag->pagl_pagino = pagino;
- goto newino;
- }
-
/* figure out the closer block if both are valid. */
if (!doneleft && !doneright) {
useleft = pagino -
@@ -1278,26 +1264,37 @@ xfs_dialloc_ag_inobt(
goto error1;
}
- /*
- * We've reached the end of the btree. because
- * we are only searching a small chunk of the
- * btree each search, there is obviously free
- * inodes closer to the parent inode than we
- * are now. restart the search again.
- */
- pag->pagl_pagino = NULLAGINO;
- pag->pagl_leftrec = NULLAGINO;
- pag->pagl_rightrec = NULLAGINO;
- xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
- xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
- goto restart_pagno;
+ if (searchdistance <= 0) {
+ /*
+ * Not in range - save last search
+ * location and allocate a new inode
+ */
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ pag->pagl_leftrec = trec.ir_startino;
+ pag->pagl_rightrec = rec.ir_startino;
+ pag->pagl_pagino = pagino;
+
+ } else {
+ /*
+ * We've reached the end of the btree. because
+ * we are only searching a small chunk of the
+ * btree each search, there is obviously free
+ * inodes closer to the parent inode than we
+ * are now. restart the search again.
+ */
+ pag->pagl_pagino = NULLAGINO;
+ pag->pagl_leftrec = NULLAGINO;
+ pag->pagl_rightrec = NULLAGINO;
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ goto restart_pagno;
+ }
}
/*
* In a different AG from the parent.
* See if the most recently allocated block has any free.
*/
-newino:
if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
XFS_LOOKUP_EQ, &i);
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 0e80f34fe97c..31840ca24018 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -1499,14 +1499,11 @@ xfs_iext_realloc_indirect(
xfs_ifork_t *ifp, /* inode fork pointer */
int new_size) /* new indirection array size */
{
- int nlists; /* number of irec's (ex lists) */
- int size; /* current indirection array size */
-
ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- size = nlists * sizeof(xfs_ext_irec_t);
ASSERT(ifp->if_real_bytes);
- ASSERT((new_size >= 0) && (new_size != size));
+ ASSERT((new_size >= 0) &&
+ (new_size != ((ifp->if_real_bytes / XFS_IEXT_BUFSZ) *
+ sizeof(xfs_ext_irec_t))));
if (new_size == 0) {
xfs_iext_destroy(ifp);
} else {
@@ -2023,3 +2020,15 @@ xfs_iext_get_extent(
xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), gotp);
return true;
}
+
+void
+xfs_iext_update_extent(
+ struct xfs_ifork *ifp,
+ xfs_extnum_t idx,
+ struct xfs_bmbt_irec *gotp)
+{
+ ASSERT(idx >= 0);
+ ASSERT(idx < xfs_iext_count(ifp));
+
+ xfs_bmbt_set_all(xfs_iext_get_ext(ifp, idx), gotp);
+}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 7fb8365326d1..11af705219f6 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -187,6 +187,8 @@ bool xfs_iext_lookup_extent(struct xfs_inode *ip,
xfs_extnum_t *idxp, struct xfs_bmbt_irec *gotp);
bool xfs_iext_get_extent(struct xfs_ifork *ifp, xfs_extnum_t idx,
struct xfs_bmbt_irec *gotp);
+void xfs_iext_update_extent(struct xfs_ifork *ifp, xfs_extnum_t idx,
+ struct xfs_bmbt_irec *gotp);
extern struct kmem_zone *xfs_ifork_zone;
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 45b1c3b4e047..9d5406b4f663 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1679,7 +1679,7 @@ xfs_refcount_recover_cow_leftovers(
xfs_bmap_add_free(mp, &dfops, fsb,
rr->rr_rrec.rc_blockcount, NULL);
- error = xfs_defer_finish(&tp, &dfops, NULL);
+ error = xfs_defer_finish(&tp, &dfops);
if (error)
goto out_defer;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index c8ca03a5a08f..fffae1390d7f 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -85,11 +85,11 @@ xfs_find_bdev_for_inode(
* associated buffer_heads, paying attention to the start and end offsets that
* we need to process on the page.
*
- * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
- * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
- * the page at all, as we may be racing with memory reclaim and it can free both
- * the bufferhead chain and the page as it will see the page as clean and
- * unused.
+ * Note that we open code the action in end_buffer_async_write here so that we
+ * only have to iterate over the buffers attached to the page once. This is not
+ * only more efficient, but also ensures that we only calls end_page_writeback
+ * at the end of the iteration, and thus avoids the pitfall of having the page
+ * and buffers potentially freed after every call to end_buffer_async_write.
*/
static void
xfs_finish_page_writeback(
@@ -97,29 +97,44 @@ xfs_finish_page_writeback(
struct bio_vec *bvec,
int error)
{
- unsigned int end = bvec->bv_offset + bvec->bv_len - 1;
- struct buffer_head *head, *bh, *next;
+ struct buffer_head *head = page_buffers(bvec->bv_page), *bh = head;
+ bool busy = false;
unsigned int off = 0;
- unsigned int bsize;
+ unsigned long flags;
ASSERT(bvec->bv_offset < PAGE_SIZE);
ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
- ASSERT(end < PAGE_SIZE);
+ ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
- bh = head = page_buffers(bvec->bv_page);
-
- bsize = bh->b_size;
+ local_irq_save(flags);
+ bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
do {
- if (off > end)
- break;
- next = bh->b_this_page;
- if (off < bvec->bv_offset)
- goto next_bh;
- bh->b_end_io(bh, !error);
-next_bh:
- off += bsize;
- } while ((bh = next) != head);
+ if (off >= bvec->bv_offset &&
+ off < bvec->bv_offset + bvec->bv_len) {
+ ASSERT(buffer_async_write(bh));
+ ASSERT(bh->b_end_io == NULL);
+
+ if (error) {
+ mark_buffer_write_io_error(bh);
+ clear_buffer_uptodate(bh);
+ SetPageError(bvec->bv_page);
+ } else {
+ set_buffer_uptodate(bh);
+ }
+ clear_buffer_async_write(bh);
+ unlock_buffer(bh);
+ } else if (buffer_async_write(bh)) {
+ ASSERT(buffer_locked(bh));
+ busy = true;
+ }
+ off += bh->b_size;
+ } while ((bh = bh->b_this_page) != head);
+ bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
+ local_irq_restore(flags);
+
+ if (!busy)
+ end_page_writeback(bvec->bv_page);
}
/*
@@ -133,8 +148,10 @@ xfs_destroy_ioend(
int error)
{
struct inode *inode = ioend->io_inode;
- struct bio *last = ioend->io_bio;
- struct bio *bio, *next;
+ struct bio *bio = &ioend->io_inline_bio;
+ struct bio *last = ioend->io_bio, *next;
+ u64 start = bio->bi_iter.bi_sector;
+ bool quiet = bio_flagged(bio, BIO_QUIET);
for (bio = &ioend->io_inline_bio; bio; bio = next) {
struct bio_vec *bvec;
@@ -155,6 +172,11 @@ xfs_destroy_ioend(
bio_put(bio);
}
+
+ if (unlikely(error && !quiet)) {
+ xfs_err_ratelimited(XFS_I(inode)->i_mount,
+ "writeback error on sector %llu", start);
+ }
}
/*
@@ -423,7 +445,8 @@ xfs_start_buffer_writeback(
ASSERT(!buffer_delay(bh));
ASSERT(!buffer_unwritten(bh));
- mark_buffer_async_write(bh);
+ bh->b_end_io = NULL;
+ set_buffer_async_write(bh);
set_buffer_uptodate(bh);
clear_buffer_dirty(bh);
}
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index be0b79d8900f..ebd66b19fbfc 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -97,7 +97,7 @@ xfs_attr3_leaf_freextent(
/*
* Roll to next transaction.
*/
- error = xfs_trans_roll(trans, dp);
+ error = xfs_trans_roll_inode(trans, dp);
if (error)
return error;
}
@@ -308,7 +308,7 @@ xfs_attr3_node_inactive(
/*
* Atomically commit the whole invalidate stuff.
*/
- error = xfs_trans_roll(trans, dp);
+ error = xfs_trans_roll_inode(trans, dp);
if (error)
return error;
}
@@ -375,7 +375,7 @@ xfs_attr3_root_inactive(
/*
* Commit the invalidate and start the next transaction.
*/
- error = xfs_trans_roll(trans, dp);
+ error = xfs_trans_roll_inode(trans, dp);
return error;
}
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 88073910fa5d..dd136f7275e4 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -502,7 +502,7 @@ xfs_bui_recover(
}
/* Finish transaction, free inodes. */
- error = xfs_defer_finish(&tp, &dfops, NULL);
+ error = xfs_defer_finish(&tp, &dfops);
if (error)
goto err_dfops;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 93e955262d07..cd9a5400ba4f 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -222,22 +222,21 @@ xfs_bmap_eof(
* Count leaf blocks given a range of extent records. Delayed allocation
* extents are not counted towards the totals.
*/
-STATIC void
+xfs_extnum_t
xfs_bmap_count_leaves(
struct xfs_ifork *ifp,
- xfs_extnum_t *numrecs,
xfs_filblks_t *count)
{
- xfs_extnum_t i;
- xfs_extnum_t nr_exts = xfs_iext_count(ifp);
+ struct xfs_bmbt_irec got;
+ xfs_extnum_t numrecs = 0, i = 0;
- for (i = 0; i < nr_exts; i++) {
- xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, i);
- if (!isnullstartblock(xfs_bmbt_get_startblock(frp))) {
- (*numrecs)++;
- *count += xfs_bmbt_get_blockcount(frp);
+ while (xfs_iext_get_extent(ifp, i++, &got)) {
+ if (!isnullstartblock(got.br_startblock)) {
+ *count += got.br_blockcount;
+ numrecs++;
}
}
+ return numrecs;
}
/*
@@ -370,7 +369,7 @@ xfs_bmap_count_blocks(
switch (XFS_IFORK_FORMAT(ip, whichfork)) {
case XFS_DINODE_FMT_EXTENTS:
- xfs_bmap_count_leaves(ifp, nextents, count);
+ *nextents = xfs_bmap_count_leaves(ifp, count);
return 0;
case XFS_DINODE_FMT_BTREE:
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
@@ -1136,7 +1135,7 @@ xfs_alloc_file_space(
/*
* Complete the transaction
*/
- error = xfs_defer_finish(&tp, &dfops, NULL);
+ error = xfs_defer_finish(&tp, &dfops);
if (error)
goto error0;
@@ -1202,7 +1201,8 @@ xfs_unmap_extent(
if (error)
goto out_bmap_cancel;
- error = xfs_defer_finish(&tp, &dfops, ip);
+ xfs_defer_ijoin(&dfops, ip);
+ error = xfs_defer_finish(&tp, &dfops);
if (error)
goto out_bmap_cancel;
@@ -1496,7 +1496,7 @@ xfs_shift_file_space(
if (error)
goto out_bmap_cancel;
- error = xfs_defer_finish(&tp, &dfops, NULL);
+ error = xfs_defer_finish(&tp, &dfops);
if (error)
goto out_bmap_cancel;
@@ -1777,7 +1777,8 @@ xfs_swap_extent_rmap(
if (error)
goto out_defer;
- error = xfs_defer_finish(tpp, &dfops, ip);
+ xfs_defer_ijoin(&dfops, ip);
+ error = xfs_defer_finish(tpp, &dfops);
if (error)
goto out_defer;
@@ -1840,29 +1841,18 @@ xfs_swap_extent_forks(
}
/*
- * Before we've swapped the forks, lets set the owners of the forks
- * appropriately. We have to do this as we are demand paging the btree
- * buffers, and so the validation done on read will expect the owner
- * field to be correctly set. Once we change the owners, we can swap the
- * inode forks.
+ * Btree format (v3) inodes have the inode number stamped in the bmbt
+ * block headers. We can't start changing the bmbt blocks until the
+ * inode owner change is logged so recovery does the right thing in the
+ * event of a crash. Set the owner change log flags now and leave the
+ * bmbt scan as the last step.
*/
if (ip->i_d.di_version == 3 &&
- ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ ip->i_d.di_format == XFS_DINODE_FMT_BTREE)
(*target_log_flags) |= XFS_ILOG_DOWNER;
- error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
- tip->i_ino, NULL);
- if (error)
- return error;
- }
-
if (tip->i_d.di_version == 3 &&
- tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
(*src_log_flags) |= XFS_ILOG_DOWNER;
- error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
- ip->i_ino, NULL);
- if (error)
- return error;
- }
/*
* Swap the data forks of the inodes
@@ -1940,6 +1930,48 @@ xfs_swap_extent_forks(
return 0;
}
+/*
+ * Fix up the owners of the bmbt blocks to refer to the current inode. The
+ * change owner scan attempts to order all modified buffers in the current
+ * transaction. In the event of ordered buffer failure, the offending buffer is
+ * physically logged as a fallback and the scan returns -EAGAIN. We must roll
+ * the transaction in this case to replenish the fallback log reservation and
+ * restart the scan. This process repeats until the scan completes.
+ */
+static int
+xfs_swap_change_owner(
+ struct xfs_trans **tpp,
+ struct xfs_inode *ip,
+ struct xfs_inode *tmpip)
+{
+ int error;
+ struct xfs_trans *tp = *tpp;
+
+ do {
+ error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
+ NULL);
+ /* success or fatal error */
+ if (error != -EAGAIN)
+ break;
+
+ error = xfs_trans_roll(tpp);
+ if (error)
+ break;
+ tp = *tpp;
+
+ /*
+ * Redirty both inodes so they can relog and keep the log tail
+ * moving forward.
+ */
+ xfs_trans_ijoin(tp, ip, 0);
+ xfs_trans_ijoin(tp, tmpip, 0);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
+ } while (true);
+
+ return error;
+}
+
int
xfs_swap_extents(
struct xfs_inode *ip, /* target inode */
@@ -1954,7 +1986,7 @@ xfs_swap_extents(
int lock_flags;
struct xfs_ifork *cowfp;
uint64_t f;
- int resblks;
+ int resblks = 0;
/*
* Lock the inodes against other IO, page faults and truncate to
@@ -2002,11 +2034,8 @@ xfs_swap_extents(
XFS_SWAP_RMAP_SPACE_RES(mp,
XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK),
XFS_DATA_FORK);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
- 0, 0, &tp);
- } else
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0,
- 0, 0, &tp);
+ }
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
if (error)
goto out_unlock;
@@ -2092,6 +2121,23 @@ xfs_swap_extents(
xfs_trans_log_inode(tp, tip, target_log_flags);
/*
+ * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
+ * have inode number owner values in the bmbt blocks that still refer to
+ * the old inode. Scan each bmbt to fix up the owner values with the
+ * inode number of the current inode.
+ */
+ if (src_log_flags & XFS_ILOG_DOWNER) {
+ error = xfs_swap_change_owner(&tp, ip, tip);
+ if (error)
+ goto out_trans_cancel;
+ }
+ if (target_log_flags & XFS_ILOG_DOWNER) {
+ error = xfs_swap_change_owner(&tp, tip, ip);
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ /*
* If this is a synchronous mount, make sure that the
* transaction goes to disk before returning to the user.
*/
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 0cede1043571..0eaa81dc49be 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -70,6 +70,7 @@ int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
+xfs_extnum_t xfs_bmap_count_leaves(struct xfs_ifork *ifp, xfs_filblks_t *count);
int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
int whichfork, xfs_extnum_t *nextents,
xfs_filblks_t *count);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index f6a8422e9562..e0a0af0946f2 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -29,6 +29,7 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_log.h"
+#include "xfs_inode.h"
kmem_zone_t *xfs_buf_item_zone;
@@ -322,6 +323,8 @@ xfs_buf_item_format(
ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
(xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
&& xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
+ ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) ||
+ (bip->bli_flags & XFS_BLI_STALE));
/*
@@ -346,16 +349,6 @@ xfs_buf_item_format(
bip->bli_flags &= ~XFS_BLI_INODE_BUF;
}
- if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
- XFS_BLI_ORDERED) {
- /*
- * The buffer has been logged just to order it. It is not being
- * included in the transaction commit, so don't format it.
- */
- trace_xfs_buf_item_format_ordered(bip);
- return;
- }
-
for (i = 0; i < bip->bli_format_count; i++) {
xfs_buf_item_format_segment(bip, lv, &vecp, offset,
&bip->bli_formats[i]);
@@ -574,26 +567,20 @@ xfs_buf_item_unlock(
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;
- bool clean;
- bool aborted;
- int flags;
+ bool aborted = !!(lip->li_flags & XFS_LI_ABORTED);
+ bool hold = !!(bip->bli_flags & XFS_BLI_HOLD);
+ bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY);
+#if defined(DEBUG) || defined(XFS_WARN)
+ bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED);
+#endif
/* Clear the buffer's association with this transaction. */
bp->b_transp = NULL;
/*
- * If this is a transaction abort, don't return early. Instead, allow
- * the brelse to happen. Normally it would be done for stale
- * (cancelled) buffers at unpin time, but we'll never go through the
- * pin/unpin cycle if we abort inside commit.
+ * The per-transaction state has been copied above so clear it from the
+ * bli.
*/
- aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
- /*
- * Before possibly freeing the buf item, copy the per-transaction state
- * so we can reference it safely later after clearing it from the
- * buffer log item.
- */
- flags = bip->bli_flags;
bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
/*
@@ -601,7 +588,7 @@ xfs_buf_item_unlock(
* unlock the buffer and free the buf item when the buffer is unpinned
* for the last time.
*/
- if (flags & XFS_BLI_STALE) {
+ if (bip->bli_flags & XFS_BLI_STALE) {
trace_xfs_buf_item_unlock_stale(bip);
ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
if (!aborted) {
@@ -619,20 +606,11 @@ xfs_buf_item_unlock(
* regardless of whether it is dirty or not. A dirty abort implies a
* shutdown, anyway.
*
- * Ordered buffers are dirty but may have no recorded changes, so ensure
- * we only release clean items here.
+ * The bli dirty state should match whether the blf has logged segments
+ * except for ordered buffers, where only the bli should be dirty.
*/
- clean = (flags & XFS_BLI_DIRTY) ? false : true;
- if (clean) {
- int i;
- for (i = 0; i < bip->bli_format_count; i++) {
- if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
- bip->bli_formats[i].blf_map_size)) {
- clean = false;
- break;
- }
- }
- }
+ ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) ||
+ (ordered && dirty && !xfs_buf_item_dirty_format(bip)));
/*
* Clean buffers, by definition, cannot be in the AIL. However, aborted
@@ -651,11 +629,11 @@ xfs_buf_item_unlock(
ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
xfs_buf_item_relse(bp);
- } else if (clean)
+ } else if (!dirty)
xfs_buf_item_relse(bp);
}
- if (!(flags & XFS_BLI_HOLD))
+ if (!hold)
xfs_buf_relse(bp);
}
@@ -945,14 +923,22 @@ xfs_buf_item_log(
/*
- * Return 1 if the buffer has been logged or ordered in a transaction (at any
- * point, not just the current transaction) and 0 if not.
+ * Return true if the buffer has any ranges logged/dirtied by a transaction,
+ * false otherwise.
*/
-uint
-xfs_buf_item_dirty(
- xfs_buf_log_item_t *bip)
+bool
+xfs_buf_item_dirty_format(
+ struct xfs_buf_log_item *bip)
{
- return (bip->bli_flags & XFS_BLI_DIRTY);
+ int i;
+
+ for (i = 0; i < bip->bli_format_count; i++) {
+ if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
+ bip->bli_formats[i].blf_map_size))
+ return true;
+ }
+
+ return false;
}
STATIC void
@@ -1054,6 +1040,31 @@ xfs_buf_do_callbacks(
}
}
+/*
+ * Invoke the error state callback for each log item affected by the failed I/O.
+ *
+ * If a metadata buffer write fails with a non-permanent error, the buffer is
+ * eventually resubmitted and so the completion callbacks are not run. The error
+ * state may need to be propagated to the log items attached to the buffer,
+ * however, so the next AIL push of the item knows hot to handle it correctly.
+ */
+STATIC void
+xfs_buf_do_callbacks_fail(
+ struct xfs_buf *bp)
+{
+ struct xfs_log_item *next;
+ struct xfs_log_item *lip = bp->b_fspriv;
+ struct xfs_ail *ailp = lip->li_ailp;
+
+ spin_lock(&ailp->xa_lock);
+ for (; lip; lip = next) {
+ next = lip->li_bio_list;
+ if (lip->li_ops->iop_error)
+ lip->li_ops->iop_error(lip, bp);
+ }
+ spin_unlock(&ailp->xa_lock);
+}
+
static bool
xfs_buf_iodone_callback_error(
struct xfs_buf *bp)
@@ -1123,7 +1134,11 @@ xfs_buf_iodone_callback_error(
if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
goto permanent_error;
- /* still a transient error, higher layers will retry */
+ /*
+ * Still a transient error, run IO completion failure callbacks and let
+ * the higher layers retry the buffer.
+ */
+ xfs_buf_do_callbacks_fail(bp);
xfs_buf_ioerror(bp, 0);
xfs_buf_relse(bp);
return true;
@@ -1204,3 +1219,31 @@ xfs_buf_iodone(
xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
xfs_buf_item_free(BUF_ITEM(lip));
}
+
+/*
+ * Requeue a failed buffer for writeback
+ *
+ * Return true if the buffer has been re-queued properly, false otherwise
+ */
+bool
+xfs_buf_resubmit_failed_buffers(
+ struct xfs_buf *bp,
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
+{
+ struct xfs_log_item *next;
+
+ /*
+ * Clear XFS_LI_FAILED flag from all items before resubmit
+ *
+ * XFS_LI_FAILED set/clear is protected by xa_lock, caller this
+ * function already have it acquired
+ */
+ for (; lip; lip = next) {
+ next = lip->li_bio_list;
+ xfs_clear_li_failed(lip);
+ }
+
+ /* Add this buffer back to the delayed write list */
+ return xfs_buf_delwri_queue(bp, buffer_list);
+}
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index f7eba99d19dd..9690ce62c9a7 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -64,12 +64,15 @@ typedef struct xfs_buf_log_item {
int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
void xfs_buf_item_relse(struct xfs_buf *);
void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
-uint xfs_buf_item_dirty(xfs_buf_log_item_t *);
+bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
void xfs_buf_attach_iodone(struct xfs_buf *,
void(*)(struct xfs_buf *, xfs_log_item_t *),
xfs_log_item_t *);
void xfs_buf_iodone_callbacks(struct xfs_buf *);
void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
+bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *,
+ struct xfs_log_item *,
+ struct list_head *);
extern kmem_zone_t *xfs_buf_item_zone;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index fd2ef8c2c9a7..cd82429d8df7 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -383,7 +383,7 @@ xfs_qm_dqalloc(
xfs_trans_bhold(tp, bp);
- error = xfs_defer_finish(tpp, &dfops, NULL);
+ error = xfs_defer_finish(tpp, &dfops);
if (error)
goto error1;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 2f4feb959bfb..bd786a9ac2c3 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -57,6 +57,7 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_AG_RESV_CRITICAL,
XFS_RANDOM_DROP_WRITES,
XFS_RANDOM_LOG_BAD_CRC,
+ XFS_RANDOM_LOG_ITEM_PIN,
};
struct xfs_errortag_attr {
@@ -161,6 +162,7 @@ XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE);
XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL);
XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES);
XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC);
+XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -193,6 +195,7 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(ag_resv_critical),
XFS_ERRORTAG_ATTR_LIST(drop_writes),
XFS_ERRORTAG_ATTR_LIST(log_bad_crc),
+ XFS_ERRORTAG_ATTR_LIST(log_item_pin),
NULL,
};
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 7577be5f09bc..7c4bef3bddb7 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -106,7 +106,8 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
*/
#define XFS_ERRTAG_DROP_WRITES 28
#define XFS_ERRTAG_LOG_BAD_CRC 29
-#define XFS_ERRTAG_MAX 30
+#define XFS_ERRTAG_LOG_ITEM_PIN 30
+#define XFS_ERRTAG_MAX 31
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -141,6 +142,7 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
#define XFS_RANDOM_AG_RESV_CRITICAL 4
#define XFS_RANDOM_DROP_WRITES 1
#define XFS_RANDOM_LOG_BAD_CRC 1
+#define XFS_RANDOM_LOG_ITEM_PIN 1
#ifdef DEBUG
extern int xfs_errortag_init(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c4893e226fd8..ec3e44fcf771 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1011,96 +1011,67 @@ xfs_file_llseek(
* page_lock (MM)
* i_lock (XFS - extent map serialisation)
*/
-
-/*
- * mmap()d file has taken write protection fault and is being made writable. We
- * can set the page state up correctly for a writable page, which means we can
- * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
- * mapping.
- */
-STATIC int
-xfs_filemap_page_mkwrite(
- struct vm_fault *vmf)
+static int
+__xfs_filemap_fault(
+ struct vm_fault *vmf,
+ enum page_entry_size pe_size,
+ bool write_fault)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
+ struct xfs_inode *ip = XFS_I(inode);
int ret;
- trace_xfs_filemap_page_mkwrite(XFS_I(inode));
+ trace_xfs_filemap_fault(ip, pe_size, write_fault);
- sb_start_pagefault(inode->i_sb);
- file_update_time(vmf->vma->vm_file);
- xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ if (write_fault) {
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vmf->vma->vm_file);
+ }
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
- ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
+ ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops);
} else {
- ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
- ret = block_page_mkwrite_return(ret);
+ if (write_fault)
+ ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
+ else
+ ret = filemap_fault(vmf);
}
-
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- sb_end_pagefault(inode->i_sb);
+ if (write_fault)
+ sb_end_pagefault(inode->i_sb);
return ret;
}
-STATIC int
+static int
xfs_filemap_fault(
struct vm_fault *vmf)
{
- struct inode *inode = file_inode(vmf->vma->vm_file);
- int ret;
-
- trace_xfs_filemap_fault(XFS_I(inode));
-
/* DAX can shortcut the normal fault path on write faults! */
- if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode))
- return xfs_filemap_page_mkwrite(vmf);
-
- xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- if (IS_DAX(inode))
- ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
- else
- ret = filemap_fault(vmf);
- xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-
- return ret;
+ return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
+ IS_DAX(file_inode(vmf->vma->vm_file)) &&
+ (vmf->flags & FAULT_FLAG_WRITE));
}
-/*
- * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
- * both read and write faults. Hence we need to handle both cases. There is no
- * ->huge_mkwrite callout for huge pages, so we have a single function here to
- * handle both cases here. @flags carries the information on the type of fault
- * occuring.
- */
-STATIC int
+static int
xfs_filemap_huge_fault(
struct vm_fault *vmf,
enum page_entry_size pe_size)
{
- struct inode *inode = file_inode(vmf->vma->vm_file);
- struct xfs_inode *ip = XFS_I(inode);
- int ret;
-
- if (!IS_DAX(inode))
+ if (!IS_DAX(file_inode(vmf->vma->vm_file)))
return VM_FAULT_FALLBACK;
- trace_xfs_filemap_huge_fault(ip);
-
- if (vmf->flags & FAULT_FLAG_WRITE) {
- sb_start_pagefault(inode->i_sb);
- file_update_time(vmf->vma->vm_file);
- }
-
- xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops);
- xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-
- if (vmf->flags & FAULT_FLAG_WRITE)
- sb_end_pagefault(inode->i_sb);
+ /* DAX can shortcut the normal fault path on write faults! */
+ return __xfs_filemap_fault(vmf, pe_size,
+ (vmf->flags & FAULT_FLAG_WRITE));
+}
- return ret;
+static int
+xfs_filemap_page_mkwrite(
+ struct vm_fault *vmf)
+{
+ return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
}
/*
@@ -1130,7 +1101,7 @@ xfs_filemap_pfn_mkwrite(
if (vmf->pgoff >= size)
ret = VM_FAULT_SIGBUS;
else if (IS_DAX(inode))
- ret = dax_pfn_mkwrite(vmf);
+ ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
sb_end_pagefault(inode->i_sb);
return ret;
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 0a9e6985a0d0..34227115a5d6 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1124,11 +1124,11 @@ reclaim:
* Because we use RCU freeing we need to ensure the inode always appears
* to be reclaimed with an invalid inode number when in the free state.
* We do this as early as possible under the ILOCK so that
- * xfs_iflush_cluster() can be guaranteed to detect races with us here.
- * By doing this, we guarantee that once xfs_iflush_cluster has locked
- * XFS_ILOCK that it will see either a valid, flushable inode that will
- * serialise correctly, or it will see a clean (and invalid) inode that
- * it can skip.
+ * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
+ * detect races with us here. By doing this, we guarantee that once
+ * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
+ * it will see either a valid inode that will serialise correctly, or it
+ * will see an invalid inode that it can skip.
*/
spin_lock(&ip->i_flags_lock);
ip->i_flags = XFS_IRECLAIM;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ff48f0096810..5599dda4727a 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1055,7 +1055,7 @@ xfs_dir_ialloc(
tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
}
- code = xfs_trans_roll(&tp, NULL);
+ code = xfs_trans_roll(&tp);
if (committed != NULL)
*committed = 1;
@@ -1285,7 +1285,7 @@ xfs_create(
*/
xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
- error = xfs_defer_finish(&tp, &dfops, NULL);
+ error = xfs_defer_finish(&tp, &dfops);
if (error)
goto out_bmap_cancel;
@@ -1513,7 +1513,7 @@ xfs_link(
if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
xfs_trans_set_sync(tp);
- error = xfs_defer_finish(&tp, &dfops, NULL);
+ error = xfs_defer_finish(&tp, &dfops);
if (error) {
xfs_defer_cancel(&dfops);
goto error_return;
@@ -1607,11 +1607,12 @@ xfs_itruncate_extents(
* Duplicate the transaction that has the permanent
* reservation and commit the old transaction.
*/
- error = xfs_defer_finish(&tp, &dfops, ip);
+ xfs_defer_ijoin(&dfops, ip);
+ error = xfs_defer_finish(&tp, &dfops);
if (error)
goto out_bmap_cancel;
- error = xfs_trans_roll(&tp, ip);
+ error = xfs_trans_roll_inode(&tp, ip);
if (error)
goto out;
}
@@ -1855,7 +1856,7 @@ xfs_inactive_ifree(
* Just ignore errors at this point. There is nothing we can do except
* to try to keep going. Make sure it's not a silent error.
*/
- error = xfs_defer_finish(&tp, &dfops, NULL);
+ error = xfs_defer_finish(&tp, &dfops);
if (error) {
xfs_notice(mp, "%s: xfs_defer_finish returned error %d",
__func__, error);
@@ -2359,11 +2360,24 @@ retry:
* already marked stale. If we can't lock it, back off
* and retry.
*/
- if (ip != free_ip &&
- !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
- rcu_read_unlock();
- delay(1);
- goto retry;
+ if (ip != free_ip) {
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+ rcu_read_unlock();
+ delay(1);
+ goto retry;
+ }
+
+ /*
+ * Check the inode number again in case we're
+ * racing with freeing in xfs_reclaim_inode().
+ * See the comments in that function for more
+ * information as to why the initial check is
+ * not sufficient.
+ */
+ if (ip->i_ino != inum + i) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ continue;
+ }
}
rcu_read_unlock();
@@ -2637,7 +2651,7 @@ xfs_remove(
if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
xfs_trans_set_sync(tp);
- error = xfs_defer_finish(&tp, &dfops, NULL);
+ error = xfs_defer_finish(&tp, &dfops);
if (error)
goto out_bmap_cancel;
@@ -2723,7 +2737,7 @@ xfs_finish_rename(
if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
xfs_trans_set_sync(tp);
- error = xfs_defer_finish(&tp, dfops, NULL);
+ error = xfs_defer_finish(&tp, dfops);
if (error) {
xfs_defer_cancel(dfops);
xfs_trans_cancel(tp);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 013cc78d7daf..6d0f74ec31e8 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -27,6 +27,7 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
#include "xfs_log.h"
@@ -475,6 +476,23 @@ xfs_inode_item_unpin(
wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
}
+/*
+ * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer
+ * have been failed during writeback
+ *
+ * This informs the AIL that the inode is already flush locked on the next push,
+ * and acquires a hold on the buffer to ensure that it isn't reclaimed before
+ * dirty data makes it to disk.
+ */
+STATIC void
+xfs_inode_item_error(
+ struct xfs_log_item *lip,
+ struct xfs_buf *bp)
+{
+ ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode));
+ xfs_set_li_failed(lip, bp);
+}
+
STATIC uint
xfs_inode_item_push(
struct xfs_log_item *lip,
@@ -484,13 +502,28 @@ xfs_inode_item_push(
{
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
- struct xfs_buf *bp = NULL;
+ struct xfs_buf *bp = lip->li_buf;
uint rval = XFS_ITEM_SUCCESS;
int error;
if (xfs_ipincount(ip) > 0)
return XFS_ITEM_PINNED;
+ /*
+ * The buffer containing this item failed to be written back
+ * previously. Resubmit the buffer for IO.
+ */
+ if (lip->li_flags & XFS_LI_FAILED) {
+ if (!xfs_buf_trylock(bp))
+ return XFS_ITEM_LOCKED;
+
+ if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list))
+ rval = XFS_ITEM_FLUSHING;
+
+ xfs_buf_unlock(bp);
+ return rval;
+ }
+
if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
return XFS_ITEM_LOCKED;
@@ -622,7 +655,8 @@ static const struct xfs_item_ops xfs_inode_item_ops = {
.iop_unlock = xfs_inode_item_unlock,
.iop_committed = xfs_inode_item_committed,
.iop_push = xfs_inode_item_push,
- .iop_committing = xfs_inode_item_committing
+ .iop_committing = xfs_inode_item_committing,
+ .iop_error = xfs_inode_item_error
};
@@ -710,7 +744,8 @@ xfs_iflush_done(
* the AIL lock.
*/
iip = INODE_ITEM(blip);
- if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
+ if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) ||
+ lip->li_flags & XFS_LI_FAILED)
need_ail++;
blip = next;
@@ -718,7 +753,8 @@ xfs_iflush_done(
/* make sure we capture the state of the initial inode. */
iip = INODE_ITEM(lip);
- if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
+ if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) ||
+ lip->li_flags & XFS_LI_FAILED)
need_ail++;
/*
@@ -739,6 +775,9 @@ xfs_iflush_done(
if (INODE_ITEM(blip)->ili_logged &&
blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
mlip_changed |= xfs_ail_delete_one(ailp, blip);
+ else {
+ xfs_clear_li_failed(blip);
+ }
}
if (mlip_changed) {
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 9c0c7a920304..5049e8ab6e30 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -931,16 +931,15 @@ xfs_ioc_fsgetxattr(
return 0;
}
-STATIC void
-xfs_set_diflags(
+STATIC uint16_t
+xfs_flags2diflags(
struct xfs_inode *ip,
unsigned int xflags)
{
- unsigned int di_flags;
- uint64_t di_flags2;
-
/* can't set PREALLOC this way, just preserve it */
- di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
+ uint16_t di_flags =
+ (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
+
if (xflags & FS_XFLAG_IMMUTABLE)
di_flags |= XFS_DIFLAG_IMMUTABLE;
if (xflags & FS_XFLAG_APPEND)
@@ -970,19 +969,24 @@ xfs_set_diflags(
if (xflags & FS_XFLAG_EXTSIZE)
di_flags |= XFS_DIFLAG_EXTSIZE;
}
- ip->i_d.di_flags = di_flags;
- /* diflags2 only valid for v3 inodes. */
- if (ip->i_d.di_version < 3)
- return;
+ return di_flags;
+}
+
+STATIC uint64_t
+xfs_flags2diflags2(
+ struct xfs_inode *ip,
+ unsigned int xflags)
+{
+ uint64_t di_flags2 =
+ (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
- di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
if (xflags & FS_XFLAG_DAX)
di_flags2 |= XFS_DIFLAG2_DAX;
if (xflags & FS_XFLAG_COWEXTSIZE)
di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
- ip->i_d.di_flags2 = di_flags2;
+ return di_flags2;
}
STATIC void
@@ -1008,11 +1012,12 @@ xfs_diflags_to_linux(
inode->i_flags |= S_NOATIME;
else
inode->i_flags &= ~S_NOATIME;
+#if 0 /* disabled until the flag switching races are sorted out */
if (xflags & FS_XFLAG_DAX)
inode->i_flags |= S_DAX;
else
inode->i_flags &= ~S_DAX;
-
+#endif
}
static int
@@ -1022,6 +1027,7 @@ xfs_ioctl_setattr_xflags(
struct fsxattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
+ uint64_t di_flags2;
/* Can't change realtime flag if any extents are allocated. */
if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
@@ -1052,7 +1058,14 @@ xfs_ioctl_setattr_xflags(
!capable(CAP_LINUX_IMMUTABLE))
return -EPERM;
- xfs_set_diflags(ip, fa->fsx_xflags);
+ /* diflags2 only valid for v3 inodes. */
+ di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
+ if (di_flags2 && ip->i_d.di_version < 3)
+ return -EINVAL;
+
+ ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags);
+ ip->i_d.di_flags2 = di_flags2;
+
xfs_diflags_to_linux(ip);
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 813394c62849..79cb5b3d140c 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -274,7 +274,7 @@ xfs_iomap_write_direct(
/*
* Complete the transaction
*/
- error = xfs_defer_finish(&tp, &dfops, NULL);
+ error = xfs_defer_finish(&tp, &dfops);
if (error)
goto out_bmap_cancel;
@@ -520,7 +520,6 @@ xfs_file_iomap_begin_delay(
struct inode *inode,
loff_t offset,
loff_t count,
- unsigned flags,
struct iomap *iomap)
{
struct xfs_inode *ip = XFS_I(inode);
@@ -784,7 +783,7 @@ xfs_iomap_write_allocate(
if (error)
goto trans_cancel;
- error = xfs_defer_finish(&tp, &dfops, NULL);
+ error = xfs_defer_finish(&tp, &dfops);
if (error)
goto trans_cancel;
@@ -906,7 +905,7 @@ xfs_iomap_write_unwritten(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
- error = xfs_defer_finish(&tp, &dfops, NULL);
+ error = xfs_defer_finish(&tp, &dfops);
if (error)
goto error_on_bmapi_transaction;
@@ -984,8 +983,7 @@ xfs_file_iomap_begin(
if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) &&
!IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
/* Reserve delalloc blocks for regular writeback. */
- return xfs_file_iomap_begin_delay(inode, offset, length, flags,
- iomap);
+ return xfs_file_iomap_begin_delay(inode, offset, length, iomap);
}
if (need_excl_ilock(ip, flags)) {
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 469c9fa4c178..17081c77ef86 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -817,7 +817,7 @@ xfs_vn_setattr_nonsize(
* Caution: The caller of this function is responsible for calling
* setattr_prepare() or otherwise verifying the change is fine.
*/
-int
+STATIC int
xfs_setattr_size(
struct xfs_inode *ip,
struct iattr *iattr)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 4ebd0bafc914..c5107c7bc4bf 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -743,10 +743,14 @@ xfs_log_mount_finish(
struct xfs_mount *mp)
{
int error = 0;
+ bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
return 0;
+ } else if (readonly) {
+ /* Allow unlinked processing to proceed */
+ mp->m_flags &= ~XFS_MOUNT_RDONLY;
}
/*
@@ -757,12 +761,27 @@ xfs_log_mount_finish(
* inodes. Turn it off immediately after recovery finishes
* so that we don't leak the quota inodes if subsequent mount
* activities fail.
+ *
+ * We let all inodes involved in redo item processing end up on
+ * the LRU instead of being evicted immediately so that if we do
+ * something to an unlinked inode, the irele won't cause
+ * premature truncation and freeing of the inode, which results
+ * in log recovery failure. We have to evict the unreferenced
+ * lru inodes after clearing MS_ACTIVE because we don't
+ * otherwise clean up the lru if there's a subsequent failure in
+ * xfs_mountfs, which leads to us leaking the inodes if nothing
+ * else (e.g. quotacheck) references the inodes before the
+ * mount failure occurs.
*/
mp->m_super->s_flags |= MS_ACTIVE;
error = xlog_recover_finish(mp->m_log);
if (!error)
xfs_log_work_queue(mp);
mp->m_super->s_flags &= ~MS_ACTIVE;
+ evict_inodes(mp->m_super);
+
+ if (readonly)
+ mp->m_flags |= XFS_MOUNT_RDONLY;
return error;
}
@@ -812,11 +831,14 @@ xfs_log_unmount_write(xfs_mount_t *mp)
int error;
/*
- * Don't write out unmount record on read-only mounts.
+ * Don't write out unmount record on norecovery mounts or ro devices.
* Or, if we are doing a forced umount (typically because of IO errors).
*/
- if (mp->m_flags & XFS_MOUNT_RDONLY)
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
+ xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
+ ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
return 0;
+ }
error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
@@ -3353,8 +3375,6 @@ maybe_sleep:
*/
if (iclog->ic_state & XLOG_STATE_IOERROR)
return -EIO;
- if (log_flushed)
- *log_flushed = 1;
} else {
no_sleep:
@@ -3458,8 +3478,6 @@ try_again:
xlog_wait(&iclog->ic_prev->ic_write_wait,
&log->l_icloglock);
- if (log_flushed)
- *log_flushed = 1;
already_slept = 1;
goto try_again;
}
@@ -3493,9 +3511,6 @@ try_again:
*/
if (iclog->ic_state & XLOG_STATE_IOERROR)
return -EIO;
-
- if (log_flushed)
- *log_flushed = 1;
} else { /* just return */
spin_unlock(&log->l_icloglock);
}
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 9549188f5a36..ee34899396b2 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1029,61 +1029,106 @@ out_error:
}
/*
- * Check the log tail for torn writes. This is required when torn writes are
- * detected at the head and the head had to be walked back to a previous record.
- * The tail of the previous record must now be verified to ensure the torn
- * writes didn't corrupt the previous tail.
+ * Calculate distance from head to tail (i.e., unused space in the log).
+ */
+static inline int
+xlog_tail_distance(
+ struct xlog *log,
+ xfs_daddr_t head_blk,
+ xfs_daddr_t tail_blk)
+{
+ if (head_blk < tail_blk)
+ return tail_blk - head_blk;
+
+ return tail_blk + (log->l_logBBsize - head_blk);
+}
+
+/*
+ * Verify the log tail. This is particularly important when torn or incomplete
+ * writes have been detected near the front of the log and the head has been
+ * walked back accordingly.
+ *
+ * We also have to handle the case where the tail was pinned and the head
+ * blocked behind the tail right before a crash. If the tail had been pushed
+ * immediately prior to the crash and the subsequent checkpoint was only
+ * partially written, it's possible it overwrote the last referenced tail in the
+ * log with garbage. This is not a coherency problem because the tail must have
+ * been pushed before it can be overwritten, but appears as log corruption to
+ * recovery because we have no way to know the tail was updated if the
+ * subsequent checkpoint didn't write successfully.
*
- * Return an error if CRC verification fails as recovery cannot proceed.
+ * Therefore, CRC check the log from tail to head. If a failure occurs and the
+ * offending record is within max iclog bufs from the head, walk the tail
+ * forward and retry until a valid tail is found or corruption is detected out
+ * of the range of a possible overwrite.
*/
STATIC int
xlog_verify_tail(
struct xlog *log,
xfs_daddr_t head_blk,
- xfs_daddr_t tail_blk)
+ xfs_daddr_t *tail_blk,
+ int hsize)
{
struct xlog_rec_header *thead;
struct xfs_buf *bp;
xfs_daddr_t first_bad;
- int count;
int error = 0;
bool wrapped;
- xfs_daddr_t tmp_head;
+ xfs_daddr_t tmp_tail;
+ xfs_daddr_t orig_tail = *tail_blk;
bp = xlog_get_bp(log, 1);
if (!bp)
return -ENOMEM;
/*
- * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get
- * a temporary head block that points after the last possible
- * concurrently written record of the tail.
+ * Make sure the tail points to a record (returns positive count on
+ * success).
*/
- count = xlog_seek_logrec_hdr(log, head_blk, tail_blk,
- XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead,
- &wrapped);
- if (count < 0) {
- error = count;
+ error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp,
+ &tmp_tail, &thead, &wrapped);
+ if (error < 0)
goto out;
- }
-
- /*
- * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran
- * into the actual log head. tmp_head points to the start of the record
- * so update it to the actual head block.
- */
- if (count < XLOG_MAX_ICLOGS + 1)
- tmp_head = head_blk;
+ if (*tail_blk != tmp_tail)
+ *tail_blk = tmp_tail;
/*
- * We now have a tail and temporary head block that covers at least
- * XLOG_MAX_ICLOGS records from the tail. We need to verify that these
- * records were completely written. Run a CRC verification pass from
- * tail to head and return the result.
+ * Run a CRC check from the tail to the head. We can't just check
+ * MAX_ICLOGS records past the tail because the tail may point to stale
+ * blocks cleared during the search for the head/tail. These blocks are
+ * overwritten with zero-length records and thus record count is not a
+ * reliable indicator of the iclog state before a crash.
*/
- error = xlog_do_recovery_pass(log, tmp_head, tail_blk,
+ first_bad = 0;
+ error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
XLOG_RECOVER_CRCPASS, &first_bad);
+ while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
+ int tail_distance;
+
+ /*
+ * Is corruption within range of the head? If so, retry from
+ * the next record. Otherwise return an error.
+ */
+ tail_distance = xlog_tail_distance(log, head_blk, first_bad);
+ if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize))
+ break;
+ /* skip to the next record; returns positive count on success */
+ error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp,
+ &tmp_tail, &thead, &wrapped);
+ if (error < 0)
+ goto out;
+
+ *tail_blk = tmp_tail;
+ first_bad = 0;
+ error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
+ XLOG_RECOVER_CRCPASS, &first_bad);
+ }
+
+ if (!error && *tail_blk != orig_tail)
+ xfs_warn(log->l_mp,
+ "Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
+ orig_tail, *tail_blk);
out:
xlog_put_bp(bp);
return error;
@@ -1143,7 +1188,7 @@ xlog_verify_head(
*/
error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
XLOG_RECOVER_CRCPASS, &first_bad);
- if (error == -EFSBADCRC) {
+ if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
/*
* We've hit a potential torn write. Reset the error and warn
* about it.
@@ -1183,31 +1228,12 @@ xlog_verify_head(
ASSERT(0);
return 0;
}
-
- /*
- * Now verify the tail based on the updated head. This is
- * required because the torn writes trimmed from the head could
- * have been written over the tail of a previous record. Return
- * any errors since recovery cannot proceed if the tail is
- * corrupt.
- *
- * XXX: This leaves a gap in truly robust protection from torn
- * writes in the log. If the head is behind the tail, the tail
- * pushes forward to create some space and then a crash occurs
- * causing the writes into the previous record's tail region to
- * tear, log recovery isn't able to recover.
- *
- * How likely is this to occur? If possible, can we do something
- * more intelligent here? Is it safe to push the tail forward if
- * we can determine that the tail is within the range of the
- * torn write (e.g., the kernel can only overwrite the tail if
- * it has actually been pushed forward)? Alternatively, could we
- * somehow prevent this condition at runtime?
- */
- error = xlog_verify_tail(log, *head_blk, *tail_blk);
}
+ if (error)
+ return error;
- return error;
+ return xlog_verify_tail(log, *head_blk, tail_blk,
+ be32_to_cpu((*rhead)->h_size));
}
/*
@@ -4801,12 +4827,16 @@ xlog_recover_process_intents(
int error = 0;
struct xfs_ail_cursor cur;
struct xfs_ail *ailp;
+#if defined(DEBUG) || defined(XFS_WARN)
xfs_lsn_t last_lsn;
+#endif
ailp = log->l_ailp;
spin_lock(&ailp->xa_lock);
lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+#if defined(DEBUG) || defined(XFS_WARN)
last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
+#endif
while (lip != NULL) {
/*
* We're done when we see something other than an intent.
@@ -5218,7 +5248,7 @@ xlog_do_recovery_pass(
xfs_daddr_t *first_bad) /* out: first bad log rec */
{
xlog_rec_header_t *rhead;
- xfs_daddr_t blk_no;
+ xfs_daddr_t blk_no, rblk_no;
xfs_daddr_t rhead_blk;
char *offset;
xfs_buf_t *hbp, *dbp;
@@ -5231,7 +5261,7 @@ xlog_do_recovery_pass(
LIST_HEAD (buffer_list);
ASSERT(head_blk != tail_blk);
- rhead_blk = 0;
+ blk_no = rhead_blk = tail_blk;
for (i = 0; i < XLOG_RHASH_SIZE; i++)
INIT_HLIST_HEAD(&rhash[i]);
@@ -5309,7 +5339,6 @@ xlog_do_recovery_pass(
}
memset(rhash, 0, sizeof(rhash));
- blk_no = rhead_blk = tail_blk;
if (tail_blk > head_blk) {
/*
* Perform recovery around the end of the physical log.
@@ -5371,9 +5400,19 @@ xlog_do_recovery_pass(
bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
blk_no += hblks;
- /* Read in data for log record */
- if (blk_no + bblks <= log->l_logBBsize) {
- error = xlog_bread(log, blk_no, bblks, dbp,
+ /*
+ * Read the log record data in multiple reads if it
+ * wraps around the end of the log. Note that if the
+ * header already wrapped, blk_no could point past the
+ * end of the log. The record data is contiguous in
+ * that case.
+ */
+ if (blk_no + bblks <= log->l_logBBsize ||
+ blk_no >= log->l_logBBsize) {
+ /* mod blk_no in case the header wrapped and
+ * pushed it beyond the end of the log */
+ rblk_no = do_mod(blk_no, log->l_logBBsize);
+ error = xlog_bread(log, rblk_no, bblks, dbp,
&offset);
if (error)
goto bread_err2;
@@ -5563,6 +5602,8 @@ xlog_do_recover(
xfs_buf_t *bp;
xfs_sb_t *sbp;
+ trace_xfs_log_recover(log, head_blk, tail_blk);
+
/*
* First replay the images in the log.
*/
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 15751dc2a27d..010a13a201aa 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -31,6 +31,7 @@
#include "xfs_error.h"
#include "xfs_bmap.h"
#include "xfs_bmap_btree.h"
+#include "xfs_bmap_util.h"
#include "xfs_trans.h"
#include "xfs_trans_space.h"
#include "xfs_qm.h"
@@ -1120,31 +1121,6 @@ xfs_qm_quotacheck_dqadjust(
return 0;
}
-STATIC int
-xfs_qm_get_rtblks(
- xfs_inode_t *ip,
- xfs_qcnt_t *O_rtblks)
-{
- xfs_filblks_t rtblks; /* total rt blks */
- xfs_extnum_t idx; /* extent record index */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_extnum_t nextents; /* number of extent entries */
- int error;
-
- ASSERT(XFS_IS_REALTIME_INODE(ip));
- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- if ((error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK)))
- return error;
- }
- rtblks = 0;
- nextents = xfs_iext_count(ifp);
- for (idx = 0; idx < nextents; idx++)
- rtblks += xfs_bmbt_get_blockcount(xfs_iext_get_ext(ifp, idx));
- *O_rtblks = (xfs_qcnt_t)rtblks;
- return 0;
-}
-
/*
* callback routine supplied to bulkstat(). Given an inumber, find its
* dquots and update them to account for resources taken by that inode.
@@ -1160,7 +1136,8 @@ xfs_qm_dqusage_adjust(
int *res) /* result code value */
{
xfs_inode_t *ip;
- xfs_qcnt_t nblks, rtblks = 0;
+ xfs_qcnt_t nblks;
+ xfs_filblks_t rtblks = 0; /* total rt blks */
int error;
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
@@ -1190,12 +1167,15 @@ xfs_qm_dqusage_adjust(
ASSERT(ip->i_delayed_blks == 0);
if (XFS_IS_REALTIME_INODE(ip)) {
- /*
- * Walk thru the extent list and count the realtime blocks.
- */
- error = xfs_qm_get_rtblks(ip, &rtblks);
- if (error)
- goto error0;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ if (error)
+ goto error0;
+ }
+
+ xfs_bmap_count_leaves(ifp, &rtblks);
}
nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 96fe209b5eb6..8f2e2fac4255 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -525,7 +525,7 @@ xfs_cui_recover(
}
xfs_refcount_finish_one_cleanup(tp, rcur, error);
- error = xfs_defer_finish(&tp, &dfops, NULL);
+ error = xfs_defer_finish(&tp, &dfops);
if (error)
goto abort_defer;
set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index f45fbf0db9bb..3246815c24d6 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -464,7 +464,7 @@ retry:
goto out_bmap_cancel;
/* Finish up. */
- error = xfs_defer_finish(&tp, &dfops, NULL);
+ error = xfs_defer_finish(&tp, &dfops);
if (error)
goto out_bmap_cancel;
@@ -602,7 +602,8 @@ xfs_reflink_cancel_cow_blocks(
-(long)del.br_blockcount);
/* Roll the transaction */
- error = xfs_defer_finish(tpp, &dfops, ip);
+ xfs_defer_ijoin(&dfops, ip);
+ error = xfs_defer_finish(tpp, &dfops);
if (error) {
xfs_defer_cancel(&dfops);
break;
@@ -791,7 +792,8 @@ xfs_reflink_end_cow(
/* Remove the mapping from the CoW fork. */
xfs_bmap_del_extent_cow(ip, &idx, &got, &del);
- error = xfs_defer_finish(&tp, &dfops, ip);
+ xfs_defer_ijoin(&dfops, ip);
+ error = xfs_defer_finish(&tp, &dfops);
if (error)
goto out_defer;
next_extent:
@@ -1152,7 +1154,8 @@ xfs_reflink_remap_extent(
next_extent:
/* Process all the deferred stuff. */
- error = xfs_defer_finish(&tp, &dfops, ip);
+ xfs_defer_ijoin(&dfops, ip);
+ error = xfs_defer_finish(&tp, &dfops);
if (error)
goto out_defer;
}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 91472193643b..488719d43ca8 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -810,7 +810,7 @@ xfs_growfs_rt_alloc(
/*
* Free any blocks freed up in the transaction, then commit.
*/
- error = xfs_defer_finish(&tp, &dfops, NULL);
+ error = xfs_defer_finish(&tp, &dfops);
if (error)
goto out_bmap_cancel;
error = xfs_trans_commit(tp);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 38aaacdbb8b3..c1c4c2ea1014 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1220,7 +1220,7 @@ xfs_test_remount_options(
tmp_mp->m_super = sb;
error = xfs_parseargs(tmp_mp, options);
xfs_free_fsname(tmp_mp);
- kfree(tmp_mp);
+ kmem_free(tmp_mp);
return error;
}
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 23a50d7aa46a..68d3ca2c4968 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -378,7 +378,7 @@ xfs_symlink(
xfs_trans_set_sync(tp);
}
- error = xfs_defer_finish(&tp, &dfops, NULL);
+ error = xfs_defer_finish(&tp, &dfops);
if (error)
goto out_bmap_cancel;
@@ -497,7 +497,8 @@ xfs_inactive_symlink_rmt(
/*
* Commit the first transaction. This logs the EFI and the inode.
*/
- error = xfs_defer_finish(&tp, &dfops, ip);
+ xfs_defer_ijoin(&dfops, ip);
+ error = xfs_defer_finish(&tp, &dfops);
if (error)
goto error_bmap_cancel;
/*
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index bcc3cdf8e1c5..bb5514688d47 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -517,7 +517,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
@@ -689,11 +688,34 @@ DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
-DEFINE_INODE_EVENT(xfs_filemap_fault);
-DEFINE_INODE_EVENT(xfs_filemap_huge_fault);
-DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
+TRACE_EVENT(xfs_filemap_fault,
+ TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size,
+ bool write_fault),
+ TP_ARGS(ip, pe_size, write_fault),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(enum page_entry_size, pe_size)
+ __field(bool, write_fault)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->pe_size = pe_size;
+ __entry->write_fault = write_fault;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx %s write_fault %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_symbolic(__entry->pe_size,
+ { PE_SIZE_PTE, "PTE" },
+ { PE_SIZE_PMD, "PMD" },
+ { PE_SIZE_PUD, "PUD" }),
+ __entry->write_fault)
+)
+
DECLARE_EVENT_CLASS(xfs_iref_class,
TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
TP_ARGS(ip, caller_ip),
@@ -1963,6 +1985,24 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \
DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
+TRACE_EVENT(xfs_log_recover,
+ TP_PROTO(struct xlog *log, xfs_daddr_t headblk, xfs_daddr_t tailblk),
+ TP_ARGS(log, headblk, tailblk),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_daddr_t, headblk)
+ __field(xfs_daddr_t, tailblk)
+ ),
+ TP_fast_assign(
+ __entry->dev = log->l_mp->m_super->s_dev;
+ __entry->headblk = headblk;
+ __entry->tailblk = tailblk;
+ ),
+ TP_printk("dev %d:%d headblk 0x%llx tailblk 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->headblk,
+ __entry->tailblk)
+)
+
TRACE_EVENT(xfs_log_recover_record,
TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass),
TP_ARGS(log, rhead, pass),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 2011620008de..a87f657f59c9 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1035,25 +1035,18 @@ xfs_trans_cancel(
*/
int
xfs_trans_roll(
- struct xfs_trans **tpp,
- struct xfs_inode *dp)
+ struct xfs_trans **tpp)
{
- struct xfs_trans *trans;
+ struct xfs_trans *trans = *tpp;
struct xfs_trans_res tres;
int error;
/*
- * Ensure that the inode is always logged.
- */
- trans = *tpp;
- if (dp)
- xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
-
- /*
* Copy the critical parameters from one trans to the next.
*/
tres.tr_logres = trans->t_log_res;
tres.tr_logcount = trans->t_log_count;
+
*tpp = xfs_trans_dup(trans);
/*
@@ -1067,10 +1060,8 @@ xfs_trans_roll(
if (error)
return error;
- trans = *tpp;
-
/*
- * Reserve space in the log for th next transaction.
+ * Reserve space in the log for the next transaction.
* This also pushes items in the "AIL", the list of logged items,
* out to disk if they are taking up space at the tail of the log
* that we want to use. This requires that either nothing be locked
@@ -1078,14 +1069,5 @@ xfs_trans_roll(
* the prior and the next transactions.
*/
tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
- error = xfs_trans_reserve(trans, &tres, 0, 0);
- /*
- * Ensure that the inode is in the new transaction and locked.
- */
- if (error)
- return error;
-
- if (dp)
- xfs_trans_ijoin(trans, dp, 0);
- return 0;
+ return xfs_trans_reserve(*tpp, &tres, 0, 0);
}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 6bdad6f58934..815b53d20e26 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -49,6 +49,7 @@ typedef struct xfs_log_item {
struct xfs_ail *li_ailp; /* ptr to AIL */
uint li_type; /* item type */
uint li_flags; /* misc flags */
+ struct xfs_buf *li_buf; /* real buffer pointer */
struct xfs_log_item *li_bio_list; /* buffer item list */
void (*li_cb)(struct xfs_buf *,
struct xfs_log_item *);
@@ -64,11 +65,13 @@ typedef struct xfs_log_item {
} xfs_log_item_t;
#define XFS_LI_IN_AIL 0x1
-#define XFS_LI_ABORTED 0x2
+#define XFS_LI_ABORTED 0x2
+#define XFS_LI_FAILED 0x4
#define XFS_LI_FLAGS \
{ XFS_LI_IN_AIL, "IN_AIL" }, \
- { XFS_LI_ABORTED, "ABORTED" }
+ { XFS_LI_ABORTED, "ABORTED" }, \
+ { XFS_LI_FAILED, "FAILED" }
struct xfs_item_ops {
void (*iop_size)(xfs_log_item_t *, int *, int *);
@@ -79,6 +82,7 @@ struct xfs_item_ops {
void (*iop_unlock)(xfs_log_item_t *);
xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
+ void (*iop_error)(xfs_log_item_t *, xfs_buf_t *);
};
void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
@@ -208,12 +212,14 @@ void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
-void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
+bool xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
-void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
+void xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint,
+ uint);
+void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *);
void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
void xfs_extent_free_init_defer_op(void);
@@ -224,7 +230,8 @@ int xfs_trans_free_extent(struct xfs_trans *,
struct xfs_efd_log_item *, xfs_fsblock_t,
xfs_extlen_t, struct xfs_owner_info *);
int xfs_trans_commit(struct xfs_trans *);
-int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
+int xfs_trans_roll(struct xfs_trans **);
+int xfs_trans_roll_inode(struct xfs_trans **, struct xfs_inode *);
void xfs_trans_cancel(xfs_trans_t *);
int xfs_trans_ail_init(struct xfs_mount *);
void xfs_trans_ail_destroy(struct xfs_mount *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 9056c0f34a3c..354368a906e5 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -325,6 +325,21 @@ xfs_ail_delete(
xfs_trans_ail_cursor_clear(ailp, lip);
}
+static inline uint
+xfsaild_push_item(
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip)
+{
+ /*
+ * If log item pinning is enabled, skip the push and track the item as
+ * pinned. This can help induce head-behind-tail conditions.
+ */
+ if (XFS_TEST_ERROR(false, ailp->xa_mount, XFS_ERRTAG_LOG_ITEM_PIN))
+ return XFS_ITEM_PINNED;
+
+ return lip->li_ops->iop_push(lip, &ailp->xa_buf_list);
+}
+
static long
xfsaild_push(
struct xfs_ail *ailp)
@@ -382,7 +397,7 @@ xfsaild_push(
* rely on the AIL cursor implementation to be able to deal with
* the dropped lock.
*/
- lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list);
+ lock_result = xfsaild_push_item(ailp, lip);
switch (lock_result) {
case XFS_ITEM_SUCCESS:
XFS_STATS_INC(mp, xs_push_ail_success);
@@ -687,12 +702,13 @@ xfs_trans_ail_update_bulk(
bool
xfs_ail_delete_one(
struct xfs_ail *ailp,
- struct xfs_log_item *lip)
+ struct xfs_log_item *lip)
{
struct xfs_log_item *mlip = xfs_ail_min(ailp);
trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
xfs_ail_delete(ailp, lip);
+ xfs_clear_li_failed(lip);
lip->li_flags &= ~XFS_LI_IN_AIL;
lip->li_lsn = 0;
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 86987d823d76..3ba7a96a8abd 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -435,7 +435,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) {
xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR);
xfs_buf_item_relse(bp);
- } else if (!xfs_buf_item_dirty(bip)) {
+ } else if (!(bip->bli_flags & XFS_BLI_DIRTY)) {
/***
ASSERT(bp->b_pincount == 0);
***/
@@ -493,25 +493,17 @@ xfs_trans_bhold_release(xfs_trans_t *tp,
}
/*
- * This is called to mark bytes first through last inclusive of the given
- * buffer as needing to be logged when the transaction is committed.
- * The buffer must already be associated with the given transaction.
- *
- * First and last are numbers relative to the beginning of this buffer,
- * so the first byte in the buffer is numbered 0 regardless of the
- * value of b_blkno.
+ * Mark a buffer dirty in the transaction.
*/
void
-xfs_trans_log_buf(xfs_trans_t *tp,
- xfs_buf_t *bp,
- uint first,
- uint last)
+xfs_trans_dirty_buf(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
{
- xfs_buf_log_item_t *bip = bp->b_fspriv;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
ASSERT(bp->b_transp == tp);
ASSERT(bip != NULL);
- ASSERT(first <= last && last < BBTOB(bp->b_length));
ASSERT(bp->b_iodone == NULL ||
bp->b_iodone == xfs_buf_iodone_callbacks);
@@ -531,8 +523,6 @@ xfs_trans_log_buf(xfs_trans_t *tp,
bp->b_iodone = xfs_buf_iodone_callbacks;
bip->bli_item.li_cb = xfs_buf_iodone;
- trace_xfs_trans_log_buf(bip);
-
/*
* If we invalidated the buffer within this transaction, then
* cancel the invalidation now that we're dirtying the buffer
@@ -545,17 +535,37 @@ xfs_trans_log_buf(xfs_trans_t *tp,
bp->b_flags &= ~XBF_STALE;
bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL;
}
+ bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
tp->t_flags |= XFS_TRANS_DIRTY;
bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+}
- /*
- * If we have an ordered buffer we are not logging any dirty range but
- * it still needs to be marked dirty and that it has been logged.
- */
- bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
- if (!(bip->bli_flags & XFS_BLI_ORDERED))
- xfs_buf_item_log(bip, first, last);
+/*
+ * This is called to mark bytes first through last inclusive of the given
+ * buffer as needing to be logged when the transaction is committed.
+ * The buffer must already be associated with the given transaction.
+ *
+ * First and last are numbers relative to the beginning of this buffer,
+ * so the first byte in the buffer is numbered 0 regardless of the
+ * value of b_blkno.
+ */
+void
+xfs_trans_log_buf(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ uint first,
+ uint last)
+{
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ ASSERT(first <= last && last < BBTOB(bp->b_length));
+ ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED));
+
+ xfs_trans_dirty_buf(tp, bp);
+
+ trace_xfs_trans_log_buf(bip);
+ xfs_buf_item_log(bip, first, last);
}
@@ -708,14 +718,13 @@ xfs_trans_inode_alloc_buf(
}
/*
- * Mark the buffer as ordered for this transaction. This means
- * that the contents of the buffer are not recorded in the transaction
- * but it is tracked in the AIL as though it was. This allows us
- * to record logical changes in transactions rather than the physical
- * changes we make to the buffer without changing writeback ordering
- * constraints of metadata buffers.
+ * Mark the buffer as ordered for this transaction. This means that the contents
+ * of the buffer are not recorded in the transaction but it is tracked in the
+ * AIL as though it was. This allows us to record logical changes in
+ * transactions rather than the physical changes we make to the buffer without
+ * changing writeback ordering constraints of metadata buffers.
*/
-void
+bool
xfs_trans_ordered_buf(
struct xfs_trans *tp,
struct xfs_buf *bp)
@@ -726,8 +735,18 @@ xfs_trans_ordered_buf(
ASSERT(bip != NULL);
ASSERT(atomic_read(&bip->bli_refcount) > 0);
+ if (xfs_buf_item_dirty_format(bip))
+ return false;
+
bip->bli_flags |= XFS_BLI_ORDERED;
trace_xfs_buf_item_ordered(bip);
+
+ /*
+ * We don't log a dirty range of an ordered buffer but it still needs
+ * to be marked dirty and that it has been logged.
+ */
+ xfs_trans_dirty_buf(tp, bp);
+ return true;
}
/*
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index dab8daa676f9..daa7615497f9 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -134,3 +134,17 @@ xfs_trans_log_inode(
flags |= ip->i_itemp->ili_last_fields;
ip->i_itemp->ili_fields |= flags;
}
+
+int
+xfs_trans_roll_inode(
+ struct xfs_trans **tpp,
+ struct xfs_inode *ip)
+{
+ int error;
+
+ xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
+ error = xfs_trans_roll(tpp);
+ if (!error)
+ xfs_trans_ijoin(*tpp, ip, 0);
+ return error;
+}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index d91706c56c63..b317a3644c00 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -164,4 +164,35 @@ xfs_trans_ail_copy_lsn(
*dst = *src;
}
#endif
+
+static inline void
+xfs_clear_li_failed(
+ struct xfs_log_item *lip)
+{
+ struct xfs_buf *bp = lip->li_buf;
+
+ ASSERT(lip->li_flags & XFS_LI_IN_AIL);
+ lockdep_assert_held(&lip->li_ailp->xa_lock);
+
+ if (lip->li_flags & XFS_LI_FAILED) {
+ lip->li_flags &= ~XFS_LI_FAILED;
+ lip->li_buf = NULL;
+ xfs_buf_rele(bp);
+ }
+}
+
+static inline void
+xfs_set_li_failed(
+ struct xfs_log_item *lip,
+ struct xfs_buf *bp)
+{
+ lockdep_assert_held(&lip->li_ailp->xa_lock);
+
+ if (!(lip->li_flags & XFS_LI_FAILED)) {
+ xfs_buf_hold(bp);
+ lip->li_flags |= XFS_LI_FAILED;
+ lip->li_buf = bp;
+ }
+}
+
#endif /* __XFS_TRANS_PRIV_H__ */